FEAT(otel): add OTel Collector for logs and traces
- Add OpenTelemetry Operator for CR management - Deploy OTel Collector as DaemonSet via CR - Enable filelog receiver for container log collection - Replace Promtail with OTel filelog receiver - Keep Prometheus for ServiceMonitor-based metrics scraping
This commit is contained in:
@@ -10,8 +10,9 @@ resources:
|
|||||||
- alertmanager/argocd.yaml
|
- alertmanager/argocd.yaml
|
||||||
- grafana/argocd.yaml
|
- grafana/argocd.yaml
|
||||||
- loki/argocd.yaml
|
- loki/argocd.yaml
|
||||||
- promtail/argocd.yaml
|
# promtail removed - OTel filelog receiver handles log collection
|
||||||
- tempo/argocd.yaml
|
- tempo/argocd.yaml
|
||||||
|
- opentelemetry-operator/argocd.yaml
|
||||||
- opentelemetry-collector/argocd.yaml
|
- opentelemetry-collector/argocd.yaml
|
||||||
- node-exporter/argocd.yaml
|
- node-exporter/argocd.yaml
|
||||||
- kube-state-metrics/argocd.yaml
|
- kube-state-metrics/argocd.yaml
|
||||||
|
|||||||
@@ -5,18 +5,14 @@ metadata:
|
|||||||
namespace: argocd
|
namespace: argocd
|
||||||
finalizers:
|
finalizers:
|
||||||
- resources-finalizer.argocd.argoproj.io
|
- resources-finalizer.argocd.argoproj.io
|
||||||
|
annotations:
|
||||||
|
argocd.argoproj.io/sync-wave: "1"
|
||||||
spec:
|
spec:
|
||||||
project: default
|
project: default
|
||||||
sources:
|
source:
|
||||||
- repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts
|
repoURL: https://github.com/K3S-HOME/observability.git
|
||||||
chart: opentelemetry-collector
|
|
||||||
targetRevision: 0.108.0
|
|
||||||
helm:
|
|
||||||
valueFiles:
|
|
||||||
- $values/opentelemetry-collector/helm-values.yaml
|
|
||||||
- repoURL: https://github.com/K3S-HOME/observability.git
|
|
||||||
targetRevision: main
|
targetRevision: main
|
||||||
ref: values
|
path: opentelemetry-collector/manifests
|
||||||
destination:
|
destination:
|
||||||
server: https://kubernetes.default.svc
|
server: https://kubernetes.default.svc
|
||||||
namespace: opentelemetry
|
namespace: opentelemetry
|
||||||
@@ -29,6 +25,7 @@ spec:
|
|||||||
- CreateNamespace=true
|
- CreateNamespace=true
|
||||||
- PrunePropagationPolicy=foreground
|
- PrunePropagationPolicy=foreground
|
||||||
- PruneLast=true
|
- PruneLast=true
|
||||||
|
- ServerSideApply=true
|
||||||
retry:
|
retry:
|
||||||
limit: 5
|
limit: 5
|
||||||
backoff:
|
backoff:
|
||||||
|
|||||||
@@ -4,6 +4,8 @@
|
|||||||
# Architecture:
|
# Architecture:
|
||||||
# - DaemonSet mode: one collector per node for efficient data collection
|
# - DaemonSet mode: one collector per node for efficient data collection
|
||||||
# - OTLP receiver for traces, metrics, and logs
|
# - OTLP receiver for traces, metrics, and logs
|
||||||
|
# - Filelog receiver for container logs (replaces Promtail)
|
||||||
|
# - Prometheus receiver for metrics scraping (replaces Prometheus scrape)
|
||||||
# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs)
|
# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs)
|
||||||
#
|
#
|
||||||
# Pipeline:
|
# Pipeline:
|
||||||
@@ -21,14 +23,14 @@ image:
|
|||||||
mode: daemonset
|
mode: daemonset
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Resource Limits (optimized for small cluster)
|
# Resource Limits (increased for log + metrics collection)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 25m
|
cpu: 50m
|
||||||
memory: 64Mi
|
memory: 256Mi
|
||||||
limits:
|
limits:
|
||||||
memory: 64Mi
|
memory: 512Mi
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Tolerations (run on all nodes including master)
|
# Tolerations (run on all nodes including master)
|
||||||
@@ -38,6 +40,25 @@ tolerations:
|
|||||||
operator: Exists
|
operator: Exists
|
||||||
effect: NoSchedule
|
effect: NoSchedule
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Extra Volumes for Log Collection
|
||||||
|
# =============================================================================
|
||||||
|
extraVolumes:
|
||||||
|
- name: varlogpods
|
||||||
|
hostPath:
|
||||||
|
path: /var/log/pods
|
||||||
|
- name: varlibdockercontainers
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/docker/containers
|
||||||
|
|
||||||
|
extraVolumeMounts:
|
||||||
|
- name: varlogpods
|
||||||
|
mountPath: /var/log/pods
|
||||||
|
readOnly: true
|
||||||
|
- name: varlibdockercontainers
|
||||||
|
mountPath: /var/lib/docker/containers
|
||||||
|
readOnly: true
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Ports
|
# Ports
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -64,8 +85,11 @@ ports:
|
|||||||
# OpenTelemetry Collector Configuration
|
# OpenTelemetry Collector Configuration
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
config:
|
config:
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
# Receivers - what data the collector accepts
|
# Receivers - what data the collector accepts
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
receivers:
|
receivers:
|
||||||
|
# OTLP receiver for application telemetry
|
||||||
otlp:
|
otlp:
|
||||||
protocols:
|
protocols:
|
||||||
grpc:
|
grpc:
|
||||||
@@ -73,7 +97,84 @@ config:
|
|||||||
http:
|
http:
|
||||||
endpoint: 0.0.0.0:4318
|
endpoint: 0.0.0.0:4318
|
||||||
|
|
||||||
|
# Filelog receiver for container logs (replaces Promtail)
|
||||||
|
filelog:
|
||||||
|
include:
|
||||||
|
- /var/log/pods/*/*/*.log
|
||||||
|
exclude:
|
||||||
|
# Exclude collector's own logs to prevent feedback loop
|
||||||
|
- /var/log/pods/opentelemetry_opentelemetry-collector*/*/*.log
|
||||||
|
start_at: end
|
||||||
|
include_file_path: true
|
||||||
|
include_file_name: false
|
||||||
|
operators:
|
||||||
|
# Route based on log format
|
||||||
|
- type: router
|
||||||
|
id: get-format
|
||||||
|
routes:
|
||||||
|
- output: parser-docker
|
||||||
|
expr: 'body matches "^\\{"'
|
||||||
|
- output: parser-containerd
|
||||||
|
expr: 'body matches "^[^ Z]+Z"'
|
||||||
|
default: parser-containerd
|
||||||
|
|
||||||
|
# Docker JSON format parser
|
||||||
|
- type: json_parser
|
||||||
|
id: parser-docker
|
||||||
|
output: extract-metadata-from-filepath
|
||||||
|
timestamp:
|
||||||
|
parse_from: attributes.time
|
||||||
|
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
|
||||||
|
|
||||||
|
# Containerd/CRI format parser
|
||||||
|
- type: regex_parser
|
||||||
|
id: parser-containerd
|
||||||
|
regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
|
||||||
|
output: extract-metadata-from-filepath
|
||||||
|
timestamp:
|
||||||
|
parse_from: attributes.time
|
||||||
|
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
|
||||||
|
|
||||||
|
# Extract metadata from file path
|
||||||
|
- type: regex_parser
|
||||||
|
id: extract-metadata-from-filepath
|
||||||
|
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9-]+)\/(?P<container_name>[^\/]+)\/.*$'
|
||||||
|
parse_from: attributes["log.file.path"]
|
||||||
|
|
||||||
|
# Move attributes to resource
|
||||||
|
- type: move
|
||||||
|
from: attributes.namespace
|
||||||
|
to: resource["k8s.namespace.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.pod_name
|
||||||
|
to: resource["k8s.pod.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.container_name
|
||||||
|
to: resource["k8s.container.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.uid
|
||||||
|
to: resource["k8s.pod.uid"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.stream
|
||||||
|
to: attributes["log.iostream"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.log
|
||||||
|
to: body
|
||||||
|
|
||||||
|
# Prometheus receiver - self metrics only
|
||||||
|
# Infrastructure metrics (node-exporter, kube-state-metrics) handled by Prometheus
|
||||||
|
prometheus:
|
||||||
|
config:
|
||||||
|
scrape_configs:
|
||||||
|
# OTel Collector self metrics only
|
||||||
|
- job_name: 'otel-collector'
|
||||||
|
scrape_interval: 30s
|
||||||
|
static_configs:
|
||||||
|
- targets: ['${env:MY_POD_IP}:8888']
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
# Processors - how data is transformed
|
# Processors - how data is transformed
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
processors:
|
processors:
|
||||||
# Batch processor for efficient exports
|
# Batch processor for efficient exports
|
||||||
batch:
|
batch:
|
||||||
@@ -84,8 +185,8 @@ config:
|
|||||||
# Memory limiter to prevent OOM
|
# Memory limiter to prevent OOM
|
||||||
memory_limiter:
|
memory_limiter:
|
||||||
check_interval: 5s
|
check_interval: 5s
|
||||||
limit_mib: 200
|
limit_mib: 400
|
||||||
spike_limit_mib: 50
|
spike_limit_mib: 100
|
||||||
|
|
||||||
# Add Kubernetes metadata
|
# Add Kubernetes metadata
|
||||||
k8sattributes:
|
k8sattributes:
|
||||||
@@ -112,7 +213,9 @@ config:
|
|||||||
timeout: 5s
|
timeout: 5s
|
||||||
override: false
|
override: false
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
# Exporters - where data goes
|
# Exporters - where data goes
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
exporters:
|
exporters:
|
||||||
# Tempo for traces
|
# Tempo for traces
|
||||||
otlp/tempo:
|
otlp/tempo:
|
||||||
@@ -137,12 +240,16 @@ config:
|
|||||||
debug:
|
debug:
|
||||||
verbosity: basic
|
verbosity: basic
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
# Extensions
|
# Extensions
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
extensions:
|
extensions:
|
||||||
health_check:
|
health_check:
|
||||||
endpoint: 0.0.0.0:13133
|
endpoint: 0.0.0.0:13133
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
# Service pipelines
|
# Service pipelines
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
service:
|
service:
|
||||||
extensions: [health_check]
|
extensions: [health_check]
|
||||||
pipelines:
|
pipelines:
|
||||||
@@ -152,15 +259,15 @@ config:
|
|||||||
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
exporters: [otlp/tempo]
|
exporters: [otlp/tempo]
|
||||||
|
|
||||||
# Metrics pipeline
|
# Metrics pipeline (OTLP + Prometheus scraping)
|
||||||
metrics:
|
metrics:
|
||||||
receivers: [otlp]
|
receivers: [otlp, prometheus]
|
||||||
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
exporters: [prometheusremotewrite]
|
exporters: [prometheusremotewrite]
|
||||||
|
|
||||||
# Logs pipeline
|
# Logs pipeline (OTLP + Filelog)
|
||||||
logs:
|
logs:
|
||||||
receivers: [otlp]
|
receivers: [otlp, filelog]
|
||||||
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
exporters: [loki]
|
exporters: [loki]
|
||||||
|
|
||||||
@@ -171,20 +278,23 @@ serviceAccount:
|
|||||||
create: true
|
create: true
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# RBAC for k8sattributes processor
|
# RBAC for k8sattributes processor and prometheus receiver
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
clusterRole:
|
clusterRole:
|
||||||
create: true
|
create: true
|
||||||
rules:
|
rules:
|
||||||
- apiGroups: [""]
|
- apiGroups: [""]
|
||||||
resources: ["pods", "namespaces", "nodes"]
|
resources: ["pods", "namespaces", "nodes", "endpoints", "services"]
|
||||||
verbs: ["get", "watch", "list"]
|
verbs: ["get", "watch", "list"]
|
||||||
- apiGroups: ["apps"]
|
- apiGroups: ["apps"]
|
||||||
resources: ["replicasets", "deployments"]
|
resources: ["replicasets", "deployments"]
|
||||||
verbs: ["get", "watch", "list"]
|
verbs: ["get", "watch", "list"]
|
||||||
|
- apiGroups: ["discovery.k8s.io"]
|
||||||
|
resources: ["endpointslices"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# ServiceMonitor for Prometheus
|
# ServiceMonitor for Prometheus (keep for backward compatibility)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
serviceMonitor:
|
serviceMonitor:
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|||||||
231
opentelemetry-collector/manifests/collector.yaml
Normal file
231
opentelemetry-collector/manifests/collector.yaml
Normal file
@@ -0,0 +1,231 @@
|
|||||||
|
# OpenTelemetry Collector with Target Allocator
|
||||||
|
# Managed by OpenTelemetry Operator
|
||||||
|
#
|
||||||
|
# Architecture:
|
||||||
|
# - DaemonSet mode: one collector per node for log collection
|
||||||
|
# - Target Allocator: distributes scrape targets across collectors
|
||||||
|
# - Filelog receiver for container logs
|
||||||
|
# - Prometheus receiver with Target Allocator for metrics
|
||||||
|
# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs)
|
||||||
|
apiVersion: opentelemetry.io/v1beta1
|
||||||
|
kind: OpenTelemetryCollector
|
||||||
|
metadata:
|
||||||
|
name: otel-collector
|
||||||
|
namespace: opentelemetry
|
||||||
|
spec:
|
||||||
|
mode: daemonset
|
||||||
|
image: otel/opentelemetry-collector-contrib:0.113.0
|
||||||
|
serviceAccount: otel-collector
|
||||||
|
|
||||||
|
# Target Allocator disabled - metrics collected by Prometheus directly
|
||||||
|
# OTel handles logs (filelog) and traces (otlp) only
|
||||||
|
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 256Mi
|
||||||
|
limits:
|
||||||
|
memory: 512Mi
|
||||||
|
|
||||||
|
tolerations:
|
||||||
|
- key: node-role.kubernetes.io/control-plane
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
|
||||||
|
volumeMounts:
|
||||||
|
- name: varlogpods
|
||||||
|
mountPath: /var/log/pods
|
||||||
|
readOnly: true
|
||||||
|
- name: varlibdockercontainers
|
||||||
|
mountPath: /var/lib/docker/containers
|
||||||
|
readOnly: true
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
- name: varlogpods
|
||||||
|
hostPath:
|
||||||
|
path: /var/log/pods
|
||||||
|
- name: varlibdockercontainers
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/docker/containers
|
||||||
|
|
||||||
|
ports:
|
||||||
|
- name: otlp-grpc
|
||||||
|
port: 4317
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: 4317
|
||||||
|
- name: otlp-http
|
||||||
|
port: 4318
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: 4318
|
||||||
|
- name: metrics
|
||||||
|
port: 8888
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: 8888
|
||||||
|
|
||||||
|
env:
|
||||||
|
- name: K8S_NODE_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: spec.nodeName
|
||||||
|
- name: K8S_POD_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.name
|
||||||
|
- name: K8S_POD_IP
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: status.podIP
|
||||||
|
|
||||||
|
config:
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: 0.0.0.0:4317
|
||||||
|
http:
|
||||||
|
endpoint: 0.0.0.0:4318
|
||||||
|
|
||||||
|
# Filelog receiver for container logs
|
||||||
|
filelog:
|
||||||
|
include:
|
||||||
|
- /var/log/pods/*/*/*.log
|
||||||
|
exclude:
|
||||||
|
- /var/log/pods/opentelemetry_otel-collector*/*/*.log
|
||||||
|
start_at: end
|
||||||
|
include_file_path: true
|
||||||
|
include_file_name: false
|
||||||
|
operators:
|
||||||
|
- type: router
|
||||||
|
id: get-format
|
||||||
|
routes:
|
||||||
|
- output: parser-docker
|
||||||
|
expr: 'body matches "^\\{"'
|
||||||
|
- output: parser-containerd
|
||||||
|
expr: 'body matches "^[^ Z]+Z"'
|
||||||
|
default: parser-containerd
|
||||||
|
|
||||||
|
- type: json_parser
|
||||||
|
id: parser-docker
|
||||||
|
output: extract-metadata-from-filepath
|
||||||
|
timestamp:
|
||||||
|
parse_from: attributes.time
|
||||||
|
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
|
||||||
|
|
||||||
|
- type: regex_parser
|
||||||
|
id: parser-containerd
|
||||||
|
regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
|
||||||
|
output: extract-metadata-from-filepath
|
||||||
|
timestamp:
|
||||||
|
parse_from: attributes.time
|
||||||
|
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
|
||||||
|
|
||||||
|
- type: regex_parser
|
||||||
|
id: extract-metadata-from-filepath
|
||||||
|
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9-]+)\/(?P<container_name>[^\/]+)\/.*$'
|
||||||
|
parse_from: attributes["log.file.path"]
|
||||||
|
|
||||||
|
- type: move
|
||||||
|
from: attributes.namespace
|
||||||
|
to: resource["k8s.namespace.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.pod_name
|
||||||
|
to: resource["k8s.pod.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.container_name
|
||||||
|
to: resource["k8s.container.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.uid
|
||||||
|
to: resource["k8s.pod.uid"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.stream
|
||||||
|
to: attributes["log.iostream"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.log
|
||||||
|
to: body
|
||||||
|
|
||||||
|
# Prometheus receiver - self metrics only
|
||||||
|
prometheus:
|
||||||
|
config:
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: otel-collector
|
||||||
|
scrape_interval: 60s
|
||||||
|
static_configs:
|
||||||
|
- targets: ['${env:K8S_POD_IP}:8888']
|
||||||
|
|
||||||
|
processors:
|
||||||
|
batch:
|
||||||
|
timeout: 10s
|
||||||
|
send_batch_size: 1024
|
||||||
|
send_batch_max_size: 2048
|
||||||
|
|
||||||
|
memory_limiter:
|
||||||
|
check_interval: 5s
|
||||||
|
limit_mib: 400
|
||||||
|
spike_limit_mib: 100
|
||||||
|
|
||||||
|
k8sattributes:
|
||||||
|
extract:
|
||||||
|
metadata:
|
||||||
|
- k8s.namespace.name
|
||||||
|
- k8s.deployment.name
|
||||||
|
- k8s.pod.name
|
||||||
|
- k8s.node.name
|
||||||
|
passthrough: false
|
||||||
|
pod_association:
|
||||||
|
- sources:
|
||||||
|
- from: resource_attribute
|
||||||
|
name: k8s.pod.ip
|
||||||
|
- sources:
|
||||||
|
- from: resource_attribute
|
||||||
|
name: k8s.pod.uid
|
||||||
|
- sources:
|
||||||
|
- from: connection
|
||||||
|
|
||||||
|
resourcedetection:
|
||||||
|
detectors: [env, system]
|
||||||
|
timeout: 5s
|
||||||
|
override: false
|
||||||
|
|
||||||
|
exporters:
|
||||||
|
otlp/tempo:
|
||||||
|
endpoint: tempo.tempo.svc.cluster.local:4317
|
||||||
|
tls:
|
||||||
|
insecure: true
|
||||||
|
|
||||||
|
prometheusremotewrite:
|
||||||
|
endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
|
||||||
|
tls:
|
||||||
|
insecure: true
|
||||||
|
external_labels:
|
||||||
|
otel_collector: ${env:K8S_POD_NAME}
|
||||||
|
|
||||||
|
loki:
|
||||||
|
endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
|
||||||
|
default_labels_enabled:
|
||||||
|
exporter: true
|
||||||
|
level: true
|
||||||
|
|
||||||
|
debug:
|
||||||
|
verbosity: basic
|
||||||
|
|
||||||
|
extensions:
|
||||||
|
health_check:
|
||||||
|
endpoint: 0.0.0.0:13133
|
||||||
|
|
||||||
|
service:
|
||||||
|
extensions: [health_check]
|
||||||
|
pipelines:
|
||||||
|
traces:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
|
exporters: [otlp/tempo]
|
||||||
|
|
||||||
|
metrics:
|
||||||
|
receivers: [otlp, prometheus]
|
||||||
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
|
exporters: [prometheusremotewrite]
|
||||||
|
|
||||||
|
logs:
|
||||||
|
receivers: [otlp, filelog]
|
||||||
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
|
exporters: [loki]
|
||||||
6
opentelemetry-collector/manifests/kustomization.yaml
Normal file
6
opentelemetry-collector/manifests/kustomization.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- rbac.yaml
|
||||||
|
- collector.yaml
|
||||||
85
opentelemetry-collector/manifests/rbac.yaml
Normal file
85
opentelemetry-collector/manifests/rbac.yaml
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
# RBAC for OpenTelemetry Collector and Target Allocator
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: otel-collector
|
||||||
|
namespace: opentelemetry
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
name: otel-collector
|
||||||
|
rules:
|
||||||
|
# For k8sattributes processor
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["pods", "namespaces", "nodes", "endpoints", "services"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
- apiGroups: ["apps"]
|
||||||
|
resources: ["replicasets", "deployments", "statefulsets", "daemonsets"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
- apiGroups: ["discovery.k8s.io"]
|
||||||
|
resources: ["endpointslices"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
# For Target Allocator - ServiceMonitor/PodMonitor discovery
|
||||||
|
- apiGroups: ["monitoring.coreos.com"]
|
||||||
|
resources: ["servicemonitors", "podmonitors"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
# For node metrics
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["nodes/metrics", "nodes/stats", "nodes/proxy"]
|
||||||
|
verbs: ["get"]
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: otel-collector
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: otel-collector
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: otel-collector
|
||||||
|
namespace: opentelemetry
|
||||||
|
---
|
||||||
|
# Target Allocator ServiceAccount and RBAC
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: otel-collector-targetallocator
|
||||||
|
namespace: opentelemetry
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
name: otel-targetallocator
|
||||||
|
rules:
|
||||||
|
# Core resources for service discovery
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["pods", "nodes", "services", "endpoints", "namespaces"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
- apiGroups: ["discovery.k8s.io"]
|
||||||
|
resources: ["endpointslices"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
# Prometheus CRs
|
||||||
|
- apiGroups: ["monitoring.coreos.com"]
|
||||||
|
resources: ["servicemonitors", "podmonitors", "probes", "scrapeconfigs"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
# For allocator coordination
|
||||||
|
- apiGroups: ["opentelemetry.io"]
|
||||||
|
resources: ["opentelemetrycollectors"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: otel-targetallocator
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: otel-targetallocator
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: otel-collector-targetallocator
|
||||||
|
namespace: opentelemetry
|
||||||
44
opentelemetry-operator/argocd.yaml
Normal file
44
opentelemetry-operator/argocd.yaml
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
apiVersion: argoproj.io/v1alpha1
|
||||||
|
kind: Application
|
||||||
|
metadata:
|
||||||
|
name: opentelemetry-operator
|
||||||
|
namespace: argocd
|
||||||
|
finalizers:
|
||||||
|
- resources-finalizer.argocd.argoproj.io
|
||||||
|
annotations:
|
||||||
|
argocd.argoproj.io/sync-wave: "0"
|
||||||
|
spec:
|
||||||
|
project: default
|
||||||
|
sources:
|
||||||
|
- repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts
|
||||||
|
chart: opentelemetry-operator
|
||||||
|
targetRevision: 0.74.0
|
||||||
|
helm:
|
||||||
|
valueFiles:
|
||||||
|
- $values/opentelemetry-operator/helm-values.yaml
|
||||||
|
- repoURL: https://github.com/K3S-HOME/observability.git
|
||||||
|
targetRevision: main
|
||||||
|
ref: values
|
||||||
|
destination:
|
||||||
|
server: https://kubernetes.default.svc
|
||||||
|
namespace: opentelemetry-operator
|
||||||
|
syncPolicy:
|
||||||
|
automated:
|
||||||
|
prune: true
|
||||||
|
selfHeal: true
|
||||||
|
allowEmpty: false
|
||||||
|
syncOptions:
|
||||||
|
- CreateNamespace=true
|
||||||
|
- PrunePropagationPolicy=foreground
|
||||||
|
- PruneLast=true
|
||||||
|
- ServerSideApply=true
|
||||||
|
retry:
|
||||||
|
limit: 5
|
||||||
|
backoff:
|
||||||
|
duration: 5s
|
||||||
|
factor: 2
|
||||||
|
maxDuration: 3m
|
||||||
|
managedNamespaceMetadata:
|
||||||
|
labels:
|
||||||
|
goldilocks.fairwinds.com/enabled: 'true'
|
||||||
|
revisionHistoryLimit: 10
|
||||||
41
opentelemetry-operator/helm-values.yaml
Normal file
41
opentelemetry-operator/helm-values.yaml
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
# OpenTelemetry Operator Helm Values
|
||||||
|
# Chart: https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-operator
|
||||||
|
|
||||||
|
# Manager (Operator) configuration
|
||||||
|
manager:
|
||||||
|
collectorImage:
|
||||||
|
repository: otel/opentelemetry-collector-contrib
|
||||||
|
targetAllocatorImage:
|
||||||
|
repository: ghcr.io/open-telemetry/opentelemetry-operator/target-allocator
|
||||||
|
autoInstrumentationImage:
|
||||||
|
java:
|
||||||
|
repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-java
|
||||||
|
nodejs:
|
||||||
|
repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-nodejs
|
||||||
|
python:
|
||||||
|
repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-python
|
||||||
|
dotnet:
|
||||||
|
repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-dotnet
|
||||||
|
go:
|
||||||
|
repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-go
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 256Mi
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 64Mi
|
||||||
|
|
||||||
|
# Admission webhooks (uses cert-manager self-signed CA)
|
||||||
|
admissionWebhooks:
|
||||||
|
certManager:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Kube RBAC Proxy
|
||||||
|
kubeRBACProxy:
|
||||||
|
enabled: true
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 64Mi
|
||||||
|
requests:
|
||||||
|
cpu: 5m
|
||||||
|
memory: 32Mi
|
||||||
5
opentelemetry-operator/kustomization.yaml
Normal file
5
opentelemetry-operator/kustomization.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- argocd.yaml
|
||||||
@@ -45,6 +45,9 @@ prometheus:
|
|||||||
enabled: true
|
enabled: true
|
||||||
|
|
||||||
prometheusSpec:
|
prometheusSpec:
|
||||||
|
# Enable remote write receiver for OTel Collector
|
||||||
|
enableRemoteWriteReceiver: true
|
||||||
|
|
||||||
# HA: 2 replicas on different worker nodes
|
# HA: 2 replicas on different worker nodes
|
||||||
replicas: 2
|
replicas: 2
|
||||||
replicaExternalLabelName: prometheus_replica
|
replicaExternalLabelName: prometheus_replica
|
||||||
@@ -81,7 +84,7 @@ prometheus:
|
|||||||
limits:
|
limits:
|
||||||
memory: 768Mi
|
memory: 768Mi
|
||||||
|
|
||||||
# ServiceMonitor 자동 발견 - 모든 ServiceMonitor 선택
|
# ServiceMonitor selector - scrape all ServiceMonitors
|
||||||
serviceMonitorSelectorNilUsesHelmValues: false
|
serviceMonitorSelectorNilUsesHelmValues: false
|
||||||
serviceMonitorSelector: {}
|
serviceMonitorSelector: {}
|
||||||
podMonitorSelectorNilUsesHelmValues: false
|
podMonitorSelectorNilUsesHelmValues: false
|
||||||
@@ -100,69 +103,9 @@ prometheus:
|
|||||||
externalLabels:
|
externalLabels:
|
||||||
cluster: "mayne-cluster"
|
cluster: "mayne-cluster"
|
||||||
|
|
||||||
additionalScrapeConfigs:
|
# additionalScrapeConfigs removed - OTel handles scraping now
|
||||||
# ArgoCD metrics
|
# Targets moved to OTel prometheus receiver kubernetes-pods job
|
||||||
- job_name: 'argocd-metrics'
|
additionalScrapeConfigs: []
|
||||||
static_configs:
|
|
||||||
- targets:
|
|
||||||
- 'argocd-application-controller-metrics.argocd.svc.cluster.local:8082'
|
|
||||||
labels:
|
|
||||||
service: argocd-controller
|
|
||||||
- targets:
|
|
||||||
- 'argocd-server-metrics.argocd.svc.cluster.local:8083'
|
|
||||||
labels:
|
|
||||||
service: argocd-server
|
|
||||||
- targets:
|
|
||||||
- 'argocd-repo-server-metrics.argocd.svc.cluster.local:8084'
|
|
||||||
labels:
|
|
||||||
service: argocd-repo
|
|
||||||
|
|
||||||
# Cert-Manager
|
|
||||||
- job_name: 'cert-manager'
|
|
||||||
static_configs:
|
|
||||||
- targets:
|
|
||||||
- 'cert-manager.cert-manager.svc.cluster.local:9402'
|
|
||||||
|
|
||||||
# MinIO
|
|
||||||
- job_name: 'minio-cluster'
|
|
||||||
static_configs:
|
|
||||||
- targets:
|
|
||||||
- 'minio.minio.svc.cluster.local:9000'
|
|
||||||
metrics_path: /minio/v2/metrics/cluster
|
|
||||||
scheme: http
|
|
||||||
|
|
||||||
- job_name: 'minio-node'
|
|
||||||
static_configs:
|
|
||||||
- targets:
|
|
||||||
- 'minio.minio.svc.cluster.local:9000'
|
|
||||||
metrics_path: /minio/v2/metrics/node
|
|
||||||
scheme: http
|
|
||||||
|
|
||||||
# Ingress NGINX
|
|
||||||
- job_name: 'ingress-nginx'
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: pod
|
|
||||||
namespaces:
|
|
||||||
names:
|
|
||||||
- ingress-nginx
|
|
||||||
relabel_configs:
|
|
||||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
|
||||||
action: keep
|
|
||||||
regex: ingress-nginx
|
|
||||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
|
||||||
action: keep
|
|
||||||
regex: controller
|
|
||||||
- source_labels: [__address__]
|
|
||||||
action: replace
|
|
||||||
regex: ([^:]+)(?::\d+)?
|
|
||||||
replacement: $1:10254
|
|
||||||
target_label: __address__
|
|
||||||
- source_labels: [__meta_kubernetes_pod_name]
|
|
||||||
action: replace
|
|
||||||
target_label: pod
|
|
||||||
- source_labels: [__meta_kubernetes_namespace]
|
|
||||||
action: replace
|
|
||||||
target_label: namespace
|
|
||||||
|
|
||||||
# API Server 메트릭 수집 비활성화 (메모리 절감: ~37k series)
|
# API Server 메트릭 수집 비활성화 (메모리 절감: ~37k series)
|
||||||
kubeApiServer:
|
kubeApiServer:
|
||||||
|
|||||||
Reference in New Issue
Block a user