Files
observability/opentelemetry/helm-values.yaml
Mayne0213 7e61af372b PERF(observability): remove CPU limits for stability
- Remove CPU limits from all observability components
- Prevents CPU throttling issues across monitoring stack
2026-01-12 02:10:54 +09:00

334 lines
11 KiB
YAML

# OpenTelemetry Collector Helm Values
# Chart: https://github.com/open-telemetry/opentelemetry-helm-charts
#
# Architecture:
# - DaemonSet mode: one collector per node for efficient data collection
# - OTLP receiver for traces, metrics, and logs
# - Filelog receiver for container logs (replaces Promtail)
# - Prometheus receiver for metrics scraping (replaces Prometheus scrape)
# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs)
#
# Pipeline:
# Applications → OTel Collector → Tempo/Prometheus/Loki → Grafana
# =============================================================================
# Name Override
# =============================================================================
fullnameOverride: otel-collector
# =============================================================================
# Image Configuration
# =============================================================================
image:
repository: otel/opentelemetry-collector-contrib
# =============================================================================
# Deployment Mode
# =============================================================================
mode: daemonset
# =============================================================================
# Resource Limits (no CPU limit for stability, mem limit capped at 1024Mi)
# =============================================================================
resources:
requests:
cpu: 34m
memory: 142Mi
limits:
memory: 1024Mi
# =============================================================================
# Environment Variables
# =============================================================================
extraEnvs:
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: K8S_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: K8S_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
# =============================================================================
# Extra Volumes for Log Collection
# =============================================================================
extraVolumes:
- name: varlogpods
hostPath:
path: /var/log/pods
- name: varlibdockercontainers
hostPath:
path: /var/lib/docker/containers
extraVolumeMounts:
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
- name: varlibdockercontainers
mountPath: /var/lib/docker/containers
readOnly: true
# =============================================================================
# Ports
# =============================================================================
ports:
otlp:
enabled: true
containerPort: 4317
servicePort: 4317
hostPort: 4317
protocol: TCP
otlp-http:
enabled: true
containerPort: 4318
servicePort: 4318
hostPort: 4318
protocol: TCP
metrics:
enabled: true
containerPort: 8888
servicePort: 8888
protocol: TCP
# =============================================================================
# OpenTelemetry Collector Configuration
# =============================================================================
config:
# ---------------------------------------------------------------------------
# Receivers - what data the collector accepts
# ---------------------------------------------------------------------------
receivers:
# OTLP receiver for application telemetry
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# Filelog receiver for container logs (replaces Promtail)
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
# Exclude collector's own logs to prevent feedback loop
- /var/log/pods/opentelemetry_opentelemetry-collector*/*/*.log
start_at: end
include_file_path: true
include_file_name: false
operators:
# Route based on log format
- type: router
id: get-format
routes:
- output: parser-docker
expr: 'body matches "^\\{"'
- output: parser-containerd
expr: 'body matches "^[^ Z]+Z"'
default: parser-containerd
# Docker JSON format parser
- type: json_parser
id: parser-docker
output: extract-metadata-from-filepath
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Containerd/CRI format parser
- type: regex_parser
id: parser-containerd
regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
output: extract-metadata-from-filepath
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Extract metadata from file path
- type: regex_parser
id: extract-metadata-from-filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9-]+)\/(?P<container_name>[^\/]+)\/.*$'
parse_from: attributes["log.file.path"]
# Move attributes to resource
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.uid
to: resource["k8s.pod.uid"]
- type: move
from: attributes.stream
to: attributes["log.iostream"]
- type: move
from: attributes.log
to: body
# Loki label hints - tell Loki exporter which attributes to use as labels
- type: add
field: resource["loki.resource.labels"]
value: "k8s.namespace.name, k8s.pod.name, k8s.container.name, k8s.node.name"
- type: add
field: attributes["loki.attribute.labels"]
value: "log.iostream"
# Prometheus receiver - self metrics only
# Infrastructure metrics (node-exporter, kube-state-metrics) handled by Prometheus
prometheus:
config:
scrape_configs:
# OTel Collector self metrics only
- job_name: 'otel-collector'
scrape_interval: 60s
static_configs:
- targets: ['${env:K8S_POD_IP}:8888']
# ---------------------------------------------------------------------------
# Processors - how data is transformed
# ---------------------------------------------------------------------------
processors:
# Batch processor for efficient exports
batch:
timeout: 10s
send_batch_size: 1024
send_batch_max_size: 2048
# Memory limiter to prevent OOM
memory_limiter:
check_interval: 5s
limit_mib: 400
spike_limit_mib: 100
# Add Kubernetes metadata
k8sattributes:
extract:
metadata:
- k8s.namespace.name
- k8s.deployment.name
- k8s.pod.name
- k8s.node.name
passthrough: false
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: connection
# Resource detection
resourcedetection:
detectors: [env, system]
timeout: 5s
override: false
# ---------------------------------------------------------------------------
# Exporters - where data goes
# ---------------------------------------------------------------------------
exporters:
# Tempo for traces
otlp/tempo:
endpoint: tempo.tempo.svc.cluster.local:4317
tls:
insecure: true
# Prometheus remote write for metrics
prometheusremotewrite:
endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
tls:
insecure: true
external_labels:
otel_collector: ${env:K8S_POD_NAME}
# Loki for logs
loki:
endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
default_labels_enabled:
exporter: false
level: true
# Debug exporter (for troubleshooting)
debug:
verbosity: basic
# ---------------------------------------------------------------------------
# Extensions
# ---------------------------------------------------------------------------
extensions:
health_check:
endpoint: 0.0.0.0:13133
# ---------------------------------------------------------------------------
# Service pipelines
# ---------------------------------------------------------------------------
service:
extensions: [health_check]
pipelines:
# Traces pipeline
traces:
receivers: [otlp]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [otlp/tempo]
# Metrics pipeline (OTLP + Prometheus scraping)
metrics:
receivers: [otlp, prometheus]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [prometheusremotewrite]
# Logs pipeline (OTLP + Filelog)
logs:
receivers: [otlp, filelog]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [loki]
# =============================================================================
# Service Account
# =============================================================================
serviceAccount:
create: true
# =============================================================================
# RBAC for k8sattributes processor and prometheus receiver
# =============================================================================
clusterRole:
create: true
rules:
- apiGroups: [""]
resources: ["pods", "namespaces", "nodes", "endpoints", "services"]
verbs: ["get", "watch", "list"]
- apiGroups: ["apps"]
resources: ["replicasets", "deployments"]
verbs: ["get", "watch", "list"]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["get", "watch", "list"]
# =============================================================================
# ServiceMonitor for Prometheus (keep for backward compatibility)
# =============================================================================
serviceMonitor:
enabled: true
metricsEndpoints:
- port: metrics
extraLabels:
release: prometheus
# =============================================================================
# Pod Monitor for self-monitoring
# =============================================================================
podMonitor:
enabled: false