From 1fdbb5e1dd551d519534a48691f5f190fc8dd1b4 Mon Sep 17 00:00:00 2001 From: Mayne0213 Date: Fri, 9 Jan 2026 23:30:41 +0900 Subject: [PATCH] FEAT(otel): enable Target Allocator for metrics - Enable Target Allocator with consistent-hashing strategy - Configure prometheus receiver to use Target Allocator - Add RBAC permissions for secrets and events - Use prometheusCR for ServiceMonitor/PodMonitor discovery --- .../manifests/collector.yaml | 38 ++++++++++++++----- opentelemetry-collector/manifests/rbac.yaml | 8 ++++ 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/opentelemetry-collector/manifests/collector.yaml b/opentelemetry-collector/manifests/collector.yaml index fcab6d7..7dd9bb1 100644 --- a/opentelemetry-collector/manifests/collector.yaml +++ b/opentelemetry-collector/manifests/collector.yaml @@ -3,9 +3,9 @@ # # Architecture: # - DaemonSet mode: one collector per node for log collection -# - Target Allocator: distributes scrape targets across collectors +# - Target Allocator (consistent-hashing): distributes scrape targets across collectors # - Filelog receiver for container logs -# - Prometheus receiver with Target Allocator for metrics +# - Prometheus receiver with Target Allocator for metrics (replaces Prometheus scraping) # - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs) apiVersion: opentelemetry.io/v1beta1 kind: OpenTelemetryCollector @@ -17,8 +17,25 @@ spec: image: otel/opentelemetry-collector-contrib:0.113.0 serviceAccount: otel-collector - # Target Allocator disabled - metrics collected by Prometheus directly - # OTel handles logs (filelog) and traces (otlp) only + # Target Allocator - distributes Prometheus scrape targets across collectors + # Using consistent-hashing strategy (not per-node due to collector-node mapping bug) + targetAllocator: + enabled: true + serviceAccount: otel-collector-targetallocator + image: ghcr.io/open-telemetry/opentelemetry-operator/target-allocator:0.113.0 + allocationStrategy: consistent-hashing + filterStrategy: relabel-config + prometheusCR: + enabled: true + serviceMonitorSelector: {} + podMonitorSelector: {} + scrapeInterval: 30s + resources: + requests: + cpu: 10m + memory: 64Mi + limits: + memory: 128Mi resources: requests: @@ -143,14 +160,15 @@ spec: from: attributes.log to: body - # Prometheus receiver - self metrics only + # Prometheus receiver - uses Target Allocator for ServiceMonitor/PodMonitor discovery prometheus: config: - scrape_configs: - - job_name: otel-collector - scrape_interval: 60s - static_configs: - - targets: ['${env:K8S_POD_IP}:8888'] + global: + scrape_interval: 60s + target_allocator: + endpoint: http://otel-collector-targetallocator:80 + interval: 30s + collector_id: ${env:K8S_POD_NAME} processors: batch: diff --git a/opentelemetry-collector/manifests/rbac.yaml b/opentelemetry-collector/manifests/rbac.yaml index f617665..f4022a0 100644 --- a/opentelemetry-collector/manifests/rbac.yaml +++ b/opentelemetry-collector/manifests/rbac.yaml @@ -59,6 +59,14 @@ rules: - apiGroups: [""] resources: ["pods", "nodes", "services", "endpoints", "namespaces"] verbs: ["get", "watch", "list"] + # Secrets for TLS certificates referenced by ServiceMonitors + - apiGroups: [""] + resources: ["secrets", "configmaps"] + verbs: ["get", "watch", "list"] + # Events for status reporting + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] - apiGroups: ["discovery.k8s.io"] resources: ["endpointslices"] verbs: ["get", "watch", "list"]