diff --git a/opentelemetry-collector/manifests/collector-metrics.yaml b/opentelemetry-collector/manifests/collector-metrics.yaml deleted file mode 100644 index 628a8ea..0000000 --- a/opentelemetry-collector/manifests/collector-metrics.yaml +++ /dev/null @@ -1,147 +0,0 @@ -# OpenTelemetry Collector for Metrics -# Deployment mode with Target Allocator (consistent-hashing) -apiVersion: opentelemetry.io/v1beta1 -kind: OpenTelemetryCollector -metadata: - name: otel-metrics - namespace: opentelemetry -spec: - mode: statefulset - replicas: 2 - image: otel/opentelemetry-collector-contrib:0.113.0 - serviceAccount: otel-collector - - # Target Allocator - distributes scrape targets across collector replicas - targetAllocator: - enabled: true - serviceAccount: otel-collector-targetallocator - image: ghcr.io/open-telemetry/opentelemetry-operator/target-allocator:0.113.0 - allocationStrategy: consistent-hashing - filterStrategy: relabel-config - prometheusCR: - enabled: true - serviceMonitorSelector: {} - podMonitorSelector: {} - scrapeInterval: 30s - resources: - requests: - cpu: 10m - memory: 64Mi - limits: - memory: 128Mi - - resources: - requests: - cpu: 50m - memory: 512Mi - limits: - memory: 1Gi - - ports: - - name: otlp-grpc - port: 4317 - protocol: TCP - targetPort: 4317 - - name: otlp-http - port: 4318 - protocol: TCP - targetPort: 4318 - - name: metrics - port: 8888 - protocol: TCP - targetPort: 8888 - - env: - - name: K8S_NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: K8S_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: K8S_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - config: - receivers: - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:4317 - http: - endpoint: 0.0.0.0:4318 - - # Prometheus receiver with Target Allocator - prometheus: - config: - global: - scrape_interval: 60s - scrape_configs: - - job_name: otel-metrics-self - scrape_interval: 60s - static_configs: - - targets: ['${env:K8S_POD_IP}:8888'] - target_allocator: - endpoint: http://otel-metrics-targetallocator:80 - interval: 30s - collector_id: ${env:K8S_POD_NAME} - - processors: - batch: - timeout: 10s - send_batch_size: 1024 - send_batch_max_size: 2048 - - memory_limiter: - check_interval: 5s - limit_mib: 400 - spike_limit_mib: 100 - - k8sattributes: - extract: - metadata: - - k8s.namespace.name - - k8s.deployment.name - - k8s.pod.name - - k8s.node.name - passthrough: false - pod_association: - - sources: - - from: resource_attribute - name: k8s.pod.ip - - sources: - - from: resource_attribute - name: k8s.pod.uid - - sources: - - from: connection - - resourcedetection: - detectors: [env, system] - timeout: 5s - override: false - - exporters: - prometheusremotewrite: - endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write - tls: - insecure: true - external_labels: - otel_collector: ${env:K8S_POD_NAME} - - debug: - verbosity: basic - - extensions: - health_check: - endpoint: 0.0.0.0:13133 - - service: - extensions: [health_check] - pipelines: - metrics: - receivers: [otlp, prometheus] - processors: [memory_limiter, k8sattributes, resourcedetection, batch] - exporters: [prometheusremotewrite] diff --git a/opentelemetry-collector/manifests/collector-logs.yaml b/opentelemetry-collector/manifests/collector.yaml similarity index 79% rename from opentelemetry-collector/manifests/collector-logs.yaml rename to opentelemetry-collector/manifests/collector.yaml index 13eb1b4..fcab6d7 100644 --- a/opentelemetry-collector/manifests/collector-logs.yaml +++ b/opentelemetry-collector/manifests/collector.yaml @@ -1,15 +1,25 @@ -# OpenTelemetry Collector for Logs and Traces -# DaemonSet mode - runs on every node for log collection +# OpenTelemetry Collector with Target Allocator +# Managed by OpenTelemetry Operator +# +# Architecture: +# - DaemonSet mode: one collector per node for log collection +# - Target Allocator: distributes scrape targets across collectors +# - Filelog receiver for container logs +# - Prometheus receiver with Target Allocator for metrics +# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs) apiVersion: opentelemetry.io/v1beta1 kind: OpenTelemetryCollector metadata: - name: otel-logs + name: otel-collector namespace: opentelemetry spec: mode: daemonset image: otel/opentelemetry-collector-contrib:0.113.0 serviceAccount: otel-collector + # Target Allocator disabled - metrics collected by Prometheus directly + # OTel handles logs (filelog) and traces (otlp) only + resources: requests: cpu: 50m @@ -80,7 +90,7 @@ spec: include: - /var/log/pods/*/*/*.log exclude: - - /var/log/pods/opentelemetry_otel-*/*/*.log + - /var/log/pods/opentelemetry_otel-collector*/*/*.log start_at: end include_file_path: true include_file_name: false @@ -133,6 +143,15 @@ spec: from: attributes.log to: body + # Prometheus receiver - self metrics only + prometheus: + config: + scrape_configs: + - job_name: otel-collector + scrape_interval: 60s + static_configs: + - targets: ['${env:K8S_POD_IP}:8888'] + processors: batch: timeout: 10s @@ -173,6 +192,13 @@ spec: tls: insecure: true + prometheusremotewrite: + endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write + tls: + insecure: true + external_labels: + otel_collector: ${env:K8S_POD_NAME} + loki: endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push default_labels_enabled: @@ -194,6 +220,11 @@ spec: processors: [memory_limiter, k8sattributes, resourcedetection, batch] exporters: [otlp/tempo] + metrics: + receivers: [otlp, prometheus] + processors: [memory_limiter, k8sattributes, resourcedetection, batch] + exporters: [prometheusremotewrite] + logs: receivers: [otlp, filelog] processors: [memory_limiter, k8sattributes, resourcedetection, batch] diff --git a/opentelemetry-collector/manifests/kustomization.yaml b/opentelemetry-collector/manifests/kustomization.yaml index e0cc2c4..73fb16e 100644 --- a/opentelemetry-collector/manifests/kustomization.yaml +++ b/opentelemetry-collector/manifests/kustomization.yaml @@ -3,5 +3,4 @@ kind: Kustomization resources: - rbac.yaml - - collector-logs.yaml - - collector-metrics.yaml + - collector.yaml diff --git a/opentelemetry-collector/manifests/rbac.yaml b/opentelemetry-collector/manifests/rbac.yaml index f4022a0..f617665 100644 --- a/opentelemetry-collector/manifests/rbac.yaml +++ b/opentelemetry-collector/manifests/rbac.yaml @@ -59,14 +59,6 @@ rules: - apiGroups: [""] resources: ["pods", "nodes", "services", "endpoints", "namespaces"] verbs: ["get", "watch", "list"] - # Secrets for TLS certificates referenced by ServiceMonitors - - apiGroups: [""] - resources: ["secrets", "configmaps"] - verbs: ["get", "watch", "list"] - # Events for status reporting - - apiGroups: [""] - resources: ["events"] - verbs: ["create", "patch"] - apiGroups: ["discovery.k8s.io"] resources: ["endpointslices"] verbs: ["get", "watch", "list"] diff --git a/prometheus/helm-values.yaml b/prometheus/helm-values.yaml index bba3aa4..144dedb 100644 --- a/prometheus/helm-values.yaml +++ b/prometheus/helm-values.yaml @@ -48,9 +48,8 @@ prometheus: # Enable remote write receiver for OTel Collector enableRemoteWriteReceiver: true - # Single replica due to cluster resource constraints - # Thanos provides HA query capability - replicas: 1 + # HA: 2 replicas on different worker nodes + replicas: 2 replicaExternalLabelName: prometheus_replica # Pod anti-affinity for HA @@ -68,10 +67,6 @@ prometheus: evaluationInterval: 60s # 30s → 60s retention: 3d # Local retention only (no S3 upload) - # Allow out-of-order samples from OTel collectors - tsdb: - outOfOrderTimeWindow: 5m - # Thanos Sidecar configuration (query only, no S3 upload) thanos: image: quay.io/thanos/thanos:v0.37.2 @@ -85,20 +80,15 @@ prometheus: resources: requests: cpu: 50m - memory: 1536Mi + memory: 768Mi limits: - memory: 1536Mi + memory: 768Mi - # ServiceMonitor selector - disable direct scraping (OTel handles it) - # Set to non-existent label to effectively disable + # ServiceMonitor selector - scrape all ServiceMonitors serviceMonitorSelectorNilUsesHelmValues: false - serviceMonitorSelector: - matchLabels: - prometheus-scrape: "direct" # No ServiceMonitors have this label + serviceMonitorSelector: {} podMonitorSelectorNilUsesHelmValues: false - podMonitorSelector: - matchLabels: - prometheus-scrape: "direct" # No PodMonitors have this label + podMonitorSelector: {} probeSelectorNilUsesHelmValues: false ruleSelector: {}