REVERT(otel): remove metrics collection, keep logs/traces only

- Revert to simpler architecture where Prometheus scrapes metrics directly via ServiceMonitors - OTel Collector only handles logs (filelog) and traces (otlp) - Remove Target Allocator and metrics-related config - This reduces complexity and resource usage for home cluster
2026-01-10 00:33:10 +09:00
parent a506ca3f58
commit 9e87e6fbcb
5 changed files with 43 additions and 178 deletions
--- a/opentelemetry-collector/manifests/collector-metrics.yaml
+++ b/opentelemetry-collector/manifests/collector-metrics.yaml
@@ -1,147 +0,0 @@
-# OpenTelemetry Collector for Metrics
-# Deployment mode with Target Allocator (consistent-hashing)
-apiVersion: opentelemetry.io/v1beta1
-kind: OpenTelemetryCollector
-metadata:
-  name: otel-metrics
-  namespace: opentelemetry
-spec:
-  mode: statefulset
-  replicas: 2
-  image: otel/opentelemetry-collector-contrib:0.113.0
-  serviceAccount: otel-collector
-
-  # Target Allocator - distributes scrape targets across collector replicas
-  targetAllocator:
-    enabled: true
-    serviceAccount: otel-collector-targetallocator
-    image: ghcr.io/open-telemetry/opentelemetry-operator/target-allocator:0.113.0
-    allocationStrategy: consistent-hashing
-    filterStrategy: relabel-config
-    prometheusCR:
-      enabled: true
-      serviceMonitorSelector: {}
-      podMonitorSelector: {}
-      scrapeInterval: 30s
-    resources:
-      requests:
-        cpu: 10m
-        memory: 64Mi
-      limits:
-        memory: 128Mi
-
-  resources:
-    requests:
-      cpu: 50m
-      memory: 512Mi
-    limits:
-      memory: 1Gi
-
-  ports:
-    - name: otlp-grpc
-      port: 4317
-      protocol: TCP
-      targetPort: 4317
-    - name: otlp-http
-      port: 4318
-      protocol: TCP
-      targetPort: 4318
-    - name: metrics
-      port: 8888
-      protocol: TCP
-      targetPort: 8888
-
-  env:
-    - name: K8S_NODE_NAME
-      valueFrom:
-        fieldRef:
-          fieldPath: spec.nodeName
-    - name: K8S_POD_NAME
-      valueFrom:
-        fieldRef:
-          fieldPath: metadata.name
-    - name: K8S_POD_IP
-      valueFrom:
-        fieldRef:
-          fieldPath: status.podIP
-
-  config:
-    receivers:
-      otlp:
-        protocols:
-          grpc:
-            endpoint: 0.0.0.0:4317
-          http:
-            endpoint: 0.0.0.0:4318
-
-      # Prometheus receiver with Target Allocator
-      prometheus:
-        config:
-          global:
-            scrape_interval: 60s
-          scrape_configs:
-            - job_name: otel-metrics-self
-              scrape_interval: 60s
-              static_configs:
-                - targets: ['${env:K8S_POD_IP}:8888']
-        target_allocator:
-          endpoint: http://otel-metrics-targetallocator:80
-          interval: 30s
-          collector_id: ${env:K8S_POD_NAME}
-
-    processors:
-      batch:
-        timeout: 10s
-        send_batch_size: 1024
-        send_batch_max_size: 2048
-
-      memory_limiter:
-        check_interval: 5s
-        limit_mib: 400
-        spike_limit_mib: 100
-
-      k8sattributes:
-        extract:
-          metadata:
-            - k8s.namespace.name
-            - k8s.deployment.name
-            - k8s.pod.name
-            - k8s.node.name
-        passthrough: false
-        pod_association:
-          - sources:
-              - from: resource_attribute
-                name: k8s.pod.ip
-          - sources:
-              - from: resource_attribute
-                name: k8s.pod.uid
-          - sources:
-              - from: connection
-
-      resourcedetection:
-        detectors: [env, system]
-        timeout: 5s
-        override: false
-
-    exporters:
-      prometheusremotewrite:
-        endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
-        tls:
-          insecure: true
-        external_labels:
-          otel_collector: ${env:K8S_POD_NAME}
-
-      debug:
-        verbosity: basic
-
-    extensions:
-      health_check:
-        endpoint: 0.0.0.0:13133
-
-    service:
-      extensions: [health_check]
-      pipelines:
-        metrics:
-          receivers: [otlp, prometheus]
-          processors: [memory_limiter, k8sattributes, resourcedetection, batch]
-          exporters: [prometheusremotewrite]
--- a/opentelemetry-collector/manifests/collector-logs.yaml
+++ b/opentelemetry-collector/manifests/collector-logs.yaml
@@ -1,15 +1,25 @@
-# OpenTelemetry Collector for Logs and Traces
-# DaemonSet mode - runs on every node for log collection
+# OpenTelemetry Collector with Target Allocator
+# Managed by OpenTelemetry Operator
+#
+# Architecture:
+# - DaemonSet mode: one collector per node for log collection
+# - Target Allocator: distributes scrape targets across collectors
+# - Filelog receiver for container logs
+# - Prometheus receiver with Target Allocator for metrics
+# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs)
 apiVersion: opentelemetry.io/v1beta1
 kind: OpenTelemetryCollector
 metadata:
-  name: otel-logs
+  name: otel-collector
  namespace: opentelemetry
 spec:
  mode: daemonset
  image: otel/opentelemetry-collector-contrib:0.113.0
  serviceAccount: otel-collector

+  # Target Allocator disabled - metrics collected by Prometheus directly
+  # OTel handles logs (filelog) and traces (otlp) only
+
  resources:
    requests:
      cpu: 50m
@@ -80,7 +90,7 @@ spec:
        include:
          - /var/log/pods/*/*/*.log
        exclude:
-          - /var/log/pods/opentelemetry_otel-*/*/*.log
+          - /var/log/pods/opentelemetry_otel-collector*/*/*.log
        start_at: end
        include_file_path: true
        include_file_name: false
@@ -133,6 +143,15 @@ spec:
            from: attributes.log
            to: body

+      # Prometheus receiver - self metrics only
+      prometheus:
+        config:
+          scrape_configs:
+            - job_name: otel-collector
+              scrape_interval: 60s
+              static_configs:
+                - targets: ['${env:K8S_POD_IP}:8888']
+
    processors:
      batch:
        timeout: 10s
@@ -173,6 +192,13 @@ spec:
        tls:
          insecure: true

+      prometheusremotewrite:
+        endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
+        tls:
+          insecure: true
+        external_labels:
+          otel_collector: ${env:K8S_POD_NAME}
+
      loki:
        endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
        default_labels_enabled:
@@ -194,6 +220,11 @@ spec:
          processors: [memory_limiter, k8sattributes, resourcedetection, batch]
          exporters: [otlp/tempo]

+        metrics:
+          receivers: [otlp, prometheus]
+          processors: [memory_limiter, k8sattributes, resourcedetection, batch]
+          exporters: [prometheusremotewrite]
+
        logs:
          receivers: [otlp, filelog]
          processors: [memory_limiter, k8sattributes, resourcedetection, batch]
--- a/opentelemetry-collector/manifests/kustomization.yaml
+++ b/opentelemetry-collector/manifests/kustomization.yaml
@@ -3,5 +3,4 @@ kind: Kustomization

 resources:
  - rbac.yaml
-  - collector-logs.yaml
-  - collector-metrics.yaml
+  - collector.yaml
--- a/opentelemetry-collector/manifests/rbac.yaml
+++ b/opentelemetry-collector/manifests/rbac.yaml
@@ -59,14 +59,6 @@ rules:
  - apiGroups: [""]
    resources: ["pods", "nodes", "services", "endpoints", "namespaces"]
    verbs: ["get", "watch", "list"]
-  # Secrets for TLS certificates referenced by ServiceMonitors
-  - apiGroups: [""]
-    resources: ["secrets", "configmaps"]
-    verbs: ["get", "watch", "list"]
-  # Events for status reporting
-  - apiGroups: [""]
-    resources: ["events"]
-    verbs: ["create", "patch"]
  - apiGroups: ["discovery.k8s.io"]
    resources: ["endpointslices"]
    verbs: ["get", "watch", "list"]
--- a/prometheus/helm-values.yaml
+++ b/prometheus/helm-values.yaml
@@ -48,9 +48,8 @@ prometheus:
    # Enable remote write receiver for OTel Collector
    enableRemoteWriteReceiver: true

-    # Single replica due to cluster resource constraints
-    # Thanos provides HA query capability
-    replicas: 1
+    # HA: 2 replicas on different worker nodes
+    replicas: 2
    replicaExternalLabelName: prometheus_replica

    # Pod anti-affinity for HA
@@ -68,10 +67,6 @@ prometheus:
    evaluationInterval: 60s   # 30s → 60s
    retention: 3d             # Local retention only (no S3 upload)

-    # Allow out-of-order samples from OTel collectors
-    tsdb:
-      outOfOrderTimeWindow: 5m
-
    # Thanos Sidecar configuration (query only, no S3 upload)
    thanos:
      image: quay.io/thanos/thanos:v0.37.2
@@ -85,20 +80,15 @@ prometheus:
    resources:
      requests:
        cpu: 50m
-        memory: 1536Mi
+        memory: 768Mi
      limits:
-        memory: 1536Mi
+        memory: 768Mi
    
-    # ServiceMonitor selector - disable direct scraping (OTel handles it)
-    # Set to non-existent label to effectively disable
+    # ServiceMonitor selector - scrape all ServiceMonitors
    serviceMonitorSelectorNilUsesHelmValues: false
-    serviceMonitorSelector:
-      matchLabels:
-        prometheus-scrape: "direct"  # No ServiceMonitors have this label
+    serviceMonitorSelector: {}
    podMonitorSelectorNilUsesHelmValues: false
-    podMonitorSelector:
-      matchLabels:
-        prometheus-scrape: "direct"  # No PodMonitors have this label
+    podMonitorSelector: {}
    probeSelectorNilUsesHelmValues: false
    ruleSelector: {}