REVERT(otel): remove metrics collection, keep logs/traces only

- Revert to simpler architecture where Prometheus scrapes metrics directly via ServiceMonitors
- OTel Collector only handles logs (filelog) and traces (otlp)
- Remove Target Allocator and metrics-related config
- This reduces complexity and resource usage for home cluster
This commit is contained in:
2026-01-10 00:33:10 +09:00
parent a506ca3f58
commit 9e87e6fbcb
5 changed files with 43 additions and 178 deletions

View File

@@ -1,147 +0,0 @@
# OpenTelemetry Collector for Metrics
# Deployment mode with Target Allocator (consistent-hashing)
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: otel-metrics
namespace: opentelemetry
spec:
mode: statefulset
replicas: 2
image: otel/opentelemetry-collector-contrib:0.113.0
serviceAccount: otel-collector
# Target Allocator - distributes scrape targets across collector replicas
targetAllocator:
enabled: true
serviceAccount: otel-collector-targetallocator
image: ghcr.io/open-telemetry/opentelemetry-operator/target-allocator:0.113.0
allocationStrategy: consistent-hashing
filterStrategy: relabel-config
prometheusCR:
enabled: true
serviceMonitorSelector: {}
podMonitorSelector: {}
scrapeInterval: 30s
resources:
requests:
cpu: 10m
memory: 64Mi
limits:
memory: 128Mi
resources:
requests:
cpu: 50m
memory: 512Mi
limits:
memory: 1Gi
ports:
- name: otlp-grpc
port: 4317
protocol: TCP
targetPort: 4317
- name: otlp-http
port: 4318
protocol: TCP
targetPort: 4318
- name: metrics
port: 8888
protocol: TCP
targetPort: 8888
env:
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: K8S_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: K8S_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
config:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# Prometheus receiver with Target Allocator
prometheus:
config:
global:
scrape_interval: 60s
scrape_configs:
- job_name: otel-metrics-self
scrape_interval: 60s
static_configs:
- targets: ['${env:K8S_POD_IP}:8888']
target_allocator:
endpoint: http://otel-metrics-targetallocator:80
interval: 30s
collector_id: ${env:K8S_POD_NAME}
processors:
batch:
timeout: 10s
send_batch_size: 1024
send_batch_max_size: 2048
memory_limiter:
check_interval: 5s
limit_mib: 400
spike_limit_mib: 100
k8sattributes:
extract:
metadata:
- k8s.namespace.name
- k8s.deployment.name
- k8s.pod.name
- k8s.node.name
passthrough: false
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: connection
resourcedetection:
detectors: [env, system]
timeout: 5s
override: false
exporters:
prometheusremotewrite:
endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
tls:
insecure: true
external_labels:
otel_collector: ${env:K8S_POD_NAME}
debug:
verbosity: basic
extensions:
health_check:
endpoint: 0.0.0.0:13133
service:
extensions: [health_check]
pipelines:
metrics:
receivers: [otlp, prometheus]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [prometheusremotewrite]

View File

@@ -1,15 +1,25 @@
# OpenTelemetry Collector for Logs and Traces
# DaemonSet mode - runs on every node for log collection
# OpenTelemetry Collector with Target Allocator
# Managed by OpenTelemetry Operator
#
# Architecture:
# - DaemonSet mode: one collector per node for log collection
# - Target Allocator: distributes scrape targets across collectors
# - Filelog receiver for container logs
# - Prometheus receiver with Target Allocator for metrics
# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs)
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: otel-logs
name: otel-collector
namespace: opentelemetry
spec:
mode: daemonset
image: otel/opentelemetry-collector-contrib:0.113.0
serviceAccount: otel-collector
# Target Allocator disabled - metrics collected by Prometheus directly
# OTel handles logs (filelog) and traces (otlp) only
resources:
requests:
cpu: 50m
@@ -80,7 +90,7 @@ spec:
include:
- /var/log/pods/*/*/*.log
exclude:
- /var/log/pods/opentelemetry_otel-*/*/*.log
- /var/log/pods/opentelemetry_otel-collector*/*/*.log
start_at: end
include_file_path: true
include_file_name: false
@@ -133,6 +143,15 @@ spec:
from: attributes.log
to: body
# Prometheus receiver - self metrics only
prometheus:
config:
scrape_configs:
- job_name: otel-collector
scrape_interval: 60s
static_configs:
- targets: ['${env:K8S_POD_IP}:8888']
processors:
batch:
timeout: 10s
@@ -173,6 +192,13 @@ spec:
tls:
insecure: true
prometheusremotewrite:
endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
tls:
insecure: true
external_labels:
otel_collector: ${env:K8S_POD_NAME}
loki:
endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
default_labels_enabled:
@@ -194,6 +220,11 @@ spec:
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [otlp/tempo]
metrics:
receivers: [otlp, prometheus]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [prometheusremotewrite]
logs:
receivers: [otlp, filelog]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]

View File

@@ -3,5 +3,4 @@ kind: Kustomization
resources:
- rbac.yaml
- collector-logs.yaml
- collector-metrics.yaml
- collector.yaml

View File

@@ -59,14 +59,6 @@ rules:
- apiGroups: [""]
resources: ["pods", "nodes", "services", "endpoints", "namespaces"]
verbs: ["get", "watch", "list"]
# Secrets for TLS certificates referenced by ServiceMonitors
- apiGroups: [""]
resources: ["secrets", "configmaps"]
verbs: ["get", "watch", "list"]
# Events for status reporting
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "patch"]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["get", "watch", "list"]

View File

@@ -48,9 +48,8 @@ prometheus:
# Enable remote write receiver for OTel Collector
enableRemoteWriteReceiver: true
# Single replica due to cluster resource constraints
# Thanos provides HA query capability
replicas: 1
# HA: 2 replicas on different worker nodes
replicas: 2
replicaExternalLabelName: prometheus_replica
# Pod anti-affinity for HA
@@ -68,10 +67,6 @@ prometheus:
evaluationInterval: 60s # 30s → 60s
retention: 3d # Local retention only (no S3 upload)
# Allow out-of-order samples from OTel collectors
tsdb:
outOfOrderTimeWindow: 5m
# Thanos Sidecar configuration (query only, no S3 upload)
thanos:
image: quay.io/thanos/thanos:v0.37.2
@@ -85,20 +80,15 @@ prometheus:
resources:
requests:
cpu: 50m
memory: 1536Mi
memory: 768Mi
limits:
memory: 1536Mi
memory: 768Mi
# ServiceMonitor selector - disable direct scraping (OTel handles it)
# Set to non-existent label to effectively disable
# ServiceMonitor selector - scrape all ServiceMonitors
serviceMonitorSelectorNilUsesHelmValues: false
serviceMonitorSelector:
matchLabels:
prometheus-scrape: "direct" # No ServiceMonitors have this label
serviceMonitorSelector: {}
podMonitorSelectorNilUsesHelmValues: false
podMonitorSelector:
matchLabels:
prometheus-scrape: "direct" # No PodMonitors have this label
podMonitorSelector: {}
probeSelectorNilUsesHelmValues: false
ruleSelector: {}