REVERT(otel): remove metrics collection, keep logs/traces only
- Revert to simpler architecture where Prometheus scrapes metrics directly via ServiceMonitors - OTel Collector only handles logs (filelog) and traces (otlp) - Remove Target Allocator and metrics-related config - This reduces complexity and resource usage for home cluster
This commit is contained in:
@@ -1,147 +0,0 @@
|
|||||||
# OpenTelemetry Collector for Metrics
|
|
||||||
# Deployment mode with Target Allocator (consistent-hashing)
|
|
||||||
apiVersion: opentelemetry.io/v1beta1
|
|
||||||
kind: OpenTelemetryCollector
|
|
||||||
metadata:
|
|
||||||
name: otel-metrics
|
|
||||||
namespace: opentelemetry
|
|
||||||
spec:
|
|
||||||
mode: statefulset
|
|
||||||
replicas: 2
|
|
||||||
image: otel/opentelemetry-collector-contrib:0.113.0
|
|
||||||
serviceAccount: otel-collector
|
|
||||||
|
|
||||||
# Target Allocator - distributes scrape targets across collector replicas
|
|
||||||
targetAllocator:
|
|
||||||
enabled: true
|
|
||||||
serviceAccount: otel-collector-targetallocator
|
|
||||||
image: ghcr.io/open-telemetry/opentelemetry-operator/target-allocator:0.113.0
|
|
||||||
allocationStrategy: consistent-hashing
|
|
||||||
filterStrategy: relabel-config
|
|
||||||
prometheusCR:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitorSelector: {}
|
|
||||||
podMonitorSelector: {}
|
|
||||||
scrapeInterval: 30s
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 10m
|
|
||||||
memory: 64Mi
|
|
||||||
limits:
|
|
||||||
memory: 128Mi
|
|
||||||
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 50m
|
|
||||||
memory: 512Mi
|
|
||||||
limits:
|
|
||||||
memory: 1Gi
|
|
||||||
|
|
||||||
ports:
|
|
||||||
- name: otlp-grpc
|
|
||||||
port: 4317
|
|
||||||
protocol: TCP
|
|
||||||
targetPort: 4317
|
|
||||||
- name: otlp-http
|
|
||||||
port: 4318
|
|
||||||
protocol: TCP
|
|
||||||
targetPort: 4318
|
|
||||||
- name: metrics
|
|
||||||
port: 8888
|
|
||||||
protocol: TCP
|
|
||||||
targetPort: 8888
|
|
||||||
|
|
||||||
env:
|
|
||||||
- name: K8S_NODE_NAME
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: spec.nodeName
|
|
||||||
- name: K8S_POD_NAME
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: metadata.name
|
|
||||||
- name: K8S_POD_IP
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: status.podIP
|
|
||||||
|
|
||||||
config:
|
|
||||||
receivers:
|
|
||||||
otlp:
|
|
||||||
protocols:
|
|
||||||
grpc:
|
|
||||||
endpoint: 0.0.0.0:4317
|
|
||||||
http:
|
|
||||||
endpoint: 0.0.0.0:4318
|
|
||||||
|
|
||||||
# Prometheus receiver with Target Allocator
|
|
||||||
prometheus:
|
|
||||||
config:
|
|
||||||
global:
|
|
||||||
scrape_interval: 60s
|
|
||||||
scrape_configs:
|
|
||||||
- job_name: otel-metrics-self
|
|
||||||
scrape_interval: 60s
|
|
||||||
static_configs:
|
|
||||||
- targets: ['${env:K8S_POD_IP}:8888']
|
|
||||||
target_allocator:
|
|
||||||
endpoint: http://otel-metrics-targetallocator:80
|
|
||||||
interval: 30s
|
|
||||||
collector_id: ${env:K8S_POD_NAME}
|
|
||||||
|
|
||||||
processors:
|
|
||||||
batch:
|
|
||||||
timeout: 10s
|
|
||||||
send_batch_size: 1024
|
|
||||||
send_batch_max_size: 2048
|
|
||||||
|
|
||||||
memory_limiter:
|
|
||||||
check_interval: 5s
|
|
||||||
limit_mib: 400
|
|
||||||
spike_limit_mib: 100
|
|
||||||
|
|
||||||
k8sattributes:
|
|
||||||
extract:
|
|
||||||
metadata:
|
|
||||||
- k8s.namespace.name
|
|
||||||
- k8s.deployment.name
|
|
||||||
- k8s.pod.name
|
|
||||||
- k8s.node.name
|
|
||||||
passthrough: false
|
|
||||||
pod_association:
|
|
||||||
- sources:
|
|
||||||
- from: resource_attribute
|
|
||||||
name: k8s.pod.ip
|
|
||||||
- sources:
|
|
||||||
- from: resource_attribute
|
|
||||||
name: k8s.pod.uid
|
|
||||||
- sources:
|
|
||||||
- from: connection
|
|
||||||
|
|
||||||
resourcedetection:
|
|
||||||
detectors: [env, system]
|
|
||||||
timeout: 5s
|
|
||||||
override: false
|
|
||||||
|
|
||||||
exporters:
|
|
||||||
prometheusremotewrite:
|
|
||||||
endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
|
|
||||||
tls:
|
|
||||||
insecure: true
|
|
||||||
external_labels:
|
|
||||||
otel_collector: ${env:K8S_POD_NAME}
|
|
||||||
|
|
||||||
debug:
|
|
||||||
verbosity: basic
|
|
||||||
|
|
||||||
extensions:
|
|
||||||
health_check:
|
|
||||||
endpoint: 0.0.0.0:13133
|
|
||||||
|
|
||||||
service:
|
|
||||||
extensions: [health_check]
|
|
||||||
pipelines:
|
|
||||||
metrics:
|
|
||||||
receivers: [otlp, prometheus]
|
|
||||||
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
|
||||||
exporters: [prometheusremotewrite]
|
|
||||||
@@ -1,15 +1,25 @@
|
|||||||
# OpenTelemetry Collector for Logs and Traces
|
# OpenTelemetry Collector with Target Allocator
|
||||||
# DaemonSet mode - runs on every node for log collection
|
# Managed by OpenTelemetry Operator
|
||||||
|
#
|
||||||
|
# Architecture:
|
||||||
|
# - DaemonSet mode: one collector per node for log collection
|
||||||
|
# - Target Allocator: distributes scrape targets across collectors
|
||||||
|
# - Filelog receiver for container logs
|
||||||
|
# - Prometheus receiver with Target Allocator for metrics
|
||||||
|
# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs)
|
||||||
apiVersion: opentelemetry.io/v1beta1
|
apiVersion: opentelemetry.io/v1beta1
|
||||||
kind: OpenTelemetryCollector
|
kind: OpenTelemetryCollector
|
||||||
metadata:
|
metadata:
|
||||||
name: otel-logs
|
name: otel-collector
|
||||||
namespace: opentelemetry
|
namespace: opentelemetry
|
||||||
spec:
|
spec:
|
||||||
mode: daemonset
|
mode: daemonset
|
||||||
image: otel/opentelemetry-collector-contrib:0.113.0
|
image: otel/opentelemetry-collector-contrib:0.113.0
|
||||||
serviceAccount: otel-collector
|
serviceAccount: otel-collector
|
||||||
|
|
||||||
|
# Target Allocator disabled - metrics collected by Prometheus directly
|
||||||
|
# OTel handles logs (filelog) and traces (otlp) only
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 50m
|
cpu: 50m
|
||||||
@@ -80,7 +90,7 @@ spec:
|
|||||||
include:
|
include:
|
||||||
- /var/log/pods/*/*/*.log
|
- /var/log/pods/*/*/*.log
|
||||||
exclude:
|
exclude:
|
||||||
- /var/log/pods/opentelemetry_otel-*/*/*.log
|
- /var/log/pods/opentelemetry_otel-collector*/*/*.log
|
||||||
start_at: end
|
start_at: end
|
||||||
include_file_path: true
|
include_file_path: true
|
||||||
include_file_name: false
|
include_file_name: false
|
||||||
@@ -133,6 +143,15 @@ spec:
|
|||||||
from: attributes.log
|
from: attributes.log
|
||||||
to: body
|
to: body
|
||||||
|
|
||||||
|
# Prometheus receiver - self metrics only
|
||||||
|
prometheus:
|
||||||
|
config:
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: otel-collector
|
||||||
|
scrape_interval: 60s
|
||||||
|
static_configs:
|
||||||
|
- targets: ['${env:K8S_POD_IP}:8888']
|
||||||
|
|
||||||
processors:
|
processors:
|
||||||
batch:
|
batch:
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
@@ -173,6 +192,13 @@ spec:
|
|||||||
tls:
|
tls:
|
||||||
insecure: true
|
insecure: true
|
||||||
|
|
||||||
|
prometheusremotewrite:
|
||||||
|
endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
|
||||||
|
tls:
|
||||||
|
insecure: true
|
||||||
|
external_labels:
|
||||||
|
otel_collector: ${env:K8S_POD_NAME}
|
||||||
|
|
||||||
loki:
|
loki:
|
||||||
endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
|
endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
|
||||||
default_labels_enabled:
|
default_labels_enabled:
|
||||||
@@ -194,6 +220,11 @@ spec:
|
|||||||
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
exporters: [otlp/tempo]
|
exporters: [otlp/tempo]
|
||||||
|
|
||||||
|
metrics:
|
||||||
|
receivers: [otlp, prometheus]
|
||||||
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
|
exporters: [prometheusremotewrite]
|
||||||
|
|
||||||
logs:
|
logs:
|
||||||
receivers: [otlp, filelog]
|
receivers: [otlp, filelog]
|
||||||
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
@@ -3,5 +3,4 @@ kind: Kustomization
|
|||||||
|
|
||||||
resources:
|
resources:
|
||||||
- rbac.yaml
|
- rbac.yaml
|
||||||
- collector-logs.yaml
|
- collector.yaml
|
||||||
- collector-metrics.yaml
|
|
||||||
|
|||||||
@@ -59,14 +59,6 @@ rules:
|
|||||||
- apiGroups: [""]
|
- apiGroups: [""]
|
||||||
resources: ["pods", "nodes", "services", "endpoints", "namespaces"]
|
resources: ["pods", "nodes", "services", "endpoints", "namespaces"]
|
||||||
verbs: ["get", "watch", "list"]
|
verbs: ["get", "watch", "list"]
|
||||||
# Secrets for TLS certificates referenced by ServiceMonitors
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources: ["secrets", "configmaps"]
|
|
||||||
verbs: ["get", "watch", "list"]
|
|
||||||
# Events for status reporting
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources: ["events"]
|
|
||||||
verbs: ["create", "patch"]
|
|
||||||
- apiGroups: ["discovery.k8s.io"]
|
- apiGroups: ["discovery.k8s.io"]
|
||||||
resources: ["endpointslices"]
|
resources: ["endpointslices"]
|
||||||
verbs: ["get", "watch", "list"]
|
verbs: ["get", "watch", "list"]
|
||||||
|
|||||||
@@ -48,9 +48,8 @@ prometheus:
|
|||||||
# Enable remote write receiver for OTel Collector
|
# Enable remote write receiver for OTel Collector
|
||||||
enableRemoteWriteReceiver: true
|
enableRemoteWriteReceiver: true
|
||||||
|
|
||||||
# Single replica due to cluster resource constraints
|
# HA: 2 replicas on different worker nodes
|
||||||
# Thanos provides HA query capability
|
replicas: 2
|
||||||
replicas: 1
|
|
||||||
replicaExternalLabelName: prometheus_replica
|
replicaExternalLabelName: prometheus_replica
|
||||||
|
|
||||||
# Pod anti-affinity for HA
|
# Pod anti-affinity for HA
|
||||||
@@ -68,10 +67,6 @@ prometheus:
|
|||||||
evaluationInterval: 60s # 30s → 60s
|
evaluationInterval: 60s # 30s → 60s
|
||||||
retention: 3d # Local retention only (no S3 upload)
|
retention: 3d # Local retention only (no S3 upload)
|
||||||
|
|
||||||
# Allow out-of-order samples from OTel collectors
|
|
||||||
tsdb:
|
|
||||||
outOfOrderTimeWindow: 5m
|
|
||||||
|
|
||||||
# Thanos Sidecar configuration (query only, no S3 upload)
|
# Thanos Sidecar configuration (query only, no S3 upload)
|
||||||
thanos:
|
thanos:
|
||||||
image: quay.io/thanos/thanos:v0.37.2
|
image: quay.io/thanos/thanos:v0.37.2
|
||||||
@@ -85,20 +80,15 @@ prometheus:
|
|||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 50m
|
cpu: 50m
|
||||||
memory: 1536Mi
|
memory: 768Mi
|
||||||
limits:
|
limits:
|
||||||
memory: 1536Mi
|
memory: 768Mi
|
||||||
|
|
||||||
# ServiceMonitor selector - disable direct scraping (OTel handles it)
|
# ServiceMonitor selector - scrape all ServiceMonitors
|
||||||
# Set to non-existent label to effectively disable
|
|
||||||
serviceMonitorSelectorNilUsesHelmValues: false
|
serviceMonitorSelectorNilUsesHelmValues: false
|
||||||
serviceMonitorSelector:
|
serviceMonitorSelector: {}
|
||||||
matchLabels:
|
|
||||||
prometheus-scrape: "direct" # No ServiceMonitors have this label
|
|
||||||
podMonitorSelectorNilUsesHelmValues: false
|
podMonitorSelectorNilUsesHelmValues: false
|
||||||
podMonitorSelector:
|
podMonitorSelector: {}
|
||||||
matchLabels:
|
|
||||||
prometheus-scrape: "direct" # No PodMonitors have this label
|
|
||||||
probeSelectorNilUsesHelmValues: false
|
probeSelectorNilUsesHelmValues: false
|
||||||
ruleSelector: {}
|
ruleSelector: {}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user