- Set outOfOrderTimeWindow to 5m for TSDB - Allow slightly out-of-order samples from distributed collectors - Prevents data loss from timing differences
181 lines
4.8 KiB
YAML
181 lines
4.8 KiB
YAML
# Kube-Prometheus-Stack Helm Values
|
|
# Chart: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
|
|
# Includes: Prometheus Operator, Prometheus, Alertmanager, Grafana, and various exporters
|
|
|
|
# Global settings
|
|
fullnameOverride: ""
|
|
|
|
# CRD 관리
|
|
crds:
|
|
enabled: false # CRD annotation 크기 문제로 비활성화 (CRD는 이미 설치됨)
|
|
|
|
# Prometheus Operator
|
|
prometheusOperator:
|
|
enabled: true
|
|
# CRD 생성 비활성화
|
|
createCustomResource: false
|
|
|
|
# Kubelet ServiceMonitor with cluster label
|
|
kubelet:
|
|
enabled: true
|
|
serviceMonitor:
|
|
# cAdvisor metrics (container_memory_working_set_bytes 등)
|
|
cAdvisorRelabelings:
|
|
- targetLabel: cluster
|
|
replacement: "mayne-cluster"
|
|
- sourceLabels: [__metrics_path__]
|
|
targetLabel: metrics_path
|
|
# Resource metrics
|
|
resourceRelabelings:
|
|
- targetLabel: cluster
|
|
replacement: "mayne-cluster"
|
|
# Probes metrics
|
|
probesRelabelings:
|
|
- targetLabel: cluster
|
|
replacement: "mayne-cluster"
|
|
|
|
# Prometheus
|
|
prometheus:
|
|
enabled: true
|
|
|
|
# Thanos Sidecar - for long-term storage and HA
|
|
thanosService:
|
|
enabled: true
|
|
thanosServiceMonitor:
|
|
enabled: true
|
|
|
|
prometheusSpec:
|
|
# Enable remote write receiver for OTel Collector
|
|
enableRemoteWriteReceiver: true
|
|
|
|
# HA: 2 replicas on different worker nodes
|
|
replicas: 2
|
|
replicaExternalLabelName: prometheus_replica
|
|
|
|
# Pod anti-affinity for HA
|
|
affinity:
|
|
podAntiAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 100
|
|
podAffinityTerm:
|
|
labelSelector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: prometheus
|
|
topologyKey: kubernetes.io/hostname
|
|
|
|
scrapeInterval: 60s # 30s → 60s (메모리 절감)
|
|
evaluationInterval: 60s # 30s → 60s
|
|
retention: 3d # Local retention only (no S3 upload)
|
|
|
|
# Allow out-of-order samples from OTel collectors
|
|
tsdb:
|
|
outOfOrderTimeWindow: 5m
|
|
|
|
# Thanos Sidecar configuration (query only, no S3 upload)
|
|
thanos:
|
|
image: quay.io/thanos/thanos:v0.37.2
|
|
# objectStorageConfig removed - no upload to MinIO, local storage only
|
|
|
|
# Use emptyDir instead of PVC (data is temporary, Thanos handles long-term)
|
|
storageSpec:
|
|
emptyDir:
|
|
sizeLimit: 5Gi
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 768Mi
|
|
limits:
|
|
memory: 768Mi
|
|
|
|
# ServiceMonitor selector - disable direct scraping (OTel handles it)
|
|
# Set to non-existent label to effectively disable
|
|
serviceMonitorSelectorNilUsesHelmValues: false
|
|
serviceMonitorSelector:
|
|
matchLabels:
|
|
prometheus-scrape: "direct" # No ServiceMonitors have this label
|
|
podMonitorSelectorNilUsesHelmValues: false
|
|
podMonitorSelector:
|
|
matchLabels:
|
|
prometheus-scrape: "direct" # No PodMonitors have this label
|
|
probeSelectorNilUsesHelmValues: false
|
|
ruleSelector: {}
|
|
|
|
# Alertmanager 설정
|
|
alertingEndpoints:
|
|
- name: alertmanager
|
|
namespace: alertmanager
|
|
port: http
|
|
scheme: http
|
|
|
|
# External labels added to all metrics
|
|
externalLabels:
|
|
cluster: "mayne-cluster"
|
|
|
|
# additionalScrapeConfigs removed - OTel handles scraping now
|
|
# Targets moved to OTel prometheus receiver kubernetes-pods job
|
|
additionalScrapeConfigs: []
|
|
|
|
# API Server 메트릭 수집 비활성화 (메모리 절감: ~37k series)
|
|
kubeApiServer:
|
|
enabled: false
|
|
|
|
# etcd 메트릭 수집 비활성화 (메모리 절감: ~26k series)
|
|
kubeEtcd:
|
|
enabled: false
|
|
|
|
# Alertmanager (기존 alertmanager 사용)
|
|
alertmanager:
|
|
enabled: false
|
|
|
|
# Grafana (기존 grafana 사용)
|
|
grafana:
|
|
enabled: false
|
|
|
|
# Node Exporter (기존 node-exporter 사용)
|
|
nodeExporter:
|
|
enabled: false
|
|
|
|
# Kube State Metrics (기존 kube-state-metrics 사용)
|
|
# 별도로 배포된 kube-state-metrics가 자체 ServiceMonitor 생성
|
|
kubeStateMetrics:
|
|
enabled: false
|
|
|
|
# 기본 ServiceMonitors
|
|
defaultRules:
|
|
create: true
|
|
# 비활성화할 개별 alert rule
|
|
disabled:
|
|
KubeCPUOvercommit: true
|
|
KubeMemoryOvercommit: true
|
|
PrometheusDuplicateTimestamps: true
|
|
rules:
|
|
alertmanager: true
|
|
etcd: false
|
|
configReloaders: true
|
|
general: true
|
|
k8s: true
|
|
kubeApiserverAvailability: true
|
|
kubeApiserverBurnrate: true
|
|
kubeApiserverHistogram: true
|
|
kubeApiserverSlos: true
|
|
kubeControllerManager: false
|
|
kubelet: true
|
|
kubeProxy: false
|
|
kubePrometheusGeneral: true
|
|
kubePrometheusNodeRecording: true
|
|
kubernetesApps: true
|
|
kubernetesResources: true
|
|
kubernetesStorage: true
|
|
kubernetesSystem: true
|
|
kubeSchedulerAlerting: false
|
|
kubeSchedulerRecording: false
|
|
kubeStateMetrics: true
|
|
network: true
|
|
node: true
|
|
nodeExporterAlerting: true
|
|
nodeExporterRecording: true
|
|
prometheus: true
|
|
prometheusOperator: true
|
|
|