Files
observability/prometheus/helm-values.yaml
Mayne0213 ea4d7d4ecf PERF(prometheus): reduce CPU request from 200m to 50m
- Actual usage is ~17m, 200m was over-provisioned
- Fixes "Insufficient cpu" scheduling error for replica 2
2026-01-09 21:41:52 +09:00

235 lines
6.3 KiB
YAML

# Kube-Prometheus-Stack Helm Values
# Chart: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
# Includes: Prometheus Operator, Prometheus, Alertmanager, Grafana, and various exporters
# Global settings
fullnameOverride: ""
# CRD 관리
crds:
enabled: false # CRD annotation 크기 문제로 비활성화 (CRD는 이미 설치됨)
# Prometheus Operator
prometheusOperator:
enabled: true
# CRD 생성 비활성화
createCustomResource: false
# Kubelet ServiceMonitor with cluster label
kubelet:
enabled: true
serviceMonitor:
# cAdvisor metrics (container_memory_working_set_bytes 등)
cAdvisorRelabelings:
- targetLabel: cluster
replacement: "mayne-cluster"
- sourceLabels: [__metrics_path__]
targetLabel: metrics_path
# Resource metrics
resourceRelabelings:
- targetLabel: cluster
replacement: "mayne-cluster"
# Probes metrics
probesRelabelings:
- targetLabel: cluster
replacement: "mayne-cluster"
# Prometheus
prometheus:
enabled: true
# Thanos Sidecar - for long-term storage and HA
thanosService:
enabled: true
thanosServiceMonitor:
enabled: true
prometheusSpec:
# HA: 2 replicas on different worker nodes
replicas: 2
replicaExternalLabelName: prometheus_replica
# Pod anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app.kubernetes.io/name: prometheus
topologyKey: kubernetes.io/hostname
scrapeInterval: 60s # 30s → 60s (메모리 절감)
evaluationInterval: 60s # 30s → 60s
retention: 3d # Local retention (S3 has longer retention via Thanos)
# Thanos Sidecar configuration
thanos:
image: quay.io/thanos/thanos:v0.37.2
objectStorageConfig:
existingSecret:
name: thanos-objstore-secret
key: objstore.yml
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: local-path
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 5Gi
resources:
requests:
cpu: 50m
memory: 512Mi
limits:
memory: 768Mi
# ServiceMonitor 자동 발견 - 모든 ServiceMonitor 선택
serviceMonitorSelectorNilUsesHelmValues: false
serviceMonitorSelector: {}
podMonitorSelectorNilUsesHelmValues: false
podMonitorSelector: {}
probeSelectorNilUsesHelmValues: false
ruleSelector: {}
# Alertmanager 설정
alertingEndpoints:
- name: alertmanager
namespace: alertmanager
port: http
scheme: http
# External labels added to all metrics
externalLabels:
cluster: "mayne-cluster"
additionalScrapeConfigs:
# ArgoCD metrics
- job_name: 'argocd-metrics'
static_configs:
- targets:
- 'argocd-metrics.argocd.svc.cluster.local:8082'
labels:
service: argocd-controller
- targets:
- 'argocd-server-metrics.argocd.svc.cluster.local:8083'
labels:
service: argocd-server
- targets:
- 'argocd-repo-server.argocd.svc.cluster.local:8084'
labels:
service: argocd-repo
# Cert-Manager
- job_name: 'cert-manager'
static_configs:
- targets:
- 'cert-manager.cert-manager.svc.cluster.local:9402'
# MinIO
- job_name: 'minio-cluster'
static_configs:
- targets:
- 'minio.minio.svc.cluster.local:9000'
metrics_path: /minio/v2/metrics/cluster
scheme: http
- job_name: 'minio-node'
static_configs:
- targets:
- 'minio.minio.svc.cluster.local:9000'
metrics_path: /minio/v2/metrics/node
scheme: http
# Ingress NGINX
- job_name: 'ingress-nginx'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- ingress-nginx
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
action: keep
regex: ingress-nginx
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
action: keep
regex: controller
- source_labels: [__address__]
action: replace
regex: ([^:]+)(?::\d+)?
replacement: $1:10254
target_label: __address__
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: pod
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: namespace
# API Server 메트릭 수집 비활성화 (메모리 절감: ~37k series)
kubeApiServer:
enabled: false
# etcd 메트릭 수집 비활성화 (메모리 절감: ~26k series)
kubeEtcd:
enabled: false
# Alertmanager (기존 alertmanager 사용)
alertmanager:
enabled: false
# Grafana (기존 grafana 사용)
grafana:
enabled: false
# Node Exporter (기존 node-exporter 사용)
nodeExporter:
enabled: false
# Kube State Metrics (기존 kube-state-metrics 사용)
# 별도로 배포된 kube-state-metrics가 자체 ServiceMonitor 생성
kubeStateMetrics:
enabled: false
# 기본 ServiceMonitors
defaultRules:
create: true
# 비활성화할 개별 alert rule
disabled:
KubeCPUOvercommit: true
KubeMemoryOvercommit: true
rules:
alertmanager: true
etcd: false
configReloaders: true
general: true
k8s: true
kubeApiserverAvailability: true
kubeApiserverBurnrate: true
kubeApiserverHistogram: true
kubeApiserverSlos: true
kubeControllerManager: false
kubelet: true
kubeProxy: false
kubePrometheusGeneral: true
kubePrometheusNodeRecording: true
kubernetesApps: true
kubernetesResources: true
kubernetesStorage: true
kubernetesSystem: true
kubeSchedulerAlerting: false
kubeSchedulerRecording: false
kubeStateMetrics: true
network: true
node: true
nodeExporterAlerting: true
nodeExporterRecording: true
prometheus: true
prometheusOperator: true