Align memory limits with memory requests for guaranteed QoS class. - prometheus, thanos (query, storegateway, compactor) - alertmanager, tempo, goldilocks (dashboard, controller) - node-exporter, opentelemetry-collector, vpa, kube-state-metrics
235 lines
6.3 KiB
YAML
235 lines
6.3 KiB
YAML
# Kube-Prometheus-Stack Helm Values
|
|
# Chart: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
|
|
# Includes: Prometheus Operator, Prometheus, Alertmanager, Grafana, and various exporters
|
|
|
|
# Global settings
|
|
fullnameOverride: ""
|
|
|
|
# CRD 관리
|
|
crds:
|
|
enabled: false # CRD annotation 크기 문제로 비활성화 (CRD는 이미 설치됨)
|
|
|
|
# Prometheus Operator
|
|
prometheusOperator:
|
|
enabled: true
|
|
# CRD 생성 비활성화
|
|
createCustomResource: false
|
|
|
|
# Kubelet ServiceMonitor with cluster label
|
|
kubelet:
|
|
enabled: true
|
|
serviceMonitor:
|
|
# cAdvisor metrics (container_memory_working_set_bytes 등)
|
|
cAdvisorRelabelings:
|
|
- targetLabel: cluster
|
|
replacement: "mayne-cluster"
|
|
- sourceLabels: [__metrics_path__]
|
|
targetLabel: metrics_path
|
|
# Resource metrics
|
|
resourceRelabelings:
|
|
- targetLabel: cluster
|
|
replacement: "mayne-cluster"
|
|
# Probes metrics
|
|
probesRelabelings:
|
|
- targetLabel: cluster
|
|
replacement: "mayne-cluster"
|
|
|
|
# Prometheus
|
|
prometheus:
|
|
enabled: true
|
|
|
|
# Thanos Sidecar - for long-term storage and HA
|
|
thanosService:
|
|
enabled: true
|
|
thanosServiceMonitor:
|
|
enabled: true
|
|
|
|
prometheusSpec:
|
|
# HA: 2 replicas on different worker nodes
|
|
replicas: 2
|
|
replicaExternalLabelName: prometheus_replica
|
|
|
|
# Pod anti-affinity for HA
|
|
affinity:
|
|
podAntiAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 100
|
|
podAffinityTerm:
|
|
labelSelector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: prometheus
|
|
topologyKey: kubernetes.io/hostname
|
|
|
|
scrapeInterval: 60s # 30s → 60s (메모리 절감)
|
|
evaluationInterval: 60s # 30s → 60s
|
|
retention: 3d # Local retention (S3 has longer retention via Thanos)
|
|
|
|
# Thanos Sidecar configuration
|
|
thanos:
|
|
image: quay.io/thanos/thanos:v0.37.2
|
|
objectStorageConfig:
|
|
existingSecret:
|
|
name: thanos-objstore-secret
|
|
key: objstore.yml
|
|
|
|
storageSpec:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
storageClassName: local-path
|
|
accessModes: ["ReadWriteOnce"]
|
|
resources:
|
|
requests:
|
|
storage: 5Gi
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 512Mi
|
|
limits:
|
|
memory: 512Mi
|
|
|
|
# ServiceMonitor 자동 발견 - 모든 ServiceMonitor 선택
|
|
serviceMonitorSelectorNilUsesHelmValues: false
|
|
serviceMonitorSelector: {}
|
|
podMonitorSelectorNilUsesHelmValues: false
|
|
podMonitorSelector: {}
|
|
probeSelectorNilUsesHelmValues: false
|
|
ruleSelector: {}
|
|
|
|
# Alertmanager 설정
|
|
alertingEndpoints:
|
|
- name: alertmanager
|
|
namespace: alertmanager
|
|
port: http
|
|
scheme: http
|
|
|
|
# External labels added to all metrics
|
|
externalLabels:
|
|
cluster: "mayne-cluster"
|
|
|
|
additionalScrapeConfigs:
|
|
# ArgoCD metrics
|
|
- job_name: 'argocd-metrics'
|
|
static_configs:
|
|
- targets:
|
|
- 'argocd-application-controller-metrics.argocd.svc.cluster.local:8082'
|
|
labels:
|
|
service: argocd-controller
|
|
- targets:
|
|
- 'argocd-server-metrics.argocd.svc.cluster.local:8083'
|
|
labels:
|
|
service: argocd-server
|
|
- targets:
|
|
- 'argocd-repo-server-metrics.argocd.svc.cluster.local:8084'
|
|
labels:
|
|
service: argocd-repo
|
|
|
|
# Cert-Manager
|
|
- job_name: 'cert-manager'
|
|
static_configs:
|
|
- targets:
|
|
- 'cert-manager.cert-manager.svc.cluster.local:9402'
|
|
|
|
# MinIO
|
|
- job_name: 'minio-cluster'
|
|
static_configs:
|
|
- targets:
|
|
- 'minio.minio.svc.cluster.local:9000'
|
|
metrics_path: /minio/v2/metrics/cluster
|
|
scheme: http
|
|
|
|
- job_name: 'minio-node'
|
|
static_configs:
|
|
- targets:
|
|
- 'minio.minio.svc.cluster.local:9000'
|
|
metrics_path: /minio/v2/metrics/node
|
|
scheme: http
|
|
|
|
# Ingress NGINX
|
|
- job_name: 'ingress-nginx'
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
namespaces:
|
|
names:
|
|
- ingress-nginx
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
|
action: keep
|
|
regex: ingress-nginx
|
|
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
|
action: keep
|
|
regex: controller
|
|
- source_labels: [__address__]
|
|
action: replace
|
|
regex: ([^:]+)(?::\d+)?
|
|
replacement: $1:10254
|
|
target_label: __address__
|
|
- source_labels: [__meta_kubernetes_pod_name]
|
|
action: replace
|
|
target_label: pod
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
action: replace
|
|
target_label: namespace
|
|
|
|
# API Server 메트릭 수집 비활성화 (메모리 절감: ~37k series)
|
|
kubeApiServer:
|
|
enabled: false
|
|
|
|
# etcd 메트릭 수집 비활성화 (메모리 절감: ~26k series)
|
|
kubeEtcd:
|
|
enabled: false
|
|
|
|
# Alertmanager (기존 alertmanager 사용)
|
|
alertmanager:
|
|
enabled: false
|
|
|
|
# Grafana (기존 grafana 사용)
|
|
grafana:
|
|
enabled: false
|
|
|
|
# Node Exporter (기존 node-exporter 사용)
|
|
nodeExporter:
|
|
enabled: false
|
|
|
|
# Kube State Metrics (기존 kube-state-metrics 사용)
|
|
# 별도로 배포된 kube-state-metrics가 자체 ServiceMonitor 생성
|
|
kubeStateMetrics:
|
|
enabled: false
|
|
|
|
# 기본 ServiceMonitors
|
|
defaultRules:
|
|
create: true
|
|
# 비활성화할 개별 alert rule
|
|
disabled:
|
|
KubeCPUOvercommit: true
|
|
KubeMemoryOvercommit: true
|
|
rules:
|
|
alertmanager: true
|
|
etcd: false
|
|
configReloaders: true
|
|
general: true
|
|
k8s: true
|
|
kubeApiserverAvailability: true
|
|
kubeApiserverBurnrate: true
|
|
kubeApiserverHistogram: true
|
|
kubeApiserverSlos: true
|
|
kubeControllerManager: false
|
|
kubelet: true
|
|
kubeProxy: false
|
|
kubePrometheusGeneral: true
|
|
kubePrometheusNodeRecording: true
|
|
kubernetesApps: true
|
|
kubernetesResources: true
|
|
kubernetesStorage: true
|
|
kubernetesSystem: true
|
|
kubeSchedulerAlerting: false
|
|
kubeSchedulerRecording: false
|
|
kubeStateMetrics: true
|
|
network: true
|
|
node: true
|
|
nodeExporterAlerting: true
|
|
nodeExporterRecording: true
|
|
prometheus: true
|
|
prometheusOperator: true
|
|
|