From 42862965918161ee3b074a51536b9b43e3a7036b Mon Sep 17 00:00:00 2001 From: Mayne0213 Date: Wed, 7 Jan 2026 23:48:35 +0900 Subject: [PATCH] PERF(resources): remove CPU limits - keep memory limits only - CPU throttling prevents app startup, not crashes - Memory OOM is the real cascading failure cause - CPU request ensures fair scheduling --- alertmanager/helm-values.yaml | 6 ++++-- goldilocks/helm-values.yaml | 18 ++++++++---------- grafana/helm-values.yaml | 6 ++++-- kube-state-metrics/helm-values.yaml | 6 ++++-- loki/helm-values.yaml | 6 ++++-- node-exporter/helm-values.yaml | 6 ++++-- prometheus/helm-values.yaml | 6 +++--- promtail/helm-values.yaml | 6 ++++-- uptime-kuma/helm-values.yaml | 6 +++--- vpa/helm-values.yaml | 9 ++++----- 10 files changed, 42 insertions(+), 33 deletions(-) diff --git a/alertmanager/helm-values.yaml b/alertmanager/helm-values.yaml index 3d47114..609445d 100644 --- a/alertmanager/helm-values.yaml +++ b/alertmanager/helm-values.yaml @@ -10,8 +10,10 @@ persistence: resources: requests: - cpu: 10m - memory: 32Mi + cpu: 15m + memory: 100Mi + limits: + memory: 150Mi # Disable default config - use secret instead config: diff --git a/goldilocks/helm-values.yaml b/goldilocks/helm-values.yaml index 7a26341..6fdd85a 100644 --- a/goldilocks/helm-values.yaml +++ b/goldilocks/helm-values.yaml @@ -7,12 +7,11 @@ dashboard: replicaCount: 1 resources: - limits: - cpu: null - memory: 256Mi requests: - cpu: 25m - memory: 128Mi + cpu: 15m + memory: 100Mi + limits: + memory: 150Mi service: type: ClusterIP @@ -39,12 +38,11 @@ controller: enabled: true resources: - limits: - cpu: null - memory: 256Mi requests: - cpu: 25m - memory: 128Mi + cpu: 15m + memory: 100Mi + limits: + memory: 150Mi # Enable VPA recommendations for all namespaces # Set to false to only monitor namespaces with the label: goldilocks.fairwinds.com/enabled=true diff --git a/grafana/helm-values.yaml b/grafana/helm-values.yaml index c61c5ed..b7130be 100644 --- a/grafana/helm-values.yaml +++ b/grafana/helm-values.yaml @@ -24,8 +24,10 @@ podSecurityContext: resources: requests: - cpu: 25m - memory: 128Mi + cpu: 11m + memory: 425Mi + limits: + memory: 425Mi service: type: ClusterIP diff --git a/kube-state-metrics/helm-values.yaml b/kube-state-metrics/helm-values.yaml index bdd82b5..e517bee 100644 --- a/kube-state-metrics/helm-values.yaml +++ b/kube-state-metrics/helm-values.yaml @@ -5,8 +5,10 @@ fullnameOverride: kube-state-metrics resources: requests: - cpu: 10m - memory: 64Mi + cpu: 15m + memory: 100Mi + limits: + memory: 150Mi service: type: ClusterIP diff --git a/loki/helm-values.yaml b/loki/helm-values.yaml index 2613814..e31efc5 100644 --- a/loki/helm-values.yaml +++ b/loki/helm-values.yaml @@ -43,8 +43,10 @@ singleBinary: storageClass: local-path resources: requests: - cpu: 15m # Reduced from 50m based on actual usage (10m) - memory: 128Mi + cpu: 23m + memory: 462Mi + limits: + memory: 462Mi # Disable components not needed in single binary mode backend: diff --git a/node-exporter/helm-values.yaml b/node-exporter/helm-values.yaml index 4386a98..00a877c 100644 --- a/node-exporter/helm-values.yaml +++ b/node-exporter/helm-values.yaml @@ -8,8 +8,10 @@ hostPID: true resources: requests: - cpu: 10m - memory: 50Mi + cpu: 15m + memory: 64Mi + limits: + memory: 96Mi service: type: ClusterIP diff --git a/prometheus/helm-values.yaml b/prometheus/helm-values.yaml index 9989b3b..c5f2891 100644 --- a/prometheus/helm-values.yaml +++ b/prometheus/helm-values.yaml @@ -53,11 +53,11 @@ prometheus: storage: 5Gi resources: - limits: - memory: 1Gi # limit 추가 (무제한 증가 방지) requests: cpu: 200m - memory: 512Mi # 256Mi → 512Mi (실제 사용량 반영) + memory: 512Mi + limits: + memory: 768Mi # ServiceMonitor 자동 발견 - 모든 ServiceMonitor 선택 serviceMonitorSelectorNilUsesHelmValues: false diff --git a/promtail/helm-values.yaml b/promtail/helm-values.yaml index 738e716..b4a3667 100644 --- a/promtail/helm-values.yaml +++ b/promtail/helm-values.yaml @@ -32,8 +32,10 @@ defaultVolumeMounts: # Resources resources: requests: - cpu: 25m # Reduced from 50m based on actual usage (8-17m) - memory: 64Mi + cpu: 23m + memory: 182Mi + limits: + memory: 182Mi # Tolerations to run on all nodes including master tolerations: diff --git a/uptime-kuma/helm-values.yaml b/uptime-kuma/helm-values.yaml index ebaa1fe..f5806c3 100644 --- a/uptime-kuma/helm-values.yaml +++ b/uptime-kuma/helm-values.yaml @@ -13,10 +13,10 @@ persistence: resources: requests: - cpu: 50m - memory: 128Mi + cpu: 15m + memory: 200Mi limits: - memory: 256Mi + memory: 300Mi livenessProbe: enabled: true diff --git a/vpa/helm-values.yaml b/vpa/helm-values.yaml index cf652f4..b4715bf 100644 --- a/vpa/helm-values.yaml +++ b/vpa/helm-values.yaml @@ -7,12 +7,11 @@ recommender: replicaCount: 1 resources: - limits: - cpu: null - memory: 1Gi requests: - cpu: 50m - memory: 512Mi + cpu: 15m + memory: 128Mi + limits: + memory: 192Mi # Updater - applies recommended resource requests to pods # Disabled because we're using updateMode: Off (recommendations only)