FIX(opentelemetry-operator): remove cpu null values

- Remove cpu: null (not allowed in new chart schema) - Keep only memory limits
CHORE(opentelemetry-operator): upgrade chart to 0.102.0
2026-01-10 18:55:23 +09:00 · 2026-01-10 18:53:34 +09:00 · 2026-01-10 18:42:02 +09:00 · 2026-01-10 18:35:15 +09:00 · 2026-01-10 18:28:12 +09:00 · 2026-01-10 18:00:06 +09:00
10 changed files with 9 additions and 64 deletions
--- a/goldilocks/helm-values.yaml
+++ b/goldilocks/helm-values.yaml
@@ -60,15 +60,6 @@ controller:
  # Set to false to only monitor namespaces with the label: goldilocks.fairwinds.com/enabled=true
  enableCostRecommendations: true
  # Schedule on control-plane node
  nodeSelector:
    node-role.kubernetes.io/control-plane: "true"
  tolerations:
    - key: node-role.kubernetes.io/control-plane
      operator: Exists
      effect: NoSchedule
 # VPA configuration (should already be installed)
 vpa:
  # Set to false since we're installing VPA separately
--- a/grafana/dashboards/main.json
+++ b/grafana/dashboards/main.json
@@ -1987,14 +1987,14 @@
          },
          "editorMode": "code",
          "exemplar": true,
-          "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0 or vector(0)",
+          "expr": "sum(rate(container_pressure_cpu_waiting_seconds_total{pod!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0.1",
          "interval": "$resolution",
          "legendFormat": "{{ namespace }}",
          "range": true,
          "refId": "A"
        }
      ],
-      "title": "CPU Throttled seconds by namespace",
+      "title": "CPU Pressure (waiting) by namespace",
      "type": "timeseries"
    },
    {
@@ -2099,14 +2099,14 @@
          },
          "editorMode": "code",
          "exemplar": true,
-          "expr": "sum(rate(node_cpu_core_throttles_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance) or vector(0)",
+          "expr": "sum(rate(node_pressure_cpu_waiting_seconds_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance) > 0.1",
          "interval": "$resolution",
          "legendFormat": "{{ instance }}",
          "range": true,
          "refId": "A"
        }
      ],
-      "title": "CPU Core Throttled by instance",
+      "title": "Node CPU Pressure (waiting) by instance",
      "type": "timeseries"
    },
    {
--- a/grafana/dashboards/minio.json
+++ b/grafana/dashboards/minio.json
@@ -2242,8 +2242,9 @@
              }
            ]
          },
-          "unit": "s",
+          "unit": "percent",
-          "unitScale": true
+          "min": 0,
          "max": 100
        },
        "overrides": []
      },
@@ -2278,7 +2279,7 @@
          },
          "editorMode": "code",
          "exemplar": true,
-          "expr": "rate(minio_node_process_cpu_total_seconds{job=~\"$scrape_jobs\"}[5m])",
+          "expr": "rate(minio_node_process_cpu_total_seconds{job=\"minio\"}[5m]) * 100",
          "interval": "",
          "legendFormat": "{{server}}",
          "range": true,
--- a/kube-state-metrics/helm-values.yaml
+++ b/kube-state-metrics/helm-values.yaml
@@ -14,15 +14,6 @@ resources:
  limits:
    memory: 105Mi
 # Schedule on control-plane node
 nodeSelector:
  node-role.kubernetes.io/control-plane: "true"
 tolerations:
  - key: node-role.kubernetes.io/control-plane
    operator: Exists
    effect: NoSchedule
 service:
  type: ClusterIP
  clusterIP: None
--- a/opentelemetry-collector/helm-values.yaml
+++ b/opentelemetry-collector/helm-values.yaml
@@ -32,14 +32,6 @@ resources:
  limits:
    memory: 512Mi
 # =============================================================================
 # Tolerations (run on all nodes including master)
 # =============================================================================
 tolerations:
  - key: node-role.kubernetes.io/control-plane
    operator: Exists
    effect: NoSchedule
 # =============================================================================
 # Extra Volumes for Log Collection
 # =============================================================================
--- a/opentelemetry-collector/manifests/collector.yaml
+++ b/opentelemetry-collector/manifests/collector.yaml
@@ -27,11 +27,6 @@ spec:
    limits:
      memory: 512Mi
  tolerations:
    - key: node-role.kubernetes.io/control-plane
      operator: Exists
      effect: NoSchedule
  volumeMounts:
    - name: varlogpods
      mountPath: /var/log/pods
--- a/opentelemetry-operator/argocd.yaml
+++ b/opentelemetry-operator/argocd.yaml
@@ -12,7 +12,7 @@ spec:
  sources:
  - repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts
    chart: opentelemetry-operator
-    targetRevision: 0.74.0
+    targetRevision: 0.102.0
    helm:
      valueFiles:
      - $values/opentelemetry-operator/helm-values.yaml
--- a/opentelemetry-operator/helm-values.yaml
+++ b/opentelemetry-operator/helm-values.yaml
@@ -20,7 +20,6 @@ manager:
      repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-go
  resources:
    limits:
      cpu: null  # Disable chart default CPU limits
      memory: 256Mi
    requests:
      cpu: 10m
@@ -39,17 +38,8 @@ kubeRBACProxy:
  enabled: true
  resources:
    limits:
      cpu: null  # Disable chart default CPU limits
      memory: 64Mi
    requests:
      cpu: 5m
      memory: 64Mi
 # Schedule on master node
 tolerations:
  - key: node-role.kubernetes.io/control-plane
    operator: Exists
    effect: NoSchedule
 nodeSelector:
  node-role.kubernetes.io/control-plane: "true"
--- a/promtail/helm-values.yaml
+++ b/promtail/helm-values.yaml
@@ -37,12 +37,6 @@ resources:
  limits:
    memory: 182Mi
 # Tolerations to run on all nodes including control-plane
 tolerations:
  - key: node-role.kubernetes.io/control-plane
    operator: Exists
    effect: NoSchedule
 # ServiceMonitor disabled
 serviceMonitor:
  enabled: false
--- a/vpa/helm-values.yaml
+++ b/vpa/helm-values.yaml
@@ -13,15 +13,6 @@ recommender:
    limits:
      memory: 100Mi
  # Schedule on control-plane node
  nodeSelector:
    node-role.kubernetes.io/control-plane: "true"
  tolerations:
    - key: node-role.kubernetes.io/control-plane
      operator: Exists
      effect: NoSchedule
 # Updater - applies recommended resource requests to pods
 # Disabled because we're using updateMode: Off (recommendations only)
 updater:
Author	SHA1	Message	Date
Mayne0213	7d0c8aa5f3	FIX(opentelemetry-operator): remove cpu null values - Remove cpu: null (not allowed in new chart schema) - Keep only memory limits	2026-01-10 18:55:23 +09:00
Mayne0213	9c00c42946	CHORE(opentelemetry-operator): upgrade chart to 0.102.0 - Fix ServiceMonitor duplicate creation bug (Issue #3446) - Upgrade from 0.74.0 to 0.102.0	2026-01-10 18:53:34 +09:00
Mayne0213	a08d989fc3	FIX(opentelemetry-operator): remove invalid serviceMonitor - Remove top-level serviceMonitor (not in chart schema) - Keep manager.serviceMonitor.enabled: false	2026-01-10 18:42:02 +09:00
Mayne0213	203a8debac	REFACTOR(repo): remove control-plane scheduling - Remove nodeSelector for control-plane node - Remove tolerations for control-plane taint - Allow pods to schedule on any available node	2026-01-10 18:35:15 +09:00
Mayne0213	c128ece672	FIX(opentelemetry-operator): disable serviceMonitor - Add top-level serviceMonitor.enabled: false - Prevent duplicate ServiceMonitor creation on restart	2026-01-10 18:28:12 +09:00
Mayne0213	bcf60b2428	fix: set CPU pressure threshold to 10%	2026-01-10 18:00:06 +09:00
Mayne0213	da89c8dbf0	FIX(grafana): restore gauge design with percentage display - Restore original gauge panel type - Keep * 100 query and percent unit - Set max to 100 for proper gauge range	2026-01-10 17:58:11 +09:00
Mayne0213	11f9457236	fix: increase CPU pressure threshold to 30%	2026-01-10 17:57:34 +09:00
Mayne0213	7e375e20c6	FIX(grafana): show CPU Usage as percentage per node - Change panel type from gauge to stat - Add * 100 to query for percentage - Show each node's CPU usage horizontally - Set thresholds at 50% (orange), 80% (red)	2026-01-10 17:57:05 +09:00
Mayne0213	b818a8c1fe	fix: update CPU throttling panels to use PSI metrics with 10% threshold	2026-01-10 17:54:55 +09:00