Compare commits

...

10 Commits

Author SHA1 Message Date
7d0c8aa5f3 FIX(opentelemetry-operator): remove cpu null values
- Remove cpu: null (not allowed in new chart schema)
- Keep only memory limits
2026-01-10 18:55:23 +09:00
9c00c42946 CHORE(opentelemetry-operator): upgrade chart to 0.102.0
- Fix ServiceMonitor duplicate creation bug (Issue #3446)
- Upgrade from 0.74.0 to 0.102.0
2026-01-10 18:53:34 +09:00
a08d989fc3 FIX(opentelemetry-operator): remove invalid serviceMonitor
- Remove top-level serviceMonitor (not in chart schema)
- Keep manager.serviceMonitor.enabled: false
2026-01-10 18:42:02 +09:00
203a8debac REFACTOR(repo): remove control-plane scheduling
- Remove nodeSelector for control-plane node
- Remove tolerations for control-plane taint
- Allow pods to schedule on any available node
2026-01-10 18:35:15 +09:00
c128ece672 FIX(opentelemetry-operator): disable serviceMonitor
- Add top-level serviceMonitor.enabled: false
- Prevent duplicate ServiceMonitor creation on restart
2026-01-10 18:28:12 +09:00
bcf60b2428 fix: set CPU pressure threshold to 10% 2026-01-10 18:00:06 +09:00
da89c8dbf0 FIX(grafana): restore gauge design with percentage display
- Restore original gauge panel type
- Keep * 100 query and percent unit
- Set max to 100 for proper gauge range
2026-01-10 17:58:11 +09:00
11f9457236 fix: increase CPU pressure threshold to 30% 2026-01-10 17:57:34 +09:00
7e375e20c6 FIX(grafana): show CPU Usage as percentage per node
- Change panel type from gauge to stat
- Add * 100 to query for percentage
- Show each node's CPU usage horizontally
- Set thresholds at 50% (orange), 80% (red)
2026-01-10 17:57:05 +09:00
b818a8c1fe fix: update CPU throttling panels to use PSI metrics with 10% threshold 2026-01-10 17:54:55 +09:00
10 changed files with 9 additions and 64 deletions

View File

@@ -60,15 +60,6 @@ controller:
# Set to false to only monitor namespaces with the label: goldilocks.fairwinds.com/enabled=true # Set to false to only monitor namespaces with the label: goldilocks.fairwinds.com/enabled=true
enableCostRecommendations: true enableCostRecommendations: true
# Schedule on control-plane node
nodeSelector:
node-role.kubernetes.io/control-plane: "true"
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
# VPA configuration (should already be installed) # VPA configuration (should already be installed)
vpa: vpa:
# Set to false since we're installing VPA separately # Set to false since we're installing VPA separately

View File

@@ -1987,14 +1987,14 @@
}, },
"editorMode": "code", "editorMode": "code",
"exemplar": true, "exemplar": true,
"expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0 or vector(0)", "expr": "sum(rate(container_pressure_cpu_waiting_seconds_total{pod!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0.1",
"interval": "$resolution", "interval": "$resolution",
"legendFormat": "{{ namespace }}", "legendFormat": "{{ namespace }}",
"range": true, "range": true,
"refId": "A" "refId": "A"
} }
], ],
"title": "CPU Throttled seconds by namespace", "title": "CPU Pressure (waiting) by namespace",
"type": "timeseries" "type": "timeseries"
}, },
{ {
@@ -2099,14 +2099,14 @@
}, },
"editorMode": "code", "editorMode": "code",
"exemplar": true, "exemplar": true,
"expr": "sum(rate(node_cpu_core_throttles_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance) or vector(0)", "expr": "sum(rate(node_pressure_cpu_waiting_seconds_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance) > 0.1",
"interval": "$resolution", "interval": "$resolution",
"legendFormat": "{{ instance }}", "legendFormat": "{{ instance }}",
"range": true, "range": true,
"refId": "A" "refId": "A"
} }
], ],
"title": "CPU Core Throttled by instance", "title": "Node CPU Pressure (waiting) by instance",
"type": "timeseries" "type": "timeseries"
}, },
{ {

View File

@@ -2242,8 +2242,9 @@
} }
] ]
}, },
"unit": "s", "unit": "percent",
"unitScale": true "min": 0,
"max": 100
}, },
"overrides": [] "overrides": []
}, },
@@ -2278,7 +2279,7 @@
}, },
"editorMode": "code", "editorMode": "code",
"exemplar": true, "exemplar": true,
"expr": "rate(minio_node_process_cpu_total_seconds{job=~\"$scrape_jobs\"}[5m])", "expr": "rate(minio_node_process_cpu_total_seconds{job=\"minio\"}[5m]) * 100",
"interval": "", "interval": "",
"legendFormat": "{{server}}", "legendFormat": "{{server}}",
"range": true, "range": true,

View File

@@ -14,15 +14,6 @@ resources:
limits: limits:
memory: 105Mi memory: 105Mi
# Schedule on control-plane node
nodeSelector:
node-role.kubernetes.io/control-plane: "true"
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
service: service:
type: ClusterIP type: ClusterIP
clusterIP: None clusterIP: None

View File

@@ -32,14 +32,6 @@ resources:
limits: limits:
memory: 512Mi memory: 512Mi
# =============================================================================
# Tolerations (run on all nodes including master)
# =============================================================================
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
# ============================================================================= # =============================================================================
# Extra Volumes for Log Collection # Extra Volumes for Log Collection
# ============================================================================= # =============================================================================

View File

@@ -27,11 +27,6 @@ spec:
limits: limits:
memory: 512Mi memory: 512Mi
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
volumeMounts: volumeMounts:
- name: varlogpods - name: varlogpods
mountPath: /var/log/pods mountPath: /var/log/pods

View File

@@ -12,7 +12,7 @@ spec:
sources: sources:
- repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts - repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts
chart: opentelemetry-operator chart: opentelemetry-operator
targetRevision: 0.74.0 targetRevision: 0.102.0
helm: helm:
valueFiles: valueFiles:
- $values/opentelemetry-operator/helm-values.yaml - $values/opentelemetry-operator/helm-values.yaml

View File

@@ -20,7 +20,6 @@ manager:
repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-go repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-go
resources: resources:
limits: limits:
cpu: null # Disable chart default CPU limits
memory: 256Mi memory: 256Mi
requests: requests:
cpu: 10m cpu: 10m
@@ -39,17 +38,8 @@ kubeRBACProxy:
enabled: true enabled: true
resources: resources:
limits: limits:
cpu: null # Disable chart default CPU limits
memory: 64Mi memory: 64Mi
requests: requests:
cpu: 5m cpu: 5m
memory: 64Mi memory: 64Mi
# Schedule on master node
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
nodeSelector:
node-role.kubernetes.io/control-plane: "true"

View File

@@ -37,12 +37,6 @@ resources:
limits: limits:
memory: 182Mi memory: 182Mi
# Tolerations to run on all nodes including control-plane
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
# ServiceMonitor disabled # ServiceMonitor disabled
serviceMonitor: serviceMonitor:
enabled: false enabled: false

View File

@@ -13,15 +13,6 @@ recommender:
limits: limits:
memory: 100Mi memory: 100Mi
# Schedule on control-plane node
nodeSelector:
node-role.kubernetes.io/control-plane: "true"
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
# Updater - applies recommended resource requests to pods # Updater - applies recommended resource requests to pods
# Disabled because we're using updateMode: Off (recommendations only) # Disabled because we're using updateMode: Off (recommendations only)
updater: updater: