FIX(alertmanager): improve OOMKilled alert detection

- Only fire when container restarted in last 10 minutes
- Prevent stale alerts from old OOM events
This commit is contained in:
2026-01-09 15:13:44 +09:00
parent e3c615b5c1
commit bb8b1c193e

View File

@@ -12,10 +12,16 @@ spec:
rules: rules:
- alert: KubeContainerOOMKilled - alert: KubeContainerOOMKilled
annotations: annotations:
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} was OOMKilled." description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} was OOMKilled in the last 10 minutes."
summary: "Container was OOMKilled" summary: "Container was recently OOMKilled"
expr: | expr: |
(
increase(kube_pod_container_status_restarts_total[10m]) > 0
)
and on (namespace, pod, container)
(
kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1 kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1
)
for: 0m for: 0m
labels: labels:
severity: warning severity: warning