FEAT(blackbox-exporter): add external endpoint monitoring
- Add blackbox-exporter with prometheus-community Helm chart - Configure HTTP probes for 25 external endpoints - Include SSL certificate expiry alerting rules - Add probe failure and slow response alerts - Deploy 2 replicas with anti-affinity for HA
This commit is contained in:
41
blackbox-exporter/argocd.yaml
Normal file
41
blackbox-exporter/argocd.yaml
Normal file
@@ -0,0 +1,41 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: blackbox-exporter
|
||||
namespace: argocd
|
||||
finalizers:
|
||||
- resources-finalizer.argocd.argoproj.io
|
||||
spec:
|
||||
project: default
|
||||
sources:
|
||||
- repoURL: https://prometheus-community.github.io/helm-charts
|
||||
chart: prometheus-blackbox-exporter
|
||||
targetRevision: 9.2.0
|
||||
helm:
|
||||
valueFiles:
|
||||
- $values/blackbox-exporter/helm-values.yaml
|
||||
- repoURL: https://github.com/K3S-HOME/observability.git
|
||||
targetRevision: main
|
||||
ref: values
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: blackbox-exporter
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
allowEmpty: false
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
- PrunePropagationPolicy=foreground
|
||||
- PruneLast=true
|
||||
retry:
|
||||
limit: 5
|
||||
backoff:
|
||||
duration: 5s
|
||||
factor: 2
|
||||
maxDuration: 3m
|
||||
managedNamespaceMetadata:
|
||||
labels:
|
||||
goldilocks.fairwinds.com/enabled: 'true'
|
||||
revisionHistoryLimit: 10
|
||||
186
blackbox-exporter/helm-values.yaml
Normal file
186
blackbox-exporter/helm-values.yaml
Normal file
@@ -0,0 +1,186 @@
|
||||
# Prometheus Blackbox Exporter Helm Values
|
||||
# Chart: https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-blackbox-exporter
|
||||
|
||||
fullnameOverride: blackbox-exporter
|
||||
|
||||
replicas: 2
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 15m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
memory: 64Mi
|
||||
|
||||
config:
|
||||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
||||
valid_status_codes: [200, 301, 302, 303]
|
||||
method: GET
|
||||
follow_redirects: true
|
||||
preferred_ip_protocol: ip4
|
||||
tls_config:
|
||||
insecure_skip_verify: false
|
||||
http_2xx_insecure:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
||||
valid_status_codes: [200, 301, 302, 303]
|
||||
method: GET
|
||||
follow_redirects: true
|
||||
preferred_ip_protocol: ip4
|
||||
tls_config:
|
||||
insecure_skip_verify: true
|
||||
tcp_connect:
|
||||
prober: tcp
|
||||
timeout: 5s
|
||||
icmp:
|
||||
prober: icmp
|
||||
timeout: 5s
|
||||
icmp:
|
||||
preferred_ip_protocol: ip4
|
||||
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
defaults:
|
||||
additionalMetricsRelabels: {}
|
||||
interval: 60s
|
||||
scrapeTimeout: 30s
|
||||
module: http_2xx
|
||||
additionalLabels:
|
||||
release: prometheus
|
||||
targets:
|
||||
# Infrastructure Services
|
||||
- name: argocd
|
||||
url: https://argocd0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: grafana
|
||||
url: https://grafana0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: vault
|
||||
url: https://vault0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: authelia
|
||||
url: https://auth0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: karma
|
||||
url: https://karma0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: gitea
|
||||
url: https://github0213.com
|
||||
module: http_2xx
|
||||
- name: minio-console
|
||||
url: https://minio0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: velero-ui
|
||||
url: https://velero0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: headlamp
|
||||
url: https://kubernetes0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: goldilocks
|
||||
url: https://goldilocks0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: code-server
|
||||
url: https://vscode0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: pgweb
|
||||
url: https://pgweb0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: zot
|
||||
url: https://zot0213.kro.kr
|
||||
module: http_2xx
|
||||
|
||||
# User Applications
|
||||
- name: homer
|
||||
url: https://mayne.kro.kr
|
||||
module: http_2xx
|
||||
- name: portfolio
|
||||
url: https://minjo0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: docusaurus
|
||||
url: https://docusaurus0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: jotion
|
||||
url: https://jotion0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: jovies
|
||||
url: https://jovies.kro.kr
|
||||
module: http_2xx
|
||||
- name: todo
|
||||
url: https://todo0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: umami
|
||||
url: https://umami0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: mas
|
||||
url: https://mas0213.kro.kr
|
||||
module: http_2xx
|
||||
- name: jaejadle
|
||||
url: https://jaejadle.kro.kr
|
||||
module: http_2xx
|
||||
- name: jaejadle-dev
|
||||
url: https://dev.jaejadle.kro.kr
|
||||
module: http_2xx
|
||||
- name: joossam
|
||||
url: https://joossameng.kro.kr
|
||||
module: http_2xx
|
||||
- name: joossam-dev
|
||||
url: https://dev.joossameng.kro.kr
|
||||
module: http_2xx
|
||||
|
||||
prometheusRule:
|
||||
enabled: true
|
||||
additionalLabels:
|
||||
release: prometheus
|
||||
rules:
|
||||
- alert: BlackboxProbeFailed
|
||||
expr: probe_success == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Blackbox probe failed for {{ $labels.target }}"
|
||||
description: "Probe {{ $labels.instance }} has been failing for more than 5 minutes."
|
||||
- alert: BlackboxSlowProbe
|
||||
expr: avg_over_time(probe_duration_seconds[5m]) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Blackbox slow probe for {{ $labels.target }}"
|
||||
description: "Probe {{ $labels.instance }} took more than 5s to complete."
|
||||
- alert: BlackboxSslCertificateWillExpireSoon
|
||||
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 30
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SSL certificate will expire soon for {{ $labels.target }}"
|
||||
description: "SSL certificate expires in {{ $value | humanizeDuration }} for {{ $labels.instance }}."
|
||||
- alert: BlackboxSslCertificateExpired
|
||||
expr: (probe_ssl_earliest_cert_expiry - time()) <= 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "SSL certificate has expired for {{ $labels.target }}"
|
||||
description: "SSL certificate has expired for {{ $labels.instance }}."
|
||||
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app.kubernetes.io/name
|
||||
operator: In
|
||||
values:
|
||||
- prometheus-blackbox-exporter
|
||||
topologyKey: kubernetes.io/hostname
|
||||
3
blackbox-exporter/kustomization.yaml
Normal file
3
blackbox-exporter/kustomization.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources: []
|
||||
@@ -17,3 +17,4 @@ resources:
|
||||
- kube-state-metrics/argocd.yaml
|
||||
- goldilocks/argocd.yaml
|
||||
- vpa/argocd.yaml
|
||||
- blackbox-exporter/argocd.yaml
|
||||
|
||||
Reference in New Issue
Block a user