FEAT(thanos): add Thanos for Prometheus HA and long-term storage
- Add Thanos Query, Store Gateway, Compactor - Enable Prometheus Sidecar with S3 (MinIO) storage - Configure Prometheus replicas: 2 with pod anti-affinity - Add ExternalSecrets for MinIO credentials - Retention: raw 7d, 5m downsampled 30d, 1h downsampled 90d
This commit is contained in:
@@ -4,6 +4,7 @@ kind: Kustomization
|
||||
resources:
|
||||
- application.yaml
|
||||
- prometheus/argocd.yaml
|
||||
- thanos/argocd.yaml
|
||||
- alertmanager/argocd.yaml
|
||||
- grafana/argocd.yaml
|
||||
- loki/argocd.yaml
|
||||
|
||||
@@ -38,10 +38,39 @@ kubelet:
|
||||
prometheus:
|
||||
enabled: true
|
||||
|
||||
# Thanos Sidecar - for long-term storage and HA
|
||||
thanosService:
|
||||
enabled: true
|
||||
thanosServiceMonitor:
|
||||
enabled: true
|
||||
|
||||
prometheusSpec:
|
||||
# HA: 2 replicas on different worker nodes
|
||||
replicas: 2
|
||||
replicaExternalLabelName: prometheus_replica
|
||||
|
||||
# Pod anti-affinity for HA
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: prometheus
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
||||
scrapeInterval: 60s # 30s → 60s (메모리 절감)
|
||||
evaluationInterval: 60s # 30s → 60s
|
||||
retention: 3d # 7d → 3d (메모리 절감)
|
||||
retention: 3d # Local retention (S3 has longer retention via Thanos)
|
||||
|
||||
# Thanos Sidecar configuration
|
||||
thanos:
|
||||
image: quay.io/thanos/thanos:v0.37.2
|
||||
objectStorageConfig:
|
||||
existingSecret:
|
||||
name: thanos-objstore-secret
|
||||
key: objstore.yml
|
||||
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
|
||||
@@ -16,3 +16,36 @@ spec:
|
||||
remoteRef:
|
||||
key: postgresql
|
||||
property: PASSWORD
|
||||
---
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
name: thanos-objstore-secret
|
||||
namespace: prometheus
|
||||
spec:
|
||||
refreshInterval: 1h
|
||||
secretStoreRef:
|
||||
name: vault-backend
|
||||
kind: ClusterSecretStore
|
||||
target:
|
||||
name: thanos-objstore-secret
|
||||
template:
|
||||
engineVersion: v2
|
||||
data:
|
||||
objstore.yml: |
|
||||
type: S3
|
||||
config:
|
||||
bucket: thanos
|
||||
endpoint: minio.minio.svc.cluster.local:9000
|
||||
access_key: {{ .access_key }}
|
||||
secret_key: {{ .secret_key }}
|
||||
insecure: true
|
||||
data:
|
||||
- secretKey: access_key
|
||||
remoteRef:
|
||||
key: secret/minio
|
||||
property: root-user
|
||||
- secretKey: secret_key
|
||||
remoteRef:
|
||||
key: secret/minio
|
||||
property: root-password
|
||||
|
||||
45
thanos/argocd.yaml
Normal file
45
thanos/argocd.yaml
Normal file
@@ -0,0 +1,45 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: thanos
|
||||
namespace: argocd
|
||||
finalizers:
|
||||
- resources-finalizer.argocd.argoproj.io
|
||||
spec:
|
||||
project: default
|
||||
sources:
|
||||
- repoURL: https://charts.bitnami.com/bitnami
|
||||
chart: thanos
|
||||
targetRevision: 17.3.1
|
||||
helm:
|
||||
valueFiles:
|
||||
- $values/thanos/helm-values.yaml
|
||||
- repoURL: https://github.com/K3S-HOME/observability.git
|
||||
targetRevision: main
|
||||
ref: values
|
||||
- repoURL: https://github.com/K3S-HOME/observability.git
|
||||
targetRevision: main
|
||||
path: thanos/manifests
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: thanos
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
allowEmpty: false
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
- PrunePropagationPolicy=foreground
|
||||
- PruneLast=true
|
||||
retry:
|
||||
limit: 5
|
||||
backoff:
|
||||
duration: 5s
|
||||
factor: 2
|
||||
maxDuration: 3m
|
||||
managedNamespaceMetadata:
|
||||
labels:
|
||||
goldilocks.fairwinds.com/enabled: 'true'
|
||||
minio-s3: enabled
|
||||
revisionHistoryLimit: 10
|
||||
117
thanos/helm-values.yaml
Normal file
117
thanos/helm-values.yaml
Normal file
@@ -0,0 +1,117 @@
|
||||
# Thanos Helm Values
|
||||
# Chart: https://github.com/bitnami/charts/tree/main/bitnami/thanos
|
||||
#
|
||||
# Architecture:
|
||||
# - Prometheus (prometheus namespace) + Sidecar → uploads to MinIO
|
||||
# - Query: queries Sidecar + Store Gateway, deduplicates data
|
||||
# - Store Gateway: reads historical data from MinIO
|
||||
# - Compactor: compacts and downsamples data in MinIO
|
||||
|
||||
# Object storage configuration (MinIO S3)
|
||||
# Uses secret created by ExternalSecret
|
||||
existingObjstoreSecret: thanos-objstore-secret
|
||||
|
||||
# =============================================================================
|
||||
# Query - Main query endpoint (Grafana connects here)
|
||||
# =============================================================================
|
||||
query:
|
||||
enabled: true
|
||||
replicaCount: 1
|
||||
|
||||
# Deduplicate metrics from multiple Prometheus replicas
|
||||
dnsDiscovery:
|
||||
enabled: true
|
||||
sidecarsService: prometheus-kube-prometheus-thanos-discovery
|
||||
sidecarsNamespace: prometheus
|
||||
|
||||
# Store endpoints for historical data
|
||||
stores:
|
||||
- dnssrv+_grpc._tcp.thanos-storegateway.thanos.svc.cluster.local
|
||||
|
||||
# Deduplication settings
|
||||
extraFlags:
|
||||
- --query.replica-label=prometheus_replica
|
||||
- --query.auto-downsampling
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 15m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
memory: 256Mi
|
||||
|
||||
# =============================================================================
|
||||
# Query Frontend - Caching layer for Query (optional, disabled for small cluster)
|
||||
# =============================================================================
|
||||
queryFrontend:
|
||||
enabled: false
|
||||
|
||||
# =============================================================================
|
||||
# Store Gateway - Reads historical data from S3
|
||||
# =============================================================================
|
||||
storegateway:
|
||||
enabled: true
|
||||
replicaCount: 1
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 15m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
memory: 512Mi
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClass: local-path
|
||||
size: 2Gi
|
||||
|
||||
# =============================================================================
|
||||
# Compactor - Compacts and downsamples data in S3
|
||||
# =============================================================================
|
||||
compactor:
|
||||
enabled: true
|
||||
|
||||
# Retention settings
|
||||
retentionResolutionRaw: 7d # Keep raw data for 7 days
|
||||
retentionResolution5m: 30d # Keep 5m downsampled for 30 days
|
||||
retentionResolution1h: 90d # Keep 1h downsampled for 90 days
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 15m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
memory: 512Mi
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClass: local-path
|
||||
size: 2Gi
|
||||
|
||||
# =============================================================================
|
||||
# Ruler - Alerting rules evaluation (disabled, using Prometheus rules)
|
||||
# =============================================================================
|
||||
ruler:
|
||||
enabled: false
|
||||
|
||||
# =============================================================================
|
||||
# Receive - Remote write endpoint (disabled, using Sidecar)
|
||||
# =============================================================================
|
||||
receive:
|
||||
enabled: false
|
||||
|
||||
# =============================================================================
|
||||
# Sidecar - Disabled here, enabled in Prometheus helm-values
|
||||
# =============================================================================
|
||||
# The sidecar is deployed alongside Prometheus via kube-prometheus-stack
|
||||
|
||||
# =============================================================================
|
||||
# Metrics
|
||||
# =============================================================================
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
namespace: thanos
|
||||
labels:
|
||||
release: prometheus
|
||||
5
thanos/kustomization.yaml
Normal file
5
thanos/kustomization.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- argocd.yaml
|
||||
32
thanos/manifests/secret.yaml
Normal file
32
thanos/manifests/secret.yaml
Normal file
@@ -0,0 +1,32 @@
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
name: thanos-objstore-secret
|
||||
namespace: thanos
|
||||
spec:
|
||||
refreshInterval: 1h
|
||||
secretStoreRef:
|
||||
name: vault-backend
|
||||
kind: ClusterSecretStore
|
||||
target:
|
||||
name: thanos-objstore-secret
|
||||
template:
|
||||
engineVersion: v2
|
||||
data:
|
||||
objstore.yml: |
|
||||
type: S3
|
||||
config:
|
||||
bucket: thanos
|
||||
endpoint: minio.minio.svc.cluster.local:9000
|
||||
access_key: {{ .access_key }}
|
||||
secret_key: {{ .secret_key }}
|
||||
insecure: true
|
||||
data:
|
||||
- secretKey: access_key
|
||||
remoteRef:
|
||||
key: secret/minio
|
||||
property: root-user
|
||||
- secretKey: secret_key
|
||||
remoteRef:
|
||||
key: secret/minio
|
||||
property: root-password
|
||||
Reference in New Issue
Block a user