FEAT(thanos): add Thanos for Prometheus HA and long-term storage

- Add Thanos Query, Store Gateway, Compactor
- Enable Prometheus Sidecar with S3 (MinIO) storage
- Configure Prometheus replicas: 2 with pod anti-affinity
- Add ExternalSecrets for MinIO credentials
- Retention: raw 7d, 5m downsampled 30d, 1h downsampled 90d
This commit is contained in:
2026-01-08 20:21:37 +09:00
parent 9f3b768cd9
commit 6b576d6a16
7 changed files with 264 additions and 2 deletions

View File

@@ -4,6 +4,7 @@ kind: Kustomization
resources:
- application.yaml
- prometheus/argocd.yaml
- thanos/argocd.yaml
- alertmanager/argocd.yaml
- grafana/argocd.yaml
- loki/argocd.yaml

View File

@@ -37,11 +37,40 @@ kubelet:
# Prometheus
prometheus:
enabled: true
# Thanos Sidecar - for long-term storage and HA
thanosService:
enabled: true
thanosServiceMonitor:
enabled: true
prometheusSpec:
# HA: 2 replicas on different worker nodes
replicas: 2
replicaExternalLabelName: prometheus_replica
# Pod anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app.kubernetes.io/name: prometheus
topologyKey: kubernetes.io/hostname
scrapeInterval: 60s # 30s → 60s (메모리 절감)
evaluationInterval: 60s # 30s → 60s
retention: 3d # 7d → 3d (메모리 절감)
retention: 3d # Local retention (S3 has longer retention via Thanos)
# Thanos Sidecar configuration
thanos:
image: quay.io/thanos/thanos:v0.37.2
objectStorageConfig:
existingSecret:
name: thanos-objstore-secret
key: objstore.yml
storageSpec:
volumeClaimTemplate:

View File

@@ -16,3 +16,36 @@ spec:
remoteRef:
key: postgresql
property: PASSWORD
---
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: thanos-objstore-secret
namespace: prometheus
spec:
refreshInterval: 1h
secretStoreRef:
name: vault-backend
kind: ClusterSecretStore
target:
name: thanos-objstore-secret
template:
engineVersion: v2
data:
objstore.yml: |
type: S3
config:
bucket: thanos
endpoint: minio.minio.svc.cluster.local:9000
access_key: {{ .access_key }}
secret_key: {{ .secret_key }}
insecure: true
data:
- secretKey: access_key
remoteRef:
key: secret/minio
property: root-user
- secretKey: secret_key
remoteRef:
key: secret/minio
property: root-password

45
thanos/argocd.yaml Normal file
View File

@@ -0,0 +1,45 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: thanos
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
sources:
- repoURL: https://charts.bitnami.com/bitnami
chart: thanos
targetRevision: 17.3.1
helm:
valueFiles:
- $values/thanos/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
targetRevision: main
ref: values
- repoURL: https://github.com/K3S-HOME/observability.git
targetRevision: main
path: thanos/manifests
destination:
server: https://kubernetes.default.svc
namespace: thanos
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
managedNamespaceMetadata:
labels:
goldilocks.fairwinds.com/enabled: 'true'
minio-s3: enabled
revisionHistoryLimit: 10

117
thanos/helm-values.yaml Normal file
View File

@@ -0,0 +1,117 @@
# Thanos Helm Values
# Chart: https://github.com/bitnami/charts/tree/main/bitnami/thanos
#
# Architecture:
# - Prometheus (prometheus namespace) + Sidecar → uploads to MinIO
# - Query: queries Sidecar + Store Gateway, deduplicates data
# - Store Gateway: reads historical data from MinIO
# - Compactor: compacts and downsamples data in MinIO
# Object storage configuration (MinIO S3)
# Uses secret created by ExternalSecret
existingObjstoreSecret: thanos-objstore-secret
# =============================================================================
# Query - Main query endpoint (Grafana connects here)
# =============================================================================
query:
enabled: true
replicaCount: 1
# Deduplicate metrics from multiple Prometheus replicas
dnsDiscovery:
enabled: true
sidecarsService: prometheus-kube-prometheus-thanos-discovery
sidecarsNamespace: prometheus
# Store endpoints for historical data
stores:
- dnssrv+_grpc._tcp.thanos-storegateway.thanos.svc.cluster.local
# Deduplication settings
extraFlags:
- --query.replica-label=prometheus_replica
- --query.auto-downsampling
resources:
requests:
cpu: 15m
memory: 128Mi
limits:
memory: 256Mi
# =============================================================================
# Query Frontend - Caching layer for Query (optional, disabled for small cluster)
# =============================================================================
queryFrontend:
enabled: false
# =============================================================================
# Store Gateway - Reads historical data from S3
# =============================================================================
storegateway:
enabled: true
replicaCount: 1
resources:
requests:
cpu: 15m
memory: 128Mi
limits:
memory: 512Mi
persistence:
enabled: true
storageClass: local-path
size: 2Gi
# =============================================================================
# Compactor - Compacts and downsamples data in S3
# =============================================================================
compactor:
enabled: true
# Retention settings
retentionResolutionRaw: 7d # Keep raw data for 7 days
retentionResolution5m: 30d # Keep 5m downsampled for 30 days
retentionResolution1h: 90d # Keep 1h downsampled for 90 days
resources:
requests:
cpu: 15m
memory: 128Mi
limits:
memory: 512Mi
persistence:
enabled: true
storageClass: local-path
size: 2Gi
# =============================================================================
# Ruler - Alerting rules evaluation (disabled, using Prometheus rules)
# =============================================================================
ruler:
enabled: false
# =============================================================================
# Receive - Remote write endpoint (disabled, using Sidecar)
# =============================================================================
receive:
enabled: false
# =============================================================================
# Sidecar - Disabled here, enabled in Prometheus helm-values
# =============================================================================
# The sidecar is deployed alongside Prometheus via kube-prometheus-stack
# =============================================================================
# Metrics
# =============================================================================
metrics:
enabled: true
serviceMonitor:
enabled: true
namespace: thanos
labels:
release: prometheus

View File

@@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- argocd.yaml

View File

@@ -0,0 +1,32 @@
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: thanos-objstore-secret
namespace: thanos
spec:
refreshInterval: 1h
secretStoreRef:
name: vault-backend
kind: ClusterSecretStore
target:
name: thanos-objstore-secret
template:
engineVersion: v2
data:
objstore.yml: |
type: S3
config:
bucket: thanos
endpoint: minio.minio.svc.cluster.local:9000
access_key: {{ .access_key }}
secret_key: {{ .secret_key }}
insecure: true
data:
- secretKey: access_key
remoteRef:
key: secret/minio
property: root-user
- secretKey: secret_key
remoteRef:
key: secret/minio
property: root-password