FEAT(thanos): add Thanos for Prometheus HA and long-term storage

- Add Thanos Query, Store Gateway, Compactor
- Enable Prometheus Sidecar with S3 (MinIO) storage
- Configure Prometheus replicas: 2 with pod anti-affinity
- Add ExternalSecrets for MinIO credentials
- Retention: raw 7d, 5m downsampled 30d, 1h downsampled 90d
This commit is contained in:
2026-01-08 20:21:37 +09:00
parent 9f3b768cd9
commit 6b576d6a16
7 changed files with 264 additions and 2 deletions

45
thanos/argocd.yaml Normal file
View File

@@ -0,0 +1,45 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: thanos
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
sources:
- repoURL: https://charts.bitnami.com/bitnami
chart: thanos
targetRevision: 17.3.1
helm:
valueFiles:
- $values/thanos/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
targetRevision: main
ref: values
- repoURL: https://github.com/K3S-HOME/observability.git
targetRevision: main
path: thanos/manifests
destination:
server: https://kubernetes.default.svc
namespace: thanos
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
managedNamespaceMetadata:
labels:
goldilocks.fairwinds.com/enabled: 'true'
minio-s3: enabled
revisionHistoryLimit: 10

117
thanos/helm-values.yaml Normal file
View File

@@ -0,0 +1,117 @@
# Thanos Helm Values
# Chart: https://github.com/bitnami/charts/tree/main/bitnami/thanos
#
# Architecture:
# - Prometheus (prometheus namespace) + Sidecar → uploads to MinIO
# - Query: queries Sidecar + Store Gateway, deduplicates data
# - Store Gateway: reads historical data from MinIO
# - Compactor: compacts and downsamples data in MinIO
# Object storage configuration (MinIO S3)
# Uses secret created by ExternalSecret
existingObjstoreSecret: thanos-objstore-secret
# =============================================================================
# Query - Main query endpoint (Grafana connects here)
# =============================================================================
query:
enabled: true
replicaCount: 1
# Deduplicate metrics from multiple Prometheus replicas
dnsDiscovery:
enabled: true
sidecarsService: prometheus-kube-prometheus-thanos-discovery
sidecarsNamespace: prometheus
# Store endpoints for historical data
stores:
- dnssrv+_grpc._tcp.thanos-storegateway.thanos.svc.cluster.local
# Deduplication settings
extraFlags:
- --query.replica-label=prometheus_replica
- --query.auto-downsampling
resources:
requests:
cpu: 15m
memory: 128Mi
limits:
memory: 256Mi
# =============================================================================
# Query Frontend - Caching layer for Query (optional, disabled for small cluster)
# =============================================================================
queryFrontend:
enabled: false
# =============================================================================
# Store Gateway - Reads historical data from S3
# =============================================================================
storegateway:
enabled: true
replicaCount: 1
resources:
requests:
cpu: 15m
memory: 128Mi
limits:
memory: 512Mi
persistence:
enabled: true
storageClass: local-path
size: 2Gi
# =============================================================================
# Compactor - Compacts and downsamples data in S3
# =============================================================================
compactor:
enabled: true
# Retention settings
retentionResolutionRaw: 7d # Keep raw data for 7 days
retentionResolution5m: 30d # Keep 5m downsampled for 30 days
retentionResolution1h: 90d # Keep 1h downsampled for 90 days
resources:
requests:
cpu: 15m
memory: 128Mi
limits:
memory: 512Mi
persistence:
enabled: true
storageClass: local-path
size: 2Gi
# =============================================================================
# Ruler - Alerting rules evaluation (disabled, using Prometheus rules)
# =============================================================================
ruler:
enabled: false
# =============================================================================
# Receive - Remote write endpoint (disabled, using Sidecar)
# =============================================================================
receive:
enabled: false
# =============================================================================
# Sidecar - Disabled here, enabled in Prometheus helm-values
# =============================================================================
# The sidecar is deployed alongside Prometheus via kube-prometheus-stack
# =============================================================================
# Metrics
# =============================================================================
metrics:
enabled: true
serviceMonitor:
enabled: true
namespace: thanos
labels:
release: prometheus

View File

@@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- argocd.yaml

View File

@@ -0,0 +1,32 @@
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: thanos-objstore-secret
namespace: thanos
spec:
refreshInterval: 1h
secretStoreRef:
name: vault-backend
kind: ClusterSecretStore
target:
name: thanos-objstore-secret
template:
engineVersion: v2
data:
objstore.yml: |
type: S3
config:
bucket: thanos
endpoint: minio.minio.svc.cluster.local:9000
access_key: {{ .access_key }}
secret_key: {{ .secret_key }}
insecure: true
data:
- secretKey: access_key
remoteRef:
key: secret/minio
property: root-user
- secretKey: secret_key
remoteRef:
key: secret/minio
property: root-password