FEAT(thanos): add Thanos for Prometheus HA and long-term storage
- Add Thanos Query, Store Gateway, Compactor - Enable Prometheus Sidecar with S3 (MinIO) storage - Configure Prometheus replicas: 2 with pod anti-affinity - Add ExternalSecrets for MinIO credentials - Retention: raw 7d, 5m downsampled 30d, 1h downsampled 90d
This commit is contained in:
@@ -4,6 +4,7 @@ kind: Kustomization
|
|||||||
resources:
|
resources:
|
||||||
- application.yaml
|
- application.yaml
|
||||||
- prometheus/argocd.yaml
|
- prometheus/argocd.yaml
|
||||||
|
- thanos/argocd.yaml
|
||||||
- alertmanager/argocd.yaml
|
- alertmanager/argocd.yaml
|
||||||
- grafana/argocd.yaml
|
- grafana/argocd.yaml
|
||||||
- loki/argocd.yaml
|
- loki/argocd.yaml
|
||||||
|
|||||||
@@ -37,11 +37,40 @@ kubelet:
|
|||||||
# Prometheus
|
# Prometheus
|
||||||
prometheus:
|
prometheus:
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|
||||||
|
# Thanos Sidecar - for long-term storage and HA
|
||||||
|
thanosService:
|
||||||
|
enabled: true
|
||||||
|
thanosServiceMonitor:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
prometheusSpec:
|
prometheusSpec:
|
||||||
|
# HA: 2 replicas on different worker nodes
|
||||||
|
replicas: 2
|
||||||
|
replicaExternalLabelName: prometheus_replica
|
||||||
|
|
||||||
|
# Pod anti-affinity for HA
|
||||||
|
affinity:
|
||||||
|
podAntiAffinity:
|
||||||
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- weight: 100
|
||||||
|
podAffinityTerm:
|
||||||
|
labelSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
topologyKey: kubernetes.io/hostname
|
||||||
|
|
||||||
scrapeInterval: 60s # 30s → 60s (메모리 절감)
|
scrapeInterval: 60s # 30s → 60s (메모리 절감)
|
||||||
evaluationInterval: 60s # 30s → 60s
|
evaluationInterval: 60s # 30s → 60s
|
||||||
retention: 3d # 7d → 3d (메모리 절감)
|
retention: 3d # Local retention (S3 has longer retention via Thanos)
|
||||||
|
|
||||||
|
# Thanos Sidecar configuration
|
||||||
|
thanos:
|
||||||
|
image: quay.io/thanos/thanos:v0.37.2
|
||||||
|
objectStorageConfig:
|
||||||
|
existingSecret:
|
||||||
|
name: thanos-objstore-secret
|
||||||
|
key: objstore.yml
|
||||||
|
|
||||||
storageSpec:
|
storageSpec:
|
||||||
volumeClaimTemplate:
|
volumeClaimTemplate:
|
||||||
|
|||||||
@@ -16,3 +16,36 @@ spec:
|
|||||||
remoteRef:
|
remoteRef:
|
||||||
key: postgresql
|
key: postgresql
|
||||||
property: PASSWORD
|
property: PASSWORD
|
||||||
|
---
|
||||||
|
apiVersion: external-secrets.io/v1
|
||||||
|
kind: ExternalSecret
|
||||||
|
metadata:
|
||||||
|
name: thanos-objstore-secret
|
||||||
|
namespace: prometheus
|
||||||
|
spec:
|
||||||
|
refreshInterval: 1h
|
||||||
|
secretStoreRef:
|
||||||
|
name: vault-backend
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
target:
|
||||||
|
name: thanos-objstore-secret
|
||||||
|
template:
|
||||||
|
engineVersion: v2
|
||||||
|
data:
|
||||||
|
objstore.yml: |
|
||||||
|
type: S3
|
||||||
|
config:
|
||||||
|
bucket: thanos
|
||||||
|
endpoint: minio.minio.svc.cluster.local:9000
|
||||||
|
access_key: {{ .access_key }}
|
||||||
|
secret_key: {{ .secret_key }}
|
||||||
|
insecure: true
|
||||||
|
data:
|
||||||
|
- secretKey: access_key
|
||||||
|
remoteRef:
|
||||||
|
key: secret/minio
|
||||||
|
property: root-user
|
||||||
|
- secretKey: secret_key
|
||||||
|
remoteRef:
|
||||||
|
key: secret/minio
|
||||||
|
property: root-password
|
||||||
|
|||||||
45
thanos/argocd.yaml
Normal file
45
thanos/argocd.yaml
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
apiVersion: argoproj.io/v1alpha1
|
||||||
|
kind: Application
|
||||||
|
metadata:
|
||||||
|
name: thanos
|
||||||
|
namespace: argocd
|
||||||
|
finalizers:
|
||||||
|
- resources-finalizer.argocd.argoproj.io
|
||||||
|
spec:
|
||||||
|
project: default
|
||||||
|
sources:
|
||||||
|
- repoURL: https://charts.bitnami.com/bitnami
|
||||||
|
chart: thanos
|
||||||
|
targetRevision: 17.3.1
|
||||||
|
helm:
|
||||||
|
valueFiles:
|
||||||
|
- $values/thanos/helm-values.yaml
|
||||||
|
- repoURL: https://github.com/K3S-HOME/observability.git
|
||||||
|
targetRevision: main
|
||||||
|
ref: values
|
||||||
|
- repoURL: https://github.com/K3S-HOME/observability.git
|
||||||
|
targetRevision: main
|
||||||
|
path: thanos/manifests
|
||||||
|
destination:
|
||||||
|
server: https://kubernetes.default.svc
|
||||||
|
namespace: thanos
|
||||||
|
syncPolicy:
|
||||||
|
automated:
|
||||||
|
prune: true
|
||||||
|
selfHeal: true
|
||||||
|
allowEmpty: false
|
||||||
|
syncOptions:
|
||||||
|
- CreateNamespace=true
|
||||||
|
- PrunePropagationPolicy=foreground
|
||||||
|
- PruneLast=true
|
||||||
|
retry:
|
||||||
|
limit: 5
|
||||||
|
backoff:
|
||||||
|
duration: 5s
|
||||||
|
factor: 2
|
||||||
|
maxDuration: 3m
|
||||||
|
managedNamespaceMetadata:
|
||||||
|
labels:
|
||||||
|
goldilocks.fairwinds.com/enabled: 'true'
|
||||||
|
minio-s3: enabled
|
||||||
|
revisionHistoryLimit: 10
|
||||||
117
thanos/helm-values.yaml
Normal file
117
thanos/helm-values.yaml
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
# Thanos Helm Values
|
||||||
|
# Chart: https://github.com/bitnami/charts/tree/main/bitnami/thanos
|
||||||
|
#
|
||||||
|
# Architecture:
|
||||||
|
# - Prometheus (prometheus namespace) + Sidecar → uploads to MinIO
|
||||||
|
# - Query: queries Sidecar + Store Gateway, deduplicates data
|
||||||
|
# - Store Gateway: reads historical data from MinIO
|
||||||
|
# - Compactor: compacts and downsamples data in MinIO
|
||||||
|
|
||||||
|
# Object storage configuration (MinIO S3)
|
||||||
|
# Uses secret created by ExternalSecret
|
||||||
|
existingObjstoreSecret: thanos-objstore-secret
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Query - Main query endpoint (Grafana connects here)
|
||||||
|
# =============================================================================
|
||||||
|
query:
|
||||||
|
enabled: true
|
||||||
|
replicaCount: 1
|
||||||
|
|
||||||
|
# Deduplicate metrics from multiple Prometheus replicas
|
||||||
|
dnsDiscovery:
|
||||||
|
enabled: true
|
||||||
|
sidecarsService: prometheus-kube-prometheus-thanos-discovery
|
||||||
|
sidecarsNamespace: prometheus
|
||||||
|
|
||||||
|
# Store endpoints for historical data
|
||||||
|
stores:
|
||||||
|
- dnssrv+_grpc._tcp.thanos-storegateway.thanos.svc.cluster.local
|
||||||
|
|
||||||
|
# Deduplication settings
|
||||||
|
extraFlags:
|
||||||
|
- --query.replica-label=prometheus_replica
|
||||||
|
- --query.auto-downsampling
|
||||||
|
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 15m
|
||||||
|
memory: 128Mi
|
||||||
|
limits:
|
||||||
|
memory: 256Mi
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Query Frontend - Caching layer for Query (optional, disabled for small cluster)
|
||||||
|
# =============================================================================
|
||||||
|
queryFrontend:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Store Gateway - Reads historical data from S3
|
||||||
|
# =============================================================================
|
||||||
|
storegateway:
|
||||||
|
enabled: true
|
||||||
|
replicaCount: 1
|
||||||
|
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 15m
|
||||||
|
memory: 128Mi
|
||||||
|
limits:
|
||||||
|
memory: 512Mi
|
||||||
|
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
storageClass: local-path
|
||||||
|
size: 2Gi
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Compactor - Compacts and downsamples data in S3
|
||||||
|
# =============================================================================
|
||||||
|
compactor:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Retention settings
|
||||||
|
retentionResolutionRaw: 7d # Keep raw data for 7 days
|
||||||
|
retentionResolution5m: 30d # Keep 5m downsampled for 30 days
|
||||||
|
retentionResolution1h: 90d # Keep 1h downsampled for 90 days
|
||||||
|
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 15m
|
||||||
|
memory: 128Mi
|
||||||
|
limits:
|
||||||
|
memory: 512Mi
|
||||||
|
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
storageClass: local-path
|
||||||
|
size: 2Gi
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Ruler - Alerting rules evaluation (disabled, using Prometheus rules)
|
||||||
|
# =============================================================================
|
||||||
|
ruler:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Receive - Remote write endpoint (disabled, using Sidecar)
|
||||||
|
# =============================================================================
|
||||||
|
receive:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Sidecar - Disabled here, enabled in Prometheus helm-values
|
||||||
|
# =============================================================================
|
||||||
|
# The sidecar is deployed alongside Prometheus via kube-prometheus-stack
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Metrics
|
||||||
|
# =============================================================================
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: true
|
||||||
|
namespace: thanos
|
||||||
|
labels:
|
||||||
|
release: prometheus
|
||||||
5
thanos/kustomization.yaml
Normal file
5
thanos/kustomization.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- argocd.yaml
|
||||||
32
thanos/manifests/secret.yaml
Normal file
32
thanos/manifests/secret.yaml
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
apiVersion: external-secrets.io/v1
|
||||||
|
kind: ExternalSecret
|
||||||
|
metadata:
|
||||||
|
name: thanos-objstore-secret
|
||||||
|
namespace: thanos
|
||||||
|
spec:
|
||||||
|
refreshInterval: 1h
|
||||||
|
secretStoreRef:
|
||||||
|
name: vault-backend
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
target:
|
||||||
|
name: thanos-objstore-secret
|
||||||
|
template:
|
||||||
|
engineVersion: v2
|
||||||
|
data:
|
||||||
|
objstore.yml: |
|
||||||
|
type: S3
|
||||||
|
config:
|
||||||
|
bucket: thanos
|
||||||
|
endpoint: minio.minio.svc.cluster.local:9000
|
||||||
|
access_key: {{ .access_key }}
|
||||||
|
secret_key: {{ .secret_key }}
|
||||||
|
insecure: true
|
||||||
|
data:
|
||||||
|
- secretKey: access_key
|
||||||
|
remoteRef:
|
||||||
|
key: secret/minio
|
||||||
|
property: root-user
|
||||||
|
- secretKey: secret_key
|
||||||
|
remoteRef:
|
||||||
|
key: secret/minio
|
||||||
|
property: root-password
|
||||||
Reference in New Issue
Block a user