diff --git a/kustomization.yaml b/kustomization.yaml index 0e6c10c..5dffbb5 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -4,6 +4,7 @@ kind: Kustomization resources: - application.yaml - prometheus/argocd.yaml + - thanos/argocd.yaml - alertmanager/argocd.yaml - grafana/argocd.yaml - loki/argocd.yaml diff --git a/prometheus/helm-values.yaml b/prometheus/helm-values.yaml index 3488801..80e5118 100644 --- a/prometheus/helm-values.yaml +++ b/prometheus/helm-values.yaml @@ -37,11 +37,40 @@ kubelet: # Prometheus prometheus: enabled: true - + + # Thanos Sidecar - for long-term storage and HA + thanosService: + enabled: true + thanosServiceMonitor: + enabled: true + prometheusSpec: + # HA: 2 replicas on different worker nodes + replicas: 2 + replicaExternalLabelName: prometheus_replica + + # Pod anti-affinity for HA + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: prometheus + topologyKey: kubernetes.io/hostname + scrapeInterval: 60s # 30s → 60s (메모리 절감) evaluationInterval: 60s # 30s → 60s - retention: 3d # 7d → 3d (메모리 절감) + retention: 3d # Local retention (S3 has longer retention via Thanos) + + # Thanos Sidecar configuration + thanos: + image: quay.io/thanos/thanos:v0.37.2 + objectStorageConfig: + existingSecret: + name: thanos-objstore-secret + key: objstore.yml storageSpec: volumeClaimTemplate: diff --git a/prometheus/manifests/secret.yaml b/prometheus/manifests/secret.yaml index 29f8b4d..4325345 100644 --- a/prometheus/manifests/secret.yaml +++ b/prometheus/manifests/secret.yaml @@ -16,3 +16,36 @@ spec: remoteRef: key: postgresql property: PASSWORD +--- +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: thanos-objstore-secret + namespace: prometheus +spec: + refreshInterval: 1h + secretStoreRef: + name: vault-backend + kind: ClusterSecretStore + target: + name: thanos-objstore-secret + template: + engineVersion: v2 + data: + objstore.yml: | + type: S3 + config: + bucket: thanos + endpoint: minio.minio.svc.cluster.local:9000 + access_key: {{ .access_key }} + secret_key: {{ .secret_key }} + insecure: true + data: + - secretKey: access_key + remoteRef: + key: secret/minio + property: root-user + - secretKey: secret_key + remoteRef: + key: secret/minio + property: root-password diff --git a/thanos/argocd.yaml b/thanos/argocd.yaml new file mode 100644 index 0000000..b888132 --- /dev/null +++ b/thanos/argocd.yaml @@ -0,0 +1,45 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: thanos + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + sources: + - repoURL: https://charts.bitnami.com/bitnami + chart: thanos + targetRevision: 17.3.1 + helm: + valueFiles: + - $values/thanos/helm-values.yaml + - repoURL: https://github.com/K3S-HOME/observability.git + targetRevision: main + ref: values + - repoURL: https://github.com/K3S-HOME/observability.git + targetRevision: main + path: thanos/manifests + destination: + server: https://kubernetes.default.svc + namespace: thanos + syncPolicy: + automated: + prune: true + selfHeal: true + allowEmpty: false + syncOptions: + - CreateNamespace=true + - PrunePropagationPolicy=foreground + - PruneLast=true + retry: + limit: 5 + backoff: + duration: 5s + factor: 2 + maxDuration: 3m + managedNamespaceMetadata: + labels: + goldilocks.fairwinds.com/enabled: 'true' + minio-s3: enabled + revisionHistoryLimit: 10 diff --git a/thanos/helm-values.yaml b/thanos/helm-values.yaml new file mode 100644 index 0000000..baf3ac8 --- /dev/null +++ b/thanos/helm-values.yaml @@ -0,0 +1,117 @@ +# Thanos Helm Values +# Chart: https://github.com/bitnami/charts/tree/main/bitnami/thanos +# +# Architecture: +# - Prometheus (prometheus namespace) + Sidecar → uploads to MinIO +# - Query: queries Sidecar + Store Gateway, deduplicates data +# - Store Gateway: reads historical data from MinIO +# - Compactor: compacts and downsamples data in MinIO + +# Object storage configuration (MinIO S3) +# Uses secret created by ExternalSecret +existingObjstoreSecret: thanos-objstore-secret + +# ============================================================================= +# Query - Main query endpoint (Grafana connects here) +# ============================================================================= +query: + enabled: true + replicaCount: 1 + + # Deduplicate metrics from multiple Prometheus replicas + dnsDiscovery: + enabled: true + sidecarsService: prometheus-kube-prometheus-thanos-discovery + sidecarsNamespace: prometheus + + # Store endpoints for historical data + stores: + - dnssrv+_grpc._tcp.thanos-storegateway.thanos.svc.cluster.local + + # Deduplication settings + extraFlags: + - --query.replica-label=prometheus_replica + - --query.auto-downsampling + + resources: + requests: + cpu: 15m + memory: 128Mi + limits: + memory: 256Mi + +# ============================================================================= +# Query Frontend - Caching layer for Query (optional, disabled for small cluster) +# ============================================================================= +queryFrontend: + enabled: false + +# ============================================================================= +# Store Gateway - Reads historical data from S3 +# ============================================================================= +storegateway: + enabled: true + replicaCount: 1 + + resources: + requests: + cpu: 15m + memory: 128Mi + limits: + memory: 512Mi + + persistence: + enabled: true + storageClass: local-path + size: 2Gi + +# ============================================================================= +# Compactor - Compacts and downsamples data in S3 +# ============================================================================= +compactor: + enabled: true + + # Retention settings + retentionResolutionRaw: 7d # Keep raw data for 7 days + retentionResolution5m: 30d # Keep 5m downsampled for 30 days + retentionResolution1h: 90d # Keep 1h downsampled for 90 days + + resources: + requests: + cpu: 15m + memory: 128Mi + limits: + memory: 512Mi + + persistence: + enabled: true + storageClass: local-path + size: 2Gi + +# ============================================================================= +# Ruler - Alerting rules evaluation (disabled, using Prometheus rules) +# ============================================================================= +ruler: + enabled: false + +# ============================================================================= +# Receive - Remote write endpoint (disabled, using Sidecar) +# ============================================================================= +receive: + enabled: false + +# ============================================================================= +# Sidecar - Disabled here, enabled in Prometheus helm-values +# ============================================================================= +# The sidecar is deployed alongside Prometheus via kube-prometheus-stack + +# ============================================================================= +# Metrics +# ============================================================================= +metrics: + enabled: true + serviceMonitor: + enabled: true + namespace: thanos + labels: + release: prometheus diff --git a/thanos/kustomization.yaml b/thanos/kustomization.yaml new file mode 100644 index 0000000..418e1ce --- /dev/null +++ b/thanos/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - argocd.yaml diff --git a/thanos/manifests/secret.yaml b/thanos/manifests/secret.yaml new file mode 100644 index 0000000..89a9ba5 --- /dev/null +++ b/thanos/manifests/secret.yaml @@ -0,0 +1,32 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: thanos-objstore-secret + namespace: thanos +spec: + refreshInterval: 1h + secretStoreRef: + name: vault-backend + kind: ClusterSecretStore + target: + name: thanos-objstore-secret + template: + engineVersion: v2 + data: + objstore.yml: | + type: S3 + config: + bucket: thanos + endpoint: minio.minio.svc.cluster.local:9000 + access_key: {{ .access_key }} + secret_key: {{ .secret_key }} + insecure: true + data: + - secretKey: access_key + remoteRef: + key: secret/minio + property: root-user + - secretKey: secret_key + remoteRef: + key: secret/minio + property: root-password