FEAT(tempo): configure S3 storage with MinIO

- Enable env var expansion in config
- Configure extraEnv for S3 credentials
- Fix OTel Collector image settings
This commit is contained in:
2026-01-09 13:22:16 +09:00
parent 7139f3e5a2
commit 5f926cb6cf
9 changed files with 459 additions and 0 deletions

33
application.yaml Normal file
View File

@@ -0,0 +1,33 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: observability
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: https://github.com/K3S-HOME/observability.git
targetRevision: main
path: .
destination:
server: https://kubernetes.default.svc
namespace: argocd
syncPolicy:
automated:
prune: false
selfHeal: true
syncOptions:
- CreateNamespace=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
revisionHistoryLimit: 10

View File

@@ -2,12 +2,17 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
resources: resources:
# Self-reference for App of Apps pattern
- application.yaml
- prometheus/argocd.yaml - prometheus/argocd.yaml
- thanos/argocd.yaml - thanos/argocd.yaml
- alertmanager/argocd.yaml - alertmanager/argocd.yaml
- grafana/argocd.yaml - grafana/argocd.yaml
- loki/argocd.yaml - loki/argocd.yaml
- promtail/argocd.yaml - promtail/argocd.yaml
- tempo/argocd.yaml
- opentelemetry-collector/argocd.yaml
- node-exporter/argocd.yaml - node-exporter/argocd.yaml
- kube-state-metrics/argocd.yaml - kube-state-metrics/argocd.yaml
- goldilocks/argocd.yaml - goldilocks/argocd.yaml

View File

@@ -0,0 +1,41 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: opentelemetry-collector
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
sources:
- repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts
chart: opentelemetry-collector
targetRevision: 0.108.0
helm:
valueFiles:
- $values/opentelemetry-collector/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
targetRevision: main
ref: values
destination:
server: https://kubernetes.default.svc
namespace: opentelemetry
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
managedNamespaceMetadata:
labels:
goldilocks.fairwinds.com/enabled: 'true'
revisionHistoryLimit: 10

View File

@@ -0,0 +1,200 @@
# OpenTelemetry Collector Helm Values
# Chart: https://github.com/open-telemetry/opentelemetry-helm-charts
#
# Architecture:
# - DaemonSet mode: one collector per node for efficient data collection
# - OTLP receiver for traces, metrics, and logs
# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs)
#
# Pipeline:
# Applications → OTel Collector → Tempo/Prometheus/Loki → Grafana
# =============================================================================
# Image Configuration
# =============================================================================
image:
repository: otel/opentelemetry-collector-contrib
# =============================================================================
# Deployment Mode
# =============================================================================
mode: daemonset
# =============================================================================
# Resource Limits (optimized for small cluster)
# =============================================================================
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
memory: 256Mi
# =============================================================================
# Tolerations (run on all nodes including master)
# =============================================================================
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
# =============================================================================
# Ports
# =============================================================================
ports:
otlp:
enabled: true
containerPort: 4317
servicePort: 4317
hostPort: 4317
protocol: TCP
otlp-http:
enabled: true
containerPort: 4318
servicePort: 4318
hostPort: 4318
protocol: TCP
metrics:
enabled: true
containerPort: 8888
servicePort: 8888
protocol: TCP
# =============================================================================
# OpenTelemetry Collector Configuration
# =============================================================================
config:
# Receivers - what data the collector accepts
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# Processors - how data is transformed
processors:
# Batch processor for efficient exports
batch:
timeout: 10s
send_batch_size: 1024
send_batch_max_size: 2048
# Memory limiter to prevent OOM
memory_limiter:
check_interval: 5s
limit_mib: 200
spike_limit_mib: 50
# Add Kubernetes metadata
k8sattributes:
extract:
metadata:
- k8s.namespace.name
- k8s.deployment.name
- k8s.pod.name
- k8s.node.name
passthrough: false
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: connection
# Resource detection
resourcedetection:
detectors: [env, system]
timeout: 5s
override: false
# Exporters - where data goes
exporters:
# Tempo for traces
otlp/tempo:
endpoint: tempo.tempo.svc.cluster.local:4317
tls:
insecure: true
# Prometheus remote write for metrics
prometheusremotewrite:
endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
tls:
insecure: true
# Loki for logs
loki:
endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
default_labels_enabled:
exporter: true
level: true
# Debug exporter (for troubleshooting)
debug:
verbosity: basic
# Extensions
extensions:
health_check:
endpoint: 0.0.0.0:13133
# Service pipelines
service:
extensions: [health_check]
pipelines:
# Traces pipeline
traces:
receivers: [otlp]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [otlp/tempo]
# Metrics pipeline
metrics:
receivers: [otlp]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [prometheusremotewrite]
# Logs pipeline
logs:
receivers: [otlp]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [loki]
# =============================================================================
# Service Account
# =============================================================================
serviceAccount:
create: true
# =============================================================================
# RBAC for k8sattributes processor
# =============================================================================
clusterRole:
create: true
rules:
- apiGroups: [""]
resources: ["pods", "namespaces", "nodes"]
verbs: ["get", "watch", "list"]
- apiGroups: ["apps"]
resources: ["replicasets", "deployments"]
verbs: ["get", "watch", "list"]
# =============================================================================
# ServiceMonitor for Prometheus
# =============================================================================
serviceMonitor:
enabled: true
metricsEndpoints:
- port: metrics
extraLabels:
release: prometheus
# =============================================================================
# Pod Monitor for self-monitoring
# =============================================================================
podMonitor:
enabled: false

View File

@@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- argocd.yaml

45
tempo/argocd.yaml Normal file
View File

@@ -0,0 +1,45 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: tempo
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
sources:
- repoURL: https://grafana.github.io/helm-charts
chart: tempo
targetRevision: 1.17.0
helm:
valueFiles:
- $values/tempo/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
targetRevision: main
ref: values
- repoURL: https://github.com/K3S-HOME/observability.git
targetRevision: main
path: tempo/manifests
destination:
server: https://kubernetes.default.svc
namespace: tempo
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
managedNamespaceMetadata:
labels:
goldilocks.fairwinds.com/enabled: 'true'
minio-s3: enabled
revisionHistoryLimit: 10

99
tempo/helm-values.yaml Normal file
View File

@@ -0,0 +1,99 @@
# Tempo Helm Values
# Chart: https://github.com/grafana/helm-charts/tree/main/charts/tempo
#
# Architecture:
# - Single binary (monolithic) mode for small clusters
# - MinIO S3 for trace storage
# - OTLP receiver for OpenTelemetry data
# - Integrates with Grafana for trace visualization
# Run on master node for stability
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
nodeSelector:
node-role.kubernetes.io/control-plane: "true"
# =============================================================================
# Resource Limits (optimized for small cluster)
# =============================================================================
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
memory: 512Mi
# =============================================================================
# Tempo Configuration
# =============================================================================
tempo:
# Receivers - protocols Tempo accepts
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# Retention settings
retention: 72h # Keep traces for 3 days
# Backend storage (MinIO S3)
# Uses environment variable expansion
storage:
trace:
backend: s3
s3:
bucket: tempo
endpoint: minio.minio.svc.cluster.local:9000
access_key: ${S3_ACCESS_KEY}
secret_key: ${S3_SECRET_KEY}
insecure: true
# Query settings
querier:
frontend_worker:
frontend_address: ""
# Metrics generator for trace-derived metrics
metricsGenerator:
enabled: true
remoteWriteUrl: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
# Environment variables from secret for S3 credentials
extraEnv:
- name: S3_ACCESS_KEY
valueFrom:
secretKeyRef:
name: tempo-s3-secret
key: S3_ACCESS_KEY
- name: S3_SECRET_KEY
valueFrom:
secretKeyRef:
name: tempo-s3-secret
key: S3_SECRET_KEY
# =============================================================================
# Persistence (local cache)
# =============================================================================
persistence:
enabled: true
storageClassName: local-path
size: 2Gi
# =============================================================================
# Service
# =============================================================================
service:
type: ClusterIP
# =============================================================================
# ServiceMonitor for Prometheus
# =============================================================================
serviceMonitor:
enabled: true
additionalLabels:
release: prometheus

5
tempo/kustomization.yaml Normal file
View File

@@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- argocd.yaml

View File

@@ -0,0 +1,26 @@
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: tempo-s3-secret
namespace: tempo
spec:
refreshInterval: 1h
secretStoreRef:
name: vault-backend
kind: ClusterSecretStore
target:
name: tempo-s3-secret
template:
engineVersion: v2
data:
S3_ACCESS_KEY: "{{ .access_key }}"
S3_SECRET_KEY: "{{ .secret_key }}"
data:
- secretKey: access_key
remoteRef:
key: minio
property: ROOT_USER
- secretKey: secret_key
remoteRef:
key: minio
property: ROOT_PASSWORD