FEAT(tempo): configure S3 storage with MinIO
- Enable env var expansion in config - Configure extraEnv for S3 credentials - Fix OTel Collector image settings
This commit is contained in:
33
application.yaml
Normal file
33
application.yaml
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
apiVersion: argoproj.io/v1alpha1
|
||||||
|
kind: Application
|
||||||
|
metadata:
|
||||||
|
name: observability
|
||||||
|
namespace: argocd
|
||||||
|
finalizers:
|
||||||
|
- resources-finalizer.argocd.argoproj.io
|
||||||
|
spec:
|
||||||
|
project: default
|
||||||
|
|
||||||
|
source:
|
||||||
|
repoURL: https://github.com/K3S-HOME/observability.git
|
||||||
|
targetRevision: main
|
||||||
|
path: .
|
||||||
|
|
||||||
|
destination:
|
||||||
|
server: https://kubernetes.default.svc
|
||||||
|
namespace: argocd
|
||||||
|
|
||||||
|
syncPolicy:
|
||||||
|
automated:
|
||||||
|
prune: false
|
||||||
|
selfHeal: true
|
||||||
|
syncOptions:
|
||||||
|
- CreateNamespace=true
|
||||||
|
retry:
|
||||||
|
limit: 5
|
||||||
|
backoff:
|
||||||
|
duration: 5s
|
||||||
|
factor: 2
|
||||||
|
maxDuration: 3m
|
||||||
|
|
||||||
|
revisionHistoryLimit: 10
|
||||||
@@ -2,12 +2,17 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
|||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
|
# Self-reference for App of Apps pattern
|
||||||
|
- application.yaml
|
||||||
|
|
||||||
- prometheus/argocd.yaml
|
- prometheus/argocd.yaml
|
||||||
- thanos/argocd.yaml
|
- thanos/argocd.yaml
|
||||||
- alertmanager/argocd.yaml
|
- alertmanager/argocd.yaml
|
||||||
- grafana/argocd.yaml
|
- grafana/argocd.yaml
|
||||||
- loki/argocd.yaml
|
- loki/argocd.yaml
|
||||||
- promtail/argocd.yaml
|
- promtail/argocd.yaml
|
||||||
|
- tempo/argocd.yaml
|
||||||
|
- opentelemetry-collector/argocd.yaml
|
||||||
- node-exporter/argocd.yaml
|
- node-exporter/argocd.yaml
|
||||||
- kube-state-metrics/argocd.yaml
|
- kube-state-metrics/argocd.yaml
|
||||||
- goldilocks/argocd.yaml
|
- goldilocks/argocd.yaml
|
||||||
|
|||||||
41
opentelemetry-collector/argocd.yaml
Normal file
41
opentelemetry-collector/argocd.yaml
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
apiVersion: argoproj.io/v1alpha1
|
||||||
|
kind: Application
|
||||||
|
metadata:
|
||||||
|
name: opentelemetry-collector
|
||||||
|
namespace: argocd
|
||||||
|
finalizers:
|
||||||
|
- resources-finalizer.argocd.argoproj.io
|
||||||
|
spec:
|
||||||
|
project: default
|
||||||
|
sources:
|
||||||
|
- repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts
|
||||||
|
chart: opentelemetry-collector
|
||||||
|
targetRevision: 0.108.0
|
||||||
|
helm:
|
||||||
|
valueFiles:
|
||||||
|
- $values/opentelemetry-collector/helm-values.yaml
|
||||||
|
- repoURL: https://github.com/K3S-HOME/observability.git
|
||||||
|
targetRevision: main
|
||||||
|
ref: values
|
||||||
|
destination:
|
||||||
|
server: https://kubernetes.default.svc
|
||||||
|
namespace: opentelemetry
|
||||||
|
syncPolicy:
|
||||||
|
automated:
|
||||||
|
prune: true
|
||||||
|
selfHeal: true
|
||||||
|
allowEmpty: false
|
||||||
|
syncOptions:
|
||||||
|
- CreateNamespace=true
|
||||||
|
- PrunePropagationPolicy=foreground
|
||||||
|
- PruneLast=true
|
||||||
|
retry:
|
||||||
|
limit: 5
|
||||||
|
backoff:
|
||||||
|
duration: 5s
|
||||||
|
factor: 2
|
||||||
|
maxDuration: 3m
|
||||||
|
managedNamespaceMetadata:
|
||||||
|
labels:
|
||||||
|
goldilocks.fairwinds.com/enabled: 'true'
|
||||||
|
revisionHistoryLimit: 10
|
||||||
200
opentelemetry-collector/helm-values.yaml
Normal file
200
opentelemetry-collector/helm-values.yaml
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
# OpenTelemetry Collector Helm Values
|
||||||
|
# Chart: https://github.com/open-telemetry/opentelemetry-helm-charts
|
||||||
|
#
|
||||||
|
# Architecture:
|
||||||
|
# - DaemonSet mode: one collector per node for efficient data collection
|
||||||
|
# - OTLP receiver for traces, metrics, and logs
|
||||||
|
# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs)
|
||||||
|
#
|
||||||
|
# Pipeline:
|
||||||
|
# Applications → OTel Collector → Tempo/Prometheus/Loki → Grafana
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Image Configuration
|
||||||
|
# =============================================================================
|
||||||
|
image:
|
||||||
|
repository: otel/opentelemetry-collector-contrib
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Deployment Mode
|
||||||
|
# =============================================================================
|
||||||
|
mode: daemonset
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Resource Limits (optimized for small cluster)
|
||||||
|
# =============================================================================
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 25m
|
||||||
|
memory: 64Mi
|
||||||
|
limits:
|
||||||
|
memory: 256Mi
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tolerations (run on all nodes including master)
|
||||||
|
# =============================================================================
|
||||||
|
tolerations:
|
||||||
|
- key: node-role.kubernetes.io/control-plane
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Ports
|
||||||
|
# =============================================================================
|
||||||
|
ports:
|
||||||
|
otlp:
|
||||||
|
enabled: true
|
||||||
|
containerPort: 4317
|
||||||
|
servicePort: 4317
|
||||||
|
hostPort: 4317
|
||||||
|
protocol: TCP
|
||||||
|
otlp-http:
|
||||||
|
enabled: true
|
||||||
|
containerPort: 4318
|
||||||
|
servicePort: 4318
|
||||||
|
hostPort: 4318
|
||||||
|
protocol: TCP
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
containerPort: 8888
|
||||||
|
servicePort: 8888
|
||||||
|
protocol: TCP
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# OpenTelemetry Collector Configuration
|
||||||
|
# =============================================================================
|
||||||
|
config:
|
||||||
|
# Receivers - what data the collector accepts
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: 0.0.0.0:4317
|
||||||
|
http:
|
||||||
|
endpoint: 0.0.0.0:4318
|
||||||
|
|
||||||
|
# Processors - how data is transformed
|
||||||
|
processors:
|
||||||
|
# Batch processor for efficient exports
|
||||||
|
batch:
|
||||||
|
timeout: 10s
|
||||||
|
send_batch_size: 1024
|
||||||
|
send_batch_max_size: 2048
|
||||||
|
|
||||||
|
# Memory limiter to prevent OOM
|
||||||
|
memory_limiter:
|
||||||
|
check_interval: 5s
|
||||||
|
limit_mib: 200
|
||||||
|
spike_limit_mib: 50
|
||||||
|
|
||||||
|
# Add Kubernetes metadata
|
||||||
|
k8sattributes:
|
||||||
|
extract:
|
||||||
|
metadata:
|
||||||
|
- k8s.namespace.name
|
||||||
|
- k8s.deployment.name
|
||||||
|
- k8s.pod.name
|
||||||
|
- k8s.node.name
|
||||||
|
passthrough: false
|
||||||
|
pod_association:
|
||||||
|
- sources:
|
||||||
|
- from: resource_attribute
|
||||||
|
name: k8s.pod.ip
|
||||||
|
- sources:
|
||||||
|
- from: resource_attribute
|
||||||
|
name: k8s.pod.uid
|
||||||
|
- sources:
|
||||||
|
- from: connection
|
||||||
|
|
||||||
|
# Resource detection
|
||||||
|
resourcedetection:
|
||||||
|
detectors: [env, system]
|
||||||
|
timeout: 5s
|
||||||
|
override: false
|
||||||
|
|
||||||
|
# Exporters - where data goes
|
||||||
|
exporters:
|
||||||
|
# Tempo for traces
|
||||||
|
otlp/tempo:
|
||||||
|
endpoint: tempo.tempo.svc.cluster.local:4317
|
||||||
|
tls:
|
||||||
|
insecure: true
|
||||||
|
|
||||||
|
# Prometheus remote write for metrics
|
||||||
|
prometheusremotewrite:
|
||||||
|
endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
|
||||||
|
tls:
|
||||||
|
insecure: true
|
||||||
|
|
||||||
|
# Loki for logs
|
||||||
|
loki:
|
||||||
|
endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
|
||||||
|
default_labels_enabled:
|
||||||
|
exporter: true
|
||||||
|
level: true
|
||||||
|
|
||||||
|
# Debug exporter (for troubleshooting)
|
||||||
|
debug:
|
||||||
|
verbosity: basic
|
||||||
|
|
||||||
|
# Extensions
|
||||||
|
extensions:
|
||||||
|
health_check:
|
||||||
|
endpoint: 0.0.0.0:13133
|
||||||
|
|
||||||
|
# Service pipelines
|
||||||
|
service:
|
||||||
|
extensions: [health_check]
|
||||||
|
pipelines:
|
||||||
|
# Traces pipeline
|
||||||
|
traces:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
|
exporters: [otlp/tempo]
|
||||||
|
|
||||||
|
# Metrics pipeline
|
||||||
|
metrics:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
|
exporters: [prometheusremotewrite]
|
||||||
|
|
||||||
|
# Logs pipeline
|
||||||
|
logs:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
|
||||||
|
exporters: [loki]
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Service Account
|
||||||
|
# =============================================================================
|
||||||
|
serviceAccount:
|
||||||
|
create: true
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# RBAC for k8sattributes processor
|
||||||
|
# =============================================================================
|
||||||
|
clusterRole:
|
||||||
|
create: true
|
||||||
|
rules:
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["pods", "namespaces", "nodes"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
- apiGroups: ["apps"]
|
||||||
|
resources: ["replicasets", "deployments"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# ServiceMonitor for Prometheus
|
||||||
|
# =============================================================================
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: true
|
||||||
|
metricsEndpoints:
|
||||||
|
- port: metrics
|
||||||
|
extraLabels:
|
||||||
|
release: prometheus
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Pod Monitor for self-monitoring
|
||||||
|
# =============================================================================
|
||||||
|
podMonitor:
|
||||||
|
enabled: false
|
||||||
5
opentelemetry-collector/kustomization.yaml
Normal file
5
opentelemetry-collector/kustomization.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- argocd.yaml
|
||||||
45
tempo/argocd.yaml
Normal file
45
tempo/argocd.yaml
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
apiVersion: argoproj.io/v1alpha1
|
||||||
|
kind: Application
|
||||||
|
metadata:
|
||||||
|
name: tempo
|
||||||
|
namespace: argocd
|
||||||
|
finalizers:
|
||||||
|
- resources-finalizer.argocd.argoproj.io
|
||||||
|
spec:
|
||||||
|
project: default
|
||||||
|
sources:
|
||||||
|
- repoURL: https://grafana.github.io/helm-charts
|
||||||
|
chart: tempo
|
||||||
|
targetRevision: 1.17.0
|
||||||
|
helm:
|
||||||
|
valueFiles:
|
||||||
|
- $values/tempo/helm-values.yaml
|
||||||
|
- repoURL: https://github.com/K3S-HOME/observability.git
|
||||||
|
targetRevision: main
|
||||||
|
ref: values
|
||||||
|
- repoURL: https://github.com/K3S-HOME/observability.git
|
||||||
|
targetRevision: main
|
||||||
|
path: tempo/manifests
|
||||||
|
destination:
|
||||||
|
server: https://kubernetes.default.svc
|
||||||
|
namespace: tempo
|
||||||
|
syncPolicy:
|
||||||
|
automated:
|
||||||
|
prune: true
|
||||||
|
selfHeal: true
|
||||||
|
allowEmpty: false
|
||||||
|
syncOptions:
|
||||||
|
- CreateNamespace=true
|
||||||
|
- PrunePropagationPolicy=foreground
|
||||||
|
- PruneLast=true
|
||||||
|
retry:
|
||||||
|
limit: 5
|
||||||
|
backoff:
|
||||||
|
duration: 5s
|
||||||
|
factor: 2
|
||||||
|
maxDuration: 3m
|
||||||
|
managedNamespaceMetadata:
|
||||||
|
labels:
|
||||||
|
goldilocks.fairwinds.com/enabled: 'true'
|
||||||
|
minio-s3: enabled
|
||||||
|
revisionHistoryLimit: 10
|
||||||
99
tempo/helm-values.yaml
Normal file
99
tempo/helm-values.yaml
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
# Tempo Helm Values
|
||||||
|
# Chart: https://github.com/grafana/helm-charts/tree/main/charts/tempo
|
||||||
|
#
|
||||||
|
# Architecture:
|
||||||
|
# - Single binary (monolithic) mode for small clusters
|
||||||
|
# - MinIO S3 for trace storage
|
||||||
|
# - OTLP receiver for OpenTelemetry data
|
||||||
|
# - Integrates with Grafana for trace visualization
|
||||||
|
|
||||||
|
# Run on master node for stability
|
||||||
|
tolerations:
|
||||||
|
- key: node-role.kubernetes.io/control-plane
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
nodeSelector:
|
||||||
|
node-role.kubernetes.io/control-plane: "true"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Resource Limits (optimized for small cluster)
|
||||||
|
# =============================================================================
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 128Mi
|
||||||
|
limits:
|
||||||
|
memory: 512Mi
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tempo Configuration
|
||||||
|
# =============================================================================
|
||||||
|
tempo:
|
||||||
|
# Receivers - protocols Tempo accepts
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: 0.0.0.0:4317
|
||||||
|
http:
|
||||||
|
endpoint: 0.0.0.0:4318
|
||||||
|
|
||||||
|
# Retention settings
|
||||||
|
retention: 72h # Keep traces for 3 days
|
||||||
|
|
||||||
|
# Backend storage (MinIO S3)
|
||||||
|
# Uses environment variable expansion
|
||||||
|
storage:
|
||||||
|
trace:
|
||||||
|
backend: s3
|
||||||
|
s3:
|
||||||
|
bucket: tempo
|
||||||
|
endpoint: minio.minio.svc.cluster.local:9000
|
||||||
|
access_key: ${S3_ACCESS_KEY}
|
||||||
|
secret_key: ${S3_SECRET_KEY}
|
||||||
|
insecure: true
|
||||||
|
|
||||||
|
# Query settings
|
||||||
|
querier:
|
||||||
|
frontend_worker:
|
||||||
|
frontend_address: ""
|
||||||
|
|
||||||
|
# Metrics generator for trace-derived metrics
|
||||||
|
metricsGenerator:
|
||||||
|
enabled: true
|
||||||
|
remoteWriteUrl: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
|
||||||
|
|
||||||
|
# Environment variables from secret for S3 credentials
|
||||||
|
extraEnv:
|
||||||
|
- name: S3_ACCESS_KEY
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: tempo-s3-secret
|
||||||
|
key: S3_ACCESS_KEY
|
||||||
|
- name: S3_SECRET_KEY
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: tempo-s3-secret
|
||||||
|
key: S3_SECRET_KEY
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Persistence (local cache)
|
||||||
|
# =============================================================================
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
storageClassName: local-path
|
||||||
|
size: 2Gi
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Service
|
||||||
|
# =============================================================================
|
||||||
|
service:
|
||||||
|
type: ClusterIP
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# ServiceMonitor for Prometheus
|
||||||
|
# =============================================================================
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: true
|
||||||
|
additionalLabels:
|
||||||
|
release: prometheus
|
||||||
5
tempo/kustomization.yaml
Normal file
5
tempo/kustomization.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- argocd.yaml
|
||||||
26
tempo/manifests/secret.yaml
Normal file
26
tempo/manifests/secret.yaml
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
apiVersion: external-secrets.io/v1
|
||||||
|
kind: ExternalSecret
|
||||||
|
metadata:
|
||||||
|
name: tempo-s3-secret
|
||||||
|
namespace: tempo
|
||||||
|
spec:
|
||||||
|
refreshInterval: 1h
|
||||||
|
secretStoreRef:
|
||||||
|
name: vault-backend
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
target:
|
||||||
|
name: tempo-s3-secret
|
||||||
|
template:
|
||||||
|
engineVersion: v2
|
||||||
|
data:
|
||||||
|
S3_ACCESS_KEY: "{{ .access_key }}"
|
||||||
|
S3_SECRET_KEY: "{{ .secret_key }}"
|
||||||
|
data:
|
||||||
|
- secretKey: access_key
|
||||||
|
remoteRef:
|
||||||
|
key: minio
|
||||||
|
property: ROOT_USER
|
||||||
|
- secretKey: secret_key
|
||||||
|
remoteRef:
|
||||||
|
key: minio
|
||||||
|
property: ROOT_PASSWORD
|
||||||
Reference in New Issue
Block a user