From 5f926cb6cf6a5fe31e12c9cc5df04d433ccc0b8e Mon Sep 17 00:00:00 2001 From: Mayne0213 Date: Fri, 9 Jan 2026 13:22:16 +0900 Subject: [PATCH] FEAT(tempo): configure S3 storage with MinIO - Enable env var expansion in config - Configure extraEnv for S3 credentials - Fix OTel Collector image settings --- application.yaml | 33 ++++ kustomization.yaml | 5 + opentelemetry-collector/argocd.yaml | 41 +++++ opentelemetry-collector/helm-values.yaml | 200 +++++++++++++++++++++ opentelemetry-collector/kustomization.yaml | 5 + tempo/argocd.yaml | 45 +++++ tempo/helm-values.yaml | 99 ++++++++++ tempo/kustomization.yaml | 5 + tempo/manifests/secret.yaml | 26 +++ 9 files changed, 459 insertions(+) create mode 100644 application.yaml create mode 100644 opentelemetry-collector/argocd.yaml create mode 100644 opentelemetry-collector/helm-values.yaml create mode 100644 opentelemetry-collector/kustomization.yaml create mode 100644 tempo/argocd.yaml create mode 100644 tempo/helm-values.yaml create mode 100644 tempo/kustomization.yaml create mode 100644 tempo/manifests/secret.yaml diff --git a/application.yaml b/application.yaml new file mode 100644 index 0000000..68cbe34 --- /dev/null +++ b/application.yaml @@ -0,0 +1,33 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: observability + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + + source: + repoURL: https://github.com/K3S-HOME/observability.git + targetRevision: main + path: . + + destination: + server: https://kubernetes.default.svc + namespace: argocd + + syncPolicy: + automated: + prune: false + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: 5 + backoff: + duration: 5s + factor: 2 + maxDuration: 3m + + revisionHistoryLimit: 10 diff --git a/kustomization.yaml b/kustomization.yaml index 2a3bde8..7d69ed4 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -2,12 +2,17 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + # Self-reference for App of Apps pattern + - application.yaml + - prometheus/argocd.yaml - thanos/argocd.yaml - alertmanager/argocd.yaml - grafana/argocd.yaml - loki/argocd.yaml - promtail/argocd.yaml + - tempo/argocd.yaml + - opentelemetry-collector/argocd.yaml - node-exporter/argocd.yaml - kube-state-metrics/argocd.yaml - goldilocks/argocd.yaml diff --git a/opentelemetry-collector/argocd.yaml b/opentelemetry-collector/argocd.yaml new file mode 100644 index 0000000..bc08499 --- /dev/null +++ b/opentelemetry-collector/argocd.yaml @@ -0,0 +1,41 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: opentelemetry-collector + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + sources: + - repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts + chart: opentelemetry-collector + targetRevision: 0.108.0 + helm: + valueFiles: + - $values/opentelemetry-collector/helm-values.yaml + - repoURL: https://github.com/K3S-HOME/observability.git + targetRevision: main + ref: values + destination: + server: https://kubernetes.default.svc + namespace: opentelemetry + syncPolicy: + automated: + prune: true + selfHeal: true + allowEmpty: false + syncOptions: + - CreateNamespace=true + - PrunePropagationPolicy=foreground + - PruneLast=true + retry: + limit: 5 + backoff: + duration: 5s + factor: 2 + maxDuration: 3m + managedNamespaceMetadata: + labels: + goldilocks.fairwinds.com/enabled: 'true' + revisionHistoryLimit: 10 diff --git a/opentelemetry-collector/helm-values.yaml b/opentelemetry-collector/helm-values.yaml new file mode 100644 index 0000000..ae6353b --- /dev/null +++ b/opentelemetry-collector/helm-values.yaml @@ -0,0 +1,200 @@ +# OpenTelemetry Collector Helm Values +# Chart: https://github.com/open-telemetry/opentelemetry-helm-charts +# +# Architecture: +# - DaemonSet mode: one collector per node for efficient data collection +# - OTLP receiver for traces, metrics, and logs +# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs) +# +# Pipeline: +# Applications → OTel Collector → Tempo/Prometheus/Loki → Grafana + +# ============================================================================= +# Image Configuration +# ============================================================================= +image: + repository: otel/opentelemetry-collector-contrib + +# ============================================================================= +# Deployment Mode +# ============================================================================= +mode: daemonset + +# ============================================================================= +# Resource Limits (optimized for small cluster) +# ============================================================================= +resources: + requests: + cpu: 25m + memory: 64Mi + limits: + memory: 256Mi + +# ============================================================================= +# Tolerations (run on all nodes including master) +# ============================================================================= +tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + +# ============================================================================= +# Ports +# ============================================================================= +ports: + otlp: + enabled: true + containerPort: 4317 + servicePort: 4317 + hostPort: 4317 + protocol: TCP + otlp-http: + enabled: true + containerPort: 4318 + servicePort: 4318 + hostPort: 4318 + protocol: TCP + metrics: + enabled: true + containerPort: 8888 + servicePort: 8888 + protocol: TCP + +# ============================================================================= +# OpenTelemetry Collector Configuration +# ============================================================================= +config: + # Receivers - what data the collector accepts + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + # Processors - how data is transformed + processors: + # Batch processor for efficient exports + batch: + timeout: 10s + send_batch_size: 1024 + send_batch_max_size: 2048 + + # Memory limiter to prevent OOM + memory_limiter: + check_interval: 5s + limit_mib: 200 + spike_limit_mib: 50 + + # Add Kubernetes metadata + k8sattributes: + extract: + metadata: + - k8s.namespace.name + - k8s.deployment.name + - k8s.pod.name + - k8s.node.name + passthrough: false + pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + + # Resource detection + resourcedetection: + detectors: [env, system] + timeout: 5s + override: false + + # Exporters - where data goes + exporters: + # Tempo for traces + otlp/tempo: + endpoint: tempo.tempo.svc.cluster.local:4317 + tls: + insecure: true + + # Prometheus remote write for metrics + prometheusremotewrite: + endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write + tls: + insecure: true + + # Loki for logs + loki: + endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push + default_labels_enabled: + exporter: true + level: true + + # Debug exporter (for troubleshooting) + debug: + verbosity: basic + + # Extensions + extensions: + health_check: + endpoint: 0.0.0.0:13133 + + # Service pipelines + service: + extensions: [health_check] + pipelines: + # Traces pipeline + traces: + receivers: [otlp] + processors: [memory_limiter, k8sattributes, resourcedetection, batch] + exporters: [otlp/tempo] + + # Metrics pipeline + metrics: + receivers: [otlp] + processors: [memory_limiter, k8sattributes, resourcedetection, batch] + exporters: [prometheusremotewrite] + + # Logs pipeline + logs: + receivers: [otlp] + processors: [memory_limiter, k8sattributes, resourcedetection, batch] + exporters: [loki] + +# ============================================================================= +# Service Account +# ============================================================================= +serviceAccount: + create: true + +# ============================================================================= +# RBAC for k8sattributes processor +# ============================================================================= +clusterRole: + create: true + rules: + - apiGroups: [""] + resources: ["pods", "namespaces", "nodes"] + verbs: ["get", "watch", "list"] + - apiGroups: ["apps"] + resources: ["replicasets", "deployments"] + verbs: ["get", "watch", "list"] + +# ============================================================================= +# ServiceMonitor for Prometheus +# ============================================================================= +serviceMonitor: + enabled: true + metricsEndpoints: + - port: metrics + extraLabels: + release: prometheus + +# ============================================================================= +# Pod Monitor for self-monitoring +# ============================================================================= +podMonitor: + enabled: false diff --git a/opentelemetry-collector/kustomization.yaml b/opentelemetry-collector/kustomization.yaml new file mode 100644 index 0000000..418e1ce --- /dev/null +++ b/opentelemetry-collector/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - argocd.yaml diff --git a/tempo/argocd.yaml b/tempo/argocd.yaml new file mode 100644 index 0000000..f40b041 --- /dev/null +++ b/tempo/argocd.yaml @@ -0,0 +1,45 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: tempo + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + sources: + - repoURL: https://grafana.github.io/helm-charts + chart: tempo + targetRevision: 1.17.0 + helm: + valueFiles: + - $values/tempo/helm-values.yaml + - repoURL: https://github.com/K3S-HOME/observability.git + targetRevision: main + ref: values + - repoURL: https://github.com/K3S-HOME/observability.git + targetRevision: main + path: tempo/manifests + destination: + server: https://kubernetes.default.svc + namespace: tempo + syncPolicy: + automated: + prune: true + selfHeal: true + allowEmpty: false + syncOptions: + - CreateNamespace=true + - PrunePropagationPolicy=foreground + - PruneLast=true + retry: + limit: 5 + backoff: + duration: 5s + factor: 2 + maxDuration: 3m + managedNamespaceMetadata: + labels: + goldilocks.fairwinds.com/enabled: 'true' + minio-s3: enabled + revisionHistoryLimit: 10 diff --git a/tempo/helm-values.yaml b/tempo/helm-values.yaml new file mode 100644 index 0000000..fddecb3 --- /dev/null +++ b/tempo/helm-values.yaml @@ -0,0 +1,99 @@ +# Tempo Helm Values +# Chart: https://github.com/grafana/helm-charts/tree/main/charts/tempo +# +# Architecture: +# - Single binary (monolithic) mode for small clusters +# - MinIO S3 for trace storage +# - OTLP receiver for OpenTelemetry data +# - Integrates with Grafana for trace visualization + +# Run on master node for stability +tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule +nodeSelector: + node-role.kubernetes.io/control-plane: "true" + +# ============================================================================= +# Resource Limits (optimized for small cluster) +# ============================================================================= +resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 512Mi + +# ============================================================================= +# Tempo Configuration +# ============================================================================= +tempo: + # Receivers - protocols Tempo accepts + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + # Retention settings + retention: 72h # Keep traces for 3 days + + # Backend storage (MinIO S3) + # Uses environment variable expansion + storage: + trace: + backend: s3 + s3: + bucket: tempo + endpoint: minio.minio.svc.cluster.local:9000 + access_key: ${S3_ACCESS_KEY} + secret_key: ${S3_SECRET_KEY} + insecure: true + + # Query settings + querier: + frontend_worker: + frontend_address: "" + + # Metrics generator for trace-derived metrics + metricsGenerator: + enabled: true + remoteWriteUrl: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write + + # Environment variables from secret for S3 credentials + extraEnv: + - name: S3_ACCESS_KEY + valueFrom: + secretKeyRef: + name: tempo-s3-secret + key: S3_ACCESS_KEY + - name: S3_SECRET_KEY + valueFrom: + secretKeyRef: + name: tempo-s3-secret + key: S3_SECRET_KEY + +# ============================================================================= +# Persistence (local cache) +# ============================================================================= +persistence: + enabled: true + storageClassName: local-path + size: 2Gi + +# ============================================================================= +# Service +# ============================================================================= +service: + type: ClusterIP + +# ============================================================================= +# ServiceMonitor for Prometheus +# ============================================================================= +serviceMonitor: + enabled: true + additionalLabels: + release: prometheus diff --git a/tempo/kustomization.yaml b/tempo/kustomization.yaml new file mode 100644 index 0000000..418e1ce --- /dev/null +++ b/tempo/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - argocd.yaml diff --git a/tempo/manifests/secret.yaml b/tempo/manifests/secret.yaml new file mode 100644 index 0000000..e9c8776 --- /dev/null +++ b/tempo/manifests/secret.yaml @@ -0,0 +1,26 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: tempo-s3-secret + namespace: tempo +spec: + refreshInterval: 1h + secretStoreRef: + name: vault-backend + kind: ClusterSecretStore + target: + name: tempo-s3-secret + template: + engineVersion: v2 + data: + S3_ACCESS_KEY: "{{ .access_key }}" + S3_SECRET_KEY: "{{ .secret_key }}" + data: + - secretKey: access_key + remoteRef: + key: minio + property: ROOT_USER + - secretKey: secret_key + remoteRef: + key: minio + property: ROOT_PASSWORD