Compare commits

..

21 Commits

Author SHA1 Message Date
b145881fa2 PERF(prometheus): increase memory limit to 1Gi
- Increase memory request from 768Mi to 1Gi
- Increase memory limit from 768Mi to 1Gi
- Prevents OOM at 97% memory usage
2026-01-12 03:16:40 +09:00
7e61af372b PERF(observability): remove CPU limits for stability
- Remove CPU limits from all observability components
- Prevents CPU throttling issues across monitoring stack
2026-01-12 02:10:54 +09:00
3b5bf20902 PERF(observability): optimize resources via VPA
- alertmanager: CPU 15m/15m, memory 100Mi/100Mi
- blackbox-exporter: CPU 15m/32m, memory 100Mi/100Mi
- goldilocks: controller 15m/25m, dashboard 15m/15m
- grafana: CPU 22m/24m, memory 144Mi/242Mi (upperBound)
- kube-state-metrics: CPU 15m/15m, memory 100Mi/100Mi
- loki: CPU 10m/69m, memory 225Mi/323Mi
- node-exporter: CPU 15m/15m, memory 100Mi/100Mi
- opentelemetry: CPU 34m/410m, memory 142Mi/1024Mi
- prometheus-operator: CPU 15m/15m, memory 100Mi/100Mi
- tempo: CPU 15m/15m, memory 100Mi/109Mi
- thanos: CPU 15m/15m, memory 100Mi/126Mi
- vpa: CPU 15m/15m, memory 100Mi/100Mi
2026-01-12 01:07:58 +09:00
a70403d1ae FEAT(grafana): add Tempo datasource
- Add Tempo datasource for distributed tracing
- Configure URL to tempo.tempo.svc.cluster.local:3100
2026-01-12 00:34:50 +09:00
7cbc0c810e FIX(tempo): move resources to correct helm path
- Move resources from top-level to tempo.resources
- Fix memory limit not being applied to container
2026-01-12 00:21:12 +09:00
904cc3cab6 PERF(grafana): increase memory limits
- Increase requests from 175Mi to 256Mi
- Increase limits from 175Mi to 256Mi
- Fix OOM and timeout issues
2026-01-11 23:32:09 +09:00
c1214029a2 refactor: update Vault secret paths to new categorized structure
- alertmanager: alertmanager → observability/alertmanager
- grafana: postgresql → storage/postgresql
- prometheus: postgresql → storage/postgresql, minio → storage/minio
- thanos: minio → storage/minio

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-11 22:36:22 +09:00
4aa7e37f76 PERF(otel): reduce resources based on VPA recommendation
- Add fullnameOverride to simplify pod names
- Reduce memory request from 512Mi to 400Mi
- Reduce CPU request from 50m to 25m
2026-01-11 21:33:58 +09:00
4bdcaf8fcd REFACTOR(otel): rename folder to opentelemetry
- Rename opentelemetry-collector to opentelemetry
- Update ArgoCD Application name to opentelemetry
- Simplify folder structure after operator removal
2026-01-11 21:27:54 +09:00
43cf7e9de7 REFACTOR(otel): migrate collector from Operator to Helm
- Remove opentelemetry-operator (no longer needed)
- Convert opentelemetry-collector to direct Helm Chart
- Remove CRD-based manifests (collector.yaml, rbac.yaml)
- Update helm-values.yaml with Loki labels and env vars
- Simplify architecture: Helm -> DaemonSet (no Operator)
2026-01-11 21:22:39 +09:00
15d5e58d6c migrate: change repoURLs from GitHub to Gitea
Update all ArgoCD Application references to use Gitea (github0213.com)
instead of GitHub for K3S-HOME/observability repository.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 20:43:29 +09:00
7d0c8aa5f3 FIX(opentelemetry-operator): remove cpu null values
- Remove cpu: null (not allowed in new chart schema)
- Keep only memory limits
2026-01-10 18:55:23 +09:00
9c00c42946 CHORE(opentelemetry-operator): upgrade chart to 0.102.0
- Fix ServiceMonitor duplicate creation bug (Issue #3446)
- Upgrade from 0.74.0 to 0.102.0
2026-01-10 18:53:34 +09:00
a08d989fc3 FIX(opentelemetry-operator): remove invalid serviceMonitor
- Remove top-level serviceMonitor (not in chart schema)
- Keep manager.serviceMonitor.enabled: false
2026-01-10 18:42:02 +09:00
203a8debac REFACTOR(repo): remove control-plane scheduling
- Remove nodeSelector for control-plane node
- Remove tolerations for control-plane taint
- Allow pods to schedule on any available node
2026-01-10 18:35:15 +09:00
c128ece672 FIX(opentelemetry-operator): disable serviceMonitor
- Add top-level serviceMonitor.enabled: false
- Prevent duplicate ServiceMonitor creation on restart
2026-01-10 18:28:12 +09:00
bcf60b2428 fix: set CPU pressure threshold to 10% 2026-01-10 18:00:06 +09:00
da89c8dbf0 FIX(grafana): restore gauge design with percentage display
- Restore original gauge panel type
- Keep * 100 query and percent unit
- Set max to 100 for proper gauge range
2026-01-10 17:58:11 +09:00
11f9457236 fix: increase CPU pressure threshold to 30% 2026-01-10 17:57:34 +09:00
7e375e20c6 FIX(grafana): show CPU Usage as percentage per node
- Change panel type from gauge to stat
- Add * 100 to query for percentage
- Show each node's CPU usage horizontally
- Set thresholds at 50% (orange), 80% (red)
2026-01-10 17:57:05 +09:00
b818a8c1fe fix: update CPU throttling panels to use PSI metrics with 10% threshold 2026-01-10 17:54:55 +09:00
42 changed files with 2748 additions and 542 deletions

View File

@@ -14,10 +14,10 @@ spec:
helm:
valueFiles:
- $values/alertmanager/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
path: alertmanager
kustomize: {}

View File

@@ -21,6 +21,7 @@ affinity:
persistence:
enabled: false
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 15m

View File

@@ -14,7 +14,7 @@ spec:
data:
- secretKey: smtp_auth_password
remoteRef:
key: alertmanager
key: observability/alertmanager
property: SMTP_PASSWORD
---
apiVersion: external-secrets.io/v1
@@ -81,5 +81,5 @@ spec:
data:
- secretKey: smtp_password
remoteRef:
key: alertmanager
key: observability/alertmanager
property: SMTP_PASSWORD

View File

@@ -9,7 +9,7 @@ spec:
project: default
source:
repoURL: https://github.com/K3S-HOME/observability.git
repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
path: .

View File

@@ -14,7 +14,7 @@ spec:
helm:
valueFiles:
- $values/blackbox-exporter/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
destination:

View File

@@ -5,9 +5,10 @@ fullnameOverride: blackbox-exporter
replicas: 1
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 23m
cpu: 15m
memory: 100Mi
limits:
memory: 100Mi

View File

@@ -14,10 +14,10 @@ spec:
helm:
valueFiles:
- $values/goldilocks/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
path: goldilocks
destination:

View File

@@ -6,6 +6,7 @@ dashboard:
enabled: true
replicaCount: 1
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 15m
@@ -49,6 +50,7 @@ controller:
enabled: true
replicaCount: 1
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 15m
@@ -60,15 +62,6 @@ controller:
# Set to false to only monitor namespaces with the label: goldilocks.fairwinds.com/enabled=true
enableCostRecommendations: true
# Schedule on control-plane node
nodeSelector:
node-role.kubernetes.io/control-plane: "true"
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
# VPA configuration (should already be installed)
vpa:
# Set to false since we're installing VPA separately

View File

@@ -14,10 +14,10 @@ spec:
helm:
valueFiles:
- $values/grafana/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
path: grafana
destination:

2629
grafana/dashboards/APM.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1987,14 +1987,14 @@
},
"editorMode": "code",
"exemplar": true,
"expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0 or vector(0)",
"expr": "sum(rate(container_pressure_cpu_waiting_seconds_total{pod!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0.1",
"interval": "$resolution",
"legendFormat": "{{ namespace }}",
"range": true,
"refId": "A"
}
],
"title": "CPU Throttled seconds by namespace",
"title": "CPU Pressure (waiting) by namespace",
"type": "timeseries"
},
{
@@ -2099,14 +2099,14 @@
},
"editorMode": "code",
"exemplar": true,
"expr": "sum(rate(node_cpu_core_throttles_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance) or vector(0)",
"expr": "sum(rate(node_pressure_cpu_waiting_seconds_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance) > 0.1",
"interval": "$resolution",
"legendFormat": "{{ instance }}",
"range": true,
"refId": "A"
}
],
"title": "CPU Core Throttled by instance",
"title": "Node CPU Pressure (waiting) by instance",
"type": "timeseries"
},
{

View File

@@ -2242,8 +2242,9 @@
}
]
},
"unit": "s",
"unitScale": true
"unit": "percent",
"min": 0,
"max": 100
},
"overrides": []
},
@@ -2278,7 +2279,7 @@
},
"editorMode": "code",
"exemplar": true,
"expr": "rate(minio_node_process_cpu_total_seconds{job=~\"$scrape_jobs\"}[5m])",
"expr": "rate(minio_node_process_cpu_total_seconds{job=\"minio\"}[5m]) * 100",
"interval": "",
"legendFormat": "{{server}}",
"range": true,

View File

@@ -39,12 +39,13 @@ podSecurityContext:
fsGroup: 472
fsGroupChangePolicy: "Always"
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 23m
memory: 175Mi
cpu: 22m
memory: 144Mi
limits:
memory: 175Mi
memory: 242Mi
service:
type: ClusterIP
@@ -80,6 +81,11 @@ datasources:
editable: true
jsonData:
implementation: prometheus
- name: Tempo
type: tempo
access: proxy
url: http://tempo.tempo.svc.cluster.local:3100
editable: true
# Dashboards are manually imported via Grafana UI
# JSON files stored in dashboards/ directory for reference

View File

@@ -14,5 +14,5 @@ spec:
data:
- secretKey: password
remoteRef:
key: postgresql
key: storage/postgresql
property: PASSWORD

View File

@@ -14,7 +14,7 @@ spec:
helm:
valueFiles:
- $values/kube-state-metrics/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
destination:

View File

@@ -7,21 +7,13 @@ fullnameOverride: kube-state-metrics
# Note: kube-state-metrics는 stateless이지만, 여러 replica는 동일한 메트릭을 중복 생성하므로
# 단일 replica로 실행하는 것이 권장됩니다.
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 15m
memory: 105Mi
memory: 100Mi
limits:
memory: 105Mi
# Schedule on control-plane node
nodeSelector:
node-role.kubernetes.io/control-plane: "true"
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
memory: 100Mi
service:
type: ClusterIP

View File

@@ -12,8 +12,7 @@ resources:
- loki/argocd.yaml
# promtail removed - OTel filelog receiver handles log collection
- tempo/argocd.yaml
- opentelemetry-operator/argocd.yaml
- opentelemetry-collector/argocd.yaml
- opentelemetry/argocd.yaml
- node-exporter/argocd.yaml
- kube-state-metrics/argocd.yaml
- goldilocks/argocd.yaml

View File

@@ -12,7 +12,7 @@ spec:
helm:
valueFiles:
- $values/loki/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
destination:

View File

@@ -60,12 +60,13 @@ singleBinary:
mountPath: /var/loki
# Medium priority for observability
priorityClassName: medium-priority
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 63m
memory: 363Mi
cpu: 10m
memory: 225Mi
limits:
memory: 363Mi
memory: 323Mi
# Disable components not needed in single binary mode
backend:

View File

@@ -14,7 +14,7 @@ spec:
helm:
valueFiles:
- $values/node-exporter/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
destination:

View File

@@ -6,6 +6,7 @@ fullnameOverride: node-exporter
hostNetwork: true
hostPID: true
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 15m

View File

@@ -1,38 +0,0 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: opentelemetry-collector
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
annotations:
argocd.argoproj.io/sync-wave: "1"
spec:
project: default
source:
repoURL: https://github.com/K3S-HOME/observability.git
targetRevision: main
path: opentelemetry-collector/manifests
destination:
server: https://kubernetes.default.svc
namespace: opentelemetry
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
- ServerSideApply=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
managedNamespaceMetadata:
labels:
goldilocks.fairwinds.com/enabled: 'true'
revisionHistoryLimit: 10

View File

@@ -1,5 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- argocd.yaml

View File

@@ -1,238 +0,0 @@
# OpenTelemetry Collector with Target Allocator
# Managed by OpenTelemetry Operator
#
# Architecture:
# - DaemonSet mode: one collector per node for log collection
# - Target Allocator: distributes scrape targets across collectors
# - Filelog receiver for container logs
# - Prometheus receiver with Target Allocator for metrics
# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs)
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: otel-collector
namespace: opentelemetry
spec:
mode: daemonset
image: otel/opentelemetry-collector-contrib:0.113.0
serviceAccount: otel-collector
# Target Allocator disabled - metrics collected by Prometheus directly
# OTel handles logs (filelog) and traces (otlp) only
resources:
requests:
cpu: 50m
memory: 512Mi
limits:
memory: 512Mi
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
volumeMounts:
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
- name: varlibdockercontainers
mountPath: /var/lib/docker/containers
readOnly: true
volumes:
- name: varlogpods
hostPath:
path: /var/log/pods
- name: varlibdockercontainers
hostPath:
path: /var/lib/docker/containers
ports:
- name: otlp-grpc
port: 4317
protocol: TCP
targetPort: 4317
- name: otlp-http
port: 4318
protocol: TCP
targetPort: 4318
- name: metrics
port: 8888
protocol: TCP
targetPort: 8888
env:
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: K8S_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: K8S_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
config:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# Filelog receiver for container logs
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
- /var/log/pods/opentelemetry_otel-collector*/*/*.log
start_at: end
include_file_path: true
include_file_name: false
operators:
- type: router
id: get-format
routes:
- output: parser-docker
expr: 'body matches "^\\{"'
- output: parser-containerd
expr: 'body matches "^[^ Z]+Z"'
default: parser-containerd
- type: json_parser
id: parser-docker
output: extract-metadata-from-filepath
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
- type: regex_parser
id: parser-containerd
regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
output: extract-metadata-from-filepath
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
- type: regex_parser
id: extract-metadata-from-filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9-]+)\/(?P<container_name>[^\/]+)\/.*$'
parse_from: attributes["log.file.path"]
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.uid
to: resource["k8s.pod.uid"]
- type: move
from: attributes.stream
to: attributes["log.iostream"]
- type: move
from: attributes.log
to: body
# Loki label hints - tell Loki exporter which attributes to use as labels
- type: add
field: resource["loki.resource.labels"]
value: "k8s.namespace.name, k8s.pod.name, k8s.container.name, k8s.node.name"
- type: add
field: attributes["loki.attribute.labels"]
value: "log.iostream"
# Prometheus receiver - self metrics only
prometheus:
config:
scrape_configs:
- job_name: otel-collector
scrape_interval: 60s
static_configs:
- targets: ['${env:K8S_POD_IP}:8888']
processors:
batch:
timeout: 10s
send_batch_size: 1024
send_batch_max_size: 2048
memory_limiter:
check_interval: 5s
limit_mib: 400
spike_limit_mib: 100
k8sattributes:
extract:
metadata:
- k8s.namespace.name
- k8s.deployment.name
- k8s.pod.name
- k8s.node.name
passthrough: false
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: connection
resourcedetection:
detectors: [env, system]
timeout: 5s
override: false
exporters:
otlp/tempo:
endpoint: tempo.tempo.svc.cluster.local:4317
tls:
insecure: true
prometheusremotewrite:
endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
tls:
insecure: true
external_labels:
otel_collector: ${env:K8S_POD_NAME}
loki:
endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
default_labels_enabled:
exporter: false
level: true
debug:
verbosity: basic
extensions:
health_check:
endpoint: 0.0.0.0:13133
service:
extensions: [health_check]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [otlp/tempo]
metrics:
receivers: [otlp, prometheus]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [prometheusremotewrite]
logs:
receivers: [otlp, filelog]
processors: [memory_limiter, k8sattributes, resourcedetection, batch]
exporters: [loki]

View File

@@ -1,6 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- rbac.yaml
- collector.yaml

View File

@@ -1,85 +0,0 @@
# RBAC for OpenTelemetry Collector and Target Allocator
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: otel-collector
namespace: opentelemetry
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: otel-collector
rules:
# For k8sattributes processor
- apiGroups: [""]
resources: ["pods", "namespaces", "nodes", "endpoints", "services"]
verbs: ["get", "watch", "list"]
- apiGroups: ["apps"]
resources: ["replicasets", "deployments", "statefulsets", "daemonsets"]
verbs: ["get", "watch", "list"]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["get", "watch", "list"]
# For Target Allocator - ServiceMonitor/PodMonitor discovery
- apiGroups: ["monitoring.coreos.com"]
resources: ["servicemonitors", "podmonitors"]
verbs: ["get", "watch", "list"]
# For node metrics
- apiGroups: [""]
resources: ["nodes/metrics", "nodes/stats", "nodes/proxy"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: otel-collector
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: otel-collector
subjects:
- kind: ServiceAccount
name: otel-collector
namespace: opentelemetry
---
# Target Allocator ServiceAccount and RBAC
apiVersion: v1
kind: ServiceAccount
metadata:
name: otel-collector-targetallocator
namespace: opentelemetry
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: otel-targetallocator
rules:
# Core resources for service discovery
- apiGroups: [""]
resources: ["pods", "nodes", "services", "endpoints", "namespaces"]
verbs: ["get", "watch", "list"]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["get", "watch", "list"]
# Prometheus CRs
- apiGroups: ["monitoring.coreos.com"]
resources: ["servicemonitors", "podmonitors", "probes", "scrapeconfigs"]
verbs: ["get", "watch", "list"]
# For allocator coordination
- apiGroups: ["opentelemetry.io"]
resources: ["opentelemetrycollectors"]
verbs: ["get", "watch", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: otel-targetallocator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: otel-targetallocator
subjects:
- kind: ServiceAccount
name: otel-collector-targetallocator
namespace: opentelemetry

View File

@@ -1,55 +0,0 @@
# OpenTelemetry Operator Helm Values
# Chart: https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-operator
# Manager (Operator) configuration
manager:
collectorImage:
repository: otel/opentelemetry-collector-contrib
targetAllocatorImage:
repository: ghcr.io/open-telemetry/opentelemetry-operator/target-allocator
autoInstrumentationImage:
java:
repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-java
nodejs:
repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-nodejs
python:
repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-python
dotnet:
repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-dotnet
go:
repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-go
resources:
limits:
cpu: null # Disable chart default CPU limits
memory: 256Mi
requests:
cpu: 10m
memory: 256Mi
# ServiceMonitor configuration
serviceMonitor:
enabled: false # Disable ServiceMonitor creation to prevent conflicts
# Admission webhooks (uses cert-manager self-signed CA)
admissionWebhooks:
certManager:
enabled: true
# Kube RBAC Proxy
kubeRBACProxy:
enabled: true
resources:
limits:
cpu: null # Disable chart default CPU limits
memory: 64Mi
requests:
cpu: 5m
memory: 64Mi
# Schedule on master node
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
nodeSelector:
node-role.kubernetes.io/control-plane: "true"

View File

@@ -1,5 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- argocd.yaml

View File

@@ -1,27 +1,27 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: opentelemetry-operator
name: opentelemetry
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
annotations:
argocd.argoproj.io/sync-wave: "0"
argocd.argoproj.io/sync-wave: "1"
spec:
project: default
sources:
- repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts
chart: opentelemetry-operator
targetRevision: 0.74.0
chart: opentelemetry-collector
targetRevision: 0.108.0
helm:
valueFiles:
- $values/opentelemetry-operator/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- $values/opentelemetry/helm-values.yaml
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
destination:
server: https://kubernetes.default.svc
namespace: opentelemetry-operator
namespace: opentelemetry
syncPolicy:
automated:
prune: true

View File

@@ -11,6 +11,11 @@
# Pipeline:
# Applications → OTel Collector → Tempo/Prometheus/Loki → Grafana
# =============================================================================
# Name Override
# =============================================================================
fullnameOverride: otel-collector
# =============================================================================
# Image Configuration
# =============================================================================
@@ -23,22 +28,31 @@ image:
mode: daemonset
# =============================================================================
# Resource Limits (increased for log + metrics collection)
# Resource Limits (no CPU limit for stability, mem limit capped at 1024Mi)
# =============================================================================
resources:
requests:
cpu: 50m
memory: 512Mi
cpu: 34m
memory: 142Mi
limits:
memory: 512Mi
memory: 1024Mi
# =============================================================================
# Tolerations (run on all nodes including master)
# Environment Variables
# =============================================================================
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
extraEnvs:
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: K8S_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: K8S_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
# =============================================================================
# Extra Volumes for Log Collection
@@ -160,6 +174,13 @@ config:
- type: move
from: attributes.log
to: body
# Loki label hints - tell Loki exporter which attributes to use as labels
- type: add
field: resource["loki.resource.labels"]
value: "k8s.namespace.name, k8s.pod.name, k8s.container.name, k8s.node.name"
- type: add
field: attributes["loki.attribute.labels"]
value: "log.iostream"
# Prometheus receiver - self metrics only
# Infrastructure metrics (node-exporter, kube-state-metrics) handled by Prometheus
@@ -168,9 +189,9 @@ config:
scrape_configs:
# OTel Collector self metrics only
- job_name: 'otel-collector'
scrape_interval: 30s
scrape_interval: 60s
static_configs:
- targets: ['${env:MY_POD_IP}:8888']
- targets: ['${env:K8S_POD_IP}:8888']
# ---------------------------------------------------------------------------
# Processors - how data is transformed
@@ -228,12 +249,14 @@ config:
endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
tls:
insecure: true
external_labels:
otel_collector: ${env:K8S_POD_NAME}
# Loki for logs
loki:
endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
default_labels_enabled:
exporter: true
exporter: false
level: true
# Debug exporter (for troubleshooting)

View File

@@ -14,10 +14,10 @@ spec:
helm:
valueFiles:
- $values/prometheus/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
path: prometheus
destination:

View File

@@ -14,6 +14,13 @@ prometheusOperator:
enabled: true
# CRD 생성 비활성화
createCustomResource: false
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 15m
memory: 100Mi
limits:
memory: 100Mi
# Kubelet ServiceMonitor with cluster label
kubelet:
@@ -105,9 +112,9 @@ prometheus:
resources:
requests:
cpu: 50m
memory: 768Mi
memory: 1Gi
limits:
memory: 768Mi
memory: 1Gi
# ServiceMonitor selector - scrape all ServiceMonitors
serviceMonitorSelectorNilUsesHelmValues: false

View File

@@ -14,7 +14,7 @@ spec:
data:
- secretKey: password
remoteRef:
key: postgresql
key: storage/postgresql
property: PASSWORD
---
apiVersion: external-secrets.io/v1
@@ -43,14 +43,14 @@ spec:
data:
- secretKey: access_key
remoteRef:
key: minio
key: storage/minio
property: ROOT_USER
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: secret_key
remoteRef:
key: minio
key: storage/minio
property: ROOT_PASSWORD
conversionStrategy: Default
decodingStrategy: None

View File

@@ -12,7 +12,7 @@ spec:
helm:
valueFiles:
- $values/promtail/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
destination:

View File

@@ -37,12 +37,6 @@ resources:
limits:
memory: 182Mi
# Tolerations to run on all nodes including control-plane
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
# ServiceMonitor disabled
serviceMonitor:
enabled: false

View File

@@ -14,7 +14,7 @@ spec:
helm:
valueFiles:
- $values/tempo/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
destination:

View File

@@ -14,19 +14,16 @@ priorityClassName: medium-priority
replicas: 1
# =============================================================================
# Resource Limits (optimized for small cluster)
# Tempo Configuration
# =============================================================================
resources:
tempo:
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 15m
memory: 100Mi
limits:
memory: 100Mi
# =============================================================================
# Tempo Configuration
# =============================================================================
tempo:
memory: 109Mi
# Receivers - protocols Tempo accepts
receivers:
otlp:

View File

@@ -14,10 +14,10 @@ spec:
helm:
valueFiles:
- $values/thanos/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
path: thanos/manifests
destination:

View File

@@ -46,12 +46,13 @@ query:
- --query.replica-label=prometheus_replica
- --query.auto-downsampling
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 15m
memory: 283Mi
memory: 100Mi
limits:
memory: 283Mi
memory: 126Mi
# =============================================================================
# Query Frontend - Caching layer for Query (optional, disabled for small cluster)

View File

@@ -24,9 +24,9 @@ spec:
data:
- secretKey: access_key
remoteRef:
key: minio
key: storage/minio
property: ROOT_USER
- secretKey: secret_key
remoteRef:
key: minio
key: storage/minio
property: ROOT_PASSWORD

View File

@@ -14,7 +14,7 @@ spec:
helm:
valueFiles:
- $values/vpa/helm-values.yaml
- repoURL: https://github.com/K3S-HOME/observability.git
- repoURL: https://github0213.com/K3S-HOME/observability.git
targetRevision: main
ref: values
destination:

View File

@@ -6,6 +6,7 @@ recommender:
enabled: true
replicaCount: 1
# Resource settings (no CPU limit for stability)
resources:
requests:
cpu: 15m
@@ -13,15 +14,6 @@ recommender:
limits:
memory: 100Mi
# Schedule on control-plane node
nodeSelector:
node-role.kubernetes.io/control-plane: "true"
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
# Updater - applies recommended resource requests to pods
# Disabled because we're using updateMode: Off (recommendations only)
updater: