PERF(prometheus): increase memory limit to 1Gi

- Increase memory request from 768Mi to 1Gi - Increase memory limit from 768Mi to 1Gi - Prevents OOM at 97% memory usage
PERF(observability): remove CPU limits for stability
2026-01-12 03:16:40 +09:00 · 2026-01-12 02:10:54 +09:00 · 2026-01-12 01:07:58 +09:00 · 2026-01-12 00:34:50 +09:00 · 2026-01-12 00:21:12 +09:00 · 2026-01-11 23:32:09 +09:00
42 changed files with 2748 additions and 542 deletions
--- a/alertmanager/argocd.yaml
+++ b/alertmanager/argocd.yaml
@@ -14,10 +14,10 @@ spec:
    helm:
      valueFiles:
      - $values/alertmanager/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    path: alertmanager
    kustomize: {}
--- a/alertmanager/helm-values.yaml
+++ b/alertmanager/helm-values.yaml
@@ -21,6 +21,7 @@ affinity:
 persistence:
  enabled: false

+# Resource settings (no CPU limit for stability)
 resources:
  requests:
    cpu: 15m
--- a/alertmanager/manifests/secret.yaml
+++ b/alertmanager/manifests/secret.yaml
@@ -14,7 +14,7 @@ spec:
  data:
    - secretKey: smtp_auth_password
      remoteRef:
-        key: alertmanager
+        key: observability/alertmanager
        property: SMTP_PASSWORD
 ---
 apiVersion: external-secrets.io/v1
@@ -81,5 +81,5 @@ spec:
  data:
    - secretKey: smtp_password
      remoteRef:
-        key: alertmanager
+        key: observability/alertmanager
        property: SMTP_PASSWORD
--- a/application.yaml
+++ b/application.yaml
@@ -9,7 +9,7 @@ spec:
  project: default

  source:
-    repoURL: https://github.com/K3S-HOME/observability.git
+    repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    path: .

--- a/blackbox-exporter/argocd.yaml
+++ b/blackbox-exporter/argocd.yaml
@@ -14,7 +14,7 @@ spec:
    helm:
      valueFiles:
      - $values/blackbox-exporter/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
  destination:
--- a/blackbox-exporter/helm-values.yaml
+++ b/blackbox-exporter/helm-values.yaml
@@ -5,9 +5,10 @@ fullnameOverride: blackbox-exporter

 replicas: 1

+# Resource settings (no CPU limit for stability)
 resources:
  requests:
-    cpu: 23m
+    cpu: 15m
    memory: 100Mi
  limits:
    memory: 100Mi
--- a/goldilocks/argocd.yaml
+++ b/goldilocks/argocd.yaml
@@ -14,10 +14,10 @@ spec:
    helm:
      valueFiles:
      - $values/goldilocks/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    path: goldilocks
  destination:
--- a/goldilocks/helm-values.yaml
+++ b/goldilocks/helm-values.yaml
@@ -6,6 +6,7 @@ dashboard:
  enabled: true
  replicaCount: 1

+  # Resource settings (no CPU limit for stability)
  resources:
    requests:
      cpu: 15m
@@ -49,6 +50,7 @@ controller:
  enabled: true
  replicaCount: 1

+  # Resource settings (no CPU limit for stability)
  resources:
    requests:
      cpu: 15m
@@ -60,15 +62,6 @@ controller:
  # Set to false to only monitor namespaces with the label: goldilocks.fairwinds.com/enabled=true
  enableCostRecommendations: true

-  # Schedule on control-plane node
-  nodeSelector:
-    node-role.kubernetes.io/control-plane: "true"
-
-  tolerations:
-    - key: node-role.kubernetes.io/control-plane
-      operator: Exists
-      effect: NoSchedule
-
 # VPA configuration (should already be installed)
 vpa:
  # Set to false since we're installing VPA separately
--- a/grafana/argocd.yaml
+++ b/grafana/argocd.yaml
@@ -14,10 +14,10 @@ spec:
    helm:
      valueFiles:
      - $values/grafana/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    path: grafana
  destination:
--- a/grafana/dashboards/APM.json
+++ b/grafana/dashboards/APM.json
--- a/grafana/dashboards/main.json
+++ b/grafana/dashboards/main.json
@@ -1987,14 +1987,14 @@
          },
          "editorMode": "code",
          "exemplar": true,
-          "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0 or vector(0)",
+          "expr": "sum(rate(container_pressure_cpu_waiting_seconds_total{pod!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0.1",
          "interval": "$resolution",
          "legendFormat": "{{ namespace }}",
          "range": true,
          "refId": "A"
        }
      ],
-      "title": "CPU Throttled seconds by namespace",
+      "title": "CPU Pressure (waiting) by namespace",
      "type": "timeseries"
    },
    {
@@ -2099,14 +2099,14 @@
          },
          "editorMode": "code",
          "exemplar": true,
-          "expr": "sum(rate(node_cpu_core_throttles_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance) or vector(0)",
+          "expr": "sum(rate(node_pressure_cpu_waiting_seconds_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance) > 0.1",
          "interval": "$resolution",
          "legendFormat": "{{ instance }}",
          "range": true,
          "refId": "A"
        }
      ],
-      "title": "CPU Core Throttled by instance",
+      "title": "Node CPU Pressure (waiting) by instance",
      "type": "timeseries"
    },
    {
--- a/grafana/dashboards/minio.json
+++ b/grafana/dashboards/minio.json
@@ -2242,8 +2242,9 @@
              }
            ]
          },
-          "unit": "s",
-          "unitScale": true
+          "unit": "percent",
+          "min": 0,
+          "max": 100
        },
        "overrides": []
      },
@@ -2278,7 +2279,7 @@
          },
          "editorMode": "code",
          "exemplar": true,
-          "expr": "rate(minio_node_process_cpu_total_seconds{job=~\"$scrape_jobs\"}[5m])",
+          "expr": "rate(minio_node_process_cpu_total_seconds{job=\"minio\"}[5m]) * 100",
          "interval": "",
          "legendFormat": "{{server}}",
          "range": true,
--- a/grafana/helm-values.yaml
+++ b/grafana/helm-values.yaml
@@ -39,12 +39,13 @@ podSecurityContext:
  fsGroup: 472
  fsGroupChangePolicy: "Always"

+# Resource settings (no CPU limit for stability)
 resources:
  requests:
-    cpu: 23m
-    memory: 175Mi
+    cpu: 22m
+    memory: 144Mi
  limits:
-    memory: 175Mi
+    memory: 242Mi

 service:
  type: ClusterIP
@@ -80,6 +81,11 @@ datasources:
      editable: true
      jsonData:
        implementation: prometheus
+    - name: Tempo
+      type: tempo
+      access: proxy
+      url: http://tempo.tempo.svc.cluster.local:3100
+      editable: true

 # Dashboards are manually imported via Grafana UI
 # JSON files stored in dashboards/ directory for reference
--- a/grafana/manifests/secret.yaml
+++ b/grafana/manifests/secret.yaml
@@ -14,5 +14,5 @@ spec:
  data:
    - secretKey: password
      remoteRef:
-        key: postgresql
+        key: storage/postgresql
        property: PASSWORD
--- a/kube-state-metrics/argocd.yaml
+++ b/kube-state-metrics/argocd.yaml
@@ -14,7 +14,7 @@ spec:
    helm:
      valueFiles:
      - $values/kube-state-metrics/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
  destination:
--- a/kube-state-metrics/helm-values.yaml
+++ b/kube-state-metrics/helm-values.yaml
@@ -7,21 +7,13 @@ fullnameOverride: kube-state-metrics
 # Note: kube-state-metrics는 stateless이지만, 여러 replica는 동일한 메트릭을 중복 생성하므로
 # 단일 replica로 실행하는 것이 권장됩니다.

+# Resource settings (no CPU limit for stability)
 resources:
  requests:
    cpu: 15m
-    memory: 105Mi
+    memory: 100Mi
  limits:
-    memory: 105Mi
-
-# Schedule on control-plane node
-nodeSelector:
-  node-role.kubernetes.io/control-plane: "true"
-
-tolerations:
-  - key: node-role.kubernetes.io/control-plane
-    operator: Exists
-    effect: NoSchedule
+    memory: 100Mi

 service:
  type: ClusterIP
--- a/kustomization.yaml
+++ b/kustomization.yaml
@@ -12,8 +12,7 @@ resources:
  - loki/argocd.yaml
  # promtail removed - OTel filelog receiver handles log collection
  - tempo/argocd.yaml
-  - opentelemetry-operator/argocd.yaml
-  - opentelemetry-collector/argocd.yaml
+  - opentelemetry/argocd.yaml
  - node-exporter/argocd.yaml
  - kube-state-metrics/argocd.yaml
  - goldilocks/argocd.yaml
--- a/loki/argocd.yaml
+++ b/loki/argocd.yaml
@@ -12,7 +12,7 @@ spec:
    helm:
      valueFiles:
      - $values/loki/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
  destination:
--- a/loki/helm-values.yaml
+++ b/loki/helm-values.yaml
@@ -60,12 +60,13 @@ singleBinary:
      mountPath: /var/loki
  # Medium priority for observability
  priorityClassName: medium-priority
+  # Resource settings (no CPU limit for stability)
  resources:
    requests:
-      cpu: 63m
-      memory: 363Mi
+      cpu: 10m
+      memory: 225Mi
    limits:
-      memory: 363Mi
+      memory: 323Mi

 # Disable components not needed in single binary mode
 backend:
--- a/node-exporter/argocd.yaml
+++ b/node-exporter/argocd.yaml
@@ -14,7 +14,7 @@ spec:
    helm:
      valueFiles:
      - $values/node-exporter/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
  destination:
--- a/node-exporter/helm-values.yaml
+++ b/node-exporter/helm-values.yaml
@@ -6,6 +6,7 @@ fullnameOverride: node-exporter
 hostNetwork: true
 hostPID: true

+# Resource settings (no CPU limit for stability)
 resources:
  requests:
    cpu: 15m
--- a/opentelemetry-collector/argocd.yaml
+++ b/opentelemetry-collector/argocd.yaml
@@ -1,38 +0,0 @@
-apiVersion: argoproj.io/v1alpha1
-kind: Application
-metadata:
-  name: opentelemetry-collector
-  namespace: argocd
-  finalizers:
-  - resources-finalizer.argocd.argoproj.io
-  annotations:
-    argocd.argoproj.io/sync-wave: "1"
-spec:
-  project: default
-  source:
-    repoURL: https://github.com/K3S-HOME/observability.git
-    targetRevision: main
-    path: opentelemetry-collector/manifests
-  destination:
-    server: https://kubernetes.default.svc
-    namespace: opentelemetry
-  syncPolicy:
-    automated:
-      prune: true
-      selfHeal: true
-      allowEmpty: false
-    syncOptions:
-    - CreateNamespace=true
-    - PrunePropagationPolicy=foreground
-    - PruneLast=true
-    - ServerSideApply=true
-    retry:
-      limit: 5
-      backoff:
-        duration: 5s
-        factor: 2
-        maxDuration: 3m
-    managedNamespaceMetadata:
-      labels:
-        goldilocks.fairwinds.com/enabled: 'true'
-  revisionHistoryLimit: 10
--- a/opentelemetry-collector/kustomization.yaml
+++ b/opentelemetry-collector/kustomization.yaml
@@ -1,5 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-resources:
-  - argocd.yaml
--- a/opentelemetry-collector/manifests/collector.yaml
+++ b/opentelemetry-collector/manifests/collector.yaml
@@ -1,238 +0,0 @@
-# OpenTelemetry Collector with Target Allocator
-# Managed by OpenTelemetry Operator
-#
-# Architecture:
-# - DaemonSet mode: one collector per node for log collection
-# - Target Allocator: distributes scrape targets across collectors
-# - Filelog receiver for container logs
-# - Prometheus receiver with Target Allocator for metrics
-# - Exports to: Tempo (traces), Prometheus (metrics), Loki (logs)
-apiVersion: opentelemetry.io/v1beta1
-kind: OpenTelemetryCollector
-metadata:
-  name: otel-collector
-  namespace: opentelemetry
-spec:
-  mode: daemonset
-  image: otel/opentelemetry-collector-contrib:0.113.0
-  serviceAccount: otel-collector
-
-  # Target Allocator disabled - metrics collected by Prometheus directly
-  # OTel handles logs (filelog) and traces (otlp) only
-
-  resources:
-    requests:
-      cpu: 50m
-      memory: 512Mi
-    limits:
-      memory: 512Mi
-
-  tolerations:
-    - key: node-role.kubernetes.io/control-plane
-      operator: Exists
-      effect: NoSchedule
-
-  volumeMounts:
-    - name: varlogpods
-      mountPath: /var/log/pods
-      readOnly: true
-    - name: varlibdockercontainers
-      mountPath: /var/lib/docker/containers
-      readOnly: true
-
-  volumes:
-    - name: varlogpods
-      hostPath:
-        path: /var/log/pods
-    - name: varlibdockercontainers
-      hostPath:
-        path: /var/lib/docker/containers
-
-  ports:
-    - name: otlp-grpc
-      port: 4317
-      protocol: TCP
-      targetPort: 4317
-    - name: otlp-http
-      port: 4318
-      protocol: TCP
-      targetPort: 4318
-    - name: metrics
-      port: 8888
-      protocol: TCP
-      targetPort: 8888
-
-  env:
-    - name: K8S_NODE_NAME
-      valueFrom:
-        fieldRef:
-          fieldPath: spec.nodeName
-    - name: K8S_POD_NAME
-      valueFrom:
-        fieldRef:
-          fieldPath: metadata.name
-    - name: K8S_POD_IP
-      valueFrom:
-        fieldRef:
-          fieldPath: status.podIP
-
-  config:
-    receivers:
-      otlp:
-        protocols:
-          grpc:
-            endpoint: 0.0.0.0:4317
-          http:
-            endpoint: 0.0.0.0:4318
-
-      # Filelog receiver for container logs
-      filelog:
-        include:
-          - /var/log/pods/*/*/*.log
-        exclude:
-          - /var/log/pods/opentelemetry_otel-collector*/*/*.log
-        start_at: end
-        include_file_path: true
-        include_file_name: false
-        operators:
-          - type: router
-            id: get-format
-            routes:
-              - output: parser-docker
-                expr: 'body matches "^\\{"'
-              - output: parser-containerd
-                expr: 'body matches "^[^ Z]+Z"'
-            default: parser-containerd
-
-          - type: json_parser
-            id: parser-docker
-            output: extract-metadata-from-filepath
-            timestamp:
-              parse_from: attributes.time
-              layout: '%Y-%m-%dT%H:%M:%S.%LZ'
-
-          - type: regex_parser
-            id: parser-containerd
-            regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
-            output: extract-metadata-from-filepath
-            timestamp:
-              parse_from: attributes.time
-              layout: '%Y-%m-%dT%H:%M:%S.%LZ'
-
-          - type: regex_parser
-            id: extract-metadata-from-filepath
-            regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9-]+)\/(?P<container_name>[^\/]+)\/.*$'
-            parse_from: attributes["log.file.path"]
-
-          - type: move
-            from: attributes.namespace
-            to: resource["k8s.namespace.name"]
-          - type: move
-            from: attributes.pod_name
-            to: resource["k8s.pod.name"]
-          - type: move
-            from: attributes.container_name
-            to: resource["k8s.container.name"]
-          - type: move
-            from: attributes.uid
-            to: resource["k8s.pod.uid"]
-          - type: move
-            from: attributes.stream
-            to: attributes["log.iostream"]
-          - type: move
-            from: attributes.log
-            to: body
-          # Loki label hints - tell Loki exporter which attributes to use as labels
-          - type: add
-            field: resource["loki.resource.labels"]
-            value: "k8s.namespace.name, k8s.pod.name, k8s.container.name, k8s.node.name"
-          - type: add
-            field: attributes["loki.attribute.labels"]
-            value: "log.iostream"
-
-      # Prometheus receiver - self metrics only
-      prometheus:
-        config:
-          scrape_configs:
-            - job_name: otel-collector
-              scrape_interval: 60s
-              static_configs:
-                - targets: ['${env:K8S_POD_IP}:8888']
-
-    processors:
-      batch:
-        timeout: 10s
-        send_batch_size: 1024
-        send_batch_max_size: 2048
-
-      memory_limiter:
-        check_interval: 5s
-        limit_mib: 400
-        spike_limit_mib: 100
-
-      k8sattributes:
-        extract:
-          metadata:
-            - k8s.namespace.name
-            - k8s.deployment.name
-            - k8s.pod.name
-            - k8s.node.name
-        passthrough: false
-        pod_association:
-          - sources:
-              - from: resource_attribute
-                name: k8s.pod.ip
-          - sources:
-              - from: resource_attribute
-                name: k8s.pod.uid
-          - sources:
-              - from: connection
-
-      resourcedetection:
-        detectors: [env, system]
-        timeout: 5s
-        override: false
-
-    exporters:
-      otlp/tempo:
-        endpoint: tempo.tempo.svc.cluster.local:4317
-        tls:
-          insecure: true
-
-      prometheusremotewrite:
-        endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
-        tls:
-          insecure: true
-        external_labels:
-          otel_collector: ${env:K8S_POD_NAME}
-
-      loki:
-        endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
-        default_labels_enabled:
-          exporter: false
-          level: true
-
-      debug:
-        verbosity: basic
-
-    extensions:
-      health_check:
-        endpoint: 0.0.0.0:13133
-
-    service:
-      extensions: [health_check]
-      pipelines:
-        traces:
-          receivers: [otlp]
-          processors: [memory_limiter, k8sattributes, resourcedetection, batch]
-          exporters: [otlp/tempo]
-
-        metrics:
-          receivers: [otlp, prometheus]
-          processors: [memory_limiter, k8sattributes, resourcedetection, batch]
-          exporters: [prometheusremotewrite]
-
-        logs:
-          receivers: [otlp, filelog]
-          processors: [memory_limiter, k8sattributes, resourcedetection, batch]
-          exporters: [loki]
--- a/opentelemetry-collector/manifests/kustomization.yaml
+++ b/opentelemetry-collector/manifests/kustomization.yaml
@@ -1,6 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-resources:
-  - rbac.yaml
-  - collector.yaml
--- a/opentelemetry-collector/manifests/rbac.yaml
+++ b/opentelemetry-collector/manifests/rbac.yaml
@@ -1,85 +0,0 @@
-# RBAC for OpenTelemetry Collector and Target Allocator
---
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: otel-collector
-  namespace: opentelemetry
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: otel-collector
-rules:
-  # For k8sattributes processor
-  - apiGroups: [""]
-    resources: ["pods", "namespaces", "nodes", "endpoints", "services"]
-    verbs: ["get", "watch", "list"]
-  - apiGroups: ["apps"]
-    resources: ["replicasets", "deployments", "statefulsets", "daemonsets"]
-    verbs: ["get", "watch", "list"]
-  - apiGroups: ["discovery.k8s.io"]
-    resources: ["endpointslices"]
-    verbs: ["get", "watch", "list"]
-  # For Target Allocator - ServiceMonitor/PodMonitor discovery
-  - apiGroups: ["monitoring.coreos.com"]
-    resources: ["servicemonitors", "podmonitors"]
-    verbs: ["get", "watch", "list"]
-  # For node metrics
-  - apiGroups: [""]
-    resources: ["nodes/metrics", "nodes/stats", "nodes/proxy"]
-    verbs: ["get"]
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: otel-collector
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: otel-collector
-subjects:
-  - kind: ServiceAccount
-    name: otel-collector
-    namespace: opentelemetry
---
-# Target Allocator ServiceAccount and RBAC
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: otel-collector-targetallocator
-  namespace: opentelemetry
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: otel-targetallocator
-rules:
-  # Core resources for service discovery
-  - apiGroups: [""]
-    resources: ["pods", "nodes", "services", "endpoints", "namespaces"]
-    verbs: ["get", "watch", "list"]
-  - apiGroups: ["discovery.k8s.io"]
-    resources: ["endpointslices"]
-    verbs: ["get", "watch", "list"]
-  # Prometheus CRs
-  - apiGroups: ["monitoring.coreos.com"]
-    resources: ["servicemonitors", "podmonitors", "probes", "scrapeconfigs"]
-    verbs: ["get", "watch", "list"]
-  # For allocator coordination
-  - apiGroups: ["opentelemetry.io"]
-    resources: ["opentelemetrycollectors"]
-    verbs: ["get", "watch", "list"]
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: otel-targetallocator
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: otel-targetallocator
-subjects:
-  - kind: ServiceAccount
-    name: otel-collector-targetallocator
-    namespace: opentelemetry
--- a/opentelemetry-operator/helm-values.yaml
+++ b/opentelemetry-operator/helm-values.yaml
@@ -1,55 +0,0 @@
-# OpenTelemetry Operator Helm Values
-# Chart: https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-operator
-
-# Manager (Operator) configuration
-manager:
-  collectorImage:
-    repository: otel/opentelemetry-collector-contrib
-  targetAllocatorImage:
-    repository: ghcr.io/open-telemetry/opentelemetry-operator/target-allocator
-  autoInstrumentationImage:
-    java:
-      repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-java
-    nodejs:
-      repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-nodejs
-    python:
-      repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-python
-    dotnet:
-      repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-dotnet
-    go:
-      repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-go
-  resources:
-    limits:
-      cpu: null  # Disable chart default CPU limits
-      memory: 256Mi
-    requests:
-      cpu: 10m
-      memory: 256Mi
-  # ServiceMonitor configuration
-  serviceMonitor:
-    enabled: false  # Disable ServiceMonitor creation to prevent conflicts
-
-# Admission webhooks (uses cert-manager self-signed CA)
-admissionWebhooks:
-  certManager:
-    enabled: true
-
-# Kube RBAC Proxy
-kubeRBACProxy:
-  enabled: true
-  resources:
-    limits:
-      cpu: null  # Disable chart default CPU limits
-      memory: 64Mi
-    requests:
-      cpu: 5m
-      memory: 64Mi
-
-# Schedule on master node
-tolerations:
-  - key: node-role.kubernetes.io/control-plane
-    operator: Exists
-    effect: NoSchedule
-
-nodeSelector:
-  node-role.kubernetes.io/control-plane: "true"
--- a/opentelemetry-operator/kustomization.yaml
+++ b/opentelemetry-operator/kustomization.yaml
@@ -1,5 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-resources:
-  - argocd.yaml
--- a/opentelemetry-operator/argocd.yaml
+++ b/opentelemetry-operator/argocd.yaml
@@ -1,27 +1,27 @@
 apiVersion: argoproj.io/v1alpha1
 kind: Application
 metadata:
-  name: opentelemetry-operator
+  name: opentelemetry
  namespace: argocd
  finalizers:
  - resources-finalizer.argocd.argoproj.io
  annotations:
-    argocd.argoproj.io/sync-wave: "0"
+    argocd.argoproj.io/sync-wave: "1"
 spec:
  project: default
  sources:
  - repoURL: https://open-telemetry.github.io/opentelemetry-helm-charts
-    chart: opentelemetry-operator
-    targetRevision: 0.74.0
+    chart: opentelemetry-collector
+    targetRevision: 0.108.0
    helm:
      valueFiles:
-      - $values/opentelemetry-operator/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+      - $values/opentelemetry/helm-values.yaml
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
  destination:
    server: https://kubernetes.default.svc
-    namespace: opentelemetry-operator
+    namespace: opentelemetry
  syncPolicy:
    automated:
      prune: true
--- a/opentelemetry-collector/helm-values.yaml
+++ b/opentelemetry-collector/helm-values.yaml
@@ -11,6 +11,11 @@
 # Pipeline:
 #   Applications → OTel Collector → Tempo/Prometheus/Loki → Grafana

+# =============================================================================
+# Name Override
+# =============================================================================
+fullnameOverride: otel-collector
+
 # =============================================================================
 # Image Configuration
 # =============================================================================
@@ -23,22 +28,31 @@ image:
 mode: daemonset

 # =============================================================================
-# Resource Limits (increased for log + metrics collection)
+# Resource Limits (no CPU limit for stability, mem limit capped at 1024Mi)
 # =============================================================================
 resources:
  requests:
-    cpu: 50m
-    memory: 512Mi
+    cpu: 34m
+    memory: 142Mi
  limits:
-    memory: 512Mi
+    memory: 1024Mi

 # =============================================================================
-# Tolerations (run on all nodes including master)
+# Environment Variables
 # =============================================================================
-tolerations:
-  - key: node-role.kubernetes.io/control-plane
-    operator: Exists
-    effect: NoSchedule
+extraEnvs:
+  - name: K8S_NODE_NAME
+    valueFrom:
+      fieldRef:
+        fieldPath: spec.nodeName
+  - name: K8S_POD_NAME
+    valueFrom:
+      fieldRef:
+        fieldPath: metadata.name
+  - name: K8S_POD_IP
+    valueFrom:
+      fieldRef:
+        fieldPath: status.podIP

 # =============================================================================
 # Extra Volumes for Log Collection
@@ -160,6 +174,13 @@ config:
        - type: move
          from: attributes.log
          to: body
+        # Loki label hints - tell Loki exporter which attributes to use as labels
+        - type: add
+          field: resource["loki.resource.labels"]
+          value: "k8s.namespace.name, k8s.pod.name, k8s.container.name, k8s.node.name"
+        - type: add
+          field: attributes["loki.attribute.labels"]
+          value: "log.iostream"

    # Prometheus receiver - self metrics only
    # Infrastructure metrics (node-exporter, kube-state-metrics) handled by Prometheus
@@ -168,9 +189,9 @@ config:
        scrape_configs:
          # OTel Collector self metrics only
          - job_name: 'otel-collector'
-            scrape_interval: 30s
+            scrape_interval: 60s
            static_configs:
-              - targets: ['${env:MY_POD_IP}:8888']
+              - targets: ['${env:K8S_POD_IP}:8888']

  # ---------------------------------------------------------------------------
  # Processors - how data is transformed
@@ -228,12 +249,14 @@ config:
      endpoint: http://prometheus-kube-prometheus-prometheus.prometheus.svc:9090/api/v1/write
      tls:
        insecure: true
+      external_labels:
+        otel_collector: ${env:K8S_POD_NAME}

    # Loki for logs
    loki:
      endpoint: http://loki.loki.svc.cluster.local:3100/loki/api/v1/push
      default_labels_enabled:
-        exporter: true
+        exporter: false
        level: true

    # Debug exporter (for troubleshooting)
--- a/prometheus/argocd.yaml
+++ b/prometheus/argocd.yaml
@@ -14,10 +14,10 @@ spec:
    helm:
      valueFiles:
      - $values/prometheus/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    path: prometheus
  destination:
--- a/prometheus/helm-values.yaml
+++ b/prometheus/helm-values.yaml
@@ -14,6 +14,13 @@ prometheusOperator:
  enabled: true
  # CRD 생성 비활성화
  createCustomResource: false
+  # Resource settings (no CPU limit for stability)
+  resources:
+    requests:
+      cpu: 15m
+      memory: 100Mi
+    limits:
+      memory: 100Mi

 # Kubelet ServiceMonitor with cluster label
 kubelet:
@@ -105,9 +112,9 @@ prometheus:
    resources:
      requests:
        cpu: 50m
-        memory: 768Mi
+        memory: 1Gi
      limits:
-        memory: 768Mi
+        memory: 1Gi
    
    # ServiceMonitor selector - scrape all ServiceMonitors
    serviceMonitorSelectorNilUsesHelmValues: false
--- a/prometheus/manifests/secret.yaml
+++ b/prometheus/manifests/secret.yaml
@@ -14,7 +14,7 @@ spec:
  data:
    - secretKey: password
      remoteRef:
-        key: postgresql
+        key: storage/postgresql
        property: PASSWORD
 ---
 apiVersion: external-secrets.io/v1
@@ -43,14 +43,14 @@ spec:
  data:
    - secretKey: access_key
      remoteRef:
-        key: minio
+        key: storage/minio
        property: ROOT_USER
        conversionStrategy: Default
        decodingStrategy: None
        metadataPolicy: None
    - secretKey: secret_key
      remoteRef:
-        key: minio
+        key: storage/minio
        property: ROOT_PASSWORD
        conversionStrategy: Default
        decodingStrategy: None
--- a/promtail/argocd.yaml
+++ b/promtail/argocd.yaml
@@ -12,7 +12,7 @@ spec:
    helm:
      valueFiles:
      - $values/promtail/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
  destination:
--- a/promtail/helm-values.yaml
+++ b/promtail/helm-values.yaml
@@ -37,12 +37,6 @@ resources:
  limits:
    memory: 182Mi

-# Tolerations to run on all nodes including control-plane
-tolerations:
-  - key: node-role.kubernetes.io/control-plane
-    operator: Exists
-    effect: NoSchedule
-
 # ServiceMonitor disabled
 serviceMonitor:
  enabled: false
--- a/tempo/argocd.yaml
+++ b/tempo/argocd.yaml
@@ -14,7 +14,7 @@ spec:
    helm:
      valueFiles:
      - $values/tempo/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
  destination:
--- a/tempo/helm-values.yaml
+++ b/tempo/helm-values.yaml
@@ -14,19 +14,16 @@ priorityClassName: medium-priority
 replicas: 1

 # =============================================================================
-# Resource Limits (optimized for small cluster)
+# Tempo Configuration
 # =============================================================================
-resources:
+tempo:
+  # Resource settings (no CPU limit for stability)
+  resources:
    requests:
      cpu: 15m
      memory: 100Mi
    limits:
-    memory: 100Mi
-
-# =============================================================================
-# Tempo Configuration
-# =============================================================================
-tempo:
+      memory: 109Mi
  # Receivers - protocols Tempo accepts
  receivers:
    otlp:
--- a/thanos/argocd.yaml
+++ b/thanos/argocd.yaml
@@ -14,10 +14,10 @@ spec:
    helm:
      valueFiles:
      - $values/thanos/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    path: thanos/manifests
  destination:
--- a/thanos/helm-values.yaml
+++ b/thanos/helm-values.yaml
@@ -46,12 +46,13 @@ query:
    - --query.replica-label=prometheus_replica
    - --query.auto-downsampling

+  # Resource settings (no CPU limit for stability)
  resources:
    requests:
      cpu: 15m
-      memory: 283Mi
+      memory: 100Mi
    limits:
-      memory: 283Mi
+      memory: 126Mi

 # =============================================================================
 # Query Frontend - Caching layer for Query (optional, disabled for small cluster)
--- a/thanos/manifests/secret.yaml
+++ b/thanos/manifests/secret.yaml
@@ -24,9 +24,9 @@ spec:
  data:
    - secretKey: access_key
      remoteRef:
-        key: minio
+        key: storage/minio
        property: ROOT_USER
    - secretKey: secret_key
      remoteRef:
-        key: minio
+        key: storage/minio
        property: ROOT_PASSWORD
--- a/vpa/argocd.yaml
+++ b/vpa/argocd.yaml
@@ -14,7 +14,7 @@ spec:
    helm:
      valueFiles:
      - $values/vpa/helm-values.yaml
-  - repoURL: https://github.com/K3S-HOME/observability.git
+  - repoURL: https://github0213.com/K3S-HOME/observability.git
    targetRevision: main
    ref: values
  destination:
--- a/vpa/helm-values.yaml
+++ b/vpa/helm-values.yaml
@@ -6,6 +6,7 @@ recommender:
  enabled: true
  replicaCount: 1

+  # Resource settings (no CPU limit for stability)
  resources:
    requests:
      cpu: 15m
@@ -13,15 +14,6 @@ recommender:
    limits:
      memory: 100Mi

-  # Schedule on control-plane node
-  nodeSelector:
-    node-role.kubernetes.io/control-plane: "true"
-
-  tolerations:
-    - key: node-role.kubernetes.io/control-plane
-      operator: Exists
-      effect: NoSchedule
-
 # Updater - applies recommended resource requests to pods
 # Disabled because we're using updateMode: Off (recommendations only)
 updater:
Author	SHA1	Message	Date
Mayne0213	b145881fa2	PERF(prometheus): increase memory limit to 1Gi - Increase memory request from 768Mi to 1Gi - Increase memory limit from 768Mi to 1Gi - Prevents OOM at 97% memory usage	2026-01-12 03:16:40 +09:00
Mayne0213	7e61af372b	PERF(observability): remove CPU limits for stability - Remove CPU limits from all observability components - Prevents CPU throttling issues across monitoring stack	2026-01-12 02:10:54 +09:00
Mayne0213	3b5bf20902	PERF(observability): optimize resources via VPA - alertmanager: CPU 15m/15m, memory 100Mi/100Mi - blackbox-exporter: CPU 15m/32m, memory 100Mi/100Mi - goldilocks: controller 15m/25m, dashboard 15m/15m - grafana: CPU 22m/24m, memory 144Mi/242Mi (upperBound) - kube-state-metrics: CPU 15m/15m, memory 100Mi/100Mi - loki: CPU 10m/69m, memory 225Mi/323Mi - node-exporter: CPU 15m/15m, memory 100Mi/100Mi - opentelemetry: CPU 34m/410m, memory 142Mi/1024Mi - prometheus-operator: CPU 15m/15m, memory 100Mi/100Mi - tempo: CPU 15m/15m, memory 100Mi/109Mi - thanos: CPU 15m/15m, memory 100Mi/126Mi - vpa: CPU 15m/15m, memory 100Mi/100Mi	2026-01-12 01:07:58 +09:00
Mayne0213	a70403d1ae	FEAT(grafana): add Tempo datasource - Add Tempo datasource for distributed tracing - Configure URL to tempo.tempo.svc.cluster.local:3100	2026-01-12 00:34:50 +09:00
Mayne0213	7cbc0c810e	FIX(tempo): move resources to correct helm path - Move resources from top-level to tempo.resources - Fix memory limit not being applied to container	2026-01-12 00:21:12 +09:00
Mayne0213	904cc3cab6	PERF(grafana): increase memory limits - Increase requests from 175Mi to 256Mi - Increase limits from 175Mi to 256Mi - Fix OOM and timeout issues	2026-01-11 23:32:09 +09:00
Mayne0213	c1214029a2	refactor: update Vault secret paths to new categorized structure - alertmanager: alertmanager → observability/alertmanager - grafana: postgresql → storage/postgresql - prometheus: postgresql → storage/postgresql, minio → storage/minio - thanos: minio → storage/minio Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-11 22:36:22 +09:00
Mayne0213	4aa7e37f76	PERF(otel): reduce resources based on VPA recommendation - Add fullnameOverride to simplify pod names - Reduce memory request from 512Mi to 400Mi - Reduce CPU request from 50m to 25m	2026-01-11 21:33:58 +09:00
Mayne0213	4bdcaf8fcd	REFACTOR(otel): rename folder to opentelemetry - Rename opentelemetry-collector to opentelemetry - Update ArgoCD Application name to opentelemetry - Simplify folder structure after operator removal	2026-01-11 21:27:54 +09:00
Mayne0213	43cf7e9de7	REFACTOR(otel): migrate collector from Operator to Helm - Remove opentelemetry-operator (no longer needed) - Convert opentelemetry-collector to direct Helm Chart - Remove CRD-based manifests (collector.yaml, rbac.yaml) - Update helm-values.yaml with Loki labels and env vars - Simplify architecture: Helm -> DaemonSet (no Operator)	2026-01-11 21:22:39 +09:00
Mayne0213	15d5e58d6c	migrate: change repoURLs from GitHub to Gitea Update all ArgoCD Application references to use Gitea (github0213.com) instead of GitHub for K3S-HOME/observability repository. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-10 20:43:29 +09:00
Mayne0213	7d0c8aa5f3	FIX(opentelemetry-operator): remove cpu null values - Remove cpu: null (not allowed in new chart schema) - Keep only memory limits	2026-01-10 18:55:23 +09:00
Mayne0213	9c00c42946	CHORE(opentelemetry-operator): upgrade chart to 0.102.0 - Fix ServiceMonitor duplicate creation bug (Issue #3446) - Upgrade from 0.74.0 to 0.102.0	2026-01-10 18:53:34 +09:00
Mayne0213	a08d989fc3	FIX(opentelemetry-operator): remove invalid serviceMonitor - Remove top-level serviceMonitor (not in chart schema) - Keep manager.serviceMonitor.enabled: false	2026-01-10 18:42:02 +09:00
Mayne0213	203a8debac	REFACTOR(repo): remove control-plane scheduling - Remove nodeSelector for control-plane node - Remove tolerations for control-plane taint - Allow pods to schedule on any available node	2026-01-10 18:35:15 +09:00
Mayne0213	c128ece672	FIX(opentelemetry-operator): disable serviceMonitor - Add top-level serviceMonitor.enabled: false - Prevent duplicate ServiceMonitor creation on restart	2026-01-10 18:28:12 +09:00
Mayne0213	bcf60b2428	fix: set CPU pressure threshold to 10%	2026-01-10 18:00:06 +09:00
Mayne0213	da89c8dbf0	FIX(grafana): restore gauge design with percentage display - Restore original gauge panel type - Keep * 100 query and percent unit - Set max to 100 for proper gauge range	2026-01-10 17:58:11 +09:00
Mayne0213	11f9457236	fix: increase CPU pressure threshold to 30%	2026-01-10 17:57:34 +09:00
Mayne0213	7e375e20c6	FIX(grafana): show CPU Usage as percentage per node - Change panel type from gauge to stat - Add * 100 to query for percentage - Show each node's CPU usage horizontally - Set thresholds at 50% (orange), 80% (red)	2026-01-10 17:57:05 +09:00
Mayne0213	b818a8c1fe	fix: update CPU throttling panels to use PSI metrics with 10% threshold	2026-01-10 17:54:55 +09:00