INIT(repo): monitoring stack setup

This commit is contained in:
2025-12-17 15:06:58 +09:00
commit baee94b69d
23 changed files with 866 additions and 0 deletions

View File

@@ -0,0 +1,46 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: alertmanager
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
sources:
# Helm chart from external repository
- repoURL: https://prometheus-community.github.io/helm-charts
chart: alertmanager
targetRevision: 1.29.0
helm:
valueFiles:
- $values/alertmanager/helm-values/alertmanager.yaml
# Values file from Git repository
- repoURL: https://gitea0213.kro.kr/bluemayne/infrastructure.git
targetRevision: main
ref: values
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
revisionHistoryLimit: 10

View File

@@ -0,0 +1,54 @@
# Alertmanager Helm Values
# Chart: https://github.com/prometheus-community/helm-charts/tree/main/charts/alertmanager
fullnameOverride: alertmanager
persistence:
enabled: true
size: 1Gi
storageClass: local-path
resources:
requests:
cpu: 10m
memory: 32Mi
# Prometheus ServiceMonitor 설정
serviceMonitor:
enabled: true
additionalLabels:
release: prometheus
namespace: monitoring
config:
global:
resolve_timeout: 5m
route:
group_by: ["alertname", "cluster", "service"]
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: "default"
routes:
- match:
severity: critical
receiver: "critical"
continue: true
- match:
severity: warning
receiver: "warning"
receivers:
- name: "default"
# 기본 수신자 (로그만 남김)
- name: "critical"
# TODO: Slack, Email 등 알림 채널 추가
# webhook_configs:
# - url: 'http://your-webhook-url'
- name: "warning"
# TODO: 경고 알림 채널 추가
inhibit_rules:
- source_match:
severity: "critical"
target_match:
severity: "warning"
equal: ["alertname", "cluster", "service"]

View File

@@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
# ArgoCD Application 리소스는 infrastructure/kustomization.yaml에서 관리
# - argocd/alertmanager.yaml

View File

@@ -0,0 +1,50 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: grafana
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
sources:
# Helm chart from external repository
- repoURL: https://grafana.github.io/helm-charts
chart: grafana
targetRevision: 10.3.0
helm:
valueFiles:
- $values/grafana/helm-values/grafana.yaml
# Values file from Git repository
- repoURL: https://gitea0213.kro.kr/bluemayne/infrastructure.git
targetRevision: main
ref: values
# Vault secrets from Git repository
- repoURL: https://gitea0213.kro.kr/bluemayne/infrastructure.git
targetRevision: main
path: grafana
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
revisionHistoryLimit: 10

View File

@@ -0,0 +1,63 @@
# Grafana Helm Values
# Chart: https://github.com/grafana/helm-charts/tree/main/charts/grafana
fullnameOverride: grafana
admin:
user: bluemayne
# Password is managed via SealedSecret: grafana-admin-password
existingSecret: grafana-admin-password
userKey: admin-user
passwordKey: admin-password
persistence:
enabled: true
size: 2Gi
storageClass: local-path
initChownData:
enabled: false
podSecurityContext:
fsGroup: 472
fsGroupChangePolicy: "Always"
resources:
requests:
cpu: 25m
memory: 128Mi
service:
type: ClusterIP
port: 80
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus-kube-prometheus-prometheus:9090
isDefault: true
editable: true
- name: Loki
type: loki
access: proxy
url: http://loki.logging.svc.cluster.local:3100
editable: true
grafana.ini:
server:
root_url: "http://grafana0213.kro.kr"
auth.anonymous:
enabled: false
security:
allow_embedding: true
auth.basic:
enabled: false
auth:
disable_login_form: false
disable_signout_menu: true
news:
news_feed_enabled: false

View File

@@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
# ArgoCD Application 리소스는 infrastructure/kustomization.yaml에서 관리
# - argocd/grafana.yaml
- vault/grafana-admin-password.yaml

View File

@@ -0,0 +1,22 @@
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: grafana-admin-password
namespace: monitoring
spec:
refreshInterval: 1h
secretStoreRef:
kind: ClusterSecretStore
name: vault-backend
target:
name: grafana-admin-password
creationPolicy: Owner
data:
- secretKey: admin-user
remoteRef:
key: monitoring/grafana
property: ADMIN_USER
- secretKey: admin-password
remoteRef:
key: monitoring/grafana
property: ADMIN_PASSWORD

View File

@@ -0,0 +1,46 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: kube-state-metrics
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
sources:
# Helm chart from external repository
- repoURL: https://prometheus-community.github.io/helm-charts
chart: kube-state-metrics
targetRevision: 5.25.1
helm:
valueFiles:
- $values/kube-state-metrics/helm-values/kube-state-metrics.yaml
# Values file from Git repository
- repoURL: https://gitea0213.kro.kr/bluemayne/infrastructure.git
targetRevision: main
ref: values
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
revisionHistoryLimit: 10

View File

@@ -0,0 +1,24 @@
# Kube State Metrics Helm Values
# Chart: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics
fullnameOverride: kube-state-metrics
resources:
requests:
cpu: 10m
memory: 64Mi
service:
type: ClusterIP
clusterIP: None
# Prometheus ServiceMonitor 설정
prometheus:
monitor:
enabled: true
additionalLabels:
release: prometheus
namespace: monitoring
relabelings:
- targetLabel: cluster
replacement: "mayne-cluster"

View File

@@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
# ArgoCD Application 리소스는 infrastructure/kustomization.yaml에서 관리
# - argocd/kube-state-metrics.yaml

26
loki/argocd/loki.yaml Normal file
View File

@@ -0,0 +1,26 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: loki
namespace: argocd
spec:
project: default
sources:
- repoURL: https://grafana.github.io/helm-charts
chart: loki
targetRevision: 6.24.0
helm:
valueFiles:
- $values/loki/helm-values/loki.yaml
- repoURL: https://gitea0213.kro.kr/bluemayne/infrastructure.git
targetRevision: HEAD
ref: values
destination:
server: https://kubernetes.default.svc
namespace: logging
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true

View File

@@ -0,0 +1,78 @@
# Loki Helm Values
# Chart: https://grafana.github.io/helm-charts
# Simple single binary deployment
loki:
# Use filesystem storage (simple setup)
storage:
type: filesystem
# Single binary mode for simplicity
commonConfig:
replication_factor: 1
# Schema config
schemaConfig:
configs:
- from: "2024-01-01"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
# Limits
limits_config:
retention_period: 168h # 7 days
ingestion_rate_mb: 10
ingestion_burst_size_mb: 20
max_streams_per_user: 10000
# Auth disabled for simplicity
auth_enabled: false
# Use single binary deployment (simpler)
deploymentMode: SingleBinary
singleBinary:
replicas: 1
persistence:
enabled: true
size: 10Gi
storageClass: local-path
resources:
requests:
cpu: 100m
memory: 256Mi
# Disable components not needed in single binary mode
backend:
replicas: 0
read:
replicas: 0
write:
replicas: 0
# Gateway disabled (direct access)
gateway:
enabled: false
# Disable all caching (use simple mode)
chunksCache:
enabled: false
resultsCache:
enabled: false
# Disable monitoring components
monitoring:
selfMonitoring:
enabled: false
grafanaAgent:
installOperator: false
lokiCanary:
enabled: false
# Test disabled
test:
enabled: false

6
loki/kustomization.yaml Normal file
View File

@@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
# ArgoCD Application 리소스는 infrastructure/kustomization.yaml에서 관리
# - argocd/loki.yaml

View File

@@ -0,0 +1,46 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: node-exporter
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
sources:
# Helm chart from external repository
- repoURL: https://prometheus-community.github.io/helm-charts
chart: prometheus-node-exporter
targetRevision: 4.39.0
helm:
valueFiles:
- $values/node-exporter/helm-values/node-exporter.yaml
# Values file from Git repository
- repoURL: https://gitea0213.kro.kr/bluemayne/infrastructure.git
targetRevision: main
ref: values
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
revisionHistoryLimit: 10

View File

@@ -0,0 +1,33 @@
# Prometheus Node Exporter Helm Values
# Chart: https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter
fullnameOverride: node-exporter
hostNetwork: true
hostPID: true
resources:
requests:
cpu: 10m
memory: 50Mi
service:
type: ClusterIP
clusterIP: None
# Prometheus ServiceMonitor 설정
prometheus:
monitor:
enabled: true
additionalLabels:
release: prometheus
namespace: monitoring
attachMetadata:
node: true
relabelings:
- targetLabel: cluster
replacement: "mayne-cluster"
tolerations:
- effect: NoSchedule
operator: Exists

View File

@@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
# ArgoCD Application 리소스는 infrastructure/kustomization.yaml에서 관리
# - argocd/node-exporter.yaml

View File

@@ -0,0 +1,50 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: prometheus
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
sources:
# Helm chart from external repository
- repoURL: https://prometheus-community.github.io/helm-charts
chart: kube-prometheus-stack
targetRevision: 80.0.0
helm:
valueFiles:
- $values/prometheus/helm-values/kube-prometheus-stack.yaml
# Values file from Git repository
- repoURL: https://gitea0213.kro.kr/bluemayne/infrastructure.git
targetRevision: main
ref: values
# Vault secrets from Git repository
- repoURL: https://gitea0213.kro.kr/bluemayne/infrastructure.git
targetRevision: main
path: prometheus
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
revisionHistoryLimit: 10

View File

@@ -0,0 +1,191 @@
# Kube-Prometheus-Stack Helm Values
# Chart: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
# Includes: Prometheus Operator, Prometheus, Alertmanager, Grafana, and various exporters
# Global settings
fullnameOverride: ""
# CRD 관리
crds:
enabled: false # CRD는 수동으로 설치하여 annotation 크기 문제 방지
# Prometheus Operator
prometheusOperator:
enabled: true
# CRD 생성 비활성화
createCustomResource: false
# Kubelet ServiceMonitor with cluster label
kubelet:
enabled: true
serviceMonitor:
# cAdvisor metrics (container_memory_working_set_bytes 등)
cAdvisorRelabelings:
- targetLabel: cluster
replacement: "mayne-cluster"
# Resource metrics
resourceRelabelings:
- targetLabel: cluster
replacement: "mayne-cluster"
# Probes metrics
probesRelabelings:
- targetLabel: cluster
replacement: "mayne-cluster"
# Prometheus
prometheus:
enabled: true
prometheusSpec:
scrapeInterval: 30s
evaluationInterval: 30s
retention: 7d
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: local-path
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 5Gi
resources:
requests:
cpu: 50m
memory: 256Mi
# ServiceMonitor 자동 발견 - 모든 ServiceMonitor 선택
serviceMonitorSelectorNilUsesHelmValues: false
serviceMonitorSelector: {}
podMonitorSelectorNilUsesHelmValues: false
podMonitorSelector: {}
probeSelectorNilUsesHelmValues: false
ruleSelector: {}
# Alertmanager 설정
alertingEndpoints:
- name: alertmanager
namespace: monitoring
port: http-web
scheme: http
# Alertmanager (기존 alertmanager 사용)
alertmanager:
enabled: false
# Grafana (기존 grafana 사용)
grafana:
enabled: false
# Node Exporter (기존 node-exporter 사용)
nodeExporter:
enabled: false
# Kube State Metrics (기존 kube-state-metrics 사용)
# 별도로 배포된 kube-state-metrics가 자체 ServiceMonitor 생성
kubeStateMetrics:
enabled: false
# 기본 ServiceMonitors
defaultRules:
create: true
rules:
alertmanager: true
etcd: false
configReloaders: true
general: true
k8s: true
kubeApiserverAvailability: true
kubeApiserverBurnrate: true
kubeApiserverHistogram: true
kubeApiserverSlos: true
kubeControllerManager: false
kubelet: true
kubeProxy: false
kubePrometheusGeneral: true
kubePrometheusNodeRecording: true
kubernetesApps: true
kubernetesResources: true
kubernetesStorage: true
kubernetesSystem: true
kubeSchedulerAlerting: false
kubeSchedulerRecording: false
kubeStateMetrics: true
network: true
node: true
nodeExporterAlerting: true
nodeExporterRecording: true
prometheus: true
prometheusOperator: true
# Additional scrape configs for existing services
prometheus:
prometheusSpec:
# External labels added to all metrics
externalLabels:
cluster: "mayne-cluster"
additionalScrapeConfigs:
# ArgoCD metrics
- job_name: 'argocd-metrics'
static_configs:
- targets:
- 'argocd-metrics.argocd.svc.cluster.local:8082'
labels:
service: argocd-controller
- targets:
- 'argocd-server-metrics.argocd.svc.cluster.local:8083'
labels:
service: argocd-server
- targets:
- 'argocd-repo-server.argocd.svc.cluster.local:8084'
labels:
service: argocd-repo
# Cert-Manager
- job_name: 'cert-manager'
static_configs:
- targets:
- 'cert-manager.cert-manager.svc.cluster.local:9402'
# MinIO
- job_name: 'minio-cluster'
static_configs:
- targets:
- 'minio.minio.svc.cluster.local:9000'
metrics_path: /minio/v2/metrics/cluster
scheme: http
- job_name: 'minio-node'
static_configs:
- targets:
- 'minio.minio.svc.cluster.local:9000'
metrics_path: /minio/v2/metrics/node
scheme: http
# Ingress NGINX
- job_name: 'ingress-nginx'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- ingress-nginx
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
action: keep
regex: ingress-nginx
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
action: keep
regex: controller
- source_labels: [__address__]
action: replace
regex: ([^:]+)(?::\d+)?
replacement: $1:10254
target_label: __address__
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: pod
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: namespace

View File

@@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
# ArgoCD Application 리소스는 infrastructure/kustomization.yaml에서 관리
# - argocd/prometheus.yaml
- vault/postgresql-password.yaml

View File

@@ -0,0 +1,18 @@
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: postgresql-password
namespace: monitoring
spec:
refreshInterval: 1h
secretStoreRef:
kind: ClusterSecretStore
name: vault-backend
target:
name: postgresql-password
creationPolicy: Owner
data:
- secretKey: password
remoteRef:
key: monitoring/postgres
property: PASSWORD

View File

@@ -0,0 +1,26 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: promtail
namespace: argocd
spec:
project: default
sources:
- repoURL: https://grafana.github.io/helm-charts
chart: promtail
targetRevision: 6.16.6
helm:
valueFiles:
- $values/promtail/helm-values/promtail.yaml
- repoURL: https://gitea0213.kro.kr/bluemayne/infrastructure.git
targetRevision: HEAD
ref: values
destination:
server: https://kubernetes.default.svc
namespace: logging
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true

View File

@@ -0,0 +1,49 @@
# Promtail Helm Values
# Chart: https://grafana.github.io/helm-charts
# Log collector agent (DaemonSet on all nodes)
# Loki endpoint
config:
clients:
- url: http://loki.logging.svc.cluster.local:3100/loki/api/v1/push
# Default scrape config (use defaults)
defaultVolumes:
- name: run
hostPath:
path: /run/promtail
- name: containers
hostPath:
path: /var/lib/docker/containers
- name: pods
hostPath:
path: /var/log/pods
defaultVolumeMounts:
- name: run
mountPath: /run/promtail
- name: containers
mountPath: /var/lib/docker/containers
readOnly: true
- name: pods
mountPath: /var/log/pods
readOnly: true
# Resources
resources:
requests:
cpu: 50m
memory: 64Mi
# Tolerations to run on all nodes including master
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
# ServiceMonitor disabled
serviceMonitor:
enabled: false

View File

@@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
# ArgoCD Application 리소스는 infrastructure/kustomization.yaml에서 관리
# - argocd/promtail.yaml