diff --git a/alertmanager/helm-values.yaml b/alertmanager/helm-values.yaml index d934e87..acee8a6 100644 --- a/alertmanager/helm-values.yaml +++ b/alertmanager/helm-values.yaml @@ -21,13 +21,12 @@ affinity: persistence: enabled: false -# Resource settings (VPA lowerBound/target) +# Resource settings (no CPU limit for stability) resources: requests: cpu: 15m memory: 100Mi limits: - cpu: 15m memory: 100Mi # Disable default config - use secret instead diff --git a/blackbox-exporter/helm-values.yaml b/blackbox-exporter/helm-values.yaml index 3ef4191..aca53ec 100644 --- a/blackbox-exporter/helm-values.yaml +++ b/blackbox-exporter/helm-values.yaml @@ -5,13 +5,12 @@ fullnameOverride: blackbox-exporter replicas: 1 -# Resource settings (VPA lowerBound/upperBound) +# Resource settings (no CPU limit for stability) resources: requests: cpu: 15m memory: 100Mi limits: - cpu: 32m memory: 100Mi config: diff --git a/goldilocks/helm-values.yaml b/goldilocks/helm-values.yaml index a4aed7e..caad52e 100644 --- a/goldilocks/helm-values.yaml +++ b/goldilocks/helm-values.yaml @@ -6,13 +6,12 @@ dashboard: enabled: true replicaCount: 1 - # Resource settings (VPA lowerBound/upperBound) + # Resource settings (no CPU limit for stability) resources: requests: cpu: 15m memory: 100Mi limits: - cpu: 15m memory: 100Mi service: @@ -51,13 +50,12 @@ controller: enabled: true replicaCount: 1 - # Resource settings (VPA lowerBound/upperBound) + # Resource settings (no CPU limit for stability) resources: requests: cpu: 15m memory: 100Mi limits: - cpu: 25m memory: 100Mi # Enable VPA recommendations for all namespaces diff --git a/grafana/dashboards/APM.json b/grafana/dashboards/APM.json new file mode 100644 index 0000000..4c285ca --- /dev/null +++ b/grafana/dashboards/APM.json @@ -0,0 +1,2629 @@ +{ + "__inputs": [ + { + "name": "DS_GRAFANACLOUD-CLECLERC-PROM", + "label": "grafanacloud-cleclerc-prom", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }, + { + "name": "DS_GRAFANACLOUD-CLECLERC-LOGS", + "label": "grafanacloud-cleclerc-logs", + "description": "", + "type": "datasource", + "pluginId": "loki", + "pluginName": "Loki" + }, + { + "name": "DS_GRAFANACLOUD-CLECLERC-TRACES", + "label": "grafanacloud-cleclerc-traces", + "description": "", + "type": "datasource", + "pluginId": "tempo", + "pluginName": "Tempo" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "12.1.0-91094" + }, + { + "type": "panel", + "id": "logs", + "name": "Logs", + "version": "" + }, + { + "type": "datasource", + "id": "loki", + "name": "Loki", + "version": "12.1.0-91094" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "datasource", + "id": "tempo", + "name": "Tempo", + "version": "12.1.0-91094" + }, + { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Lightweight APM dashboard for monitoring OpenTelemetry-based services. \n\nInstrument your applications using OpenTelemetry SDKs and send traces, metrics, and logs to Tempo for traces, a Prometheus-compatible database like Mimir for metrics, and Loki for logs. This dashboard provides a centralized view of your application's health and performance. \n\nFor a fully managed observability stack, consider using Grafana Cloud. \n\nLearn more about this dashboard on https://github.com/cyrille-leclerc/opentelemetry-service-dashboard.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "gridPos": { + "h": 7, + "w": 20, + "x": 0, + "y": 0 + }, + "id": 42, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "

Dashboard setup instructions (delete this panel after setup)

\n\nThis dashboard requires:\n\n## Metrics\n\nSend OpenTelemetry metrics to the OTLP endpoint of a Prometheus database.\n\n### Prometheus\n\nSend OpenTelemetry metrics to the Prometheus OTLP Endpoint and configure the parameters `keep_identifying_resource_attributes` and `promote_resource_attributes` on the OTLP endpoint. \n\nExample Prometheus OTLP Endpoint configuration snippet:\n\n```yml\notlp:\n keep_identifying_resource_attributes: true\n promote_resource_attributes:\n # REQUIRED FOR THIS DASHBOARD\n - service.instance.id\n - service.name\n - service.namespace\n - deployment.environment.name\n # RECOMMENDED FOR OTEL METRICS IN GENERAL\n - service.version\n - cloud.availability_zone\n - cloud.region\n - container.name\n - deployment.environment\n - k8s.cluster.name\n - k8s.container.name\n - k8s.cronjob.name\n - k8s.daemonset.name\n - k8s.deployment.name\n - k8s.job.name\n - k8s.namespace.name\n - k8s.pod.name\n - k8s.replicaset.name\n - k8s.statefulset.name\n```\n\nLearn more in Prometheus [configuration reference](https://prometheus.io/docs/prometheus/latest/configuration/configuration/) and [OpenTelemetry guide](https://prometheus.io/docs/guides/opentelemetry/).\n\n### Mimir OTLP Endpoint configuration\n\nSend OpenTelemetry metrics to the Mimir OTLP Endpoint and configure the parameters `otel_keep_identifying_resource_attributes` and `promote_otel_resource_attributes` on the OTLP endpoint. \n\nExample Mimir OTLP Endpoint configuration snippet:\n\n```yml\n# (experimental) Whether to keep identifying OTel resource attributes in the\n# target_info metric on top of converting to job and instance labels.\n# CLI flag: -distributor.otel-keep-identifying-resource-attributes\notel_keep_identifying_resource_attributes: true\n# (experimental) Optionally specify OTel resource attributes to promote to\n# labels.\n# CLI flag: -distributor.otel-promote-resource-attributes\npromote_otel_resource_attributes: \"service.instance.id, service.name, service.namespace, service.version, cloud.availability_zone, cloud.region, container.name, deployment.environment, deployment.environment.name, k8s.cluster.name, k8s.container.name, k8s.cronjob.name, k8s.daemonset.name, k8s.deployment.name, k8s.job.name, k8s.namespace.name, k8s.pod.name, k8s.replicaset.name, k8s.statefulset.name\"\n```\n\nLearn more in Mimir [configuration parameters](https://github.com/grafana/mimir/blob/main/docs/sources/mimir/configure/configuration-parameters/index.md).\n\n### Grafana Cloud Metrics\n\nSend OpenTelemetry metrics to the Grafana Cloud OTLP Endpoint as documented in [Grafana Cloud / Send OTLP data](https://grafana.com/docs/grafana-cloud/send-data/otlp/send-data-otlp/) and open a support ticket to activate `otel_keep_identifying_resource_attributes`.\n\nNote that the Grafana Cloud OTLP Endpoint is configured by default to promote the following resource attributes, this list can be modified through a support ticket:\n\n```\n# REQUIRED FOR THIS DASHBOARD\n- service.instance.id\n- service.name\n- service.namespace\n- deployment.environment.name\n# RECOMMENDED FOR OTEL METRICS IN GENERAL\n- service.version\n- cloud.availability_zone\n- cloud.region\n- container.name\n- deployment.environment\n- k8s.cluster.name\n- k8s.container.name\n- k8s.cronjob.name\n- k8s.daemonset.name\n- k8s.deployment.name\n- k8s.job.name\n- k8s.namespace.name\n- k8s.pod.name\n- k8s.replicaset.name\n- k8s.statefulset.name\n```\n\n## Logs\n\n### Grafana Cloud Logs\n\nSend OpenTelemetry logs to the Grafana Cloud OTLP Endpoint as documented in [Grafana Cloud / Send OTLP data](https://grafana.com/docs/grafana-cloud/send-data/otlp/send-data-otlp/) and open a support ticket to activate `otel_keep_identifying_resource_attributes`.\n\n### Loki\n\nSend OpenTelemetry logs to the Loki OTLP Endpoint as documented in [Loki / Send data / OpenTelemetry](https://grafana.com/docs/loki/latest/send-data/otel/).\n\n## Traces\n\n### Grafana Cloud Traces\n\nSend OpenTelemetry traces to the Grafana Cloud OTLP Endpoint as documented in [Grafana Cloud / Send OTLP data](https://grafana.com/docs/grafana-cloud/send-data/otlp/send-data-otlp/).\n\n### Tempo\n\nSend OpenTelemetry traces to the Tempo OTLP Endpoint which supports both OTLP protocols: HTTP/Protobuf and gRPC.\n\n## Grafana\n\nTo prevent PromQL `rate` function issues with OpenTelemetry metrics in Grafana, either set your Prometheus Datasource's **\"scrape interval\"** to `60s` or, if that's not possible, configure each affected dashboard panel's **\"Min Step\"** option to `60s`.", + "mode": "markdown" + }, + "pluginVersion": "12.1.0-91094", + "type": "text" + }, + { + "description": "service.namespace=${service_namespace}, service.name=${service_name}, deployment.environment.name=${deployment_environment_name}", + "gridPos": { + "h": 2, + "w": 10, + "x": 0, + "y": 7 + }, + "id": 20, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "

\"OpenTelemetry Service ${service_namespace}/${service_name} (env: ${deployment_environment_name})

\n", + "mode": "html" + }, + "pluginVersion": "12.1.0-91094", + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "Shows the timestamp of the latest metrics received in the past 24h.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "#24292e", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 10, + "y": 7 + }, + "hideTimeOverride": true, + "id": 39, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^Time$/", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "editorMode": "code", + "expr": "timestamp(sum by (deployment_environment_name, service_namespace, service_name) (target_info{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}) or absent{})\n", + "interval": "60s", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "timeFrom": "now-24h", + "title": "Latest metrics received", + "type": "stat" + }, + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "description": "Shows the timestamp of the latest logs received in the past 24h.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "#24292e", + "mode": "fixed" + }, + "mappings": [], + "noValue": "No data", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "dateTimeFromNow" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 13, + "y": 7 + }, + "hideTimeOverride": true, + "id": 40, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^Time$/", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${DS_GRAFANACLOUD-CLECLERC-LOGS}" + }, + "direction": "backward", + "editorMode": "code", + "expr": "sum(count_over_time({service_name=\"$service_name\", deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\"} [5m]))", + "queryType": "range", + "refId": "A" + } + ], + "timeFrom": "now-24h", + "title": "Latest logs received", + "type": "stat" + }, + { + "datasource": { + "type": "tempo", + "uid": "${tempo_datasource}" + }, + "description": "Shows the timestamp of the latest span received in the past 24h.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "#24292e", + "mode": "fixed" + }, + "mappings": [], + "noValue": "No data", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "dateTimeFromNow" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 16, + "y": 7 + }, + "hideTimeOverride": true, + "id": 41, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^time$/", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "tempo", + "uid": "${DS_GRAFANACLOUD-CLECLERC-TRACES}" + }, + "filters": [ + { + "id": "0344fb49", + "operator": "=", + "scope": "resource", + "tag": "service.namespace", + "value": [ + "$service_namespace" + ], + "valueType": "string" + }, + { + "id": "service-name", + "operator": "=", + "scope": "resource", + "tag": "service.name", + "value": [ + "$service_name" + ], + "valueType": "string" + } + ], + "limit": 20, + "metricsQueryType": "range", + "query": "{resource.service.namespace=\"$service_namespace\" && resource.service.name=\"$service_name\"} | count_over_time()", + "queryType": "traceql", + "refId": "A", + "tableType": "traces" + } + ], + "timeFrom": "now-24h", + "title": "Latest traces received", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 15, + "panels": [], + "title": "Server HTTP RED Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "HTTP endpoints aggregation on the `http.server.request.duration` metric.\n\nSee https://opentelemetry.io/docs/specs/semconv/http/http-metrics/#metric-httpserverrequestduration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "noValue": "No HTTP Operation", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.99, sum by(le, deployment_environment_name, service_namespace, service_name) (rate(http_server_request_duration_seconds_bucket{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "interval": "60s", + "legendFormat": "P99", + "range": true, + "refId": "P99", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le, deployment_environment_name, service_namespace, service_name) (rate(http_server_request_duration_seconds_bucket{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "interval": "60s", + "legendFormat": "P90", + "range": true, + "refId": "P90", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "avg by(deployment_environment_name, service_namespace, service_name) (rate(http_server_request_duration_seconds_sum{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval])) / avg by(deployment_environment_name, service_namespace, service_name) (rate(http_server_request_duration_seconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "interval": "60s", + "legendFormat": "AVG", + "range": true, + "refId": "AVG", + "useBackend": false + } + ], + "title": "Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "HTTP endpoints aggregation on the `http.server.request.duration` metric. \n\nErrors are identified by `http.response.status_code=~\"5..\"`.\n\nSee https://opentelemetry.io/docs/specs/semconv/http/http-metrics/#metric-httpserverrequestduration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 100, + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "No HTTP Operation", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "(\n sum by(deployment_environment_name, service_namespace, service_name) (\n rate(\n http_server_request_duration_seconds_count{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\",\n http_response_status_code=~\"5..\"\n }[$__rate_interval]\n )\n ) * 100\n)\n/\n(\n sum by(deployment_environment_name, service_namespace, service_name) (\n rate(\n http_server_request_duration_seconds_count{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\"\n }[$__rate_interval]\n )\n )\n)\nor\n(\n 0\n *\n sum by(deployment_environment_name, service_namespace, service_name) (\n rate(\n http_server_request_duration_seconds_count{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\"\n }[$__rate_interval]\n )\n )\n)", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "interval": "60s", + "legendFormat": "5xx", + "range": true, + "refId": "5xx", + "useBackend": false + } + ], + "title": "Error", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "HTTP endpoints aggregation on the `http.server.request.duration` metric.\n\nSee https://opentelemetry.io/docs/specs/semconv/http/http-metrics/#metric-httpserverrequestduration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "noValue": "No HTTP Operation", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 12, + "y": 10 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "editorMode": "code", + "expr": "(sum(rate(http_server_request_duration_seconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval])) by (deployment_environment_name, service_namespace, service_name)) ", + "hide": false, + "interval": "60s", + "legendFormat": "Requests", + "range": true, + "refId": "RequestRate" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 31, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "gRPC Endpoints aggregation, `rpc.server.duration` metric.\n\nSee https://opentelemetry.io/docs/specs/semconv/rpc/rpc-metrics/#metric-rpcserverduration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "noValue": "No RPC operation", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 17 + }, + "id": 33, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le, deployment_environment_name, service_namespace, service_name) (rate(rpc_server_duration_milliseconds_bucket{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "interval": "60s", + "legendFormat": "P99", + "range": true, + "refId": "P99", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le, deployment_environment_name, service_namespace, service_name) (rate(rpc_server_duration_milliseconds_bucket{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "interval": "60s", + "legendFormat": "P90", + "range": true, + "refId": "P90", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "avg by(deployment_environment_name, service_namespace, service_name) (rate(rpc_server_duration_milliseconds_sum{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval])) / avg by(deployment_environment_name, service_namespace, service_name) (rate(rpc_server_duration_milliseconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "interval": "60s", + "legendFormat": "AVG", + "range": true, + "refId": "AVG", + "useBackend": false + } + ], + "title": "Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "RPC endpoints aggregation based on the `rpc.server.duration` metric.\n\nErrors are identified by `rpc.grpc.status_code != 0` which make the panel specific to the gRPC protocol.\n\nSee https://opentelemetry.io/docs/specs/semconv/rpc/rpc-metrics/#metric-rpcserverdurationInbound ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "No RPC operation", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 17 + }, + "id": 34, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "(\n sum without (rpc_grpc_status_code, instance) (\n rate(\n rpc_server_duration_milliseconds_count{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\",\n rpc_grpc_status_code!=\"0\"\n }[$__rate_interval]\n )\n ) * 100\n)\n/\n(\n sum without (rpc_grpc_status_code, instance) (\n rate(\n rpc_server_duration_milliseconds_count{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\"\n }[$__rate_interval]\n )\n )\n)\nor \n(\n 0\n *\n sum without (rpc_grpc_status_code, instance) (\n rate(\n rpc_server_duration_milliseconds_count{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\"\n }[$__rate_interval]\n )\n )\n)", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "interval": "60s", + "legendFormat": "Error", + "range": true, + "refId": "ERR", + "useBackend": false + } + ], + "title": "gRPC Error", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "gRPC endpoints aggregation on the `rpc.server.duration` metric.\n\nSee https://opentelemetry.io/docs/specs/semconv/rpc/rpc-metrics/#metric-rpcserverduration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "noValue": "No RPC operation", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 12, + "y": 17 + }, + "id": 35, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "editorMode": "code", + "expr": "(sum(rate(rpc_server_duration_milliseconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval])) by (deployment_environment_name, service_namespace, service_name)) * $__interval_ms / 1000", + "hide": false, + "interval": "60s", + "legendFormat": "Requests", + "range": true, + "refId": "RequestRate" + } + ], + "title": "Request Rate", + "type": "timeseries" + } + ], + "title": "Server RPC RED Metrics", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 22, + "panels": [], + "title": "Inbound Operations", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "Inbound HTTP operations of the service (aka HTTP endpoints) based on the `http.server.request.duration` metric.\n\nErrors are identified by `http.response.status_code=~\"5..\"`.\n\nSee https://opentelemetry.io/docs/specs/semconv/http/http-metrics/#metric-httpserverrequestduration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "noValue": "No HTTP operation", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Duration (p99)" + }, + "properties": [ + { + "id": "unit", + "value": "s" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "unit", + "value": "reqps" + }, + { + "id": "custom.width", + "value": 219 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Error" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 21, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Operation" + } + ] + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "\n sum by (operation) (\n label_join(\n rate(http_server_request_duration_seconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]),\n \"operation\",\n \" \",\n \"http_request_method\",\n \"http_route\"\n )\n )\n ", + "fullMetaSearch": false, + "includeNullMetadata": true, + "interval": "60s", + "legendFormat": "{{operation}}", + "range": true, + "refId": "RPS", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "editorMode": "code", + "expr": "(\n sum by (operation) (\n label_join(\n rate(http_server_request_duration_seconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\", http_response_status_code=~\"5..\"}[$__rate_interval]),\n \"operation\",\n \" \",\n \"http_request_method\",\n \"http_route\"\n )\n )\n / \n sum by (operation) (\n label_join(\n rate(http_server_request_duration_seconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]),\n \"operation\",\n \" \",\n \"http_request_method\",\n \"http_route\"\n )\n )\n ) or (0 * \n sum by (operation) (\n label_join(\n rate(http_server_request_duration_seconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]),\n \"operation\",\n \" \",\n \"http_request_method\",\n \"http_route\"\n )\n )\n )", + "hide": false, + "instant": false, + "interval": "60s", + "legendFormat": "{{operation}}", + "range": true, + "refId": "ERR_PCT" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "editorMode": "code", + "expr": "\n histogram_quantile(\n 0.99,\n sum by (le, operation) (\n label_join(\n rate(http_server_request_duration_seconds_bucket{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[5m]),\n \"operation\",\n \" \",\n \"http_request_method\",\n \"http_route\"\n )\n )\n )\n ", + "hide": false, + "instant": false, + "interval": "60s", + "legendFormat": "{{operation}}", + "range": true, + "refId": "Duration" + } + ], + "title": "HTTP Operations", + "transformations": [ + { + "id": "timeSeriesTable", + "options": { + "Duration": { + "timeField": "Time" + }, + "ERR_PCT": { + "timeField": "Time" + }, + "RPS": { + "timeField": "Time" + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "operation", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": { + "Trend #Duration": 1, + "Trend #ERR_PCT": 2, + "Trend #RPS": 3, + "operation": 0 + }, + "renameByName": { + "Trend #Duration": "Duration (p99)", + "Trend #ERR_PCT": "Error", + "Trend #RPS": "Rate", + "operation": "Operation" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "Inbound RPC operations of the service (aka RPC endpoints) based on the `rpc.server.request.duration` metric.\n\nErrors are identified by `rpc.grpc.status_code != 0` which make the panel specific to the gRPC protocol.\n\nhttps://opentelemetry.io/docs/specs/semconv/rpc/rpc-metrics/#metric-rpcserverduration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "noValue": "No RPC operation", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Duration (p99)" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "unit", + "value": "reqps" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Error" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 27, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Operation" + } + ] + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "\nsum by (operation) (\n label_join(\n rate(rpc_server_duration_milliseconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]),\n \"operation\",\n \"/\",\n \"rpc_service\",\n \"rpc_method\"\n )\n)\n ", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "interval": "60s", + "legendFormat": "__auto", + "range": true, + "refId": "RPS", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "editorMode": "code", + "expr": "(\n sum by (operation) (\n label_join(\n rate(rpc_server_duration_milliseconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\", rpc_grpc_status_code!=\"0\"}[$__rate_interval]),\n \"operation\",\n \"/\",\n \"rpc_service\",\n \"rpc_method\"\n )\n )\n / \n sum by (operation) (\n label_join(\n rate(rpc_server_duration_milliseconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]),\n \"operation\",\n \"/\",\n \"rpc_service\",\n \"rpc_method\"\n )\n )\n ) or (0 * \n sum by (operation) (\n label_join(\n rate(rpc_server_duration_milliseconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]),\n \"operation\",\n \"/\",\n \"rpc_service\",\n \"rpc_method\"\n )\n )\n )\n ", + "hide": false, + "instant": false, + "interval": "60s", + "legendFormat": "__auto", + "range": true, + "refId": "ERR_PCT" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "editorMode": "code", + "expr": "\n histogram_quantile(\n 0.99,\n sum by (le, operation) (\n label_join(\n rate(rpc_server_duration_milliseconds_bucket{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[5m]),\n \"operation\",\n \"/\",\n \"rpc_service\",\n \"rpc_method\"\n )\n )\n )\n ", + "hide": false, + "instant": false, + "interval": "60s", + "legendFormat": "{{operation}}", + "range": true, + "refId": "Duration" + } + ], + "title": "RPC Operations", + "transformations": [ + { + "id": "timeSeriesTable", + "options": { + "Duration": { + "timeField": "Time" + }, + "ERR_PCT": { + "timeField": "Time" + }, + "RPS": { + "timeField": "Time" + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "operation", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": { + "Trend #Duration": 1, + "Trend #ERR_PCT": 2, + "Trend #RPS": 3, + "operation": 0 + }, + "renameByName": { + "Trend #Duration": "Duration (p99)", + "Trend #ERR_PCT": "Error", + "Trend #RPS": "Rate", + "operation": "Operation" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 28, + "panels": [], + "title": "Outbound Services and Databases", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "HTTP calls made by the service based on the `http.client.request.duration` metric.\n\nCalls broken done by remote `server.address` and by `http.request.method`.\n\nSee https://opentelemetry.io/docs/specs/semconv/http/http-metrics/#metric-httpclientrequestduration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "noValue": "No HTTP call", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Duration (P99)" + }, + "properties": [ + { + "id": "unit", + "value": "s" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "unit", + "value": "reqps" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Error" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 23, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by (outbound_service) (\n label_join(\n rate(\n http_client_request_duration_seconds_count{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\"\n }[$__rate_interval]\n ),\n \"outbound_service\",\n \" \",\n \"server_address\",\n \"http_request_method\",\n \"url_template\"\n )\n)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "interval": "60s", + "legendFormat": "{{server_address}} {{http_request_method}} {{url_template}}", + "range": true, + "refId": "RPS", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "editorMode": "code", + "expr": "(\n sum by (outbound_service) (\n label_join(\n rate(\n http_client_request_duration_seconds_count{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\",\n http_response_status_code=~\"5..\"\n }[$__rate_interval]\n ),\n \"outbound_service\",\n \" \",\n \"server_address\",\n \"http_request_method\",\n \"url_template\"\n )\n )\n /\n sum by (outbound_service) (\n label_join(\n rate(\n http_client_request_duration_seconds_count{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\"\n }[$__rate_interval]\n ),\n \"outbound_service\",\n \" \",\n \"server_address\",\n \"http_request_method\",\n \"url_template\"\n )\n )\n)\nor\n(\n 0\n *\n sum by (outbound_service) (\n label_join(\n rate(\n http_client_request_duration_seconds_count{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\"\n }[$__rate_interval]\n ),\n \"outbound_service\",\n \" \",\n \"server_address\",\n \"http_request_method\",\n \"url_template\"\n )\n )\n)", + "hide": false, + "instant": false, + "interval": "60s", + "legendFormat": "{{server_address}} {{http_request_method}} {{url_template}}", + "range": true, + "refId": "ERR_PCT" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "editorMode": "code", + "expr": "histogram_quantile(\n 0.99,\n sum by (le, outbound_service) (\n label_join(\n rate(\n http_client_request_duration_seconds_bucket{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\"\n }[5m]\n ),\n \"outbound_service\",\n \" \",\n \"server_address\",\n \"http_request_method\",\n \"url_template\"\n )\n )\n)", + "hide": false, + "instant": false, + "interval": "60s", + "legendFormat": "{{server_address}} {{http_request_method}} {{url_template}}", + "range": true, + "refId": "DURATION" + } + ], + "title": "Outbound HTTP Services", + "transformations": [ + { + "id": "timeSeriesTable", + "options": { + "Duration": { + "timeField": "Time" + }, + "ERR_PCT": { + "timeField": "Time" + }, + "RPS": { + "timeField": "Time" + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "outbound_service", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": { + "Trend #DURATION": 1, + "Trend #ERR_PCT": 2, + "Trend #RPS": 3, + "outbound_service": 0 + }, + "renameByName": { + "Trend #DURATION": "Duration (P99)", + "Trend #Duration": "Duration (p99)", + "Trend #ERR_PCT": "Error", + "Trend #RPS": "Rate", + "operation": "Operation", + "outbound_service": "Service" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "DB calls made by the service based on the `db.client.operation.duration` metric.\n\nCalls broken down by `server.address` and `db.namespace`.\n\nSee https://opentelemetry.io/docs/specs/semconv/database/database-metrics/#metric-dbclientoperationduration\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "noValue": "No database call", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Duration (P99)" + }, + "properties": [ + { + "id": "unit", + "value": "s" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "unit", + "value": "reqps" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Error" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 24, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by (database) (\n label_join(\n rate(\n db_client_operation_duration_seconds_count{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\"\n }[$__rate_interval]\n ),\n \"database\",\n \"/\",\n \"server_address\",\n \"db_namespace\"\n )\n)", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "interval": "60s", + "legendFormat": "{{server_address}} {{db_namespace}}", + "range": true, + "refId": "RPS", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "editorMode": "code", + "expr": "(\n sum by (database) (\n label_join(\n rate(db_client_operation_duration_seconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\", http_response_status_code=~\"5..\"}[$__rate_interval]),\n \"database\",\n \"/\",\n \"server_address\",\n \"db_namespace\"\n )\n )\n / \n sum by (database) (\n label_join(\n rate(db_client_operation_duration_seconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]),\n \"database\",\n \"/\",\n \"server_address\",\n \"db_namespace\"\n )\n )\n ) or (0 * \n sum by (database) (\n label_join(\n rate(db_client_operation_duration_seconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]),\n \"database\",\n \"/\",\n \"server_address\",\n \"db_namespace\"\n )\n )\n )", + "hide": false, + "instant": false, + "interval": "60s", + "legendFormat": "{{server_address}} {{db_namespace}}", + "range": true, + "refId": "ERR_PCT" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "editorMode": "code", + "expr": "histogram_quantile(\n 0.99,\n sum by (le, database) (\n label_join(\n rate(\n db_client_operation_duration_seconds_bucket{\n deployment_environment_name=~\"$deployment_environment_name\",\n service_namespace=~\"$service_namespace\",\n service_name=\"$service_name\"\n }[5m]\n ),\n \"database\",\n \"/\",\n \"server_address\",\n \"db_namespace\"\n )\n )\n)", + "hide": false, + "instant": false, + "interval": "60s", + "legendFormat": "{{server_address}} {{db_namespace}}", + "range": true, + "refId": "DURATION" + } + ], + "title": "Outbound Databases", + "transformations": [ + { + "id": "timeSeriesTable", + "options": { + "DURATION": { + "timeField": "Time" + }, + "Duration": { + "timeField": "Time" + }, + "ERR_PCT": { + "timeField": "Time" + }, + "RPS": { + "timeField": "Time" + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "database", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": { + "Trend #DURATION": 1, + "Trend #ERR_PCT": 2, + "Trend #RPS": 3, + "database": 0 + }, + "renameByName": { + "Trend #DURATION": "Duration (P99)", + "Trend #Duration": "Duration (p99)", + "Trend #ERR_PCT": "Error", + "Trend #RPS": "Rate", + "database": "Database", + "database_operation": "Database Operation", + "operation": "Operation", + "outbound_service": "Service" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "RPC calls made by the service based on the `rpc.client.request.duration` metric.\n\nSpecific to gRPC due to the usage of the `grpc.status.code` attribute to identify errors.\n\nCalls broken down by `server.address`, `rpc.service`, and `rpc.method`.\n\nSee https://opentelemetry.io/docs/specs/semconv/rpc/rpc-metrics/#rpc-client\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "noValue": "No RPC call", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Duration (P99)" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "unit", + "value": "reqps" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Error" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 32, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "\n sum by (outbound_service) (\n label_join(\n rate(rpc_client_duration_milliseconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]),\n \"outbound_service\",\n \"/\",\n \"server_address\",\n \"rpc_service\",\n \"rpc_method\"\n )\n )\n ", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "interval": "60s", + "legendFormat": "__auto", + "range": true, + "refId": "RPS", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "editorMode": "code", + "expr": "(\n sum by (outbound_service) (\n label_join(\n rate(rpc_client_duration_milliseconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\", http_response_status_code=~\"5..\"}[$__rate_interval]),\n \"outbound_service\",\n \"/\",\n \"server_address\",\n \"rpc_service\",\n \"rpc_method\"\n )\n )\n / \n sum by (outbound_service) (\n label_join(\n rate(rpc_client_duration_milliseconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]),\n \"outbound_service\",\n \"/\",\n \"server_address\",\n \"rpc_service\",\n \"rpc_method\"\n )\n )\n ) or (0 * \n sum by (outbound_service) (\n label_join(\n rate(rpc_client_duration_milliseconds_count{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]),\n \"outbound_service\",\n \"/\",\n \"server_address\",\n \"rpc_service\",\n \"rpc_method\"\n )\n )\n )", + "hide": false, + "instant": false, + "interval": "60s", + "legendFormat": "__auto", + "range": true, + "refId": "ERR_PCT" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "editorMode": "code", + "expr": "\nhistogram_quantile(\n 0.99,\n sum by (le, outbound_service) (\n label_join(\n rate(rpc_client_duration_milliseconds_bucket{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[5m]),\n \"outbound_service\",\n \"/\",\n \"server_address\",\n \"rpc_service\",\n \"rpc_method\"\n )\n )\n)", + "hide": false, + "instant": false, + "interval": "60s", + "legendFormat": "__auto", + "range": true, + "refId": "DURATION" + } + ], + "title": "Outbound RPC Services", + "transformations": [ + { + "id": "timeSeriesTable", + "options": { + "Duration": { + "timeField": "Time" + }, + "ERR_PCT": { + "timeField": "Time" + }, + "RPS": { + "timeField": "Time" + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "outbound_service", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": { + "Trend #DURATION": 1, + "Trend #ERR_PCT": 2, + "Trend #RPS": 3, + "outbound_service": 0 + }, + "renameByName": { + "Trend #DURATION": "Duration (P99)", + "Trend #Duration": "Duration (p99)", + "Trend #ERR_PCT": "Error", + "Trend #RPS": "Rate", + "operation": "Operation", + "outbound_service": "Service Method" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 25, + "panels": [], + "title": "Logs", + "type": "row" + }, + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "description": "Logs of the service, filtered by `service.name` and `service.namespace`.\n\nTo explore the logs, open the menu clicking on the icon `⋮` of this panel and click on `Explore`.", + "gridPos": { + "h": 15, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 26, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": true, + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Ascending", + "wrapLogMessage": false + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${DS_GRAFANACLOUD-CLECLERC-LOGS}" + }, + "direction": "backward", + "editorMode": "code", + "expr": "{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"} | line_format `\u001b[1m{{if .level }}{{alignRight 5 .level}}{{else if .severity_text}}{{alignRight 5 .severity_text}}{{end}}\u001b[0m \u001b[90m[{{alignRight 10 .service_instance_id}}{{if .thread_name}}/{{alignRight 20 .thread_name}}{{else if eq \"java\" .telemetry_sdk_language }} {{end}}]\u001b[0m \u001b[36m{{if .scope_name }}{{alignRight 40 .scope_name}}{{end}}{{if .exception_type}} \u001b[1;101m \u001b[0m{{end}} {{if .exception_type}}\u001b[1;91m{{.exception_type}}\u001b[0m{{end}}{{if .exception_message}} \u001b[1;91m{{.exception_message}}\u001b[0m {{end}} \u001b[0m {{__line__}} {{if .trace_id}} \u001b[37m\u001b[3m[trace_id={{.trace_id}}]{{end}}`", + "queryType": "range", + "refId": "A" + } + ], + "type": "logs" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 51 + }, + "id": 29, + "panels": [], + "title": "Traces", + "type": "row" + }, + { + "datasource": { + "type": "tempo", + "uid": "${tempo_datasource}" + }, + "description": "Traces containing a span emitted by the service", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "noValue": "No traces", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Trace Service" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Span ID" + }, + "properties": [ + { + "id": "custom.hidden", + "value": false + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "deployment.environment.name" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "service.name" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "service.namespace" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 15, + "x": 0, + "y": 52 + }, + "id": 30, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 0, + "showHeader": true + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "tempo", + "uid": "${DS_GRAFANACLOUD-CLECLERC-TRACES}" + }, + "filters": [ + { + "id": "service-name", + "operator": "=", + "scope": "resource", + "tag": "service.name", + "value": [ + "$service_name" + ], + "valueType": "string" + }, + { + "id": "6997e808", + "operator": "=", + "scope": "resource", + "tag": "deployment.environment.name", + "value": [ + "$deployment_environment_name" + ], + "valueType": "string" + }, + { + "id": "e9f0e855", + "operator": "=", + "scope": "resource", + "tag": "service.namespace", + "value": [ + "$service_namespace" + ], + "valueType": "string" + } + ], + "limit": 20, + "metricsQueryType": "range", + "query": "{resource.service.name=\"$service_name\" && resource.service.namespace=\"$service_namespace\"} | select(status)", + "queryType": "traceql", + "refId": "A", + "tableType": "spans" + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 60 + }, + "id": 37, + "panels": [], + "title": "Runtime", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "description": "For JVM based services like Java or Kotlin, the JVM metrics.\n\nSee https://opentelemetry.io/docs/specs/semconv/runtime/jvm-metrics/", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "noValue": "No JVM metrics", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Trend #CPU_PCT" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Trend #GC_PCT" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 61 + }, + "id": 38, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "12.1.0-91094", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "editorMode": "code", + "expr": "sum by(service_instance_id) (jvm_cpu_recent_utilization_ratio{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"})", + "hide": false, + "interval": "60s", + "legendFormat": "__auto", + "range": true, + "refId": "CPU_PCT" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "editorMode": "code", + "expr": "sum by(service_instance_id) (rate(jvm_gc_duration_seconds_sum{deployment_environment_name=~\"$deployment_environment_name\", service_namespace=~\"$service_namespace\", service_name=\"$service_name\"}[$__rate_interval]))", + "hide": false, + "instant": false, + "interval": "60s", + "legendFormat": "__auto", + "range": true, + "refId": "GC_PCT" + } + ], + "title": "JVM", + "transformations": [ + { + "id": "timeSeriesTable", + "options": { + "A": { + "timeField": "Time" + }, + "CPU_PCT": { + "timeField": "Time" + }, + "GC_PCT": { + "timeField": "Time" + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "service_instance_id", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Trend #CPU_PCT": "CPU", + "Trend #GC_PCT": "Garbage Collector Time", + "instance": "Instance", + "service_instance_id": "Instance" + } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [ + { + "allowCustomValue": false, + "current": {}, + "description": "OpenTelemetry metrics. \nSend metrics using the Prometheus OTLP endpoint activating `keep_identifying_resource_attributes` and resource attribute promotion (aka `promote_resource_attributes`) including `service.name`, service.namespace`, `service.instance.id`, and `deployment.environment.name`", + "label": "Metrics", + "name": "prometheus_datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "(?!grafanacloud-usage|grafanacloud-ml-metrics).+", + "type": "datasource" + }, + { + "allowCustomValue": false, + "current": {}, + "description": "OpenTelemetry traces", + "label": "Traces", + "name": "tempo_datasource", + "options": [], + "query": "tempo", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allowCustomValue": false, + "current": {}, + "description": "OpenTelemetry logs.\n\nSend logs using the Loki OTLP endpoint activating resource attribute promotion (aka `default_resource_attributes_as_index_labels`) including `service.name`, service.namespace`, and `deployment.environment.name`", + "label": "Logs", + "name": "loki_datasource", + "options": [], + "query": "loki", + "refresh": 1, + "regex": "(?!grafanacloud-cleclerc-alert-state-history|grafanacloud-.*-usage-insights).+", + "type": "datasource" + }, + { + "allowCustomValue": false, + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "definition": "label_values(target_info,deployment_environment_name)", + "description": "Deployment environment (e.g. \"production\").\nResource attribute `deployment.environment.name` via `target_info`", + "includeAll": true, + "label": "Environment", + "name": "deployment_environment_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(target_info,deployment_environment_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allowCustomValue": false, + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${prometheus_datasource}" + }, + "definition": "label_values(target_info{deployment_environment_name=~\"$deployment_environment_name\"},service_namespace)", + "description": "Service namespace.\nResource attribute `service.namespace` via `target_info`", + "includeAll": true, + "label": "Namespace", + "name": "service_namespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(target_info{deployment_environment_name=~\"$deployment_environment_name\"},service_namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_GRAFANACLOUD-CLECLERC-PROM}" + }, + "definition": "label_values(target_info{service_namespace=~\"$service_namespace\", deployment_environment_name=~\"$deployment_environment_name\"},service_name)", + "description": "Service name.\nResource attribute `service.name` via `target_info`.", + "label": "Name", + "name": "service_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(target_info{service_namespace=~\"$service_namespace\", deployment_environment_name=~\"$deployment_environment_name\"},service_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Lightweight APM for OpenTelemetry", + "uid": "febljk0a32qyoa", + "version": 41, + "weekStart": "", + "id": null, + "gnetId": 22784 +} \ No newline at end of file diff --git a/grafana/helm-values.yaml b/grafana/helm-values.yaml index cf545bf..92c6cea 100644 --- a/grafana/helm-values.yaml +++ b/grafana/helm-values.yaml @@ -39,13 +39,12 @@ podSecurityContext: fsGroup: 472 fsGroupChangePolicy: "Always" -# Resource settings (VPA lowerBound/upperBound) +# Resource settings (no CPU limit for stability) resources: requests: cpu: 22m memory: 144Mi limits: - cpu: 24m memory: 242Mi service: diff --git a/kube-state-metrics/helm-values.yaml b/kube-state-metrics/helm-values.yaml index edd15ce..f7f542d 100644 --- a/kube-state-metrics/helm-values.yaml +++ b/kube-state-metrics/helm-values.yaml @@ -7,13 +7,12 @@ fullnameOverride: kube-state-metrics # Note: kube-state-metrics는 stateless이지만, 여러 replica는 동일한 메트릭을 중복 생성하므로 # 단일 replica로 실행하는 것이 권장됩니다. -# Resource settings (VPA lowerBound/upperBound) +# Resource settings (no CPU limit for stability) resources: requests: cpu: 15m memory: 100Mi limits: - cpu: 15m memory: 100Mi service: diff --git a/loki/helm-values.yaml b/loki/helm-values.yaml index 11bd7ea..91d21ff 100644 --- a/loki/helm-values.yaml +++ b/loki/helm-values.yaml @@ -60,13 +60,12 @@ singleBinary: mountPath: /var/loki # Medium priority for observability priorityClassName: medium-priority - # Resource settings (VPA lowerBound/target) + # Resource settings (no CPU limit for stability) resources: requests: cpu: 10m memory: 225Mi limits: - cpu: 69m memory: 323Mi # Disable components not needed in single binary mode diff --git a/node-exporter/helm-values.yaml b/node-exporter/helm-values.yaml index 53e32c7..f6c1098 100644 --- a/node-exporter/helm-values.yaml +++ b/node-exporter/helm-values.yaml @@ -6,13 +6,12 @@ fullnameOverride: node-exporter hostNetwork: true hostPID: true -# Resource settings (VPA lowerBound/upperBound) +# Resource settings (no CPU limit for stability) resources: requests: cpu: 15m memory: 100Mi limits: - cpu: 15m memory: 100Mi service: diff --git a/opentelemetry/helm-values.yaml b/opentelemetry/helm-values.yaml index c73f5f6..81e818f 100644 --- a/opentelemetry/helm-values.yaml +++ b/opentelemetry/helm-values.yaml @@ -28,14 +28,13 @@ image: mode: daemonset # ============================================================================= -# Resource Limits (VPA lowerBound/upperBound, mem limit capped at 1024Mi) +# Resource Limits (no CPU limit for stability, mem limit capped at 1024Mi) # ============================================================================= resources: requests: cpu: 34m memory: 142Mi limits: - cpu: 410m memory: 1024Mi # ============================================================================= diff --git a/prometheus/helm-values.yaml b/prometheus/helm-values.yaml index b6b9c69..caea401 100644 --- a/prometheus/helm-values.yaml +++ b/prometheus/helm-values.yaml @@ -14,13 +14,12 @@ prometheusOperator: enabled: true # CRD 생성 비활성화 createCustomResource: false - # Resource settings (VPA lowerBound/upperBound) + # Resource settings (no CPU limit for stability) resources: requests: cpu: 15m memory: 100Mi limits: - cpu: 15m memory: 100Mi # Kubelet ServiceMonitor with cluster label diff --git a/tempo/helm-values.yaml b/tempo/helm-values.yaml index eedd287..8d0ae6e 100644 --- a/tempo/helm-values.yaml +++ b/tempo/helm-values.yaml @@ -17,13 +17,12 @@ replicas: 1 # Tempo Configuration # ============================================================================= tempo: - # Resource settings (VPA lowerBound/target) + # Resource settings (no CPU limit for stability) resources: requests: cpu: 15m memory: 100Mi limits: - cpu: 15m memory: 109Mi # Receivers - protocols Tempo accepts receivers: diff --git a/thanos/helm-values.yaml b/thanos/helm-values.yaml index 38b367d..3543ddf 100644 --- a/thanos/helm-values.yaml +++ b/thanos/helm-values.yaml @@ -46,13 +46,12 @@ query: - --query.replica-label=prometheus_replica - --query.auto-downsampling - # Resource settings (VPA lowerBound/target) + # Resource settings (no CPU limit for stability) resources: requests: cpu: 15m memory: 100Mi limits: - cpu: 15m memory: 126Mi # ============================================================================= diff --git a/vpa/helm-values.yaml b/vpa/helm-values.yaml index 58baafe..f8d1ec9 100644 --- a/vpa/helm-values.yaml +++ b/vpa/helm-values.yaml @@ -6,13 +6,12 @@ recommender: enabled: true replicaCount: 1 - # Resource settings (VPA lowerBound/upperBound) + # Resource settings (no CPU limit for stability) resources: requests: cpu: 15m memory: 100Mi limits: - cpu: 15m memory: 100Mi # Updater - applies recommended resource requests to pods