|
20 | 20 | "rules": [
|
21 | 21 | {
|
22 | 22 | "alert": "Amd64 metric missing in cluster ci-dev-aks-mac-eus",
|
23 |
| - "expression": "absent(node_uname_info{machine=\"x86_64\"}) == 1 or node_uname_info{machine=\"x86_64\"} == 0", |
| 23 | + "expression": "absent(node_uname_info{job=\"node\",machine=\"x86_64\"}) == 1 or node_uname_info{job=\"node\",machine=\"x86_64\"} == 0", |
24 | 24 | "for": "PT30M",
|
25 | 25 | "annotations": {
|
26 | 26 | "description": "Amd64 metric missing in cluster ci-dev-aks-mac-eus"
|
|
200 | 200 | },
|
201 | 201 | {
|
202 | 202 | "alert": "CPU usage % greater than 75 for prometheus-collector containers on cluster ci-dev-aks-mac-eus",
|
203 |
| - "expression": "sum(sum by (cluster, namespace, pod, container) ( rate(container_cpu_usage_seconds_total{job=\"cadvisor\", image!=\"\", namespace=\"kube-system\", container=\"prometheus-collector\"}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\", namespace=\"kube-system\"}) )) by (container, pod) *100 > 75", |
| 203 | + "expression": "sum(sum by (cluster, namespace, pod, container) ( rate(container_cpu_usage_seconds_total{job=\"cadvisor\", image!=\"\", namespace=\"kube-system\", container=\"prometheus-collector\"}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{job=\"kube-state-metrics\",node!=\"\", namespace=\"kube-system\"}) )) by (container, pod) *100 > 75", |
204 | 204 | "for": "PT3M",
|
205 | 205 | "annotations": {
|
206 | 206 | "description": "CPU usage greater than 75% for prometheus-collector on cluster ci-dev-aks-mac-eus"
|
|
218 | 218 | },
|
219 | 219 | {
|
220 | 220 | "alert": "Memory usage % greater than 75 for prometheus-collector containers on cluster ci-dev-aks-mac-eus",
|
221 |
| - "expression": "(sum(container_memory_working_set_bytes{namespace=\"kube-system\", container=\"prometheus-collector\", image!=\"\"}) by (container, pod) / sum(kube_pod_container_resource_limits{namespace=\"kube-system\", container=\"prometheus-collector\", resource=\"memory\"}) by (container, pod)) * 100> 75", |
| 221 | + "expression": "(sum(container_memory_working_set_bytes{job=\"cadvisor\",namespace=\"kube-system\", container=\"prometheus-collector\", image!=\"\"}) by (container, pod) / sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\",namespace=\"kube-system\", container=\"prometheus-collector\", resource=\"memory\"}) by (container, pod)) * 100> 75", |
222 | 222 | "for": "PT3M",
|
223 | 223 | "annotations": {
|
224 | 224 | "description": "Memory usage greater than 75% for prometheus-collector containers on cluster ci-dev-aks-mac-eus"
|
|
254 | 254 | },
|
255 | 255 | {
|
256 | 256 | "alert": "New agent version found for prometheus collector",
|
257 |
| - "expression": "count(count (kube_pod_container_info{image=~\"mcr.microsoft.com/azuremonitor/containerinsights/ciprod/prometheus-collector.*\"}) by (image)) > 4", |
| 257 | + "expression": "count(count (kube_pod_container_info{job=\"kube-state-metrics\",image=~\"mcr.microsoft.com/azuremonitor/containerinsights/ciprod/prometheus-collector.*\"}) by (image)) > 4", |
258 | 258 | "for": "PT60S",
|
259 | 259 | "annotations": {
|
260 | 260 | "description": "New agent version found for prometheus collector. This alert is only used in near ring regions for prod monitoring clusters"
|
|
0 commit comments