Skip to content

Commit a4dbb49

Browse files
authored
[DPE-2331] Backport dashboards and alerting rules from vm to k8s (#61)
1 parent 2bf3e1f commit a4dbb49

File tree

5 files changed

+292
-483
lines changed

5 files changed

+292
-483
lines changed

src/alert_rules/loki/.gitkeep

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
groups:
2+
- name: kafka.alerts
3+
rules:
4+
# ==============
5+
# Base JMX Rules
6+
# ==============
7+
- alert: Kafka Missing
8+
expr: up{juju_charm!=".*"} == 0
9+
for: 0m
10+
labels:
11+
severity: critical
12+
annotations:
13+
summary: Prometheus target missing (instance {{ $labels.instance }})
14+
description: "Kafka target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
15+
16+
- alert: JvmMemory Filling Up
17+
expr: (sum by (instance)(jvm_memory_bytes_used{area="heap",juju_charm!=".*"}) / sum by (instance)(jvm_memory_bytes_max{area="heap",juju_charm!=".*"})) * 100 > 80
18+
for: 2m
19+
labels:
20+
severity: warning
21+
annotations:
22+
summary: JVM memory filling up (instance {{ $labels.instance }})
23+
description: "JVM memory is filling up (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
24+
25+
- alert: Kafka Threads Dead Locked
26+
expr: jvm_threads_deadlocked{juju_charm!=".*"} > 0
27+
labels:
28+
severity: warning
29+
annotations:
30+
summary: "Zookeeper JVM threads Deadlock occurred."
31+
description: |-
32+
JVM Thread Deadlock means a situation where two or more JVM threads are blocked forever, waiting for each other.
33+
Deadlock occurs when multiple threads need the same locks but obtain them in different order.
34+
35+
Also look to JVM documentation about threads state:
36+
https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/lang/Thread.State.html
37+
38+
# ============
39+
# Broker State
40+
# ============
41+
- alert: Broker State
42+
expr: count(kafka_server_kafkaserver_brokerstate{juju_charm!=".*"}) by (instance) == 0
43+
for: 1m
44+
labels:
45+
severity: critical
46+
annotations:
47+
summary: 'Broker {{ $labels.instance }} :: Broker State :: No Brokers alive.'
48+
description: 'Broker count is 0'
49+
50+
- alert: Zookeeper Session Connection
51+
expr: avg(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{juju_charm!=".*"})by(instance) < 1
52+
for: 1m
53+
labels:
54+
severity: warning
55+
annotations:
56+
summary: 'Broker {{ $labels.instance }} :: Broker State :: Zookeeper Sync Disconnected.'
57+
description: 'Zookeeper Sync Disconnected.'
58+
59+
- alert: Zookeeper Session Expiry
60+
expr: rate(kafka_server_sessionexpirelistener_zookeeperexpirespersec{juju_charm!=".*"}[5m]) != 0
61+
for: 1m
62+
labels:
63+
severity: warning
64+
annotations:
65+
summary: 'Broker {{ $labels.instance }} :: Broker State :: The ZooKeeper session has expired.'
66+
description: 'When a session expires, we can have leader changes and even a new controller. It is important to keep an eye on the number of such events across a Kafka cluster and if the overall number is high.'
67+
68+
# =========================
69+
# Controller and Partitions
70+
# =========================
71+
- alert: Active Controllers
72+
expr: sum(kafka_controller_kafkacontroller_activecontrollercount{juju_charm!=".*"}) != 1
73+
for: 1m
74+
labels:
75+
severity: critical
76+
annotations:
77+
summary: 'Broker :: Controller and Partitions :: No active controller'
78+
description: 'No broker in the cluster is reporting as the active controller in the last 1 minute interval. During steady state there should be only one active controller per cluster.'
79+
80+
- alert: Offline Partitions
81+
expr: sum(kafka_controller_kafkacontroller_offlinepartitionscount{juju_charm!=".*"}) by (instance) > 0
82+
for: 1m
83+
labels:
84+
severity: critical
85+
annotations:
86+
summary: 'Broker {{ $labels.instance }} :: Controller and Partitions :: {{ $value }} partitions offline'
87+
description: 'After successful leader election, if the leader for partition dies, then the partition moves to the OfflinePartition state. Offline partitions are not available for reading and writing. Restart the brokers, if needed, and check the logs for errors.'
88+
89+
- alert: High Partitions Count
90+
expr: kafka_server_replicamanager_partitioncount{juju_charm!=".*"} > 3000
91+
for: 1m
92+
labels:
93+
severity: warning
94+
annotations:
95+
summary: 'Broker {{ $labels.instance }} :: Controller and Partitions :: Too many partitions :: {{ $value }} partitions in broker'
96+
description: 'Recommended number of partition per broker should be below 4000. Increase the number of broker and rebalance partitions in order to keep this number controlled.'
97+
98+
# =======================
99+
# Replicas and Partitions
100+
# =======================
101+
- alert: Under Replicated Partitions
102+
expr: sum(kafka_server_replicamanager_underreplicatedpartitions{juju_charm!=".*"}) by (instance) > 0
103+
for: 1m
104+
labels:
105+
severity: critical
106+
annotations:
107+
summary: 'Broker {{ $labels.instance }} :: Replicas and Partitions :: {{ $value }} under replicated partitons'
108+
description: 'Under-replicated partitions means that one or more replicas are not available. This is usually because a broker is down. Restart the broker, and check for errors in the logs.'
109+
110+
- alert: Replica Fetcher Manager Max Lag
111+
expr: avg(kafka_server_replicafetchermanager_maxlag{juju_charm!=".*"})by(instance) > 50
112+
for: 1m
113+
labels:
114+
severity: critical
115+
annotations:
116+
summary: 'Broker {{ $labels.instance }} :: Replicas and Partitions :: Replica Fetcher Manager Max Lag is {{ $value }}!'
117+
description: 'The maximum lag between the time that messages are received by the leader replica and by the follower replicas.'
118+
119+
- alert: Not Replicated Topic
120+
expr: count(kafka_cluster_partition_insyncreplicascount{juju_charm!=".*"})by(topic, partition) <= 1
121+
for: 5m
122+
labels:
123+
severity: warning
124+
annotations:
125+
summary: 'Topic {{ $labels.topic }} - Partition {{ $labels.partition }} :: Replicas and Partitions :: The IRS for the topic is lower or equal than 1.'
126+
description: 'The topic is subject to data loss if the partition goes down and data cannot be recovered.'
127+
128+
# ================
129+
# In Sync Replicas
130+
# ================
131+
- alert: ISR Expands Rate
132+
expr: max(rate(kafka_server_replicamanager_isrexpandspersec{juju_charm!=".*"}[5m])) by (instance) != 0
133+
for: 1m
134+
labels:
135+
severity: warning
136+
annotations:
137+
summary: 'Broker {{ $labels.instance }} :: In Sync Replicas :: {{ $value }} ISR Expansion Rate.'
138+
description: 'If a broker goes down, ISR for some of the partitions shrink. When that broker is up again, ISRs are expanded once the replicas are fully caught up. Other than that, the expected value for ISR expansion rate is 0. If ISR is expanding and shrinking frequently, adjust Allowed replica lag.'
139+
140+
- alert: ISR Shrinks Rate
141+
expr: max(rate(kafka_server_replicamanager_isrshrinkspersec{juju_charm!=".*"}[5m])) by (instance) != 0
142+
for: 1m
143+
labels:
144+
severity: warning
145+
annotations:
146+
summary: 'Broker {{ $labels.instance }} :: In Sync Replicas :: {{ $value }} ISR Shrink Rate.'
147+
description: 'If a broker goes down, ISR for some of the partitions shrink. When that broker is up again, ISRs are expanded once the replicas are fully caught up. Other than that, the expected value for ISR shrink rate is 0. If ISR is expanding and shrinking frequently, adjust Allowed replica lag.'
148+
149+
# ================
150+
# Leader Elections
151+
# ================
152+
- alert: Leader Election Rate
153+
expr: max(rate(kafka_controller_controllerstats_leaderelectionrateandtimems{juju_charm!=".*",quantile=""}[2m]))by(instance) !=0
154+
for: 60s
155+
labels:
156+
severity: critical
157+
annotations:
158+
summary: "Broker Kafka :: Leader Elections :: Number of disputed leader elections rate are {{ $value }}"
159+
description: "Critical: Kafka number of disputed leader elections rate(!=0) on the instance {{ $labels.instance }} for more than 1 minutes"
160+
161+
- alert: Unclean Leader Election
162+
expr: max(rate(kafka_controller_controllerstats_uncleanleaderelectionspersec{juju_charm!=".*"}[5m])) by (instance) != 0
163+
for: 1m
164+
labels:
165+
severity: critical
166+
annotations:
167+
summary: 'Broker {{ $labels.instance }} :: Leader Elections :: {{ $value }} unclean leader elections.'
168+
description: '{{ $value }} unclean partition leader elections in the cluster reported in the last 1 minute interval. When unclean leader election is held among out-of-sync replicas, there is a possibility of data loss if any messages were not synced prior to the loss of the former leader. So if the number of unclean elections is greater than 0, investigate broker logs to determine why leaders were re-elected, and look for WARN or ERROR messages. Consider setting the broker configuration parameter unclean.leader.election.enable to false so that a replica outside of the set of in-sync replicas is never elected leader.'
169+
170+
# ==============
171+
# Consumer Level
172+
# ==============
173+
- alert: Records Lag Max
174+
expr: sum(kafka_server_fetcherlagmetrics_consumerlag{juju_charm!=".*"}) by(instance, client_id) > 0
175+
for: 1m
176+
labels:
177+
severity: critical
178+
annotations:
179+
summary: 'Broker {{ $labels.instance }} :: Consumer :: The maximum lag is {{ $value }}.'
180+
description: 'The maximum lag in terms of number of records for any partition in this window. An increasing value over time is your best indication that the consumer group is not keeping up with the producers.'
181+
182+
# ===============
183+
# Thread Capacity
184+
# ===============
185+
- alert: Network Processor Idle Percent
186+
expr: avg(sum(kafka_network_processor_idlepercent{juju_charm!=".*"}) by (instance, networkProcessor)) by (instance) < 0.3
187+
for: 1m
188+
labels:
189+
severity: critical
190+
annotations:
191+
summary: 'Broker {{ $labels.instance }} :: Thread Capacity :: Network Processor Idle Percent is {{ $value }}.'
192+
description: 'The average fraction of time the network processors are idle. A lower value {{ $value }} indicates that the network workload of the broker is very high.'
193+
194+
- alert: Request Handler Idle Percent
195+
expr: avg(kafka_server_kafkarequesthandlerpool_requesthandleravgidlepercent_total{juju_charm!=".*"}) by (instance) < 0.3
196+
for: 1m
197+
labels:
198+
severity: critical
199+
annotations:
200+
summary: 'Broker {{ $labels.instance }} :: Thread Capacity :: Request Handler Idle Percent is {{ $value }}.'
201+
description: 'The average fraction of time the request handler threads (IO) are idle. A lower value {{ $value }} indicates that the workload of a broker is very high.'
202+
203+
- alert: Request Queue Time Max
204+
expr: max(kafka_network_requestmetrics_requestqueuetimems{quantile="0.95", juju_charm!=".*"}) by(instance) > 200
205+
for: 1m
206+
labels:
207+
severity: warning
208+
annotations:
209+
summary: 'Kafka {{ $labels.instance }}: {{ $value }}ms request queue time'
210+
description: 'Max request queue time exceeded 200ms for a request. It is the time, in milliseconds, that a request currently spends in the request queue.'
211+
212+
- alert: Response Queue Time Max
213+
expr: max(kafka_network_requestmetrics_responsequeuetimems{quantile="0.95", juju_charm!=".*"}) by(instance) > 200
214+
for: 1m
215+
labels:
216+
severity: warning
217+
annotations:
218+
summary: 'Kafka {{ $labels.instance }}: {{ $value }}ms response queue time'
219+
description: 'Max response queue time exceeded 200ms for a request. It is the length of time, in milliseconds, that the request waits in the response queue.'
220+
221+
# ==========
222+
# Safe Guard
223+
# ==========
224+
- alert: JVM Usage
225+
expr: ((sum without(area)(jvm_memory_bytes_used{juju_charm!=".*"}) / 1024 / 1024) / (sum without(area)(jvm_memory_bytes_max{juju_charm!=".*"}) / 1024 / 1024)) * 100 > 70
226+
for: 60s
227+
labels:
228+
severity: critical
229+
annotations:
230+
summary: "Broker {{ $labels.instance }} :: Critical :: Heap memory usage is {{ $value }}%"
231+
description: " The borker {{ $labels.instance }} has high memory usage ({{ $value }}>70%) for more than 1 minutes."
232+
233+
- alert: Offline Log Directory
234+
expr: kafka_log_logmanager_offlinelogdirectorycount{juju_charm!=".*"} > 0
235+
for: 10s
236+
labels:
237+
severity: warning
238+
annotations:
239+
summary: 'Broker {{ $labels.instance }} :: Kafka offline log directories.'
240+
description: 'There are {{ $value }} offline log directories on {{ $labels.instance }}.'
241+
242+
- alert: Topic Count
243+
expr: count(count by (topic,instance) (rate(kafka_server_brokertopicmetrics_messagesinpersec{juju_charm!=".*"}[5m]))) by (instance) > 1000
244+
for: 5m
245+
labels:
246+
severity: warning
247+
annotations:
248+
summary: 'Broker {{ $labels.instance }} :: Safe Guard :: 1000 topics reached'
249+
description: 'The number of active topics in the cluster has reached 1000.'

src/charm.py

+4
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
JAVA_HOME,
3737
JMX_EXPORTER_PORT,
3838
LOGS_PATH,
39+
LOGS_RULES_DIR,
40+
METRICS_RULES_DIR,
3941
PEER,
4042
REL_NAME,
4143
ZOOKEEPER_REL_NAME,
@@ -65,11 +67,13 @@ def __init__(self, *args):
6567
self.metrics_endpoint = MetricsEndpointProvider(
6668
self,
6769
jobs=[{"static_configs": [{"targets": [f"*:{JMX_EXPORTER_PORT}"]}]}],
70+
alert_rules_path=METRICS_RULES_DIR,
6871
)
6972
self.grafana_dashboards = GrafanaDashboardProvider(self)
7073
self.loki_push = LogProxyConsumer(
7174
self,
7275
log_files=[f"{LOGS_PATH}/server.log"],
76+
alert_rules_path=LOGS_RULES_DIR,
7377
relation_name="logging",
7478
container_name="kafka",
7579
)

0 commit comments

Comments
 (0)