Skip to content

Commit 15800ba

Browse files
[DPE-6011] - feat: reconcile vm+k8s feature drift (#152)
1 parent 7e4cfa5 commit 15800ba

27 files changed

+1147
-214
lines changed

.github/workflows/ci.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ jobs:
7171
- integration-upgrade
7272
- integration-balancer-single
7373
- integration-balancer-multi
74+
- integration-kraft-single
75+
- integration-kraft-multi
7476
name: ${{ matrix.tox-environments }}
7577
needs:
7678
- lint

actions.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ get-admin-credentials:
3636
The returned client_properties can be used for Kafka bin commands using `--bootstrap-server` and `--command-config` for admin level administration
3737
This action must be called on the leader unit.
3838

39+
get-listeners:
40+
description: Get all active listeners and their port allocations
41+
3942
rebalance:
4043
description: Trigger a rebalance of cluster partitions based on configured goals
4144
params:

config.yaml

+5-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ options:
55
roles:
66
description: |
77
Comma separated list of the roles assigned to the nodes of this cluster.
8-
This configuration accepts the following roles: 'broker' (standard functionality), 'balancer' (cruise control).
8+
This configuration accepts the following roles: 'broker' (standard functionality), 'balancer' (cruise control), 'controller' (KRaft mode).
99
type: string
1010
default: broker
1111
compression_type:
@@ -92,6 +92,10 @@ options:
9292
description: Config options to add extra-sans to the ones used when requesting server certificates. The extra-sans are specified by comma-separated names to be added when requesting signed certificates. Use "{unit}" as a placeholder to be filled with the unit number, e.g. "worker-{unit}" will be translated as "worker-0" for unit 0 and "worker-1" for unit 1 when requesting the certificate.
9393
type: string
9494
default: ""
95+
extra_listeners:
96+
description: "Config options to add extra SANs to the ones used when requesting server certificates, and to define custom `advertised.listeners` and ports for clients external to the Juju model. These items are comma-separated. Use '{unit}' as a placeholder to be filled with the unit number if necessary. For port allocations, providing the port for a given listener will offset the generated port number by that amount, with an accepted value range of 20001-50000. For example, a provided value of 'worker-{unit}.domain.com:30000' will generate listeners for unit 0 with name 'worker-0.domain.com', and be allocated ports 39092, 39093 etc for each authentication scheme."
97+
type: string
98+
default: ""
9599
log_level:
96100
description: "Level of logging for the different components operated by the charm. Possible values: ERROR, WARNING, INFO, DEBUG"
97101
type: string

metadata.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ resources:
3434
kafka-image:
3535
type: oci-image
3636
description: OCI Image for Apache Kafka
37-
upstream-source: ghcr.io/canonical/charmed-kafka@sha256:67d2729ca6c4f158682c481a512c37e555bae6edc30e8f0acdb0460fcbeffe88
37+
upstream-source: ghcr.io/canonical/charmed-kafka@sha256:0f180540572828e3152ab6a39b80891c6dfb2d6abc79914ec19646e6b487b1c8
3838

3939
peers:
4040
cluster:

src/charm.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,10 @@ def _on_roles_changed(self, _):
9999
This handler is in charge of stopping the workloads, since the sub-operators would not
100100
be instantiated if roles are changed.
101101
"""
102-
if not self.state.runs_broker and self.broker.workload.active():
102+
if (
103+
not (self.state.runs_broker or self.state.runs_controller)
104+
and self.broker.workload.active()
105+
):
103106
self.broker.workload.stop()
104107

105108
if (

src/core/cluster.py

+124-44
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from lightkube.core.exceptions import ApiError as LightKubeApiError
2424
from ops import Object, Relation
2525
from ops.model import Unit
26+
from tenacity import retry, retry_if_exception_cause_type, stop_after_attempt, wait_fixed
2627

2728
from core.models import (
2829
BrokerCapacities,
@@ -37,7 +38,10 @@
3738
ADMIN_USER,
3839
BALANCER,
3940
BROKER,
41+
CONTROLLER,
42+
CONTROLLER_PORT,
4043
INTERNAL_USERS,
44+
KRAFT_NODE_ID_OFFSET,
4145
MIN_REPLICAS,
4246
OAUTH_REL_NAME,
4347
PEER,
@@ -86,7 +90,7 @@ class PeerClusterData(ProviderData, RequirerData):
8690
"""Broker provider data model."""
8791

8892
SECRET_LABEL_MAP = SECRET_LABEL_MAP
89-
SECRET_FIELDS = BALANCER.requested_secrets
93+
SECRET_FIELDS = list(set(BALANCER.requested_secrets) | set(CONTROLLER.requested_secrets))
9094

9195

9296
class ClusterState(Object):
@@ -138,45 +142,48 @@ def peer_cluster_relation(self) -> Relation | None:
138142
@property
139143
def peer_cluster_orchestrator(self) -> PeerCluster:
140144
"""The state for the related `peer-cluster-orchestrator` application that this charm is requiring from."""
141-
balancer_kwargs: dict[str, Any] = (
142-
{
143-
"balancer_username": self.cluster.balancer_username,
144-
"balancer_password": self.cluster.balancer_password,
145-
"balancer_uris": self.cluster.balancer_uris,
146-
}
147-
if self.runs_balancer
148-
else {}
149-
)
145+
extra_kwargs: dict[str, Any] = {}
146+
147+
if self.runs_balancer:
148+
extra_kwargs.update(
149+
{
150+
"balancer_username": self.cluster.balancer_username,
151+
"balancer_password": self.cluster.balancer_password,
152+
"balancer_uris": self.cluster.balancer_uris,
153+
}
154+
)
155+
156+
if self.runs_controller:
157+
extra_kwargs.update(
158+
{
159+
"controller_quorum_uris": self.cluster.controller_quorum_uris,
160+
}
161+
)
162+
150163
return PeerCluster(
151164
relation=self.peer_cluster_relation,
152165
data_interface=PeerClusterData(self.model, PEER_CLUSTER_RELATION),
153-
**balancer_kwargs,
166+
**extra_kwargs,
154167
)
155168

156169
@property
157170
def peer_cluster(self) -> PeerCluster:
158-
"""The state for the related `peer-cluster` application that this charm is providing to."""
159-
return PeerCluster(
160-
relation=self.peer_cluster_orchestrator_relation,
161-
data_interface=PeerClusterOrchestratorData(
162-
self.model, PEER_CLUSTER_ORCHESTRATOR_RELATION
163-
),
164-
)
165-
166-
@property
167-
def balancer(self) -> PeerCluster:
168171
"""The state for the `peer-cluster-orchestrator` related balancer application."""
169-
balancer_kwargs: dict[str, Any] = (
170-
{
171-
"balancer_username": self.cluster.balancer_username,
172-
"balancer_password": self.cluster.balancer_password,
173-
"balancer_uris": self.cluster.balancer_uris,
174-
}
175-
if self.runs_balancer
176-
else {}
177-
)
172+
extra_kwargs: dict[str, Any] = {}
178173

179-
if self.runs_broker: # must be providing, initialise with necessary broker data
174+
if self.runs_controller or self.runs_balancer:
175+
extra_kwargs.update(
176+
{
177+
"balancer_username": self.cluster.balancer_username,
178+
"balancer_password": self.cluster.balancer_password,
179+
"balancer_uris": self.cluster.balancer_uris,
180+
"controller_quorum_uris": self.cluster.controller_quorum_uris,
181+
}
182+
)
183+
184+
# FIXME: `cluster_manager` check instead of running broker
185+
# must be providing, initialise with necessary broker data
186+
if self.runs_broker:
180187
return PeerCluster(
181188
relation=self.peer_cluster_orchestrator_relation, # if same app, this will be None and OK
182189
data_interface=PeerClusterOrchestratorData(
@@ -185,12 +192,13 @@ def balancer(self) -> PeerCluster:
185192
broker_username=ADMIN_USER,
186193
broker_password=self.cluster.internal_user_credentials.get(ADMIN_USER, ""),
187194
broker_uris=self.bootstrap_server,
195+
cluster_uuid=self.cluster.cluster_uuid,
188196
racks=self.racks,
189197
broker_capacities=self.broker_capacities,
190198
zk_username=self.zookeeper.username,
191199
zk_password=self.zookeeper.password,
192200
zk_uris=self.zookeeper.uris,
193-
**balancer_kwargs, # in case of roles=broker,balancer on this app
201+
**extra_kwargs, # in case of roles=broker,[balancer,controller] on this app
194202
)
195203

196204
else: # must be roles=balancer only then, only load with necessary balancer data
@@ -346,7 +354,11 @@ def default_auth(self) -> AuthMap:
346354
def enabled_auth(self) -> list[AuthMap]:
347355
"""The currently enabled auth.protocols and their auth.mechanisms, based on related applications."""
348356
enabled_auth = []
349-
if self.client_relations or self.runs_balancer or self.peer_cluster_orchestrator_relation:
357+
if (
358+
self.client_relations
359+
or self.runs_balancer
360+
or BALANCER.value in self.peer_cluster_orchestrator.roles
361+
):
350362
enabled_auth.append(self.default_auth)
351363
if self.oauth_relation:
352364
enabled_auth.append(AuthMap(self.default_auth.protocol, "OAUTHBEARER"))
@@ -356,6 +368,12 @@ def enabled_auth(self) -> list[AuthMap]:
356368
return enabled_auth
357369

358370
@property
371+
@retry(
372+
wait=wait_fixed(5),
373+
stop=stop_after_attempt(3),
374+
retry=retry_if_exception_cause_type(LightKubeApiError),
375+
reraise=True,
376+
)
359377
def bootstrap_servers_external(self) -> str:
360378
"""Comma-delimited string of `bootstrap-server` for external access."""
361379
return ",".join(
@@ -394,6 +412,21 @@ def bootstrap_server(self) -> str:
394412
)
395413
)
396414

415+
@property
416+
def controller_quorum_uris(self) -> str:
417+
"""The current controller quorum uris when running KRaft mode."""
418+
# FIXME: when running broker node.id will be unit-id + 100. If unit is only running
419+
# the controller node.id == unit-id. This way we can keep a human readable mapping of ids.
420+
if self.runs_controller:
421+
node_offset = KRAFT_NODE_ID_OFFSET if self.runs_broker else 0
422+
return ",".join(
423+
[
424+
f"{broker.unit_id + node_offset}@{broker.internal_address}:{CONTROLLER_PORT}"
425+
for broker in self.brokers
426+
]
427+
)
428+
return ""
429+
397430
@property
398431
def log_dirs(self) -> str:
399432
"""Builds the necessary log.dirs based on mounted storage volumes.
@@ -446,7 +479,7 @@ def ready_to_start(self) -> Status: # noqa: C901
446479
if not self.peer_relation:
447480
return Status.NO_PEER_RELATION
448481

449-
for status in [self._broker_status, self._balancer_status]:
482+
for status in [self._broker_status, self._balancer_status, self._controller_status]:
450483
if status != Status.ACTIVE:
451484
return status
452485

@@ -461,29 +494,40 @@ def _balancer_status(self) -> Status:
461494
if not self.peer_cluster_relation and not self.runs_broker:
462495
return Status.NO_PEER_CLUSTER_RELATION
463496

464-
if not self.balancer.broker_connected:
497+
if not self.peer_cluster.broker_connected:
465498
return Status.NO_BROKER_DATA
466499

467-
if len(self.balancer.broker_capacities.get("brokerCapacities", [])) < MIN_REPLICAS:
500+
if len(self.peer_cluster.broker_capacities.get("brokerCapacities", [])) < MIN_REPLICAS:
468501
return Status.NOT_ENOUGH_BROKERS
469502

470503
return Status.ACTIVE
471504

472505
@property
473-
def _broker_status(self) -> Status:
506+
def _broker_status(self) -> Status: # noqa: C901
474507
"""Checks for role=broker specific readiness."""
475508
if not self.runs_broker:
476509
return Status.ACTIVE
477510

478-
if not self.zookeeper:
479-
return Status.ZK_NOT_RELATED
511+
# Neither ZooKeeper or KRaft are active
512+
if self.kraft_mode is None:
513+
return Status.MISSING_MODE
514+
515+
if self.kraft_mode:
516+
if not self.peer_cluster.controller_quorum_uris: # FIXME: peer_cluster or cluster?
517+
return Status.NO_QUORUM_URIS
518+
if not self.cluster.cluster_uuid:
519+
return Status.NO_CLUSTER_UUID
480520

481-
if not self.zookeeper.zookeeper_connected:
482-
return Status.ZK_NO_DATA
521+
if self.kraft_mode == False: # noqa: E712
522+
if not self.zookeeper:
523+
return Status.ZK_NOT_RELATED
483524

484-
# TLS must be enabled for Kafka and ZK or disabled for both
485-
if self.cluster.tls_enabled ^ self.zookeeper.tls:
486-
return Status.ZK_TLS_MISMATCH
525+
if not self.zookeeper.zookeeper_connected:
526+
return Status.ZK_NO_DATA
527+
528+
# TLS must be enabled for Kafka and ZK or disabled for both
529+
if self.cluster.tls_enabled ^ self.zookeeper.tls:
530+
return Status.ZK_TLS_MISMATCH
487531

488532
if self.cluster.tls_enabled and not self.unit_broker.certificate:
489533
return Status.NO_CERT
@@ -493,6 +537,37 @@ def _broker_status(self) -> Status:
493537

494538
return Status.ACTIVE
495539

540+
@property
541+
def _controller_status(self) -> Status:
542+
"""Checks for role=controller specific readiness."""
543+
if not self.runs_controller:
544+
return Status.ACTIVE
545+
546+
if not self.peer_cluster_relation and not self.runs_broker:
547+
return Status.NO_PEER_CLUSTER_RELATION
548+
549+
if not self.peer_cluster.broker_connected_kraft_mode:
550+
return Status.NO_BROKER_DATA
551+
552+
return Status.ACTIVE
553+
554+
@property
555+
def kraft_mode(self) -> bool | None:
556+
"""Is the deployment running in KRaft mode?
557+
558+
Returns:
559+
True if Kraft mode, False if ZooKeeper, None when undefined.
560+
"""
561+
# NOTE: self.roles when running colocated, peer_cluster.roles when multiapp
562+
if CONTROLLER.value in (self.roles + self.peer_cluster.roles):
563+
return True
564+
if self.zookeeper_relation:
565+
return False
566+
567+
# FIXME raise instead of none. `not kraft_mode` is falsy
568+
# NOTE: if previous checks are not met, we don't know yet how the charm is being deployed
569+
return None
570+
496571
@property
497572
def runs_balancer(self) -> bool:
498573
"""Is the charm enabling the balancer?"""
@@ -502,3 +577,8 @@ def runs_balancer(self) -> bool:
502577
def runs_broker(self) -> bool:
503578
"""Is the charm enabling the broker(s)?"""
504579
return BROKER.value in self.roles
580+
581+
@property
582+
def runs_controller(self) -> bool:
583+
"""Is the charm enabling the controller?"""
584+
return CONTROLLER.value in self.roles

0 commit comments

Comments
 (0)