Use Expr instead of HLG #14610

GitHub Actions / Unit Test Results failed Feb 18, 2025 in 0s

3 errors, 1 009 fail, 259 skipped, 2 252 pass in 15h 17m 10s

18 files - 1 18 suites - 1 15h 17m 10s ⏱️ - 1h 3m 44s
3 523 tests - 214 2 252 ✅ - 188 259 💤 + 16 1 009 ❌ - 42 3 🔥 ±0
16 443 runs - 2 199 9 957 ✅ - 1 263 1 022 💤 - 187 5 453 ❌ - 746 11 🔥 - 3

Results for commit ee38736. ± Comparison against earlier commit 16d2d44.

Annotations

Check warning on line 0 in distributed.cli.tests.test_dask_worker

github-actions / Unit Test Results

1 out of 5 runs failed: test_contact_listen_address[tcp://0.0.0.0:---nanny] (distributed.cli.tests.test_dask_worker)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 1s]

Raw output


            AttributeError: 'dict' object has no attribute '__dask_graph__'
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:43743', workers: 0, cores: 0, tasks: 0>
nanny = '--nanny', listen_address = 'tcp://0.0.0.0:35207'

    @pytest.mark.slow
    @pytest.mark.skipif(not LINUX, reason="Need 127.0.0.2 to mean localhost")
    @pytest.mark.parametrize("nanny", ["--nanny", "--no-nanny"])
    @pytest.mark.parametrize("listen_address", ["tcp://0.0.0.0:", "tcp://127.0.0.2:"])
    @gen_cluster(client=True, nthreads=[])
    async def test_contact_listen_address(c, s, nanny, listen_address):
        port = open_port()
        listen_address += str(port)
        with popen(
            [
                sys.executable,
                "-m",
                "dask",
                "worker",
                s.address,
                nanny,
                "--no-dashboard",
                "--contact-address",
                f"tcp://127.0.0.2:{port}",
                "--listen-address",
                listen_address,
            ]
        ):
            await c.wait_for_workers(1)
            info = c.scheduler_info()
            assert info["workers"].keys() == {f"tcp://127.0.0.2:{port}"}
    
            # roundtrip works
>           assert await c.submit(lambda x: x + 1, 10) == 11

distributed/cli/tests/test_dask_worker.py:500: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:408: in _result
    raise exc.with_traceback(tb)
distributed/utils.py:1507: in run_in_executor_with_context
    return await loop.run_in_executor(
../../../miniconda3/envs/dask-distributed/lib/python3.10/concurrent/futures/thread.py:58: in run
    result = self.fn(*self.args, **self.kwargs)
distributed/utils.py:1508: in <lambda>
    executor, lambda: context.run(func, *args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import dataclasses
    import heapq
    import inspect
    import itertools
    import json
    import logging
    import math
    import operator
    import os
    import pickle
    import random
    import textwrap
    import uuid
    import warnings
    import weakref
    from abc import abstractmethod
    from collections import defaultdict, deque
    from collections.abc import (
        Callable,
        Collection,
        Container,
        Hashable,
        Iterable,
        Iterator,
        Mapping,
        Sequence,
        Set,
    )
    from contextlib import suppress
    from functools import partial
    from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, cast, overload
    
    import psutil
    import tornado.web
    from sortedcontainers import SortedDict, SortedSet
    from tlz import (
        concat,
        first,
        groupby,
        merge,
        merge_sorted,
        merge_with,
        partition,
        pluck,
        second,
        take,
        valmap,
    )
    from tornado.ioloop import IOLoop
    
    import dask
    import dask.utils
    from dask._task_spec import DependenciesMapping, GraphNode, convert_legacy_graph
    from dask.core import istask, validate_key
    from dask.typing import Key, no_default
    from dask.utils import (
        _deprecated,
        _deprecated_kwarg,
        format_bytes,
        format_time,
        key_split,
        parse_bytes,
        parse_timedelta,
        tmpfile,
    )
    from dask.widgets import get_template
    
    from distributed import cluster_dump, preloading, profile
    from distributed import versions as version_module
    from distributed._asyncio import RLock
    from distributed._stories import scheduler_story
    from distributed.active_memory_manager import ActiveMemoryManagerExtension, RetireWorker
    from distributed.batched import BatchedSend
    from distributed.broker import Broker
    from distributed.client import SourceCode
    from distributed.collections import HeapSet
    from distributed.comm import (
        Comm,
        CommClosedError,
        get_address_host,
        normalize_address,
        resolve_address,
        unparse_host_port,
    )
    from distributed.comm.addressing import addresses_from_user_args
    from distributed.compatibility import PeriodicCallback
    from distributed.core import (
        ErrorMessage,
        OKMessage,
        Status,
        clean_exception,
        error_message,
        rpc,
        send_recv,
    )
    from distributed.diagnostics.memory_sampler import MemorySamplerExtension
    from distributed.diagnostics.plugin import SchedulerPlugin, _get_plugin_name
    from distributed.event import EventExtension
    from distributed.gc import disable_gc_diagnosis, enable_gc_diagnosis
    from distributed.http import get_handlers
    from distributed.metrics import monotonic, time
    from distributed.multi_lock import MultiLockExtension
    from distributed.node import ServerNode
    from distributed.proctitle import setproctitle
    from distributed.protocol import deserialize
    from distributed.protocol.pickle import dumps, loads
    from distributed.protocol.serialize import Serialized, ToPickle, serialize
    from distributed.publish import PublishExtension
    from distributed.pubsub import PubSubSchedulerExtension
    from distributed.queues import QueueExtension
    from distributed.recreate_tasks import ReplayTaskScheduler
    from distributed.security import Security
    from distributed.semaphore import SemaphoreExtension
    from distributed.shuffle import ShuffleSchedulerPlugin
    from distributed.spans import SpanMetadata, SpansSchedulerExtension
    from distributed.stealing import WorkStealing
    from distributed.utils import (
        All,
        Deadline,
        TimeoutError,
        format_dashboard_link,
        get_fileno_limit,
        key_split_group,
        log_errors,
        offload,
        recursive_to_dict,
        wait_for,
    )
    from distributed.utils_comm import (
        gather_from_workers,
        retry_operation,
        scatter_to_workers,
    )
    from distributed.variable import VariableExtension
    
    if TYPE_CHECKING:
        # TODO import from typing (requires Python >=3.10)
        # TODO import from typing (requires Python >=3.11)
        from typing_extensions import Self, TypeAlias
    
        from dask._expr import Expr
    
    # Not to be confused with distributed.worker_state_machine.TaskStateState
    TaskStateState: TypeAlias = Literal[
        "released",
        "waiting",
        "no-worker",
        "queued",
        "processing",
        "memory",
        "erred",
        "forgotten",
    ]
    
    ALL_TASK_STATES: Set[TaskStateState] = set(TaskStateState.__args__)  # type: ignore
    
    # {task key -> finish state}
    # Not to be confused with distributed.worker_state_machine.Recs
    Recs: TypeAlias = dict[Key, TaskStateState]
    # {client or worker address: [{op: <key>, ...}, ...]}
    Msgs: TypeAlias = dict[str, list[dict[str, Any]]]
    # (recommendations, client messages, worker messages)
    RecsMsgs: TypeAlias = tuple[Recs, Msgs, Msgs]
    
    T_runspec: TypeAlias = GraphNode
    
    logger = logging.getLogger(__name__)
    LOG_PDB = dask.config.get("distributed.admin.pdb-on-err")
    DEFAULT_DATA_SIZE = parse_bytes(
        dask.config.get("distributed.scheduler.default-data-size")
    )
    STIMULUS_ID_UNSET = "<stimulus_id unset>"
    
    DEFAULT_EXTENSIONS = {
        "multi_locks": MultiLockExtension,
        "publish": PublishExtension,
        "replay-tasks": ReplayTaskScheduler,
        "queues": QueueExtension,
        "variables": VariableExtension,
        "pubsub": PubSubSchedulerExtension,
        "semaphores": SemaphoreExtension,
        "events": EventExtension,
        "amm": ActiveMemoryManagerExtension,
        "memory_sampler": MemorySamplerExtension,
        "shuffle": ShuffleSchedulerPlugin,
        "spans": SpansSchedulerExtension,
        "stealing": WorkStealing,
    }
    
    
    class ClientState:
        """A simple object holding information about a client."""
    
        #: A unique identifier for this client. This is generally an opaque
        #: string generated by the client itself.
        client_key: str
    
        #: Cached hash of :attr:`~ClientState.client_key`
        _hash: int
    
        #: A set of tasks this client wants to be kept in memory, so that it can download
        #: its result when desired. This is the reverse mapping of
        #: :class:`TaskState.who_wants`. Tasks are typically removed from this set when the
        #: corresponding object in the client's space (for example a ``Future`` or a Dask
        #: collection) gets garbage-collected.
        wants_what: set[TaskState]
    
        #: The last time we received a heartbeat from this client, in local scheduler time.
        last_seen: float
    
        #: Output of :func:`distributed.versions.get_versions` on the client
        versions: dict[str, Any]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(self, client: str, *, versions: dict[str, Any] | None = None):
            self.client_key = client
            self._hash = hash(client)
            self.wants_what = set()
            self.last_seen = time()
            self.versions = versions or {}
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            if not isinstance(other, ClientState):
                return False
            return self.client_key == other.client_key
    
        def __repr__(self) -> str:
            return f"<Client {self.client_key!r}>"
    
        def __str__(self) -> str:
            return self.client_key
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
    
    class MemoryState:
        """Memory readings on a worker or on the whole cluster.
    
        See :doc:`worker-memory`.
    
        Attributes / properties:
    
        managed_total
            Sum of the output of sizeof() for all dask keys held by the worker in memory,
            plus number of bytes spilled to disk
    
        managed
            Sum of the output of sizeof() for the dask keys held in RAM. Note that this may
            be inaccurate, which may cause inaccurate unmanaged memory (see below).
    
        spilled
            Number of bytes  for the dask keys spilled to the hard drive.
            Note that this is the size on disk; size in memory may be different due to
            compression and inaccuracies in sizeof(). In other words, given the same keys,
            'managed' will change depending on the keys being in memory or spilled.
    
        process
            Total RSS memory measured by the OS on the worker process.
            This is always exactly equal to managed + unmanaged.
    
        unmanaged
            process - managed. This is the sum of
    
            - Python interpreter and modules
            - global variables
            - memory temporarily allocated by the dask tasks that are currently running
            - memory fragmentation
            - memory leaks
            - memory not yet garbage collected
            - memory not yet free()'d by the Python memory manager to the OS
    
        unmanaged_old
            Minimum of the 'unmanaged' measures over the last
            ``distributed.memory.recent-to-old-time`` seconds
    
        unmanaged_recent
            unmanaged - unmanaged_old; in other words process memory that has been recently
            allocated but is not accounted for by dask; hopefully it's mostly a temporary
            spike.
    
        optimistic
            managed + unmanaged_old; in other words the memory held long-term by
            the process under the hopeful assumption that all unmanaged_recent memory is a
            temporary spike
        """
    
        process: int
        unmanaged_old: int
        managed: int
        spilled: int
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            process: int,
            unmanaged_old: int,
            managed: int,
            spilled: int,
        ):
            # Some data arrives with the heartbeat, some other arrives in realtime as the
            # tasks progress. Also, sizeof() is not guaranteed to return correct results.
            # This can cause glitches where a partial measure is larger than the whole, so
            # we need to force all numbers to add up exactly by definition.
            self.process = process
            self.managed = min(self.process, managed)
            self.spilled = spilled
            # Subtractions between unsigned ints guaranteed by construction to be >= 0
            self.unmanaged_old = min(unmanaged_old, process - self.managed)
    
        @staticmethod
        def sum(*infos: MemoryState) -> MemoryState:
            process = 0
            unmanaged_old = 0
            managed = 0
            spilled = 0
            for ms in infos:
                process += ms.process
                unmanaged_old += ms.unmanaged_old
                spilled += ms.spilled
                managed += ms.managed
            return MemoryState(
                process=process,
                unmanaged_old=unmanaged_old,
                managed=managed,
                spilled=spilled,
            )
    
        @property
        def managed_total(self) -> int:
            return self.managed + self.spilled
    
        @property
        def unmanaged(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed
    
        @property
        def unmanaged_recent(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed - self.unmanaged_old
    
        @property
        def optimistic(self) -> int:
            return self.managed + self.unmanaged_old
    
        @property
        def managed_in_memory(self) -> int:
            warnings.warn("managed_in_memory has been renamed to managed", FutureWarning)
            return self.managed
    
        @property
        def managed_spilled(self) -> int:
            warnings.warn("managed_spilled has been renamed to spilled", FutureWarning)
            return self.spilled
    
        def __repr__(self) -> str:
            return (
                f"Process memory (RSS)  : {format_bytes(self.process)}\n"
                f"  - managed by Dask   : {format_bytes(self.managed)}\n"
                f"  - unmanaged (old)   : {format_bytes(self.unmanaged_old)}\n"
                f"  - unmanaged (recent): {format_bytes(self.unmanaged_recent)}\n"
                f"Spilled to disk       : {format_bytes(self.spilled)}\n"
            )
    
        def _to_dict(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            """
            return {
                k: getattr(self, k)
                for k in dir(self)
                if not k.startswith("_")
                and k not in {"sum", "managed_in_memory", "managed_spilled"}
            }
    
    
    class WorkerState:
        """A simple object holding information about a worker.
    
        Not to be confused with :class:`distributed.worker_state_machine.WorkerState`.
        """
    
        #: This worker's unique key. This can be its connected address
        #: (such as ``"tcp://127.0.0.1:8891"``) or an alias (such as ``"alice"``).
        address: str
    
        pid: int
        name: Hashable
    
        #: The number of CPU threads made available on this worker
        nthreads: int
    
        #: Memory available to the worker, in bytes
        memory_limit: int
    
        local_directory: str
        services: dict[str, int]
    
        #: Output of :meth:`distributed.versions.get_versions` on the worker
        versions: dict[str, Any]
    
        #: Address of the associated :class:`~distributed.nanny.Nanny`, if present
        nanny: str | None
    
        #: Read-only worker status, synced one way from the remote Worker object
        status: Status
    
        #: Cached hash of :attr:`~WorkerState.server_id`
        _hash: int
    
        #: The total memory size, in bytes, used by the tasks this worker holds in memory
        #: (i.e. the tasks in this worker's :attr:`~WorkerState.has_what`).
        nbytes: int
    
        #: Worker memory unknown to the worker, in bytes, which has been there for more than
        #: 30 seconds. See :class:`MemoryState`.
        _memory_unmanaged_old: int
    
        #: History of the last 30 seconds' worth of unmanaged memory. Used to differentiate
        #: between "old" and "new" unmanaged memory.
        #: Format: ``[(timestamp, bytes), (timestamp, bytes), ...]``
        _memory_unmanaged_history: deque[tuple[float, int]]
    
        metrics: dict[str, Any]
    
        #: The last time we received a heartbeat from this worker, in local scheduler time.
        last_seen: float
    
        time_delay: float
        bandwidth: float
    
        #: A set of all TaskStates on this worker that are actors. This only includes those
        #: actors whose state actually lives on this worker, not actors to which this worker
        #: has a reference.
        actors: set[TaskState]
    
        #: Underlying data of :meth:`WorkerState.has_what`
        _has_what: dict[TaskState, None]
    
        #: A set of tasks that have been submitted to this worker. Multiple tasks may be
        # submitted to a worker in advance and the worker will run them eventually,
        # depending on its execution resources (but see :doc:`work-stealing`).
        #:
        #: All the tasks here are in the "processing" state.
        #: This attribute is kept in sync with :attr:`TaskState.processing_on`.
        processing: set[TaskState]
    
        #: Running tasks that invoked :func:`distributed.secede`
        long_running: set[TaskState]
    
        #: A dictionary of tasks that are currently being run on this worker.
        #: Each task state is associated with the duration in seconds which the task has
        #: been running.
        executing: dict[TaskState, float]
    
        #: The available resources on this worker, e.g. ``{"GPU": 2}``.
        #: These are abstract quantities that constrain certain tasks from running at the
        #: same time on this worker.
        resources: dict[str, float]
    
        #: The sum of each resource used by all tasks allocated to this worker.
        #: The numbers in this dictionary can only be less or equal than those in this
        #: worker's :attr:`~WorkerState.resources`.
        used_resources: dict[str, float]
    
        #: Arbitrary additional metadata to be added to :meth:`~WorkerState.identity`
        extra: dict[str, Any]
    
        # The unique server ID this WorkerState is referencing
        server_id: str
    
        # Reference to scheduler task_groups
        scheduler_ref: weakref.ref[SchedulerState] | None
        task_prefix_count: defaultdict[str, int]
        _network_occ: int
        _occupancy_cache: float | None
    
        #: Keys that may need to be fetched to this worker, and the number of tasks that need them.
        #: All tasks are currently in `memory` on a worker other than this one.
        #: Much like `processing`, this does not exactly reflect worker state:
        #: keys here may be queued to fetch, in flight, or already in memory
        #: on the worker.
        needs_what: dict[TaskState, int]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            address: str,
            status: Status,
            pid: int,
            name: object,
            nthreads: int = 0,
            memory_limit: int,
            local_directory: str,
            nanny: str | None,
            server_id: str,
            services: dict[str, int] | None = None,
            versions: dict[str, Any] | None = None,
            extra: dict[str, Any] | None = None,
            scheduler: SchedulerState | None = None,
        ):
            self.server_id = server_id
            self.address = address
            self.pid = pid
            self.name = name
            self.nthreads = nthreads
            self.memory_limit = memory_limit
            self.local_directory = local_directory
            self.services = services or {}
            self.versions = versions or {}
            self.nanny = nanny
            self.status = status
            self._hash = hash(self.server_id)
            self.nbytes = 0
            self._memory_unmanaged_old = 0
            self._memory_unmanaged_history = deque()
            self.metrics = {}
            self.last_seen = time()
            self.time_delay = 0
            self.bandwidth = parse_bytes(dask.config.get("distributed.scheduler.bandwidth"))
            self.actors = set()
            self._has_what = {}
            self.processing = set()
            self.long_running = set()
            self.executing = {}
            self.resources = {}
            self.used_resources = {}
            self.extra = extra or {}
            self.scheduler_ref = weakref.ref(scheduler) if scheduler else None
            self.task_prefix_count = defaultdict(int)
            self.needs_what = {}
            self._network_occ = 0
            self._occupancy_cache = None
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            return self is other or (
                isinstance(other, WorkerState) and other.server_id == self.server_id
            )
    
        @property
        def has_what(self) -> Set[TaskState]:
            """An insertion-sorted set-like of tasks which currently reside on this worker.
            All the tasks here are in the "memory" state.
            This is the reverse mapping of :attr:`TaskState.who_has`.
    
            This is a read-only public accessor. The data is implemented as a dict without
            values, because rebalance() relies on dicts being insertion-sorted.
            """
            return self._has_what.keys()
    
        @property
        def host(self) -> str:
            return get_address_host(self.address)
    
        @property
        def memory(self) -> MemoryState:
            """Polished memory metrics for the worker.
    
            **Design note on managed memory**
    
            There are two measures available for managed memory:
    
            - ``self.nbytes``
            - ``self.metrics["managed_bytes"]``
    
            At rest, the two numbers must be identical. However, ``self.nbytes`` is
            immediately updated through the batched comms as soon as each task lands in
            memory on the worker; ``self.metrics["managed_bytes"]`` instead is updated by
            the heartbeat, which can lag several seconds behind.
    
            Below we are mixing likely newer managed memory info from ``self.nbytes`` with
            process and spilled memory from the heartbeat. This is deliberate, so that
            managed memory total is updated more frequently.
    
            Managed memory directly and immediately contributes to optimistic memory, which
            is in turn used in Active Memory Manager heuristics (at the moment of writing;
            more uses will likely be added in the future). So it's important to have it
            up to date; much more than it is for process memory.
    
            Having up-to-date managed memory info as soon as the scheduler learns about
            task completion also substantially simplifies unit tests.
    
            The flip side of this design is that it may cause some noise in the
            unmanaged_recent measure. e.g.:
    
            1. Delete 100MB of managed data
            2. The updated managed memory reaches the scheduler faster than the
               updated process memory
            3. There's a blip where the scheduler thinks that there's a sudden 100MB
               increase in unmanaged_recent, since process memory hasn't changed but managed
               memory has decreased by 100MB
            4. When the heartbeat arrives, process memory goes down and so does the
               unmanaged_recent.
    
            This is OK - one of the main reasons for the unmanaged_recent / unmanaged_old
            split is exactly to concentrate all the noise in unmanaged_recent and exclude it
            from optimistic memory, which is used for heuristics.
    
            Something that is less OK, but also less frequent, is that the sudden deletion
            of spilled keys will cause a negative blip in managed memory:
    
            1. Delete 100MB of spilled data
            2. The updated managed memory *total* reaches the scheduler faster than the
               updated spilled portion
            3. This causes the managed memory to temporarily plummet and be replaced by
               unmanaged_recent, while spilled memory remains unaltered
            4. When the heartbeat arrives, managed goes back up, unmanaged_recent
               goes back down, and spilled goes down by 100MB as it should have to
               begin with.
    
            :issue:`6002` will let us solve this.
            """
            return MemoryState(
                process=self.metrics["memory"],
                managed=max(0, self.nbytes - self.metrics["spilled_bytes"]["memory"]),
                spilled=self.metrics["spilled_bytes"]["disk"],
                unmanaged_old=self._memory_unmanaged_old,
            )
    
        def clean(self) -> WorkerState:
            """Return a version of this object that is appropriate for serialization"""
            ws = WorkerState(
                address=self.address,
                status=self.status,
                pid=self.pid,
                name=self.name,
                nthreads=self.nthreads,
                memory_limit=self.memory_limit,
                local_directory=self.local_directory,
                services=self.services,
                nanny=self.nanny,
                extra=self.extra,
                server_id=self.server_id,
            )
            ws._occupancy_cache = self.occupancy
    
            ws.executing = {ts.key: duration for ts, duration in self.executing.items()}  # type: ignore
            return ws
    
        def __repr__(self) -> str:
            name = f", name: {self.name}" if self.name != self.address else ""
            return (
                f"<WorkerState {self.address!r}{name}, "
                f"status: {self.status.name}, "
                f"memory: {len(self.has_what)}, "
                f"processing: {len(self.processing)}>"
            )
    
        def _repr_html_(self) -> str:
            return get_template("worker_state.html.j2").render(
                address=self.address,
                name=self.name,
                status=self.status.name,
                has_what=self.has_what,
                processing=self.processing,
            )
    
        def identity(self) -> dict[str, Any]:
            return {
                "type": "Worker",
                "id": self.name,
                "host": self.host,
                "resources": self.resources,
                "local_directory": self.local_directory,
                "name": self.name,
                "nthreads": self.nthreads,
                "memory_limit": self.memory_limit,
                "last_seen": self.last_seen,
                "services": self.services,
                "metrics": self.metrics,
                "status": self.status.name,
                "nanny": self.nanny,
                **self.extra,
            }
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict[str, Any]:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
        @property
        def scheduler(self) -> SchedulerState:
            assert self.scheduler_ref
            s = self.scheduler_ref()
            assert s
            return s
    
        def add_to_processing(self, ts: TaskState) -> None:
            """Assign a task to this worker for compute."""
            if self.scheduler.validate:
                assert ts not in self.processing
    
            tp = ts.prefix
            self.task_prefix_count[tp.name] += 1
            self.scheduler._task_prefix_count_global[tp.name] += 1
            self.processing.add(ts)
            for dts in ts.dependencies:
                assert dts.who_has
                if self not in dts.who_has:
                    self._inc_needs_replica(dts)
    
        def add_to_long_running(self, ts: TaskState) -> None:
            if self.scheduler.validate:
                assert ts in self.processing
                assert ts not in self.long_running
    
            self._remove_from_task_prefix_count(ts)
            # Cannot remove from processing since we're using this for things like
            # idleness detection. Idle workers are typically targeted for
            # downscaling but we should not downscale workers with long running
            # tasks
            self.long_running.add(ts)
    
        def remove_from_processing(self, ts: TaskState) -> None:
            """Remove a task from a workers processing"""
            if self.scheduler.validate:
                assert ts in self.processing
    
            if ts in self.long_running:
                self.long_running.discard(ts)
            else:
                self._remove_from_task_prefix_count(ts)
            self.processing.remove(ts)
            for dts in ts.dependencies:
                if dts in self.needs_what:
                    self._dec_needs_replica(dts)
    
        def _remove_from_task_prefix_count(self, ts: TaskState) -> None:
            prefix_name = ts.prefix.name
            count = self.task_prefix_count[prefix_name] - 1
            tp_count = self.task_prefix_count
            tp_count_global = self.scheduler._task_prefix_count_global
            if count:
                tp_count[prefix_name] = count
            else:
                del tp_count[prefix_name]
    
            count = tp_count_global[prefix_name] - 1
            if count:
                tp_count_global[prefix_name] = count
            else:
                del tp_count_global[prefix_name]
    
        def remove_replica(self, ts: TaskState) -> None:
            """The worker no longer has a task in memory"""
            if self.scheduler.validate:
                assert ts.who_has
                assert self in ts.who_has
                assert ts in self.has_what
                assert ts not in self.needs_wh…cheduler, title="Scheduler Profile (administrative)"
            )
            task_stream = TabPanel(child=task_stream, title="Task Stream")
            bandwidth_workers = TabPanel(
                child=bandwidth_workers.root, title="Bandwidth (Workers)"
            )
            bandwidth_types = TabPanel(
                child=bandwidth_types.root, title="Bandwidth (Types)"
            )
            system = TabPanel(child=sysmon.root, title="System")
            logs = TabPanel(child=logs.root, title="Scheduler Logs")
    
            tabs = Tabs(
                tabs=[
                    html,
                    task_stream,
                    system,
                    logs,
                    compute,
                    workers,
                    scheduler,
                    bandwidth_workers,
                    bandwidth_types,
                ],
                sizing_mode="stretch_both",
            )
    
            from bokeh.core.templates import get_env
            from bokeh.plotting import output_file, save
    
            with tmpfile(extension=".html") as fn:
                output_file(filename=fn, title="Dask Performance Report", mode=mode)
                template_directory = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)), "dashboard", "templates"
                )
                template_environment = get_env()
                template_environment.loader.searchpath.append(template_directory)
                template = template_environment.get_template("performance_report.html")
                save(tabs, filename=fn, template=template)
    
                with open(fn) as f:
                    data = f.read()
    
            return data
    
        async def get_worker_logs(self, n=None, workers=None, nanny=False):
            results = await self.broadcast(
                msg={"op": "get_logs", "n": n}, workers=workers, nanny=nanny
            )
            return results
    
        def log_event(self, topic: str | Collection[str], msg: Any) -> None:
            """Log an event under a given topic
    
            Parameters
            ----------
            topic : str, list[str]
                Name of the topic under which to log an event. To log the same
                event under multiple topics, pass a list of topic names.
            msg
                Event message to log. Note this must be msgpack serializable.
    
            See also
            --------
            Client.log_event
            """
            self._broker.publish(topic, msg)
    
        def subscribe_topic(self, topic: str, client: str) -> None:
            self._broker.subscribe(topic, client)
    
        def unsubscribe_topic(self, topic: str, client: str) -> None:
            self._broker.unsubscribe(topic, client)
    
        @overload
        def get_events(self, topic: str) -> tuple[tuple[float, Any], ...]: ...
    
        @overload
        def get_events(self) -> dict[str, tuple[tuple[float, Any], ...]]: ...
    
        def get_events(
            self, topic: str | None = None
        ) -> tuple[tuple[float, Any], ...] | dict[str, tuple[tuple[float, Any], ...]]:
            return self._broker.get_events(topic)
    
        async def get_worker_monitor_info(self, recent=False, starts=None):
            if starts is None:
                starts = {}
            results = await asyncio.gather(
                *(
                    self.rpc(w).get_monitor_info(recent=recent, start=starts.get(w, 0))
                    for w in self.workers
                )
            )
            return dict(zip(self.workers, results))
    
        ###########
        # Cleanup #
        ###########
    
        @log_errors
        async def check_worker_ttl(self) -> None:
            now = time()
            stimulus_id = f"check-worker-ttl-{now}"
            assert self.worker_ttl
            ttl = max(self.worker_ttl, 10 * heartbeat_interval(len(self.workers)))
            to_restart = []
    
            for ws in self.workers.values():
                last_seen = now - ws.last_seen
                if last_seen > ttl:
                    to_restart.append(ws.address)
                    logger.warning(
                        f"Worker failed to heartbeat for {last_seen:.0f}s; "
                        f"{'attempting restart' if ws.nanny else 'removing'}: {ws}"
                    )
    
            if to_restart:
                self.log_event(
                    "scheduler",
                    {
                        "action": "worker-ttl-timed-out",
                        "workers": to_restart.copy(),
                        "ttl": ttl,
                    },
                )
                await self.restart_workers(
                    to_restart,
                    wait_for_workers=False,
                    stimulus_id=stimulus_id,
                )
    
        def check_idle(self) -> float | None:
            if self.status in (Status.closing, Status.closed):
                return None  # pragma: nocover
    
            if self.transition_counter != self._idle_transition_counter:
                self._idle_transition_counter = self.transition_counter
                self.idle_since = None
                return None
    
            if self._active_graph_updates > 0:
                self.idle_since = None
                return None
    
            if (
                self.queued
                or self.unrunnable
                or any(ws.processing for ws in self.workers.values())
            ):
                self.idle_since = None
                return None
    
            if not self.idle_since:
                self.idle_since = time()
                return self.idle_since
    
            if self.jupyter:
                last_activity = (
                    self._jupyter_server_application.web_app.last_activity().timestamp()
                )
                if last_activity > self.idle_since:
                    self.idle_since = last_activity
                    return self.idle_since
    
            if self.idle_timeout:
                if time() > self.idle_since + self.idle_timeout:
                    assert self.idle_since
                    logger.info(
                        "Scheduler closing after being idle for %s",
                        format_time(self.idle_timeout),
                    )
                    self._ongoing_background_tasks.call_soon(
                        self.close, reason="idle-timeout-exceeded"
                    )
            return self.idle_since
    
        def _check_no_workers(self) -> None:
            if (
                self.status in (Status.closing, Status.closed)
                or self.no_workers_timeout is None
            ):
                return
    
            now = monotonic()
            stimulus_id = f"check-no-workers-timeout-{time()}"
    
            recommendations: Recs = {}
    
            self._refresh_no_workers_since(now)
    
            affected = self._check_unrunnable_task_timeouts(
                now, recommendations=recommendations, stimulus_id=stimulus_id
            )
    
            affected.update(
                self._check_queued_task_timeouts(
                    now, recommendations=recommendations, stimulus_id=stimulus_id
                )
            )
            self.transitions(recommendations, stimulus_id=stimulus_id)
            if affected:
                self.log_event(
                    "scheduler",
                    {"action": "no-workers-timeout-exceeded", "keys": affected},
                )
    
        def _check_unrunnable_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
            unsatisfied = []
            no_workers = []
            for ts, unrunnable_since in self.unrunnable.items():
                if timestamp <= unrunnable_since + self.no_workers_timeout:
                    # unrunnable is insertion-ordered, which means that unrunnable_since will
                    # be monotonically increasing in this loop.
                    break
                if (
                    self._no_workers_since is None
                    or self._no_workers_since >= unrunnable_since
                ):
                    unsatisfied.append(ts)
                else:
                    no_workers.append(ts)
            if not unsatisfied and not no_workers:
                return set()
    
            for ts in unsatisfied:
                e = pickle.dumps(
                    NoValidWorkerError(
                        task=ts.key,
                        host_restrictions=(ts.host_restrictions or set()).copy(),
                        worker_restrictions=(ts.worker_restrictions or set()).copy(),
                        resource_restrictions=(ts.resource_restrictions or {}).copy(),
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "for its restrictions to become satisfied.",
                    ts.key,
                )
            self._fail_tasks_after_no_workers_timeout(
                no_workers, recommendations, stimulus_id
            )
            return {ts.key for ts in concat([unsatisfied, no_workers])}
    
        def _check_queued_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
    
            if self._no_workers_since is None:
                return set()
    
            if timestamp <= self._no_workers_since + self.no_workers_timeout:
                return set()
            affected = list(self.queued)
            self._fail_tasks_after_no_workers_timeout(
                affected, recommendations, stimulus_id
            )
            return {ts.key for ts in affected}
    
        def _fail_tasks_after_no_workers_timeout(
            self, timed_out: Iterable[TaskState], recommendations: Recs, stimulus_id: str
        ) -> None:
            assert self.no_workers_timeout
    
            for ts in timed_out:
                e = pickle.dumps(
                    NoWorkerError(
                        task=ts.key,
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "without any running workers.",
                    ts.key,
                )
    
        def _refresh_no_workers_since(self, timestamp: float | None = None) -> None:
            if self.running or not (self.queued or self.unrunnable):
                self._no_workers_since = None
                return
    
            if not self._no_workers_since:
                self._no_workers_since = timestamp or monotonic()
                return
    
        def adaptive_target(self, target_duration=None):
            """Desired number of workers based on the current workload
    
            This looks at the current running tasks and memory use, and returns a
            number of desired workers.  This is often used by adaptive scheduling.
    
            Parameters
            ----------
            target_duration : str
                A desired duration of time for computations to take.  This affects
                how rapidly the scheduler will ask to scale.
    
            See Also
            --------
            distributed.deploy.Adaptive
            """
            if target_duration is None:
                target_duration = dask.config.get("distributed.adaptive.target-duration")
            target_duration = parse_timedelta(target_duration)
    
            # CPU
            queued = take(100, concat([self.queued, self.unrunnable.keys()]))
            queued_occupancy = 0
            for ts in queued:
                queued_occupancy += self._get_prefix_duration(ts.prefix)
    
            tasks_ready = len(self.queued) + len(self.unrunnable)
            if tasks_ready > 100:
                queued_occupancy *= tasks_ready / 100
    
            cpu = math.ceil((self.total_occupancy + queued_occupancy) / target_duration)
    
            # Avoid a few long tasks from asking for many cores
            for ws in self.workers.values():
                if tasks_ready > cpu:
                    break
                tasks_ready += len(ws.processing)
            else:
                cpu = min(tasks_ready, cpu)
    
            # Divide by average nthreads per worker
            if self.workers:
                nthreads = sum(ws.nthreads for ws in self.workers.values())
                cpu = math.ceil(cpu / nthreads * len(self.workers))
    
            if (self.unrunnable or self.queued) and not self.workers:
                cpu = max(1, cpu)
    
            # add more workers if more than 60% of memory is used
            limit = sum(ws.memory_limit for ws in self.workers.values())
            used = sum(ws.nbytes for ws in self.workers.values())
            memory = 0
            if used > 0.6 * limit and limit > 0:
                memory = 2 * len(self.workers)
    
            target = max(memory, cpu)
            if target >= len(self.workers):
                return target
            else:  # Scale down?
                to_close = self.workers_to_close()
                return len(self.workers) - len(to_close)
    
        def request_acquire_replicas(
            self, addr: str, keys: Iterable[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to acquire a replica of the listed keys from
            other workers. This is a fire-and-forget operation which offers no feedback for
            success or failure, and is intended for housekeeping and not for computation.
            """
            who_has = {}
            nbytes = {}
            for key in keys:
                ts = self.tasks[key]
                assert ts.who_has
                who_has[key] = [ws.address for ws in ts.who_has or ()]
                nbytes[key] = ts.nbytes
    
            self.stream_comms[addr].send(
                {
                    "op": "acquire-replicas",
                    "who_has": who_has,
                    "nbytes": nbytes,
                    "stimulus_id": stimulus_id,
                },
            )
    
        def request_remove_replicas(
            self, addr: str, keys: list[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to discard its replica of the listed keys.
            This must never be used to destroy the last replica of a key. This is a
            fire-and-forget operation, intended for housekeeping and not for computation.
    
            The replica disappears immediately from TaskState.who_has on the Scheduler side;
            if the worker refuses to delete, e.g. because the task is a dependency of
            another task running on it, it will (also asynchronously) inform the scheduler
            to re-add itself to who_has. If the worker agrees to discard the task, there is
            no feedback.
            """
            ws = self.workers[addr]
    
            # The scheduler immediately forgets about the replica and suggests the worker to
            # drop it. The worker may refuse, at which point it will send back an add-keys
            # message to reinstate it.
            for key in keys:
                ts = self.tasks[key]
                if self.validate:
                    # Do not destroy the last copy
                    assert ts.who_has
                    assert len(ts.who_has) > 1
                self.remove_replica(ts, ws)
    
            self.stream_comms[addr].send(
                {
                    "op": "remove-replicas",
                    "keys": keys,
                    "stimulus_id": stimulus_id,
                }
            )
    
    
    def _task_to_report_msg(ts: TaskState) -> dict[str, Any] | None:
        if ts.state == "forgotten":
            return {"op": "cancelled-keys", "keys": [ts.key]}
        elif ts.state == "memory":
            return {"op": "key-in-memory", "key": ts.key}
        elif ts.state == "erred":
            failing_ts = ts.exception_blame
            assert failing_ts
            return {
                "op": "task-erred",
                "key": ts.key,
                "exception": failing_ts.exception,
                "traceback": failing_ts.traceback,
            }
        else:
            return None
    
    
    def _task_to_client_msgs(ts: TaskState) -> Msgs:
        if ts.who_wants:
            report_msg = _task_to_report_msg(ts)
            if report_msg is not None:
                return {cs.client_key: [report_msg] for cs in ts.who_wants}
        return {}
    
    
    def decide_worker(
        ts: TaskState,
        all_workers: set[WorkerState],
        valid_workers: set[WorkerState] | None,
        objective: Callable[[WorkerState], Any],
    ) -> WorkerState | None:
        """
        Decide which worker should take task *ts*.
    
        We choose the worker that has the data on which *ts* depends.
    
        If several workers have dependencies then we choose the less-busy worker.
    
        Optionally provide *valid_workers* of where jobs are allowed to occur
        (if all workers are allowed to take the task, pass None instead).
    
        If the task requires data communication because no eligible worker has
        all the dependencies already, then we choose to minimize the number
        of bytes sent between workers.  This is determined by calling the
        *objective* function.
        """
        assert all(dts.who_has for dts in ts.dependencies)
        if ts.actor:
            candidates = all_workers.copy()
        else:
            candidates = {wws for dts in ts.dependencies for wws in dts.who_has or ()}
            candidates &= all_workers
        if valid_workers is None:
            if not candidates:
                candidates = all_workers.copy()
        else:
            candidates &= valid_workers
            if not candidates:
                candidates = valid_workers
                if not candidates:
                    if ts.loose_restrictions:
                        return decide_worker(ts, all_workers, None, objective)
    
        if not candidates:
            return None
        elif len(candidates) == 1:
            return next(iter(candidates))
        else:
            return min(candidates, key=objective)
    
    
    def validate_task_state(ts: TaskState) -> None:
        """Validate the given TaskState"""
        assert ts.state in ALL_TASK_STATES, ts
    
        if ts.waiting_on:
            assert ts.waiting_on.issubset(ts.dependencies), (
                "waiting not subset of dependencies",
                str(ts.waiting_on),
                str(ts.dependencies),
            )
        if ts.waiters:
            assert ts.waiters.issubset(ts.dependents), (
                "waiters not subset of dependents",
                str(ts.waiters),
                str(ts.dependents),
            )
    
        for dts in ts.waiting_on or ():
            assert not dts.who_has, ("waiting on in-memory dep", str(ts), str(dts))
            assert dts.state != "released", ("waiting on released dep", str(ts), str(dts))
        for dts in ts.dependencies:
            assert ts in dts.dependents, (
                "not in dependency's dependents",
                str(ts),
                str(dts),
                str(dts.dependents),
            )
            if ts.state in ("waiting", "queued", "processing", "no-worker"):
                assert ts.waiting_on and dts in ts.waiting_on or dts.who_has, (
                    "dep missing",
                    str(ts),
                    str(dts),
                )
            assert dts.state != "forgotten"
    
        for dts in ts.waiters or ():
            assert dts.state in ("waiting", "queued", "processing", "no-worker"), (
                "waiter not in play",
                str(ts),
                str(dts),
            )
        for dts in ts.dependents:
            assert ts in dts.dependencies, (
                "not in dependent's dependencies",
                str(ts),
                str(dts),
                str(dts.dependencies),
            )
            assert dts.state != "forgotten"
    
        assert (ts.processing_on is not None) == (ts.state == "processing")
        assert bool(ts.who_has) == (ts.state == "memory"), (ts, ts.who_has, ts.state)
    
        if ts.state == "queued":
            assert not ts.processing_on
            assert not ts.who_has
            assert all(dts.who_has for dts in ts.dependencies), (
                "task queued without all deps",
                str(ts),
                str(ts.dependencies),
            )
    
        if ts.state == "processing":
            assert all(dts.who_has for dts in ts.dependencies), (
                "task processing without all deps",
                str(ts),
                str(ts.dependencies),
            )
            assert not ts.waiting_on
    
        if ts.who_has:
            assert ts.waiters or ts.who_wants, (
                "unneeded task in memory",
                str(ts),
                str(ts.who_has),
            )
            if ts.run_spec:  # was computed
                assert ts.type
                assert isinstance(ts.type, str)
            assert not any(
                [
                    ts in dts.waiting_on
                    for dts in ts.dependents
                    if dts.waiting_on is not None
                ]
            )
            for ws in ts.who_has:
                assert ts in ws.has_what, (
                    "not in who_has' has_what",
                    str(ts),
                    str(ws),
                    str(ws.has_what),
                )
    
        for cs in ts.who_wants or ():
            assert ts in cs.wants_what, (
                "not in who_wants' wants_what",
                str(ts),
                str(cs),
                str(cs.wants_what),
            )
    
        if ts.actor:
            if ts.state == "memory":
                assert ts.who_has
                assert sum(ts in ws.actors for ws in ts.who_has) == 1
            if ts.state == "processing":
                assert ts.processing_on
                assert ts in ts.processing_on.actors
            assert ts.state != "queued"
    
    
    def validate_unrunnable(unrunnable: dict[TaskState, float]) -> None:
        prev_unrunnable_since: float | None = None
        prev_ts: TaskState | None = None
        for ts, unrunnable_since in unrunnable.items():
            assert ts.state == "no-worker"
            if prev_ts is not None:
                assert prev_unrunnable_since is not None
                # Ensure that unrunnable_since is monotonically increasing when iterating over unrunnable.
                # _check_no_workers relies on this.
                assert prev_unrunnable_since <= unrunnable_since, (
                    prev_ts,
                    ts,
                    prev_unrunnable_since,
                    unrunnable_since,
                )
            prev_ts = ts
            prev_unrunnable_since = unrunnable_since
    
    
    def validate_worker_state(ws: WorkerState) -> None:
        for ts in ws.has_what or ():
            assert ts.who_has
            assert ws in ts.who_has, (
                "not in has_what' who_has",
                str(ws),
                str(ts),
                str(ts.who_has),
            )
    
        for ts in ws.actors:
            assert ts.state in ("memory", "processing")
    
    
    def validate_state(
        tasks: dict[Key, TaskState],
        workers: dict[str, WorkerState],
        clients: dict[str, ClientState],
    ) -> None:
        """Validate a current runtime state.
    
        This performs a sequence of checks on the entire graph, running in about linear
        time. This raises assert errors if anything doesn't check out.
        """
        for ts in tasks.values():
            validate_task_state(ts)
    
        for ws in workers.values():
            validate_worker_state(ws)
    
        for cs in clients.values():
            for ts in cs.wants_what or ():
                assert ts.who_wants
                assert cs in ts.who_wants, (
                    "not in wants_what' who_wants",
                    str(cs),
                    str(ts),
                    str(ts.who_wants),
                )
    
    
    def heartbeat_interval(n: int) -> float:
        """Interval in seconds that we desire heartbeats based on number of workers"""
        if n <= 10:
            return 0.5
        elif n < 50:
            return 1
        elif n < 200:
            return 2
        else:
            # No more than 200 heartbeats a second scaled by workers
            return n / 200 + 1
    
    
    def _task_slots_available(ws: WorkerState, saturation_factor: float) -> int:
        """Number of tasks that can be sent to this worker without oversaturating it"""
        assert not math.isinf(saturation_factor)
        return max(math.ceil(saturation_factor * ws.nthreads), 1) - (
            len(ws.processing) - len(ws.long_running)
        )
    
    
    def _worker_full(ws: WorkerState, saturation_factor: float) -> bool:
        if math.isinf(saturation_factor):
            return False
        return _task_slots_available(ws, saturation_factor) <= 0
    
    
    class KilledWorker(Exception):
        def __init__(self, task: Key, last_worker: WorkerState, allowed_failures: int):
            super().__init__(task, last_worker, allowed_failures)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def last_worker(self) -> WorkerState:
            return self.args[1]
    
        @property
        def allowed_failures(self) -> int:
            return self.args[2]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} on {self.allowed_failures + 1} "
                "different workers, but all those workers died while running it. "
                f"The last worker that attempt to run the task was {self.last_worker.address}. "
                "Inspecting worker logs is often a good next step to diagnose what went wrong. "
                "For more information see https://distributed.dask.org/en/stable/killed.html."
            )
    
    
    class NoValidWorkerError(Exception):
        def __init__(
            self,
            task: Key,
            host_restrictions: set[str],
            worker_restrictions: set[str],
            resource_restrictions: dict[str, float],
            timeout: float,
        ):
            super().__init__(
                task, host_restrictions, worker_restrictions, resource_restrictions, timeout
            )
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def host_restrictions(self) -> Any:
            return self.args[1]
    
        @property
        def worker_restrictions(self) -> Any:
            return self.args[2]
    
        @property
        def resource_restrictions(self) -> Any:
            return self.args[3]
    
        @property
        def timeout(self) -> float:
            return self.args[4]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting for a valid worker matching all restrictions.\n\nRestrictions:\n"
                f"host_restrictions={self.host_restrictions!s}\n"
                f"worker_restrictions={self.worker_restrictions!s}\n"
                f"resource_restrictions={self.resource_restrictions!s}\n"
            )
    
    
    class NoWorkerError(Exception):
        def __init__(self, task: Key, timeout: float):
            super().__init__(task, timeout)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def timeout(self) -> float:
            return self.args[1]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting without any running workers."
            )
    
    
    class WorkerStatusPlugin(SchedulerPlugin):
        """A plugin to share worker status with a remote observer
    
        This is used in cluster managers to keep updated about the status of the scheduler.
        """
    
        name: ClassVar[str] = "worker-status"
        bcomm: BatchedSend
    
        def __init__(self, scheduler: Scheduler, comm: Comm):
            self.bcomm = BatchedSend(interval="5ms")
            self.bcomm.start(comm)
            scheduler.add_plugin(self)
    
        def add_worker(self, scheduler: Scheduler, worker: str) -> None:
            ident = scheduler.workers[worker].identity()
            del ident["metrics"]
            del ident["last_seen"]
            try:
                self.bcomm.send(["add", {"workers": {worker: ident}}])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def remove_worker(self, scheduler: Scheduler, worker: str, **kwargs: Any) -> None:
            try:
                self.bcomm.send(["remove", worker])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def teardown(self) -> None:
            self.bcomm.close()
    
    
    class CollectTaskMetaDataPlugin(SchedulerPlugin):
        scheduler: Scheduler
        name: str
        keys: set[Key]
        metadata: dict[Key, Any]
        state: dict[Key, TaskStateState]
    
        def __init__(self, scheduler: Scheduler, name: str):
            self.scheduler = scheduler
            self.name = name
            self.keys = set()
            self.metadata = {}
            self.state = {}
    
        def update_graph(
            self,
            scheduler: Scheduler,
            *,
            keys: set[Key],
            **kwargs: Any,
        ) -> None:
            self.keys.update(keys)
    
        def transition(
            self,
            key: Key,
            start: TaskStateState,
            finish: TaskStateState,
            *args: Any,
            **kwargs: Any,
        ) -> None:
            if finish in ("memory", "erred"):
                ts = self.scheduler.tasks.get(key)
                if ts is not None and ts.key in self.keys:
                    self.metadata[key] = ts.metadata
                    self.state[key] = finish
                    self.keys.discard(key)
    
    
    def _materialize_graph(
        expr: Expr,
        global_annotations: dict[str, Any],
        validate: bool,
    ) -> tuple[dict[Key, T_runspec], dict[Key, set[Key]], dict[str, dict[Key, Any]]]:
>       dsk: dict = expr.__dask_graph__()
E       AttributeError: 'dict' object has no attribute '__dask_graph__'

distributed/scheduler.py:9383: AttributeError

Check warning on line 0 in distributed.cli.tests.test_dask_worker

github-actions / Unit Test Results

1 out of 5 runs failed: test_contact_listen_address[tcp://0.0.0.0:---no-nanny] (distributed.cli.tests.test_dask_worker)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 0s]

Raw output


            AttributeError: 'dict' object has no attribute '__dask_graph__'
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:40515', workers: 0, cores: 0, tasks: 0>
nanny = '--no-nanny', listen_address = 'tcp://0.0.0.0:54333'

    @pytest.mark.slow
    @pytest.mark.skipif(not LINUX, reason="Need 127.0.0.2 to mean localhost")
    @pytest.mark.parametrize("nanny", ["--nanny", "--no-nanny"])
    @pytest.mark.parametrize("listen_address", ["tcp://0.0.0.0:", "tcp://127.0.0.2:"])
    @gen_cluster(client=True, nthreads=[])
    async def test_contact_listen_address(c, s, nanny, listen_address):
        port = open_port()
        listen_address += str(port)
        with popen(
            [
                sys.executable,
                "-m",
                "dask",
                "worker",
                s.address,
                nanny,
                "--no-dashboard",
                "--contact-address",
                f"tcp://127.0.0.2:{port}",
                "--listen-address",
                listen_address,
            ]
        ):
            await c.wait_for_workers(1)
            info = c.scheduler_info()
            assert info["workers"].keys() == {f"tcp://127.0.0.2:{port}"}
    
            # roundtrip works
>           assert await c.submit(lambda x: x + 1, 10) == 11

distributed/cli/tests/test_dask_worker.py:500: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:408: in _result
    raise exc.with_traceback(tb)
distributed/utils.py:1507: in run_in_executor_with_context
    return await loop.run_in_executor(
../../../miniconda3/envs/dask-distributed/lib/python3.10/concurrent/futures/thread.py:58: in run
    result = self.fn(*self.args, **self.kwargs)
distributed/utils.py:1508: in <lambda>
    executor, lambda: context.run(func, *args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import dataclasses
    import heapq
    import inspect
    import itertools
    import json
    import logging
    import math
    import operator
    import os
    import pickle
    import random
    import textwrap
    import uuid
    import warnings
    import weakref
    from abc import abstractmethod
    from collections import defaultdict, deque
    from collections.abc import (
        Callable,
        Collection,
        Container,
        Hashable,
        Iterable,
        Iterator,
        Mapping,
        Sequence,
        Set,
    )
    from contextlib import suppress
    from functools import partial
    from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, cast, overload
    
    import psutil
    import tornado.web
    from sortedcontainers import SortedDict, SortedSet
    from tlz import (
        concat,
        first,
        groupby,
        merge,
        merge_sorted,
        merge_with,
        partition,
        pluck,
        second,
        take,
        valmap,
    )
    from tornado.ioloop import IOLoop
    
    import dask
    import dask.utils
    from dask._task_spec import DependenciesMapping, GraphNode, convert_legacy_graph
    from dask.core import istask, validate_key
    from dask.typing import Key, no_default
    from dask.utils import (
        _deprecated,
        _deprecated_kwarg,
        format_bytes,
        format_time,
        key_split,
        parse_bytes,
        parse_timedelta,
        tmpfile,
    )
    from dask.widgets import get_template
    
    from distributed import cluster_dump, preloading, profile
    from distributed import versions as version_module
    from distributed._asyncio import RLock
    from distributed._stories import scheduler_story
    from distributed.active_memory_manager import ActiveMemoryManagerExtension, RetireWorker
    from distributed.batched import BatchedSend
    from distributed.broker import Broker
    from distributed.client import SourceCode
    from distributed.collections import HeapSet
    from distributed.comm import (
        Comm,
        CommClosedError,
        get_address_host,
        normalize_address,
        resolve_address,
        unparse_host_port,
    )
    from distributed.comm.addressing import addresses_from_user_args
    from distributed.compatibility import PeriodicCallback
    from distributed.core import (
        ErrorMessage,
        OKMessage,
        Status,
        clean_exception,
        error_message,
        rpc,
        send_recv,
    )
    from distributed.diagnostics.memory_sampler import MemorySamplerExtension
    from distributed.diagnostics.plugin import SchedulerPlugin, _get_plugin_name
    from distributed.event import EventExtension
    from distributed.gc import disable_gc_diagnosis, enable_gc_diagnosis
    from distributed.http import get_handlers
    from distributed.metrics import monotonic, time
    from distributed.multi_lock import MultiLockExtension
    from distributed.node import ServerNode
    from distributed.proctitle import setproctitle
    from distributed.protocol import deserialize
    from distributed.protocol.pickle import dumps, loads
    from distributed.protocol.serialize import Serialized, ToPickle, serialize
    from distributed.publish import PublishExtension
    from distributed.pubsub import PubSubSchedulerExtension
    from distributed.queues import QueueExtension
    from distributed.recreate_tasks import ReplayTaskScheduler
    from distributed.security import Security
    from distributed.semaphore import SemaphoreExtension
    from distributed.shuffle import ShuffleSchedulerPlugin
    from distributed.spans import SpanMetadata, SpansSchedulerExtension
    from distributed.stealing import WorkStealing
    from distributed.utils import (
        All,
        Deadline,
        TimeoutError,
        format_dashboard_link,
        get_fileno_limit,
        key_split_group,
        log_errors,
        offload,
        recursive_to_dict,
        wait_for,
    )
    from distributed.utils_comm import (
        gather_from_workers,
        retry_operation,
        scatter_to_workers,
    )
    from distributed.variable import VariableExtension
    
    if TYPE_CHECKING:
        # TODO import from typing (requires Python >=3.10)
        # TODO import from typing (requires Python >=3.11)
        from typing_extensions import Self, TypeAlias
    
        from dask._expr import Expr
    
    # Not to be confused with distributed.worker_state_machine.TaskStateState
    TaskStateState: TypeAlias = Literal[
        "released",
        "waiting",
        "no-worker",
        "queued",
        "processing",
        "memory",
        "erred",
        "forgotten",
    ]
    
    ALL_TASK_STATES: Set[TaskStateState] = set(TaskStateState.__args__)  # type: ignore
    
    # {task key -> finish state}
    # Not to be confused with distributed.worker_state_machine.Recs
    Recs: TypeAlias = dict[Key, TaskStateState]
    # {client or worker address: [{op: <key>, ...}, ...]}
    Msgs: TypeAlias = dict[str, list[dict[str, Any]]]
    # (recommendations, client messages, worker messages)
    RecsMsgs: TypeAlias = tuple[Recs, Msgs, Msgs]
    
    T_runspec: TypeAlias = GraphNode
    
    logger = logging.getLogger(__name__)
    LOG_PDB = dask.config.get("distributed.admin.pdb-on-err")
    DEFAULT_DATA_SIZE = parse_bytes(
        dask.config.get("distributed.scheduler.default-data-size")
    )
    STIMULUS_ID_UNSET = "<stimulus_id unset>"
    
    DEFAULT_EXTENSIONS = {
        "multi_locks": MultiLockExtension,
        "publish": PublishExtension,
        "replay-tasks": ReplayTaskScheduler,
        "queues": QueueExtension,
        "variables": VariableExtension,
        "pubsub": PubSubSchedulerExtension,
        "semaphores": SemaphoreExtension,
        "events": EventExtension,
        "amm": ActiveMemoryManagerExtension,
        "memory_sampler": MemorySamplerExtension,
        "shuffle": ShuffleSchedulerPlugin,
        "spans": SpansSchedulerExtension,
        "stealing": WorkStealing,
    }
    
    
    class ClientState:
        """A simple object holding information about a client."""
    
        #: A unique identifier for this client. This is generally an opaque
        #: string generated by the client itself.
        client_key: str
    
        #: Cached hash of :attr:`~ClientState.client_key`
        _hash: int
    
        #: A set of tasks this client wants to be kept in memory, so that it can download
        #: its result when desired. This is the reverse mapping of
        #: :class:`TaskState.who_wants`. Tasks are typically removed from this set when the
        #: corresponding object in the client's space (for example a ``Future`` or a Dask
        #: collection) gets garbage-collected.
        wants_what: set[TaskState]
    
        #: The last time we received a heartbeat from this client, in local scheduler time.
        last_seen: float
    
        #: Output of :func:`distributed.versions.get_versions` on the client
        versions: dict[str, Any]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(self, client: str, *, versions: dict[str, Any] | None = None):
            self.client_key = client
            self._hash = hash(client)
            self.wants_what = set()
            self.last_seen = time()
            self.versions = versions or {}
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            if not isinstance(other, ClientState):
                return False
            return self.client_key == other.client_key
    
        def __repr__(self) -> str:
            return f"<Client {self.client_key!r}>"
    
        def __str__(self) -> str:
            return self.client_key
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
    
    class MemoryState:
        """Memory readings on a worker or on the whole cluster.
    
        See :doc:`worker-memory`.
    
        Attributes / properties:
    
        managed_total
            Sum of the output of sizeof() for all dask keys held by the worker in memory,
            plus number of bytes spilled to disk
    
        managed
            Sum of the output of sizeof() for the dask keys held in RAM. Note that this may
            be inaccurate, which may cause inaccurate unmanaged memory (see below).
    
        spilled
            Number of bytes  for the dask keys spilled to the hard drive.
            Note that this is the size on disk; size in memory may be different due to
            compression and inaccuracies in sizeof(). In other words, given the same keys,
            'managed' will change depending on the keys being in memory or spilled.
    
        process
            Total RSS memory measured by the OS on the worker process.
            This is always exactly equal to managed + unmanaged.
    
        unmanaged
            process - managed. This is the sum of
    
            - Python interpreter and modules
            - global variables
            - memory temporarily allocated by the dask tasks that are currently running
            - memory fragmentation
            - memory leaks
            - memory not yet garbage collected
            - memory not yet free()'d by the Python memory manager to the OS
    
        unmanaged_old
            Minimum of the 'unmanaged' measures over the last
            ``distributed.memory.recent-to-old-time`` seconds
    
        unmanaged_recent
            unmanaged - unmanaged_old; in other words process memory that has been recently
            allocated but is not accounted for by dask; hopefully it's mostly a temporary
            spike.
    
        optimistic
            managed + unmanaged_old; in other words the memory held long-term by
            the process under the hopeful assumption that all unmanaged_recent memory is a
            temporary spike
        """
    
        process: int
        unmanaged_old: int
        managed: int
        spilled: int
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            process: int,
            unmanaged_old: int,
            managed: int,
            spilled: int,
        ):
            # Some data arrives with the heartbeat, some other arrives in realtime as the
            # tasks progress. Also, sizeof() is not guaranteed to return correct results.
            # This can cause glitches where a partial measure is larger than the whole, so
            # we need to force all numbers to add up exactly by definition.
            self.process = process
            self.managed = min(self.process, managed)
            self.spilled = spilled
            # Subtractions between unsigned ints guaranteed by construction to be >= 0
            self.unmanaged_old = min(unmanaged_old, process - self.managed)
    
        @staticmethod
        def sum(*infos: MemoryState) -> MemoryState:
            process = 0
            unmanaged_old = 0
            managed = 0
            spilled = 0
            for ms in infos:
                process += ms.process
                unmanaged_old += ms.unmanaged_old
                spilled += ms.spilled
                managed += ms.managed
            return MemoryState(
                process=process,
                unmanaged_old=unmanaged_old,
                managed=managed,
                spilled=spilled,
            )
    
        @property
        def managed_total(self) -> int:
            return self.managed + self.spilled
    
        @property
        def unmanaged(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed
    
        @property
        def unmanaged_recent(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed - self.unmanaged_old
    
        @property
        def optimistic(self) -> int:
            return self.managed + self.unmanaged_old
    
        @property
        def managed_in_memory(self) -> int:
            warnings.warn("managed_in_memory has been renamed to managed", FutureWarning)
            return self.managed
    
        @property
        def managed_spilled(self) -> int:
            warnings.warn("managed_spilled has been renamed to spilled", FutureWarning)
            return self.spilled
    
        def __repr__(self) -> str:
            return (
                f"Process memory (RSS)  : {format_bytes(self.process)}\n"
                f"  - managed by Dask   : {format_bytes(self.managed)}\n"
                f"  - unmanaged (old)   : {format_bytes(self.unmanaged_old)}\n"
                f"  - unmanaged (recent): {format_bytes(self.unmanaged_recent)}\n"
                f"Spilled to disk       : {format_bytes(self.spilled)}\n"
            )
    
        def _to_dict(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            """
            return {
                k: getattr(self, k)
                for k in dir(self)
                if not k.startswith("_")
                and k not in {"sum", "managed_in_memory", "managed_spilled"}
            }
    
    
    class WorkerState:
        """A simple object holding information about a worker.
    
        Not to be confused with :class:`distributed.worker_state_machine.WorkerState`.
        """
    
        #: This worker's unique key. This can be its connected address
        #: (such as ``"tcp://127.0.0.1:8891"``) or an alias (such as ``"alice"``).
        address: str
    
        pid: int
        name: Hashable
    
        #: The number of CPU threads made available on this worker
        nthreads: int
    
        #: Memory available to the worker, in bytes
        memory_limit: int
    
        local_directory: str
        services: dict[str, int]
    
        #: Output of :meth:`distributed.versions.get_versions` on the worker
        versions: dict[str, Any]
    
        #: Address of the associated :class:`~distributed.nanny.Nanny`, if present
        nanny: str | None
    
        #: Read-only worker status, synced one way from the remote Worker object
        status: Status
    
        #: Cached hash of :attr:`~WorkerState.server_id`
        _hash: int
    
        #: The total memory size, in bytes, used by the tasks this worker holds in memory
        #: (i.e. the tasks in this worker's :attr:`~WorkerState.has_what`).
        nbytes: int
    
        #: Worker memory unknown to the worker, in bytes, which has been there for more than
        #: 30 seconds. See :class:`MemoryState`.
        _memory_unmanaged_old: int
    
        #: History of the last 30 seconds' worth of unmanaged memory. Used to differentiate
        #: between "old" and "new" unmanaged memory.
        #: Format: ``[(timestamp, bytes), (timestamp, bytes), ...]``
        _memory_unmanaged_history: deque[tuple[float, int]]
    
        metrics: dict[str, Any]
    
        #: The last time we received a heartbeat from this worker, in local scheduler time.
        last_seen: float
    
        time_delay: float
        bandwidth: float
    
        #: A set of all TaskStates on this worker that are actors. This only includes those
        #: actors whose state actually lives on this worker, not actors to which this worker
        #: has a reference.
        actors: set[TaskState]
    
        #: Underlying data of :meth:`WorkerState.has_what`
        _has_what: dict[TaskState, None]
    
        #: A set of tasks that have been submitted to this worker. Multiple tasks may be
        # submitted to a worker in advance and the worker will run them eventually,
        # depending on its execution resources (but see :doc:`work-stealing`).
        #:
        #: All the tasks here are in the "processing" state.
        #: This attribute is kept in sync with :attr:`TaskState.processing_on`.
        processing: set[TaskState]
    
        #: Running tasks that invoked :func:`distributed.secede`
        long_running: set[TaskState]
    
        #: A dictionary of tasks that are currently being run on this worker.
        #: Each task state is associated with the duration in seconds which the task has
        #: been running.
        executing: dict[TaskState, float]
    
        #: The available resources on this worker, e.g. ``{"GPU": 2}``.
        #: These are abstract quantities that constrain certain tasks from running at the
        #: same time on this worker.
        resources: dict[str, float]
    
        #: The sum of each resource used by all tasks allocated to this worker.
        #: The numbers in this dictionary can only be less or equal than those in this
        #: worker's :attr:`~WorkerState.resources`.
        used_resources: dict[str, float]
    
        #: Arbitrary additional metadata to be added to :meth:`~WorkerState.identity`
        extra: dict[str, Any]
    
        # The unique server ID this WorkerState is referencing
        server_id: str
    
        # Reference to scheduler task_groups
        scheduler_ref: weakref.ref[SchedulerState] | None
        task_prefix_count: defaultdict[str, int]
        _network_occ: int
        _occupancy_cache: float | None
    
        #: Keys that may need to be fetched to this worker, and the number of tasks that need them.
        #: All tasks are currently in `memory` on a worker other than this one.
        #: Much like `processing`, this does not exactly reflect worker state:
        #: keys here may be queued to fetch, in flight, or already in memory
        #: on the worker.
        needs_what: dict[TaskState, int]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            address: str,
            status: Status,
            pid: int,
            name: object,
            nthreads: int = 0,
            memory_limit: int,
            local_directory: str,
            nanny: str | None,
            server_id: str,
            services: dict[str, int] | None = None,
            versions: dict[str, Any] | None = None,
            extra: dict[str, Any] | None = None,
            scheduler: SchedulerState | None = None,
        ):
            self.server_id = server_id
            self.address = address
            self.pid = pid
            self.name = name
            self.nthreads = nthreads
            self.memory_limit = memory_limit
            self.local_directory = local_directory
            self.services = services or {}
            self.versions = versions or {}
            self.nanny = nanny
            self.status = status
            self._hash = hash(self.server_id)
            self.nbytes = 0
            self._memory_unmanaged_old = 0
            self._memory_unmanaged_history = deque()
            self.metrics = {}
            self.last_seen = time()
            self.time_delay = 0
            self.bandwidth = parse_bytes(dask.config.get("distributed.scheduler.bandwidth"))
            self.actors = set()
            self._has_what = {}
            self.processing = set()
            self.long_running = set()
            self.executing = {}
            self.resources = {}
            self.used_resources = {}
            self.extra = extra or {}
            self.scheduler_ref = weakref.ref(scheduler) if scheduler else None
            self.task_prefix_count = defaultdict(int)
            self.needs_what = {}
            self._network_occ = 0
            self._occupancy_cache = None
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            return self is other or (
                isinstance(other, WorkerState) and other.server_id == self.server_id
            )
    
        @property
        def has_what(self) -> Set[TaskState]:
            """An insertion-sorted set-like of tasks which currently reside on this worker.
            All the tasks here are in the "memory" state.
            This is the reverse mapping of :attr:`TaskState.who_has`.
    
            This is a read-only public accessor. The data is implemented as a dict without
            values, because rebalance() relies on dicts being insertion-sorted.
            """
            return self._has_what.keys()
    
        @property
        def host(self) -> str:
            return get_address_host(self.address)
    
        @property
        def memory(self) -> MemoryState:
            """Polished memory metrics for the worker.
    
            **Design note on managed memory**
    
            There are two measures available for managed memory:
    
            - ``self.nbytes``
            - ``self.metrics["managed_bytes"]``
    
            At rest, the two numbers must be identical. However, ``self.nbytes`` is
            immediately updated through the batched comms as soon as each task lands in
            memory on the worker; ``self.metrics["managed_bytes"]`` instead is updated by
            the heartbeat, which can lag several seconds behind.
    
            Below we are mixing likely newer managed memory info from ``self.nbytes`` with
            process and spilled memory from the heartbeat. This is deliberate, so that
            managed memory total is updated more frequently.
    
            Managed memory directly and immediately contributes to optimistic memory, which
            is in turn used in Active Memory Manager heuristics (at the moment of writing;
            more uses will likely be added in the future). So it's important to have it
            up to date; much more than it is for process memory.
    
            Having up-to-date managed memory info as soon as the scheduler learns about
            task completion also substantially simplifies unit tests.
    
            The flip side of this design is that it may cause some noise in the
            unmanaged_recent measure. e.g.:
    
            1. Delete 100MB of managed data
            2. The updated managed memory reaches the scheduler faster than the
               updated process memory
            3. There's a blip where the scheduler thinks that there's a sudden 100MB
               increase in unmanaged_recent, since process memory hasn't changed but managed
               memory has decreased by 100MB
            4. When the heartbeat arrives, process memory goes down and so does the
               unmanaged_recent.
    
            This is OK - one of the main reasons for the unmanaged_recent / unmanaged_old
            split is exactly to concentrate all the noise in unmanaged_recent and exclude it
            from optimistic memory, which is used for heuristics.
    
            Something that is less OK, but also less frequent, is that the sudden deletion
            of spilled keys will cause a negative blip in managed memory:
    
            1. Delete 100MB of spilled data
            2. The updated managed memory *total* reaches the scheduler faster than the
               updated spilled portion
            3. This causes the managed memory to temporarily plummet and be replaced by
               unmanaged_recent, while spilled memory remains unaltered
            4. When the heartbeat arrives, managed goes back up, unmanaged_recent
               goes back down, and spilled goes down by 100MB as it should have to
               begin with.
    
            :issue:`6002` will let us solve this.
            """
            return MemoryState(
                process=self.metrics["memory"],
                managed=max(0, self.nbytes - self.metrics["spilled_bytes"]["memory"]),
                spilled=self.metrics["spilled_bytes"]["disk"],
                unmanaged_old=self._memory_unmanaged_old,
            )
    
        def clean(self) -> WorkerState:
            """Return a version of this object that is appropriate for serialization"""
            ws = WorkerState(
                address=self.address,
                status=self.status,
                pid=self.pid,
                name=self.name,
                nthreads=self.nthreads,
                memory_limit=self.memory_limit,
                local_directory=self.local_directory,
                services=self.services,
                nanny=self.nanny,
                extra=self.extra,
                server_id=self.server_id,
            )
            ws._occupancy_cache = self.occupancy
    
            ws.executing = {ts.key: duration for ts, duration in self.executing.items()}  # type: ignore
            return ws
    
        def __repr__(self) -> str:
            name = f", name: {self.name}" if self.name != self.address else ""
            return (
                f"<WorkerState {self.address!r}{name}, "
                f"status: {self.status.name}, "
                f"memory: {len(self.has_what)}, "
                f"processing: {len(self.processing)}>"
            )
    
        def _repr_html_(self) -> str:
            return get_template("worker_state.html.j2").render(
                address=self.address,
                name=self.name,
                status=self.status.name,
                has_what=self.has_what,
                processing=self.processing,
            )
    
        def identity(self) -> dict[str, Any]:
            return {
                "type": "Worker",
                "id": self.name,
                "host": self.host,
                "resources": self.resources,
                "local_directory": self.local_directory,
                "name": self.name,
                "nthreads": self.nthreads,
                "memory_limit": self.memory_limit,
                "last_seen": self.last_seen,
                "services": self.services,
                "metrics": self.metrics,
                "status": self.status.name,
                "nanny": self.nanny,
                **self.extra,
            }
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict[str, Any]:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
        @property
        def scheduler(self) -> SchedulerState:
            assert self.scheduler_ref
            s = self.scheduler_ref()
            assert s
            return s
    
        def add_to_processing(self, ts: TaskState) -> None:
            """Assign a task to this worker for compute."""
            if self.scheduler.validate:
                assert ts not in self.processing
    
            tp = ts.prefix
            self.task_prefix_count[tp.name] += 1
            self.scheduler._task_prefix_count_global[tp.name] += 1
            self.processing.add(ts)
            for dts in ts.dependencies:
                assert dts.who_has
                if self not in dts.who_has:
                    self._inc_needs_replica(dts)
    
        def add_to_long_running(self, ts: TaskState) -> None:
            if self.scheduler.validate:
                assert ts in self.processing
                assert ts not in self.long_running
    
            self._remove_from_task_prefix_count(ts)
            # Cannot remove from processing since we're using this for things like
            # idleness detection. Idle workers are typically targeted for
            # downscaling but we should not downscale workers with long running
            # tasks
            self.long_running.add(ts)
    
        def remove_from_processing(self, ts: TaskState) -> None:
            """Remove a task from a workers processing"""
            if self.scheduler.validate:
                assert ts in self.processing
    
            if ts in self.long_running:
                self.long_running.discard(ts)
            else:
                self._remove_from_task_prefix_count(ts)
            self.processing.remove(ts)
            for dts in ts.dependencies:
                if dts in self.needs_what:
                    self._dec_needs_replica(dts)
    
        def _remove_from_task_prefix_count(self, ts: TaskState) -> None:
            prefix_name = ts.prefix.name
            count = self.task_prefix_count[prefix_name] - 1
            tp_count = self.task_prefix_count
            tp_count_global = self.scheduler._task_prefix_count_global
            if count:
                tp_count[prefix_name] = count
            else:
                del tp_count[prefix_name]
    
            count = tp_count_global[prefix_name] - 1
            if count:
                tp_count_global[prefix_name] = count
            else:
                del tp_count_global[prefix_name]
    
        def remove_replica(self, ts: TaskState) -> None:
            """The worker no longer has a task in memory"""
            if self.scheduler.validate:
                assert ts.who_has
                assert self in ts.who_has
                assert ts in self.has_what
                assert ts not in self.needs…cheduler, title="Scheduler Profile (administrative)"
            )
            task_stream = TabPanel(child=task_stream, title="Task Stream")
            bandwidth_workers = TabPanel(
                child=bandwidth_workers.root, title="Bandwidth (Workers)"
            )
            bandwidth_types = TabPanel(
                child=bandwidth_types.root, title="Bandwidth (Types)"
            )
            system = TabPanel(child=sysmon.root, title="System")
            logs = TabPanel(child=logs.root, title="Scheduler Logs")
    
            tabs = Tabs(
                tabs=[
                    html,
                    task_stream,
                    system,
                    logs,
                    compute,
                    workers,
                    scheduler,
                    bandwidth_workers,
                    bandwidth_types,
                ],
                sizing_mode="stretch_both",
            )
    
            from bokeh.core.templates import get_env
            from bokeh.plotting import output_file, save
    
            with tmpfile(extension=".html") as fn:
                output_file(filename=fn, title="Dask Performance Report", mode=mode)
                template_directory = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)), "dashboard", "templates"
                )
                template_environment = get_env()
                template_environment.loader.searchpath.append(template_directory)
                template = template_environment.get_template("performance_report.html")
                save(tabs, filename=fn, template=template)
    
                with open(fn) as f:
                    data = f.read()
    
            return data
    
        async def get_worker_logs(self, n=None, workers=None, nanny=False):
            results = await self.broadcast(
                msg={"op": "get_logs", "n": n}, workers=workers, nanny=nanny
            )
            return results
    
        def log_event(self, topic: str | Collection[str], msg: Any) -> None:
            """Log an event under a given topic
    
            Parameters
            ----------
            topic : str, list[str]
                Name of the topic under which to log an event. To log the same
                event under multiple topics, pass a list of topic names.
            msg
                Event message to log. Note this must be msgpack serializable.
    
            See also
            --------
            Client.log_event
            """
            self._broker.publish(topic, msg)
    
        def subscribe_topic(self, topic: str, client: str) -> None:
            self._broker.subscribe(topic, client)
    
        def unsubscribe_topic(self, topic: str, client: str) -> None:
            self._broker.unsubscribe(topic, client)
    
        @overload
        def get_events(self, topic: str) -> tuple[tuple[float, Any], ...]: ...
    
        @overload
        def get_events(self) -> dict[str, tuple[tuple[float, Any], ...]]: ...
    
        def get_events(
            self, topic: str | None = None
        ) -> tuple[tuple[float, Any], ...] | dict[str, tuple[tuple[float, Any], ...]]:
            return self._broker.get_events(topic)
    
        async def get_worker_monitor_info(self, recent=False, starts=None):
            if starts is None:
                starts = {}
            results = await asyncio.gather(
                *(
                    self.rpc(w).get_monitor_info(recent=recent, start=starts.get(w, 0))
                    for w in self.workers
                )
            )
            return dict(zip(self.workers, results))
    
        ###########
        # Cleanup #
        ###########
    
        @log_errors
        async def check_worker_ttl(self) -> None:
            now = time()
            stimulus_id = f"check-worker-ttl-{now}"
            assert self.worker_ttl
            ttl = max(self.worker_ttl, 10 * heartbeat_interval(len(self.workers)))
            to_restart = []
    
            for ws in self.workers.values():
                last_seen = now - ws.last_seen
                if last_seen > ttl:
                    to_restart.append(ws.address)
                    logger.warning(
                        f"Worker failed to heartbeat for {last_seen:.0f}s; "
                        f"{'attempting restart' if ws.nanny else 'removing'}: {ws}"
                    )
    
            if to_restart:
                self.log_event(
                    "scheduler",
                    {
                        "action": "worker-ttl-timed-out",
                        "workers": to_restart.copy(),
                        "ttl": ttl,
                    },
                )
                await self.restart_workers(
                    to_restart,
                    wait_for_workers=False,
                    stimulus_id=stimulus_id,
                )
    
        def check_idle(self) -> float | None:
            if self.status in (Status.closing, Status.closed):
                return None  # pragma: nocover
    
            if self.transition_counter != self._idle_transition_counter:
                self._idle_transition_counter = self.transition_counter
                self.idle_since = None
                return None
    
            if self._active_graph_updates > 0:
                self.idle_since = None
                return None
    
            if (
                self.queued
                or self.unrunnable
                or any(ws.processing for ws in self.workers.values())
            ):
                self.idle_since = None
                return None
    
            if not self.idle_since:
                self.idle_since = time()
                return self.idle_since
    
            if self.jupyter:
                last_activity = (
                    self._jupyter_server_application.web_app.last_activity().timestamp()
                )
                if last_activity > self.idle_since:
                    self.idle_since = last_activity
                    return self.idle_since
    
            if self.idle_timeout:
                if time() > self.idle_since + self.idle_timeout:
                    assert self.idle_since
                    logger.info(
                        "Scheduler closing after being idle for %s",
                        format_time(self.idle_timeout),
                    )
                    self._ongoing_background_tasks.call_soon(
                        self.close, reason="idle-timeout-exceeded"
                    )
            return self.idle_since
    
        def _check_no_workers(self) -> None:
            if (
                self.status in (Status.closing, Status.closed)
                or self.no_workers_timeout is None
            ):
                return
    
            now = monotonic()
            stimulus_id = f"check-no-workers-timeout-{time()}"
    
            recommendations: Recs = {}
    
            self._refresh_no_workers_since(now)
    
            affected = self._check_unrunnable_task_timeouts(
                now, recommendations=recommendations, stimulus_id=stimulus_id
            )
    
            affected.update(
                self._check_queued_task_timeouts(
                    now, recommendations=recommendations, stimulus_id=stimulus_id
                )
            )
            self.transitions(recommendations, stimulus_id=stimulus_id)
            if affected:
                self.log_event(
                    "scheduler",
                    {"action": "no-workers-timeout-exceeded", "keys": affected},
                )
    
        def _check_unrunnable_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
            unsatisfied = []
            no_workers = []
            for ts, unrunnable_since in self.unrunnable.items():
                if timestamp <= unrunnable_since + self.no_workers_timeout:
                    # unrunnable is insertion-ordered, which means that unrunnable_since will
                    # be monotonically increasing in this loop.
                    break
                if (
                    self._no_workers_since is None
                    or self._no_workers_since >= unrunnable_since
                ):
                    unsatisfied.append(ts)
                else:
                    no_workers.append(ts)
            if not unsatisfied and not no_workers:
                return set()
    
            for ts in unsatisfied:
                e = pickle.dumps(
                    NoValidWorkerError(
                        task=ts.key,
                        host_restrictions=(ts.host_restrictions or set()).copy(),
                        worker_restrictions=(ts.worker_restrictions or set()).copy(),
                        resource_restrictions=(ts.resource_restrictions or {}).copy(),
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "for its restrictions to become satisfied.",
                    ts.key,
                )
            self._fail_tasks_after_no_workers_timeout(
                no_workers, recommendations, stimulus_id
            )
            return {ts.key for ts in concat([unsatisfied, no_workers])}
    
        def _check_queued_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
    
            if self._no_workers_since is None:
                return set()
    
            if timestamp <= self._no_workers_since + self.no_workers_timeout:
                return set()
            affected = list(self.queued)
            self._fail_tasks_after_no_workers_timeout(
                affected, recommendations, stimulus_id
            )
            return {ts.key for ts in affected}
    
        def _fail_tasks_after_no_workers_timeout(
            self, timed_out: Iterable[TaskState], recommendations: Recs, stimulus_id: str
        ) -> None:
            assert self.no_workers_timeout
    
            for ts in timed_out:
                e = pickle.dumps(
                    NoWorkerError(
                        task=ts.key,
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "without any running workers.",
                    ts.key,
                )
    
        def _refresh_no_workers_since(self, timestamp: float | None = None) -> None:
            if self.running or not (self.queued or self.unrunnable):
                self._no_workers_since = None
                return
    
            if not self._no_workers_since:
                self._no_workers_since = timestamp or monotonic()
                return
    
        def adaptive_target(self, target_duration=None):
            """Desired number of workers based on the current workload
    
            This looks at the current running tasks and memory use, and returns a
            number of desired workers.  This is often used by adaptive scheduling.
    
            Parameters
            ----------
            target_duration : str
                A desired duration of time for computations to take.  This affects
                how rapidly the scheduler will ask to scale.
    
            See Also
            --------
            distributed.deploy.Adaptive
            """
            if target_duration is None:
                target_duration = dask.config.get("distributed.adaptive.target-duration")
            target_duration = parse_timedelta(target_duration)
    
            # CPU
            queued = take(100, concat([self.queued, self.unrunnable.keys()]))
            queued_occupancy = 0
            for ts in queued:
                queued_occupancy += self._get_prefix_duration(ts.prefix)
    
            tasks_ready = len(self.queued) + len(self.unrunnable)
            if tasks_ready > 100:
                queued_occupancy *= tasks_ready / 100
    
            cpu = math.ceil((self.total_occupancy + queued_occupancy) / target_duration)
    
            # Avoid a few long tasks from asking for many cores
            for ws in self.workers.values():
                if tasks_ready > cpu:
                    break
                tasks_ready += len(ws.processing)
            else:
                cpu = min(tasks_ready, cpu)
    
            # Divide by average nthreads per worker
            if self.workers:
                nthreads = sum(ws.nthreads for ws in self.workers.values())
                cpu = math.ceil(cpu / nthreads * len(self.workers))
    
            if (self.unrunnable or self.queued) and not self.workers:
                cpu = max(1, cpu)
    
            # add more workers if more than 60% of memory is used
            limit = sum(ws.memory_limit for ws in self.workers.values())
            used = sum(ws.nbytes for ws in self.workers.values())
            memory = 0
            if used > 0.6 * limit and limit > 0:
                memory = 2 * len(self.workers)
    
            target = max(memory, cpu)
            if target >= len(self.workers):
                return target
            else:  # Scale down?
                to_close = self.workers_to_close()
                return len(self.workers) - len(to_close)
    
        def request_acquire_replicas(
            self, addr: str, keys: Iterable[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to acquire a replica of the listed keys from
            other workers. This is a fire-and-forget operation which offers no feedback for
            success or failure, and is intended for housekeeping and not for computation.
            """
            who_has = {}
            nbytes = {}
            for key in keys:
                ts = self.tasks[key]
                assert ts.who_has
                who_has[key] = [ws.address for ws in ts.who_has or ()]
                nbytes[key] = ts.nbytes
    
            self.stream_comms[addr].send(
                {
                    "op": "acquire-replicas",
                    "who_has": who_has,
                    "nbytes": nbytes,
                    "stimulus_id": stimulus_id,
                },
            )
    
        def request_remove_replicas(
            self, addr: str, keys: list[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to discard its replica of the listed keys.
            This must never be used to destroy the last replica of a key. This is a
            fire-and-forget operation, intended for housekeeping and not for computation.
    
            The replica disappears immediately from TaskState.who_has on the Scheduler side;
            if the worker refuses to delete, e.g. because the task is a dependency of
            another task running on it, it will (also asynchronously) inform the scheduler
            to re-add itself to who_has. If the worker agrees to discard the task, there is
            no feedback.
            """
            ws = self.workers[addr]
    
            # The scheduler immediately forgets about the replica and suggests the worker to
            # drop it. The worker may refuse, at which point it will send back an add-keys
            # message to reinstate it.
            for key in keys:
                ts = self.tasks[key]
                if self.validate:
                    # Do not destroy the last copy
                    assert ts.who_has
                    assert len(ts.who_has) > 1
                self.remove_replica(ts, ws)
    
            self.stream_comms[addr].send(
                {
                    "op": "remove-replicas",
                    "keys": keys,
                    "stimulus_id": stimulus_id,
                }
            )
    
    
    def _task_to_report_msg(ts: TaskState) -> dict[str, Any] | None:
        if ts.state == "forgotten":
            return {"op": "cancelled-keys", "keys": [ts.key]}
        elif ts.state == "memory":
            return {"op": "key-in-memory", "key": ts.key}
        elif ts.state == "erred":
            failing_ts = ts.exception_blame
            assert failing_ts
            return {
                "op": "task-erred",
                "key": ts.key,
                "exception": failing_ts.exception,
                "traceback": failing_ts.traceback,
            }
        else:
            return None
    
    
    def _task_to_client_msgs(ts: TaskState) -> Msgs:
        if ts.who_wants:
            report_msg = _task_to_report_msg(ts)
            if report_msg is not None:
                return {cs.client_key: [report_msg] for cs in ts.who_wants}
        return {}
    
    
    def decide_worker(
        ts: TaskState,
        all_workers: set[WorkerState],
        valid_workers: set[WorkerState] | None,
        objective: Callable[[WorkerState], Any],
    ) -> WorkerState | None:
        """
        Decide which worker should take task *ts*.
    
        We choose the worker that has the data on which *ts* depends.
    
        If several workers have dependencies then we choose the less-busy worker.
    
        Optionally provide *valid_workers* of where jobs are allowed to occur
        (if all workers are allowed to take the task, pass None instead).
    
        If the task requires data communication because no eligible worker has
        all the dependencies already, then we choose to minimize the number
        of bytes sent between workers.  This is determined by calling the
        *objective* function.
        """
        assert all(dts.who_has for dts in ts.dependencies)
        if ts.actor:
            candidates = all_workers.copy()
        else:
            candidates = {wws for dts in ts.dependencies for wws in dts.who_has or ()}
            candidates &= all_workers
        if valid_workers is None:
            if not candidates:
                candidates = all_workers.copy()
        else:
            candidates &= valid_workers
            if not candidates:
                candidates = valid_workers
                if not candidates:
                    if ts.loose_restrictions:
                        return decide_worker(ts, all_workers, None, objective)
    
        if not candidates:
            return None
        elif len(candidates) == 1:
            return next(iter(candidates))
        else:
            return min(candidates, key=objective)
    
    
    def validate_task_state(ts: TaskState) -> None:
        """Validate the given TaskState"""
        assert ts.state in ALL_TASK_STATES, ts
    
        if ts.waiting_on:
            assert ts.waiting_on.issubset(ts.dependencies), (
                "waiting not subset of dependencies",
                str(ts.waiting_on),
                str(ts.dependencies),
            )
        if ts.waiters:
            assert ts.waiters.issubset(ts.dependents), (
                "waiters not subset of dependents",
                str(ts.waiters),
                str(ts.dependents),
            )
    
        for dts in ts.waiting_on or ():
            assert not dts.who_has, ("waiting on in-memory dep", str(ts), str(dts))
            assert dts.state != "released", ("waiting on released dep", str(ts), str(dts))
        for dts in ts.dependencies:
            assert ts in dts.dependents, (
                "not in dependency's dependents",
                str(ts),
                str(dts),
                str(dts.dependents),
            )
            if ts.state in ("waiting", "queued", "processing", "no-worker"):
                assert ts.waiting_on and dts in ts.waiting_on or dts.who_has, (
                    "dep missing",
                    str(ts),
                    str(dts),
                )
            assert dts.state != "forgotten"
    
        for dts in ts.waiters or ():
            assert dts.state in ("waiting", "queued", "processing", "no-worker"), (
                "waiter not in play",
                str(ts),
                str(dts),
            )
        for dts in ts.dependents:
            assert ts in dts.dependencies, (
                "not in dependent's dependencies",
                str(ts),
                str(dts),
                str(dts.dependencies),
            )
            assert dts.state != "forgotten"
    
        assert (ts.processing_on is not None) == (ts.state == "processing")
        assert bool(ts.who_has) == (ts.state == "memory"), (ts, ts.who_has, ts.state)
    
        if ts.state == "queued":
            assert not ts.processing_on
            assert not ts.who_has
            assert all(dts.who_has for dts in ts.dependencies), (
                "task queued without all deps",
                str(ts),
                str(ts.dependencies),
            )
    
        if ts.state == "processing":
            assert all(dts.who_has for dts in ts.dependencies), (
                "task processing without all deps",
                str(ts),
                str(ts.dependencies),
            )
            assert not ts.waiting_on
    
        if ts.who_has:
            assert ts.waiters or ts.who_wants, (
                "unneeded task in memory",
                str(ts),
                str(ts.who_has),
            )
            if ts.run_spec:  # was computed
                assert ts.type
                assert isinstance(ts.type, str)
            assert not any(
                [
                    ts in dts.waiting_on
                    for dts in ts.dependents
                    if dts.waiting_on is not None
                ]
            )
            for ws in ts.who_has:
                assert ts in ws.has_what, (
                    "not in who_has' has_what",
                    str(ts),
                    str(ws),
                    str(ws.has_what),
                )
    
        for cs in ts.who_wants or ():
            assert ts in cs.wants_what, (
                "not in who_wants' wants_what",
                str(ts),
                str(cs),
                str(cs.wants_what),
            )
    
        if ts.actor:
            if ts.state == "memory":
                assert ts.who_has
                assert sum(ts in ws.actors for ws in ts.who_has) == 1
            if ts.state == "processing":
                assert ts.processing_on
                assert ts in ts.processing_on.actors
            assert ts.state != "queued"
    
    
    def validate_unrunnable(unrunnable: dict[TaskState, float]) -> None:
        prev_unrunnable_since: float | None = None
        prev_ts: TaskState | None = None
        for ts, unrunnable_since in unrunnable.items():
            assert ts.state == "no-worker"
            if prev_ts is not None:
                assert prev_unrunnable_since is not None
                # Ensure that unrunnable_since is monotonically increasing when iterating over unrunnable.
                # _check_no_workers relies on this.
                assert prev_unrunnable_since <= unrunnable_since, (
                    prev_ts,
                    ts,
                    prev_unrunnable_since,
                    unrunnable_since,
                )
            prev_ts = ts
            prev_unrunnable_since = unrunnable_since
    
    
    def validate_worker_state(ws: WorkerState) -> None:
        for ts in ws.has_what or ():
            assert ts.who_has
            assert ws in ts.who_has, (
                "not in has_what' who_has",
                str(ws),
                str(ts),
                str(ts.who_has),
            )
    
        for ts in ws.actors:
            assert ts.state in ("memory", "processing")
    
    
    def validate_state(
        tasks: dict[Key, TaskState],
        workers: dict[str, WorkerState],
        clients: dict[str, ClientState],
    ) -> None:
        """Validate a current runtime state.
    
        This performs a sequence of checks on the entire graph, running in about linear
        time. This raises assert errors if anything doesn't check out.
        """
        for ts in tasks.values():
            validate_task_state(ts)
    
        for ws in workers.values():
            validate_worker_state(ws)
    
        for cs in clients.values():
            for ts in cs.wants_what or ():
                assert ts.who_wants
                assert cs in ts.who_wants, (
                    "not in wants_what' who_wants",
                    str(cs),
                    str(ts),
                    str(ts.who_wants),
                )
    
    
    def heartbeat_interval(n: int) -> float:
        """Interval in seconds that we desire heartbeats based on number of workers"""
        if n <= 10:
            return 0.5
        elif n < 50:
            return 1
        elif n < 200:
            return 2
        else:
            # No more than 200 heartbeats a second scaled by workers
            return n / 200 + 1
    
    
    def _task_slots_available(ws: WorkerState, saturation_factor: float) -> int:
        """Number of tasks that can be sent to this worker without oversaturating it"""
        assert not math.isinf(saturation_factor)
        return max(math.ceil(saturation_factor * ws.nthreads), 1) - (
            len(ws.processing) - len(ws.long_running)
        )
    
    
    def _worker_full(ws: WorkerState, saturation_factor: float) -> bool:
        if math.isinf(saturation_factor):
            return False
        return _task_slots_available(ws, saturation_factor) <= 0
    
    
    class KilledWorker(Exception):
        def __init__(self, task: Key, last_worker: WorkerState, allowed_failures: int):
            super().__init__(task, last_worker, allowed_failures)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def last_worker(self) -> WorkerState:
            return self.args[1]
    
        @property
        def allowed_failures(self) -> int:
            return self.args[2]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} on {self.allowed_failures + 1} "
                "different workers, but all those workers died while running it. "
                f"The last worker that attempt to run the task was {self.last_worker.address}. "
                "Inspecting worker logs is often a good next step to diagnose what went wrong. "
                "For more information see https://distributed.dask.org/en/stable/killed.html."
            )
    
    
    class NoValidWorkerError(Exception):
        def __init__(
            self,
            task: Key,
            host_restrictions: set[str],
            worker_restrictions: set[str],
            resource_restrictions: dict[str, float],
            timeout: float,
        ):
            super().__init__(
                task, host_restrictions, worker_restrictions, resource_restrictions, timeout
            )
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def host_restrictions(self) -> Any:
            return self.args[1]
    
        @property
        def worker_restrictions(self) -> Any:
            return self.args[2]
    
        @property
        def resource_restrictions(self) -> Any:
            return self.args[3]
    
        @property
        def timeout(self) -> float:
            return self.args[4]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting for a valid worker matching all restrictions.\n\nRestrictions:\n"
                f"host_restrictions={self.host_restrictions!s}\n"
                f"worker_restrictions={self.worker_restrictions!s}\n"
                f"resource_restrictions={self.resource_restrictions!s}\n"
            )
    
    
    class NoWorkerError(Exception):
        def __init__(self, task: Key, timeout: float):
            super().__init__(task, timeout)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def timeout(self) -> float:
            return self.args[1]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting without any running workers."
            )
    
    
    class WorkerStatusPlugin(SchedulerPlugin):
        """A plugin to share worker status with a remote observer
    
        This is used in cluster managers to keep updated about the status of the scheduler.
        """
    
        name: ClassVar[str] = "worker-status"
        bcomm: BatchedSend
    
        def __init__(self, scheduler: Scheduler, comm: Comm):
            self.bcomm = BatchedSend(interval="5ms")
            self.bcomm.start(comm)
            scheduler.add_plugin(self)
    
        def add_worker(self, scheduler: Scheduler, worker: str) -> None:
            ident = scheduler.workers[worker].identity()
            del ident["metrics"]
            del ident["last_seen"]
            try:
                self.bcomm.send(["add", {"workers": {worker: ident}}])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def remove_worker(self, scheduler: Scheduler, worker: str, **kwargs: Any) -> None:
            try:
                self.bcomm.send(["remove", worker])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def teardown(self) -> None:
            self.bcomm.close()
    
    
    class CollectTaskMetaDataPlugin(SchedulerPlugin):
        scheduler: Scheduler
        name: str
        keys: set[Key]
        metadata: dict[Key, Any]
        state: dict[Key, TaskStateState]
    
        def __init__(self, scheduler: Scheduler, name: str):
            self.scheduler = scheduler
            self.name = name
            self.keys = set()
            self.metadata = {}
            self.state = {}
    
        def update_graph(
            self,
            scheduler: Scheduler,
            *,
            keys: set[Key],
            **kwargs: Any,
        ) -> None:
            self.keys.update(keys)
    
        def transition(
            self,
            key: Key,
            start: TaskStateState,
            finish: TaskStateState,
            *args: Any,
            **kwargs: Any,
        ) -> None:
            if finish in ("memory", "erred"):
                ts = self.scheduler.tasks.get(key)
                if ts is not None and ts.key in self.keys:
                    self.metadata[key] = ts.metadata
                    self.state[key] = finish
                    self.keys.discard(key)
    
    
    def _materialize_graph(
        expr: Expr,
        global_annotations: dict[str, Any],
        validate: bool,
    ) -> tuple[dict[Key, T_runspec], dict[Key, set[Key]], dict[str, dict[Key, Any]]]:
>       dsk: dict = expr.__dask_graph__()
E       AttributeError: 'dict' object has no attribute '__dask_graph__'

distributed/scheduler.py:9383: AttributeError

Check warning on line 0 in distributed.cli.tests.test_dask_worker

github-actions / Unit Test Results

1 out of 5 runs failed: test_contact_listen_address[tcp://127.0.0.2:---nanny] (distributed.cli.tests.test_dask_worker)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 1s]

Raw output


            AttributeError: 'dict' object has no attribute '__dask_graph__'
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:37619', workers: 0, cores: 0, tasks: 0>
nanny = '--nanny', listen_address = 'tcp://127.0.0.2:38987'

    @pytest.mark.slow
    @pytest.mark.skipif(not LINUX, reason="Need 127.0.0.2 to mean localhost")
    @pytest.mark.parametrize("nanny", ["--nanny", "--no-nanny"])
    @pytest.mark.parametrize("listen_address", ["tcp://0.0.0.0:", "tcp://127.0.0.2:"])
    @gen_cluster(client=True, nthreads=[])
    async def test_contact_listen_address(c, s, nanny, listen_address):
        port = open_port()
        listen_address += str(port)
        with popen(
            [
                sys.executable,
                "-m",
                "dask",
                "worker",
                s.address,
                nanny,
                "--no-dashboard",
                "--contact-address",
                f"tcp://127.0.0.2:{port}",
                "--listen-address",
                listen_address,
            ]
        ):
            await c.wait_for_workers(1)
            info = c.scheduler_info()
            assert info["workers"].keys() == {f"tcp://127.0.0.2:{port}"}
    
            # roundtrip works
>           assert await c.submit(lambda x: x + 1, 10) == 11

distributed/cli/tests/test_dask_worker.py:500: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:408: in _result
    raise exc.with_traceback(tb)
distributed/utils.py:1507: in run_in_executor_with_context
    return await loop.run_in_executor(
../../../miniconda3/envs/dask-distributed/lib/python3.10/concurrent/futures/thread.py:58: in run
    result = self.fn(*self.args, **self.kwargs)
distributed/utils.py:1508: in <lambda>
    executor, lambda: context.run(func, *args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import dataclasses
    import heapq
    import inspect
    import itertools
    import json
    import logging
    import math
    import operator
    import os
    import pickle
    import random
    import textwrap
    import uuid
    import warnings
    import weakref
    from abc import abstractmethod
    from collections import defaultdict, deque
    from collections.abc import (
        Callable,
        Collection,
        Container,
        Hashable,
        Iterable,
        Iterator,
        Mapping,
        Sequence,
        Set,
    )
    from contextlib import suppress
    from functools import partial
    from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, cast, overload
    
    import psutil
    import tornado.web
    from sortedcontainers import SortedDict, SortedSet
    from tlz import (
        concat,
        first,
        groupby,
        merge,
        merge_sorted,
        merge_with,
        partition,
        pluck,
        second,
        take,
        valmap,
    )
    from tornado.ioloop import IOLoop
    
    import dask
    import dask.utils
    from dask._task_spec import DependenciesMapping, GraphNode, convert_legacy_graph
    from dask.core import istask, validate_key
    from dask.typing import Key, no_default
    from dask.utils import (
        _deprecated,
        _deprecated_kwarg,
        format_bytes,
        format_time,
        key_split,
        parse_bytes,
        parse_timedelta,
        tmpfile,
    )
    from dask.widgets import get_template
    
    from distributed import cluster_dump, preloading, profile
    from distributed import versions as version_module
    from distributed._asyncio import RLock
    from distributed._stories import scheduler_story
    from distributed.active_memory_manager import ActiveMemoryManagerExtension, RetireWorker
    from distributed.batched import BatchedSend
    from distributed.broker import Broker
    from distributed.client import SourceCode
    from distributed.collections import HeapSet
    from distributed.comm import (
        Comm,
        CommClosedError,
        get_address_host,
        normalize_address,
        resolve_address,
        unparse_host_port,
    )
    from distributed.comm.addressing import addresses_from_user_args
    from distributed.compatibility import PeriodicCallback
    from distributed.core import (
        ErrorMessage,
        OKMessage,
        Status,
        clean_exception,
        error_message,
        rpc,
        send_recv,
    )
    from distributed.diagnostics.memory_sampler import MemorySamplerExtension
    from distributed.diagnostics.plugin import SchedulerPlugin, _get_plugin_name
    from distributed.event import EventExtension
    from distributed.gc import disable_gc_diagnosis, enable_gc_diagnosis
    from distributed.http import get_handlers
    from distributed.metrics import monotonic, time
    from distributed.multi_lock import MultiLockExtension
    from distributed.node import ServerNode
    from distributed.proctitle import setproctitle
    from distributed.protocol import deserialize
    from distributed.protocol.pickle import dumps, loads
    from distributed.protocol.serialize import Serialized, ToPickle, serialize
    from distributed.publish import PublishExtension
    from distributed.pubsub import PubSubSchedulerExtension
    from distributed.queues import QueueExtension
    from distributed.recreate_tasks import ReplayTaskScheduler
    from distributed.security import Security
    from distributed.semaphore import SemaphoreExtension
    from distributed.shuffle import ShuffleSchedulerPlugin
    from distributed.spans import SpanMetadata, SpansSchedulerExtension
    from distributed.stealing import WorkStealing
    from distributed.utils import (
        All,
        Deadline,
        TimeoutError,
        format_dashboard_link,
        get_fileno_limit,
        key_split_group,
        log_errors,
        offload,
        recursive_to_dict,
        wait_for,
    )
    from distributed.utils_comm import (
        gather_from_workers,
        retry_operation,
        scatter_to_workers,
    )
    from distributed.variable import VariableExtension
    
    if TYPE_CHECKING:
        # TODO import from typing (requires Python >=3.10)
        # TODO import from typing (requires Python >=3.11)
        from typing_extensions import Self, TypeAlias
    
        from dask._expr import Expr
    
    # Not to be confused with distributed.worker_state_machine.TaskStateState
    TaskStateState: TypeAlias = Literal[
        "released",
        "waiting",
        "no-worker",
        "queued",
        "processing",
        "memory",
        "erred",
        "forgotten",
    ]
    
    ALL_TASK_STATES: Set[TaskStateState] = set(TaskStateState.__args__)  # type: ignore
    
    # {task key -> finish state}
    # Not to be confused with distributed.worker_state_machine.Recs
    Recs: TypeAlias = dict[Key, TaskStateState]
    # {client or worker address: [{op: <key>, ...}, ...]}
    Msgs: TypeAlias = dict[str, list[dict[str, Any]]]
    # (recommendations, client messages, worker messages)
    RecsMsgs: TypeAlias = tuple[Recs, Msgs, Msgs]
    
    T_runspec: TypeAlias = GraphNode
    
    logger = logging.getLogger(__name__)
    LOG_PDB = dask.config.get("distributed.admin.pdb-on-err")
    DEFAULT_DATA_SIZE = parse_bytes(
        dask.config.get("distributed.scheduler.default-data-size")
    )
    STIMULUS_ID_UNSET = "<stimulus_id unset>"
    
    DEFAULT_EXTENSIONS = {
        "multi_locks": MultiLockExtension,
        "publish": PublishExtension,
        "replay-tasks": ReplayTaskScheduler,
        "queues": QueueExtension,
        "variables": VariableExtension,
        "pubsub": PubSubSchedulerExtension,
        "semaphores": SemaphoreExtension,
        "events": EventExtension,
        "amm": ActiveMemoryManagerExtension,
        "memory_sampler": MemorySamplerExtension,
        "shuffle": ShuffleSchedulerPlugin,
        "spans": SpansSchedulerExtension,
        "stealing": WorkStealing,
    }
    
    
    class ClientState:
        """A simple object holding information about a client."""
    
        #: A unique identifier for this client. This is generally an opaque
        #: string generated by the client itself.
        client_key: str
    
        #: Cached hash of :attr:`~ClientState.client_key`
        _hash: int
    
        #: A set of tasks this client wants to be kept in memory, so that it can download
        #: its result when desired. This is the reverse mapping of
        #: :class:`TaskState.who_wants`. Tasks are typically removed from this set when the
        #: corresponding object in the client's space (for example a ``Future`` or a Dask
        #: collection) gets garbage-collected.
        wants_what: set[TaskState]
    
        #: The last time we received a heartbeat from this client, in local scheduler time.
        last_seen: float
    
        #: Output of :func:`distributed.versions.get_versions` on the client
        versions: dict[str, Any]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(self, client: str, *, versions: dict[str, Any] | None = None):
            self.client_key = client
            self._hash = hash(client)
            self.wants_what = set()
            self.last_seen = time()
            self.versions = versions or {}
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            if not isinstance(other, ClientState):
                return False
            return self.client_key == other.client_key
    
        def __repr__(self) -> str:
            return f"<Client {self.client_key!r}>"
    
        def __str__(self) -> str:
            return self.client_key
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
    
    class MemoryState:
        """Memory readings on a worker or on the whole cluster.
    
        See :doc:`worker-memory`.
    
        Attributes / properties:
    
        managed_total
            Sum of the output of sizeof() for all dask keys held by the worker in memory,
            plus number of bytes spilled to disk
    
        managed
            Sum of the output of sizeof() for the dask keys held in RAM. Note that this may
            be inaccurate, which may cause inaccurate unmanaged memory (see below).
    
        spilled
            Number of bytes  for the dask keys spilled to the hard drive.
            Note that this is the size on disk; size in memory may be different due to
            compression and inaccuracies in sizeof(). In other words, given the same keys,
            'managed' will change depending on the keys being in memory or spilled.
    
        process
            Total RSS memory measured by the OS on the worker process.
            This is always exactly equal to managed + unmanaged.
    
        unmanaged
            process - managed. This is the sum of
    
            - Python interpreter and modules
            - global variables
            - memory temporarily allocated by the dask tasks that are currently running
            - memory fragmentation
            - memory leaks
            - memory not yet garbage collected
            - memory not yet free()'d by the Python memory manager to the OS
    
        unmanaged_old
            Minimum of the 'unmanaged' measures over the last
            ``distributed.memory.recent-to-old-time`` seconds
    
        unmanaged_recent
            unmanaged - unmanaged_old; in other words process memory that has been recently
            allocated but is not accounted for by dask; hopefully it's mostly a temporary
            spike.
    
        optimistic
            managed + unmanaged_old; in other words the memory held long-term by
            the process under the hopeful assumption that all unmanaged_recent memory is a
            temporary spike
        """
    
        process: int
        unmanaged_old: int
        managed: int
        spilled: int
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            process: int,
            unmanaged_old: int,
            managed: int,
            spilled: int,
        ):
            # Some data arrives with the heartbeat, some other arrives in realtime as the
            # tasks progress. Also, sizeof() is not guaranteed to return correct results.
            # This can cause glitches where a partial measure is larger than the whole, so
            # we need to force all numbers to add up exactly by definition.
            self.process = process
            self.managed = min(self.process, managed)
            self.spilled = spilled
            # Subtractions between unsigned ints guaranteed by construction to be >= 0
            self.unmanaged_old = min(unmanaged_old, process - self.managed)
    
        @staticmethod
        def sum(*infos: MemoryState) -> MemoryState:
            process = 0
            unmanaged_old = 0
            managed = 0
            spilled = 0
            for ms in infos:
                process += ms.process
                unmanaged_old += ms.unmanaged_old
                spilled += ms.spilled
                managed += ms.managed
            return MemoryState(
                process=process,
                unmanaged_old=unmanaged_old,
                managed=managed,
                spilled=spilled,
            )
    
        @property
        def managed_total(self) -> int:
            return self.managed + self.spilled
    
        @property
        def unmanaged(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed
    
        @property
        def unmanaged_recent(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed - self.unmanaged_old
    
        @property
        def optimistic(self) -> int:
            return self.managed + self.unmanaged_old
    
        @property
        def managed_in_memory(self) -> int:
            warnings.warn("managed_in_memory has been renamed to managed", FutureWarning)
            return self.managed
    
        @property
        def managed_spilled(self) -> int:
            warnings.warn("managed_spilled has been renamed to spilled", FutureWarning)
            return self.spilled
    
        def __repr__(self) -> str:
            return (
                f"Process memory (RSS)  : {format_bytes(self.process)}\n"
                f"  - managed by Dask   : {format_bytes(self.managed)}\n"
                f"  - unmanaged (old)   : {format_bytes(self.unmanaged_old)}\n"
                f"  - unmanaged (recent): {format_bytes(self.unmanaged_recent)}\n"
                f"Spilled to disk       : {format_bytes(self.spilled)}\n"
            )
    
        def _to_dict(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            """
            return {
                k: getattr(self, k)
                for k in dir(self)
                if not k.startswith("_")
                and k not in {"sum", "managed_in_memory", "managed_spilled"}
            }
    
    
    class WorkerState:
        """A simple object holding information about a worker.
    
        Not to be confused with :class:`distributed.worker_state_machine.WorkerState`.
        """
    
        #: This worker's unique key. This can be its connected address
        #: (such as ``"tcp://127.0.0.1:8891"``) or an alias (such as ``"alice"``).
        address: str
    
        pid: int
        name: Hashable
    
        #: The number of CPU threads made available on this worker
        nthreads: int
    
        #: Memory available to the worker, in bytes
        memory_limit: int
    
        local_directory: str
        services: dict[str, int]
    
        #: Output of :meth:`distributed.versions.get_versions` on the worker
        versions: dict[str, Any]
    
        #: Address of the associated :class:`~distributed.nanny.Nanny`, if present
        nanny: str | None
    
        #: Read-only worker status, synced one way from the remote Worker object
        status: Status
    
        #: Cached hash of :attr:`~WorkerState.server_id`
        _hash: int
    
        #: The total memory size, in bytes, used by the tasks this worker holds in memory
        #: (i.e. the tasks in this worker's :attr:`~WorkerState.has_what`).
        nbytes: int
    
        #: Worker memory unknown to the worker, in bytes, which has been there for more than
        #: 30 seconds. See :class:`MemoryState`.
        _memory_unmanaged_old: int
    
        #: History of the last 30 seconds' worth of unmanaged memory. Used to differentiate
        #: between "old" and "new" unmanaged memory.
        #: Format: ``[(timestamp, bytes), (timestamp, bytes), ...]``
        _memory_unmanaged_history: deque[tuple[float, int]]
    
        metrics: dict[str, Any]
    
        #: The last time we received a heartbeat from this worker, in local scheduler time.
        last_seen: float
    
        time_delay: float
        bandwidth: float
    
        #: A set of all TaskStates on this worker that are actors. This only includes those
        #: actors whose state actually lives on this worker, not actors to which this worker
        #: has a reference.
        actors: set[TaskState]
    
        #: Underlying data of :meth:`WorkerState.has_what`
        _has_what: dict[TaskState, None]
    
        #: A set of tasks that have been submitted to this worker. Multiple tasks may be
        # submitted to a worker in advance and the worker will run them eventually,
        # depending on its execution resources (but see :doc:`work-stealing`).
        #:
        #: All the tasks here are in the "processing" state.
        #: This attribute is kept in sync with :attr:`TaskState.processing_on`.
        processing: set[TaskState]
    
        #: Running tasks that invoked :func:`distributed.secede`
        long_running: set[TaskState]
    
        #: A dictionary of tasks that are currently being run on this worker.
        #: Each task state is associated with the duration in seconds which the task has
        #: been running.
        executing: dict[TaskState, float]
    
        #: The available resources on this worker, e.g. ``{"GPU": 2}``.
        #: These are abstract quantities that constrain certain tasks from running at the
        #: same time on this worker.
        resources: dict[str, float]
    
        #: The sum of each resource used by all tasks allocated to this worker.
        #: The numbers in this dictionary can only be less or equal than those in this
        #: worker's :attr:`~WorkerState.resources`.
        used_resources: dict[str, float]
    
        #: Arbitrary additional metadata to be added to :meth:`~WorkerState.identity`
        extra: dict[str, Any]
    
        # The unique server ID this WorkerState is referencing
        server_id: str
    
        # Reference to scheduler task_groups
        scheduler_ref: weakref.ref[SchedulerState] | None
        task_prefix_count: defaultdict[str, int]
        _network_occ: int
        _occupancy_cache: float | None
    
        #: Keys that may need to be fetched to this worker, and the number of tasks that need them.
        #: All tasks are currently in `memory` on a worker other than this one.
        #: Much like `processing`, this does not exactly reflect worker state:
        #: keys here may be queued to fetch, in flight, or already in memory
        #: on the worker.
        needs_what: dict[TaskState, int]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            address: str,
            status: Status,
            pid: int,
            name: object,
            nthreads: int = 0,
            memory_limit: int,
            local_directory: str,
            nanny: str | None,
            server_id: str,
            services: dict[str, int] | None = None,
            versions: dict[str, Any] | None = None,
            extra: dict[str, Any] | None = None,
            scheduler: SchedulerState | None = None,
        ):
            self.server_id = server_id
            self.address = address
            self.pid = pid
            self.name = name
            self.nthreads = nthreads
            self.memory_limit = memory_limit
            self.local_directory = local_directory
            self.services = services or {}
            self.versions = versions or {}
            self.nanny = nanny
            self.status = status
            self._hash = hash(self.server_id)
            self.nbytes = 0
            self._memory_unmanaged_old = 0
            self._memory_unmanaged_history = deque()
            self.metrics = {}
            self.last_seen = time()
            self.time_delay = 0
            self.bandwidth = parse_bytes(dask.config.get("distributed.scheduler.bandwidth"))
            self.actors = set()
            self._has_what = {}
            self.processing = set()
            self.long_running = set()
            self.executing = {}
            self.resources = {}
            self.used_resources = {}
            self.extra = extra or {}
            self.scheduler_ref = weakref.ref(scheduler) if scheduler else None
            self.task_prefix_count = defaultdict(int)
            self.needs_what = {}
            self._network_occ = 0
            self._occupancy_cache = None
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            return self is other or (
                isinstance(other, WorkerState) and other.server_id == self.server_id
            )
    
        @property
        def has_what(self) -> Set[TaskState]:
            """An insertion-sorted set-like of tasks which currently reside on this worker.
            All the tasks here are in the "memory" state.
            This is the reverse mapping of :attr:`TaskState.who_has`.
    
            This is a read-only public accessor. The data is implemented as a dict without
            values, because rebalance() relies on dicts being insertion-sorted.
            """
            return self._has_what.keys()
    
        @property
        def host(self) -> str:
            return get_address_host(self.address)
    
        @property
        def memory(self) -> MemoryState:
            """Polished memory metrics for the worker.
    
            **Design note on managed memory**
    
            There are two measures available for managed memory:
    
            - ``self.nbytes``
            - ``self.metrics["managed_bytes"]``
    
            At rest, the two numbers must be identical. However, ``self.nbytes`` is
            immediately updated through the batched comms as soon as each task lands in
            memory on the worker; ``self.metrics["managed_bytes"]`` instead is updated by
            the heartbeat, which can lag several seconds behind.
    
            Below we are mixing likely newer managed memory info from ``self.nbytes`` with
            process and spilled memory from the heartbeat. This is deliberate, so that
            managed memory total is updated more frequently.
    
            Managed memory directly and immediately contributes to optimistic memory, which
            is in turn used in Active Memory Manager heuristics (at the moment of writing;
            more uses will likely be added in the future). So it's important to have it
            up to date; much more than it is for process memory.
    
            Having up-to-date managed memory info as soon as the scheduler learns about
            task completion also substantially simplifies unit tests.
    
            The flip side of this design is that it may cause some noise in the
            unmanaged_recent measure. e.g.:
    
            1. Delete 100MB of managed data
            2. The updated managed memory reaches the scheduler faster than the
               updated process memory
            3. There's a blip where the scheduler thinks that there's a sudden 100MB
               increase in unmanaged_recent, since process memory hasn't changed but managed
               memory has decreased by 100MB
            4. When the heartbeat arrives, process memory goes down and so does the
               unmanaged_recent.
    
            This is OK - one of the main reasons for the unmanaged_recent / unmanaged_old
            split is exactly to concentrate all the noise in unmanaged_recent and exclude it
            from optimistic memory, which is used for heuristics.
    
            Something that is less OK, but also less frequent, is that the sudden deletion
            of spilled keys will cause a negative blip in managed memory:
    
            1. Delete 100MB of spilled data
            2. The updated managed memory *total* reaches the scheduler faster than the
               updated spilled portion
            3. This causes the managed memory to temporarily plummet and be replaced by
               unmanaged_recent, while spilled memory remains unaltered
            4. When the heartbeat arrives, managed goes back up, unmanaged_recent
               goes back down, and spilled goes down by 100MB as it should have to
               begin with.
    
            :issue:`6002` will let us solve this.
            """
            return MemoryState(
                process=self.metrics["memory"],
                managed=max(0, self.nbytes - self.metrics["spilled_bytes"]["memory"]),
                spilled=self.metrics["spilled_bytes"]["disk"],
                unmanaged_old=self._memory_unmanaged_old,
            )
    
        def clean(self) -> WorkerState:
            """Return a version of this object that is appropriate for serialization"""
            ws = WorkerState(
                address=self.address,
                status=self.status,
                pid=self.pid,
                name=self.name,
                nthreads=self.nthreads,
                memory_limit=self.memory_limit,
                local_directory=self.local_directory,
                services=self.services,
                nanny=self.nanny,
                extra=self.extra,
                server_id=self.server_id,
            )
            ws._occupancy_cache = self.occupancy
    
            ws.executing = {ts.key: duration for ts, duration in self.executing.items()}  # type: ignore
            return ws
    
        def __repr__(self) -> str:
            name = f", name: {self.name}" if self.name != self.address else ""
            return (
                f"<WorkerState {self.address!r}{name}, "
                f"status: {self.status.name}, "
                f"memory: {len(self.has_what)}, "
                f"processing: {len(self.processing)}>"
            )
    
        def _repr_html_(self) -> str:
            return get_template("worker_state.html.j2").render(
                address=self.address,
                name=self.name,
                status=self.status.name,
                has_what=self.has_what,
                processing=self.processing,
            )
    
        def identity(self) -> dict[str, Any]:
            return {
                "type": "Worker",
                "id": self.name,
                "host": self.host,
                "resources": self.resources,
                "local_directory": self.local_directory,
                "name": self.name,
                "nthreads": self.nthreads,
                "memory_limit": self.memory_limit,
                "last_seen": self.last_seen,
                "services": self.services,
                "metrics": self.metrics,
                "status": self.status.name,
                "nanny": self.nanny,
                **self.extra,
            }
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict[str, Any]:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
        @property
        def scheduler(self) -> SchedulerState:
            assert self.scheduler_ref
            s = self.scheduler_ref()
            assert s
            return s
    
        def add_to_processing(self, ts: TaskState) -> None:
            """Assign a task to this worker for compute."""
            if self.scheduler.validate:
                assert ts not in self.processing
    
            tp = ts.prefix
            self.task_prefix_count[tp.name] += 1
            self.scheduler._task_prefix_count_global[tp.name] += 1
            self.processing.add(ts)
            for dts in ts.dependencies:
                assert dts.who_has
                if self not in dts.who_has:
                    self._inc_needs_replica(dts)
    
        def add_to_long_running(self, ts: TaskState) -> None:
            if self.scheduler.validate:
                assert ts in self.processing
                assert ts not in self.long_running
    
            self._remove_from_task_prefix_count(ts)
            # Cannot remove from processing since we're using this for things like
            # idleness detection. Idle workers are typically targeted for
            # downscaling but we should not downscale workers with long running
            # tasks
            self.long_running.add(ts)
    
        def remove_from_processing(self, ts: TaskState) -> None:
            """Remove a task from a workers processing"""
            if self.scheduler.validate:
                assert ts in self.processing
    
            if ts in self.long_running:
                self.long_running.discard(ts)
            else:
                self._remove_from_task_prefix_count(ts)
            self.processing.remove(ts)
            for dts in ts.dependencies:
                if dts in self.needs_what:
                    self._dec_needs_replica(dts)
    
        def _remove_from_task_prefix_count(self, ts: TaskState) -> None:
            prefix_name = ts.prefix.name
            count = self.task_prefix_count[prefix_name] - 1
            tp_count = self.task_prefix_count
            tp_count_global = self.scheduler._task_prefix_count_global
            if count:
                tp_count[prefix_name] = count
            else:
                del tp_count[prefix_name]
    
            count = tp_count_global[prefix_name] - 1
            if count:
                tp_count_global[prefix_name] = count
            else:
                del tp_count_global[prefix_name]
    
        def remove_replica(self, ts: TaskState) -> None:
            """The worker no longer has a task in memory"""
            if self.scheduler.validate:
                assert ts.who_has
                assert self in ts.who_has
                assert ts in self.has_what
                assert ts not in self.needs_…cheduler, title="Scheduler Profile (administrative)"
            )
            task_stream = TabPanel(child=task_stream, title="Task Stream")
            bandwidth_workers = TabPanel(
                child=bandwidth_workers.root, title="Bandwidth (Workers)"
            )
            bandwidth_types = TabPanel(
                child=bandwidth_types.root, title="Bandwidth (Types)"
            )
            system = TabPanel(child=sysmon.root, title="System")
            logs = TabPanel(child=logs.root, title="Scheduler Logs")
    
            tabs = Tabs(
                tabs=[
                    html,
                    task_stream,
                    system,
                    logs,
                    compute,
                    workers,
                    scheduler,
                    bandwidth_workers,
                    bandwidth_types,
                ],
                sizing_mode="stretch_both",
            )
    
            from bokeh.core.templates import get_env
            from bokeh.plotting import output_file, save
    
            with tmpfile(extension=".html") as fn:
                output_file(filename=fn, title="Dask Performance Report", mode=mode)
                template_directory = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)), "dashboard", "templates"
                )
                template_environment = get_env()
                template_environment.loader.searchpath.append(template_directory)
                template = template_environment.get_template("performance_report.html")
                save(tabs, filename=fn, template=template)
    
                with open(fn) as f:
                    data = f.read()
    
            return data
    
        async def get_worker_logs(self, n=None, workers=None, nanny=False):
            results = await self.broadcast(
                msg={"op": "get_logs", "n": n}, workers=workers, nanny=nanny
            )
            return results
    
        def log_event(self, topic: str | Collection[str], msg: Any) -> None:
            """Log an event under a given topic
    
            Parameters
            ----------
            topic : str, list[str]
                Name of the topic under which to log an event. To log the same
                event under multiple topics, pass a list of topic names.
            msg
                Event message to log. Note this must be msgpack serializable.
    
            See also
            --------
            Client.log_event
            """
            self._broker.publish(topic, msg)
    
        def subscribe_topic(self, topic: str, client: str) -> None:
            self._broker.subscribe(topic, client)
    
        def unsubscribe_topic(self, topic: str, client: str) -> None:
            self._broker.unsubscribe(topic, client)
    
        @overload
        def get_events(self, topic: str) -> tuple[tuple[float, Any], ...]: ...
    
        @overload
        def get_events(self) -> dict[str, tuple[tuple[float, Any], ...]]: ...
    
        def get_events(
            self, topic: str | None = None
        ) -> tuple[tuple[float, Any], ...] | dict[str, tuple[tuple[float, Any], ...]]:
            return self._broker.get_events(topic)
    
        async def get_worker_monitor_info(self, recent=False, starts=None):
            if starts is None:
                starts = {}
            results = await asyncio.gather(
                *(
                    self.rpc(w).get_monitor_info(recent=recent, start=starts.get(w, 0))
                    for w in self.workers
                )
            )
            return dict(zip(self.workers, results))
    
        ###########
        # Cleanup #
        ###########
    
        @log_errors
        async def check_worker_ttl(self) -> None:
            now = time()
            stimulus_id = f"check-worker-ttl-{now}"
            assert self.worker_ttl
            ttl = max(self.worker_ttl, 10 * heartbeat_interval(len(self.workers)))
            to_restart = []
    
            for ws in self.workers.values():
                last_seen = now - ws.last_seen
                if last_seen > ttl:
                    to_restart.append(ws.address)
                    logger.warning(
                        f"Worker failed to heartbeat for {last_seen:.0f}s; "
                        f"{'attempting restart' if ws.nanny else 'removing'}: {ws}"
                    )
    
            if to_restart:
                self.log_event(
                    "scheduler",
                    {
                        "action": "worker-ttl-timed-out",
                        "workers": to_restart.copy(),
                        "ttl": ttl,
                    },
                )
                await self.restart_workers(
                    to_restart,
                    wait_for_workers=False,
                    stimulus_id=stimulus_id,
                )
    
        def check_idle(self) -> float | None:
            if self.status in (Status.closing, Status.closed):
                return None  # pragma: nocover
    
            if self.transition_counter != self._idle_transition_counter:
                self._idle_transition_counter = self.transition_counter
                self.idle_since = None
                return None
    
            if self._active_graph_updates > 0:
                self.idle_since = None
                return None
    
            if (
                self.queued
                or self.unrunnable
                or any(ws.processing for ws in self.workers.values())
            ):
                self.idle_since = None
                return None
    
            if not self.idle_since:
                self.idle_since = time()
                return self.idle_since
    
            if self.jupyter:
                last_activity = (
                    self._jupyter_server_application.web_app.last_activity().timestamp()
                )
                if last_activity > self.idle_since:
                    self.idle_since = last_activity
                    return self.idle_since
    
            if self.idle_timeout:
                if time() > self.idle_since + self.idle_timeout:
                    assert self.idle_since
                    logger.info(
                        "Scheduler closing after being idle for %s",
                        format_time(self.idle_timeout),
                    )
                    self._ongoing_background_tasks.call_soon(
                        self.close, reason="idle-timeout-exceeded"
                    )
            return self.idle_since
    
        def _check_no_workers(self) -> None:
            if (
                self.status in (Status.closing, Status.closed)
                or self.no_workers_timeout is None
            ):
                return
    
            now = monotonic()
            stimulus_id = f"check-no-workers-timeout-{time()}"
    
            recommendations: Recs = {}
    
            self._refresh_no_workers_since(now)
    
            affected = self._check_unrunnable_task_timeouts(
                now, recommendations=recommendations, stimulus_id=stimulus_id
            )
    
            affected.update(
                self._check_queued_task_timeouts(
                    now, recommendations=recommendations, stimulus_id=stimulus_id
                )
            )
            self.transitions(recommendations, stimulus_id=stimulus_id)
            if affected:
                self.log_event(
                    "scheduler",
                    {"action": "no-workers-timeout-exceeded", "keys": affected},
                )
    
        def _check_unrunnable_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
            unsatisfied = []
            no_workers = []
            for ts, unrunnable_since in self.unrunnable.items():
                if timestamp <= unrunnable_since + self.no_workers_timeout:
                    # unrunnable is insertion-ordered, which means that unrunnable_since will
                    # be monotonically increasing in this loop.
                    break
                if (
                    self._no_workers_since is None
                    or self._no_workers_since >= unrunnable_since
                ):
                    unsatisfied.append(ts)
                else:
                    no_workers.append(ts)
            if not unsatisfied and not no_workers:
                return set()
    
            for ts in unsatisfied:
                e = pickle.dumps(
                    NoValidWorkerError(
                        task=ts.key,
                        host_restrictions=(ts.host_restrictions or set()).copy(),
                        worker_restrictions=(ts.worker_restrictions or set()).copy(),
                        resource_restrictions=(ts.resource_restrictions or {}).copy(),
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "for its restrictions to become satisfied.",
                    ts.key,
                )
            self._fail_tasks_after_no_workers_timeout(
                no_workers, recommendations, stimulus_id
            )
            return {ts.key for ts in concat([unsatisfied, no_workers])}
    
        def _check_queued_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
    
            if self._no_workers_since is None:
                return set()
    
            if timestamp <= self._no_workers_since + self.no_workers_timeout:
                return set()
            affected = list(self.queued)
            self._fail_tasks_after_no_workers_timeout(
                affected, recommendations, stimulus_id
            )
            return {ts.key for ts in affected}
    
        def _fail_tasks_after_no_workers_timeout(
            self, timed_out: Iterable[TaskState], recommendations: Recs, stimulus_id: str
        ) -> None:
            assert self.no_workers_timeout
    
            for ts in timed_out:
                e = pickle.dumps(
                    NoWorkerError(
                        task=ts.key,
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "without any running workers.",
                    ts.key,
                )
    
        def _refresh_no_workers_since(self, timestamp: float | None = None) -> None:
            if self.running or not (self.queued or self.unrunnable):
                self._no_workers_since = None
                return
    
            if not self._no_workers_since:
                self._no_workers_since = timestamp or monotonic()
                return
    
        def adaptive_target(self, target_duration=None):
            """Desired number of workers based on the current workload
    
            This looks at the current running tasks and memory use, and returns a
            number of desired workers.  This is often used by adaptive scheduling.
    
            Parameters
            ----------
            target_duration : str
                A desired duration of time for computations to take.  This affects
                how rapidly the scheduler will ask to scale.
    
            See Also
            --------
            distributed.deploy.Adaptive
            """
            if target_duration is None:
                target_duration = dask.config.get("distributed.adaptive.target-duration")
            target_duration = parse_timedelta(target_duration)
    
            # CPU
            queued = take(100, concat([self.queued, self.unrunnable.keys()]))
            queued_occupancy = 0
            for ts in queued:
                queued_occupancy += self._get_prefix_duration(ts.prefix)
    
            tasks_ready = len(self.queued) + len(self.unrunnable)
            if tasks_ready > 100:
                queued_occupancy *= tasks_ready / 100
    
            cpu = math.ceil((self.total_occupancy + queued_occupancy) / target_duration)
    
            # Avoid a few long tasks from asking for many cores
            for ws in self.workers.values():
                if tasks_ready > cpu:
                    break
                tasks_ready += len(ws.processing)
            else:
                cpu = min(tasks_ready, cpu)
    
            # Divide by average nthreads per worker
            if self.workers:
                nthreads = sum(ws.nthreads for ws in self.workers.values())
                cpu = math.ceil(cpu / nthreads * len(self.workers))
    
            if (self.unrunnable or self.queued) and not self.workers:
                cpu = max(1, cpu)
    
            # add more workers if more than 60% of memory is used
            limit = sum(ws.memory_limit for ws in self.workers.values())
            used = sum(ws.nbytes for ws in self.workers.values())
            memory = 0
            if used > 0.6 * limit and limit > 0:
                memory = 2 * len(self.workers)
    
            target = max(memory, cpu)
            if target >= len(self.workers):
                return target
            else:  # Scale down?
                to_close = self.workers_to_close()
                return len(self.workers) - len(to_close)
    
        def request_acquire_replicas(
            self, addr: str, keys: Iterable[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to acquire a replica of the listed keys from
            other workers. This is a fire-and-forget operation which offers no feedback for
            success or failure, and is intended for housekeeping and not for computation.
            """
            who_has = {}
            nbytes = {}
            for key in keys:
                ts = self.tasks[key]
                assert ts.who_has
                who_has[key] = [ws.address for ws in ts.who_has or ()]
                nbytes[key] = ts.nbytes
    
            self.stream_comms[addr].send(
                {
                    "op": "acquire-replicas",
                    "who_has": who_has,
                    "nbytes": nbytes,
                    "stimulus_id": stimulus_id,
                },
            )
    
        def request_remove_replicas(
            self, addr: str, keys: list[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to discard its replica of the listed keys.
            This must never be used to destroy the last replica of a key. This is a
            fire-and-forget operation, intended for housekeeping and not for computation.
    
            The replica disappears immediately from TaskState.who_has on the Scheduler side;
            if the worker refuses to delete, e.g. because the task is a dependency of
            another task running on it, it will (also asynchronously) inform the scheduler
            to re-add itself to who_has. If the worker agrees to discard the task, there is
            no feedback.
            """
            ws = self.workers[addr]
    
            # The scheduler immediately forgets about the replica and suggests the worker to
            # drop it. The worker may refuse, at which point it will send back an add-keys
            # message to reinstate it.
            for key in keys:
                ts = self.tasks[key]
                if self.validate:
                    # Do not destroy the last copy
                    assert ts.who_has
                    assert len(ts.who_has) > 1
                self.remove_replica(ts, ws)
    
            self.stream_comms[addr].send(
                {
                    "op": "remove-replicas",
                    "keys": keys,
                    "stimulus_id": stimulus_id,
                }
            )
    
    
    def _task_to_report_msg(ts: TaskState) -> dict[str, Any] | None:
        if ts.state == "forgotten":
            return {"op": "cancelled-keys", "keys": [ts.key]}
        elif ts.state == "memory":
            return {"op": "key-in-memory", "key": ts.key}
        elif ts.state == "erred":
            failing_ts = ts.exception_blame
            assert failing_ts
            return {
                "op": "task-erred",
                "key": ts.key,
                "exception": failing_ts.exception,
                "traceback": failing_ts.traceback,
            }
        else:
            return None
    
    
    def _task_to_client_msgs(ts: TaskState) -> Msgs:
        if ts.who_wants:
            report_msg = _task_to_report_msg(ts)
            if report_msg is not None:
                return {cs.client_key: [report_msg] for cs in ts.who_wants}
        return {}
    
    
    def decide_worker(
        ts: TaskState,
        all_workers: set[WorkerState],
        valid_workers: set[WorkerState] | None,
        objective: Callable[[WorkerState], Any],
    ) -> WorkerState | None:
        """
        Decide which worker should take task *ts*.
    
        We choose the worker that has the data on which *ts* depends.
    
        If several workers have dependencies then we choose the less-busy worker.
    
        Optionally provide *valid_workers* of where jobs are allowed to occur
        (if all workers are allowed to take the task, pass None instead).
    
        If the task requires data communication because no eligible worker has
        all the dependencies already, then we choose to minimize the number
        of bytes sent between workers.  This is determined by calling the
        *objective* function.
        """
        assert all(dts.who_has for dts in ts.dependencies)
        if ts.actor:
            candidates = all_workers.copy()
        else:
            candidates = {wws for dts in ts.dependencies for wws in dts.who_has or ()}
            candidates &= all_workers
        if valid_workers is None:
            if not candidates:
                candidates = all_workers.copy()
        else:
            candidates &= valid_workers
            if not candidates:
                candidates = valid_workers
                if not candidates:
                    if ts.loose_restrictions:
                        return decide_worker(ts, all_workers, None, objective)
    
        if not candidates:
            return None
        elif len(candidates) == 1:
            return next(iter(candidates))
        else:
            return min(candidates, key=objective)
    
    
    def validate_task_state(ts: TaskState) -> None:
        """Validate the given TaskState"""
        assert ts.state in ALL_TASK_STATES, ts
    
        if ts.waiting_on:
            assert ts.waiting_on.issubset(ts.dependencies), (
                "waiting not subset of dependencies",
                str(ts.waiting_on),
                str(ts.dependencies),
            )
        if ts.waiters:
            assert ts.waiters.issubset(ts.dependents), (
                "waiters not subset of dependents",
                str(ts.waiters),
                str(ts.dependents),
            )
    
        for dts in ts.waiting_on or ():
            assert not dts.who_has, ("waiting on in-memory dep", str(ts), str(dts))
            assert dts.state != "released", ("waiting on released dep", str(ts), str(dts))
        for dts in ts.dependencies:
            assert ts in dts.dependents, (
                "not in dependency's dependents",
                str(ts),
                str(dts),
                str(dts.dependents),
            )
            if ts.state in ("waiting", "queued", "processing", "no-worker"):
                assert ts.waiting_on and dts in ts.waiting_on or dts.who_has, (
                    "dep missing",
                    str(ts),
                    str(dts),
                )
            assert dts.state != "forgotten"
    
        for dts in ts.waiters or ():
            assert dts.state in ("waiting", "queued", "processing", "no-worker"), (
                "waiter not in play",
                str(ts),
                str(dts),
            )
        for dts in ts.dependents:
            assert ts in dts.dependencies, (
                "not in dependent's dependencies",
                str(ts),
                str(dts),
                str(dts.dependencies),
            )
            assert dts.state != "forgotten"
    
        assert (ts.processing_on is not None) == (ts.state == "processing")
        assert bool(ts.who_has) == (ts.state == "memory"), (ts, ts.who_has, ts.state)
    
        if ts.state == "queued":
            assert not ts.processing_on
            assert not ts.who_has
            assert all(dts.who_has for dts in ts.dependencies), (
                "task queued without all deps",
                str(ts),
                str(ts.dependencies),
            )
    
        if ts.state == "processing":
            assert all(dts.who_has for dts in ts.dependencies), (
                "task processing without all deps",
                str(ts),
                str(ts.dependencies),
            )
            assert not ts.waiting_on
    
        if ts.who_has:
            assert ts.waiters or ts.who_wants, (
                "unneeded task in memory",
                str(ts),
                str(ts.who_has),
            )
            if ts.run_spec:  # was computed
                assert ts.type
                assert isinstance(ts.type, str)
            assert not any(
                [
                    ts in dts.waiting_on
                    for dts in ts.dependents
                    if dts.waiting_on is not None
                ]
            )
            for ws in ts.who_has:
                assert ts in ws.has_what, (
                    "not in who_has' has_what",
                    str(ts),
                    str(ws),
                    str(ws.has_what),
                )
    
        for cs in ts.who_wants or ():
            assert ts in cs.wants_what, (
                "not in who_wants' wants_what",
                str(ts),
                str(cs),
                str(cs.wants_what),
            )
    
        if ts.actor:
            if ts.state == "memory":
                assert ts.who_has
                assert sum(ts in ws.actors for ws in ts.who_has) == 1
            if ts.state == "processing":
                assert ts.processing_on
                assert ts in ts.processing_on.actors
            assert ts.state != "queued"
    
    
    def validate_unrunnable(unrunnable: dict[TaskState, float]) -> None:
        prev_unrunnable_since: float | None = None
        prev_ts: TaskState | None = None
        for ts, unrunnable_since in unrunnable.items():
            assert ts.state == "no-worker"
            if prev_ts is not None:
                assert prev_unrunnable_since is not None
                # Ensure that unrunnable_since is monotonically increasing when iterating over unrunnable.
                # _check_no_workers relies on this.
                assert prev_unrunnable_since <= unrunnable_since, (
                    prev_ts,
                    ts,
                    prev_unrunnable_since,
                    unrunnable_since,
                )
            prev_ts = ts
            prev_unrunnable_since = unrunnable_since
    
    
    def validate_worker_state(ws: WorkerState) -> None:
        for ts in ws.has_what or ():
            assert ts.who_has
            assert ws in ts.who_has, (
                "not in has_what' who_has",
                str(ws),
                str(ts),
                str(ts.who_has),
            )
    
        for ts in ws.actors:
            assert ts.state in ("memory", "processing")
    
    
    def validate_state(
        tasks: dict[Key, TaskState],
        workers: dict[str, WorkerState],
        clients: dict[str, ClientState],
    ) -> None:
        """Validate a current runtime state.
    
        This performs a sequence of checks on the entire graph, running in about linear
        time. This raises assert errors if anything doesn't check out.
        """
        for ts in tasks.values():
            validate_task_state(ts)
    
        for ws in workers.values():
            validate_worker_state(ws)
    
        for cs in clients.values():
            for ts in cs.wants_what or ():
                assert ts.who_wants
                assert cs in ts.who_wants, (
                    "not in wants_what' who_wants",
                    str(cs),
                    str(ts),
                    str(ts.who_wants),
                )
    
    
    def heartbeat_interval(n: int) -> float:
        """Interval in seconds that we desire heartbeats based on number of workers"""
        if n <= 10:
            return 0.5
        elif n < 50:
            return 1
        elif n < 200:
            return 2
        else:
            # No more than 200 heartbeats a second scaled by workers
            return n / 200 + 1
    
    
    def _task_slots_available(ws: WorkerState, saturation_factor: float) -> int:
        """Number of tasks that can be sent to this worker without oversaturating it"""
        assert not math.isinf(saturation_factor)
        return max(math.ceil(saturation_factor * ws.nthreads), 1) - (
            len(ws.processing) - len(ws.long_running)
        )
    
    
    def _worker_full(ws: WorkerState, saturation_factor: float) -> bool:
        if math.isinf(saturation_factor):
            return False
        return _task_slots_available(ws, saturation_factor) <= 0
    
    
    class KilledWorker(Exception):
        def __init__(self, task: Key, last_worker: WorkerState, allowed_failures: int):
            super().__init__(task, last_worker, allowed_failures)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def last_worker(self) -> WorkerState:
            return self.args[1]
    
        @property
        def allowed_failures(self) -> int:
            return self.args[2]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} on {self.allowed_failures + 1} "
                "different workers, but all those workers died while running it. "
                f"The last worker that attempt to run the task was {self.last_worker.address}. "
                "Inspecting worker logs is often a good next step to diagnose what went wrong. "
                "For more information see https://distributed.dask.org/en/stable/killed.html."
            )
    
    
    class NoValidWorkerError(Exception):
        def __init__(
            self,
            task: Key,
            host_restrictions: set[str],
            worker_restrictions: set[str],
            resource_restrictions: dict[str, float],
            timeout: float,
        ):
            super().__init__(
                task, host_restrictions, worker_restrictions, resource_restrictions, timeout
            )
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def host_restrictions(self) -> Any:
            return self.args[1]
    
        @property
        def worker_restrictions(self) -> Any:
            return self.args[2]
    
        @property
        def resource_restrictions(self) -> Any:
            return self.args[3]
    
        @property
        def timeout(self) -> float:
            return self.args[4]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting for a valid worker matching all restrictions.\n\nRestrictions:\n"
                f"host_restrictions={self.host_restrictions!s}\n"
                f"worker_restrictions={self.worker_restrictions!s}\n"
                f"resource_restrictions={self.resource_restrictions!s}\n"
            )
    
    
    class NoWorkerError(Exception):
        def __init__(self, task: Key, timeout: float):
            super().__init__(task, timeout)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def timeout(self) -> float:
            return self.args[1]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting without any running workers."
            )
    
    
    class WorkerStatusPlugin(SchedulerPlugin):
        """A plugin to share worker status with a remote observer
    
        This is used in cluster managers to keep updated about the status of the scheduler.
        """
    
        name: ClassVar[str] = "worker-status"
        bcomm: BatchedSend
    
        def __init__(self, scheduler: Scheduler, comm: Comm):
            self.bcomm = BatchedSend(interval="5ms")
            self.bcomm.start(comm)
            scheduler.add_plugin(self)
    
        def add_worker(self, scheduler: Scheduler, worker: str) -> None:
            ident = scheduler.workers[worker].identity()
            del ident["metrics"]
            del ident["last_seen"]
            try:
                self.bcomm.send(["add", {"workers": {worker: ident}}])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def remove_worker(self, scheduler: Scheduler, worker: str, **kwargs: Any) -> None:
            try:
                self.bcomm.send(["remove", worker])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def teardown(self) -> None:
            self.bcomm.close()
    
    
    class CollectTaskMetaDataPlugin(SchedulerPlugin):
        scheduler: Scheduler
        name: str
        keys: set[Key]
        metadata: dict[Key, Any]
        state: dict[Key, TaskStateState]
    
        def __init__(self, scheduler: Scheduler, name: str):
            self.scheduler = scheduler
            self.name = name
            self.keys = set()
            self.metadata = {}
            self.state = {}
    
        def update_graph(
            self,
            scheduler: Scheduler,
            *,
            keys: set[Key],
            **kwargs: Any,
        ) -> None:
            self.keys.update(keys)
    
        def transition(
            self,
            key: Key,
            start: TaskStateState,
            finish: TaskStateState,
            *args: Any,
            **kwargs: Any,
        ) -> None:
            if finish in ("memory", "erred"):
                ts = self.scheduler.tasks.get(key)
                if ts is not None and ts.key in self.keys:
                    self.metadata[key] = ts.metadata
                    self.state[key] = finish
                    self.keys.discard(key)
    
    
    def _materialize_graph(
        expr: Expr,
        global_annotations: dict[str, Any],
        validate: bool,
    ) -> tuple[dict[Key, T_runspec], dict[Key, set[Key]], dict[str, dict[Key, Any]]]:
>       dsk: dict = expr.__dask_graph__()
E       AttributeError: 'dict' object has no attribute '__dask_graph__'

distributed/scheduler.py:9383: AttributeError

Check warning on line 0 in distributed.cli.tests.test_dask_worker

github-actions / Unit Test Results

1 out of 5 runs failed: test_contact_listen_address[tcp://127.0.0.2:---no-nanny] (distributed.cli.tests.test_dask_worker)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 0s]

Raw output


            AttributeError: 'dict' object has no attribute '__dask_graph__'
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:34705', workers: 0, cores: 0, tasks: 0>
nanny = '--no-nanny', listen_address = 'tcp://127.0.0.2:60837'

    @pytest.mark.slow
    @pytest.mark.skipif(not LINUX, reason="Need 127.0.0.2 to mean localhost")
    @pytest.mark.parametrize("nanny", ["--nanny", "--no-nanny"])
    @pytest.mark.parametrize("listen_address", ["tcp://0.0.0.0:", "tcp://127.0.0.2:"])
    @gen_cluster(client=True, nthreads=[])
    async def test_contact_listen_address(c, s, nanny, listen_address):
        port = open_port()
        listen_address += str(port)
        with popen(
            [
                sys.executable,
                "-m",
                "dask",
                "worker",
                s.address,
                nanny,
                "--no-dashboard",
                "--contact-address",
                f"tcp://127.0.0.2:{port}",
                "--listen-address",
                listen_address,
            ]
        ):
            await c.wait_for_workers(1)
            info = c.scheduler_info()
            assert info["workers"].keys() == {f"tcp://127.0.0.2:{port}"}
    
            # roundtrip works
>           assert await c.submit(lambda x: x + 1, 10) == 11

distributed/cli/tests/test_dask_worker.py:500: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:408: in _result
    raise exc.with_traceback(tb)
distributed/utils.py:1507: in run_in_executor_with_context
    return await loop.run_in_executor(
../../../miniconda3/envs/dask-distributed/lib/python3.10/concurrent/futures/thread.py:58: in run
    result = self.fn(*self.args, **self.kwargs)
distributed/utils.py:1508: in <lambda>
    executor, lambda: context.run(func, *args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import dataclasses
    import heapq
    import inspect
    import itertools
    import json
    import logging
    import math
    import operator
    import os
    import pickle
    import random
    import textwrap
    import uuid
    import warnings
    import weakref
    from abc import abstractmethod
    from collections import defaultdict, deque
    from collections.abc import (
        Callable,
        Collection,
        Container,
        Hashable,
        Iterable,
        Iterator,
        Mapping,
        Sequence,
        Set,
    )
    from contextlib import suppress
    from functools import partial
    from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, cast, overload
    
    import psutil
    import tornado.web
    from sortedcontainers import SortedDict, SortedSet
    from tlz import (
        concat,
        first,
        groupby,
        merge,
        merge_sorted,
        merge_with,
        partition,
        pluck,
        second,
        take,
        valmap,
    )
    from tornado.ioloop import IOLoop
    
    import dask
    import dask.utils
    from dask._task_spec import DependenciesMapping, GraphNode, convert_legacy_graph
    from dask.core import istask, validate_key
    from dask.typing import Key, no_default
    from dask.utils import (
        _deprecated,
        _deprecated_kwarg,
        format_bytes,
        format_time,
        key_split,
        parse_bytes,
        parse_timedelta,
        tmpfile,
    )
    from dask.widgets import get_template
    
    from distributed import cluster_dump, preloading, profile
    from distributed import versions as version_module
    from distributed._asyncio import RLock
    from distributed._stories import scheduler_story
    from distributed.active_memory_manager import ActiveMemoryManagerExtension, RetireWorker
    from distributed.batched import BatchedSend
    from distributed.broker import Broker
    from distributed.client import SourceCode
    from distributed.collections import HeapSet
    from distributed.comm import (
        Comm,
        CommClosedError,
        get_address_host,
        normalize_address,
        resolve_address,
        unparse_host_port,
    )
    from distributed.comm.addressing import addresses_from_user_args
    from distributed.compatibility import PeriodicCallback
    from distributed.core import (
        ErrorMessage,
        OKMessage,
        Status,
        clean_exception,
        error_message,
        rpc,
        send_recv,
    )
    from distributed.diagnostics.memory_sampler import MemorySamplerExtension
    from distributed.diagnostics.plugin import SchedulerPlugin, _get_plugin_name
    from distributed.event import EventExtension
    from distributed.gc import disable_gc_diagnosis, enable_gc_diagnosis
    from distributed.http import get_handlers
    from distributed.metrics import monotonic, time
    from distributed.multi_lock import MultiLockExtension
    from distributed.node import ServerNode
    from distributed.proctitle import setproctitle
    from distributed.protocol import deserialize
    from distributed.protocol.pickle import dumps, loads
    from distributed.protocol.serialize import Serialized, ToPickle, serialize
    from distributed.publish import PublishExtension
    from distributed.pubsub import PubSubSchedulerExtension
    from distributed.queues import QueueExtension
    from distributed.recreate_tasks import ReplayTaskScheduler
    from distributed.security import Security
    from distributed.semaphore import SemaphoreExtension
    from distributed.shuffle import ShuffleSchedulerPlugin
    from distributed.spans import SpanMetadata, SpansSchedulerExtension
    from distributed.stealing import WorkStealing
    from distributed.utils import (
        All,
        Deadline,
        TimeoutError,
        format_dashboard_link,
        get_fileno_limit,
        key_split_group,
        log_errors,
        offload,
        recursive_to_dict,
        wait_for,
    )
    from distributed.utils_comm import (
        gather_from_workers,
        retry_operation,
        scatter_to_workers,
    )
    from distributed.variable import VariableExtension
    
    if TYPE_CHECKING:
        # TODO import from typing (requires Python >=3.10)
        # TODO import from typing (requires Python >=3.11)
        from typing_extensions import Self, TypeAlias
    
        from dask._expr import Expr
    
    # Not to be confused with distributed.worker_state_machine.TaskStateState
    TaskStateState: TypeAlias = Literal[
        "released",
        "waiting",
        "no-worker",
        "queued",
        "processing",
        "memory",
        "erred",
        "forgotten",
    ]
    
    ALL_TASK_STATES: Set[TaskStateState] = set(TaskStateState.__args__)  # type: ignore
    
    # {task key -> finish state}
    # Not to be confused with distributed.worker_state_machine.Recs
    Recs: TypeAlias = dict[Key, TaskStateState]
    # {client or worker address: [{op: <key>, ...}, ...]}
    Msgs: TypeAlias = dict[str, list[dict[str, Any]]]
    # (recommendations, client messages, worker messages)
    RecsMsgs: TypeAlias = tuple[Recs, Msgs, Msgs]
    
    T_runspec: TypeAlias = GraphNode
    
    logger = logging.getLogger(__name__)
    LOG_PDB = dask.config.get("distributed.admin.pdb-on-err")
    DEFAULT_DATA_SIZE = parse_bytes(
        dask.config.get("distributed.scheduler.default-data-size")
    )
    STIMULUS_ID_UNSET = "<stimulus_id unset>"
    
    DEFAULT_EXTENSIONS = {
        "multi_locks": MultiLockExtension,
        "publish": PublishExtension,
        "replay-tasks": ReplayTaskScheduler,
        "queues": QueueExtension,
        "variables": VariableExtension,
        "pubsub": PubSubSchedulerExtension,
        "semaphores": SemaphoreExtension,
        "events": EventExtension,
        "amm": ActiveMemoryManagerExtension,
        "memory_sampler": MemorySamplerExtension,
        "shuffle": ShuffleSchedulerPlugin,
        "spans": SpansSchedulerExtension,
        "stealing": WorkStealing,
    }
    
    
    class ClientState:
        """A simple object holding information about a client."""
    
        #: A unique identifier for this client. This is generally an opaque
        #: string generated by the client itself.
        client_key: str
    
        #: Cached hash of :attr:`~ClientState.client_key`
        _hash: int
    
        #: A set of tasks this client wants to be kept in memory, so that it can download
        #: its result when desired. This is the reverse mapping of
        #: :class:`TaskState.who_wants`. Tasks are typically removed from this set when the
        #: corresponding object in the client's space (for example a ``Future`` or a Dask
        #: collection) gets garbage-collected.
        wants_what: set[TaskState]
    
        #: The last time we received a heartbeat from this client, in local scheduler time.
        last_seen: float
    
        #: Output of :func:`distributed.versions.get_versions` on the client
        versions: dict[str, Any]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(self, client: str, *, versions: dict[str, Any] | None = None):
            self.client_key = client
            self._hash = hash(client)
            self.wants_what = set()
            self.last_seen = time()
            self.versions = versions or {}
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            if not isinstance(other, ClientState):
                return False
            return self.client_key == other.client_key
    
        def __repr__(self) -> str:
            return f"<Client {self.client_key!r}>"
    
        def __str__(self) -> str:
            return self.client_key
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
    
    class MemoryState:
        """Memory readings on a worker or on the whole cluster.
    
        See :doc:`worker-memory`.
    
        Attributes / properties:
    
        managed_total
            Sum of the output of sizeof() for all dask keys held by the worker in memory,
            plus number of bytes spilled to disk
    
        managed
            Sum of the output of sizeof() for the dask keys held in RAM. Note that this may
            be inaccurate, which may cause inaccurate unmanaged memory (see below).
    
        spilled
            Number of bytes  for the dask keys spilled to the hard drive.
            Note that this is the size on disk; size in memory may be different due to
            compression and inaccuracies in sizeof(). In other words, given the same keys,
            'managed' will change depending on the keys being in memory or spilled.
    
        process
            Total RSS memory measured by the OS on the worker process.
            This is always exactly equal to managed + unmanaged.
    
        unmanaged
            process - managed. This is the sum of
    
            - Python interpreter and modules
            - global variables
            - memory temporarily allocated by the dask tasks that are currently running
            - memory fragmentation
            - memory leaks
            - memory not yet garbage collected
            - memory not yet free()'d by the Python memory manager to the OS
    
        unmanaged_old
            Minimum of the 'unmanaged' measures over the last
            ``distributed.memory.recent-to-old-time`` seconds
    
        unmanaged_recent
            unmanaged - unmanaged_old; in other words process memory that has been recently
            allocated but is not accounted for by dask; hopefully it's mostly a temporary
            spike.
    
        optimistic
            managed + unmanaged_old; in other words the memory held long-term by
            the process under the hopeful assumption that all unmanaged_recent memory is a
            temporary spike
        """
    
        process: int
        unmanaged_old: int
        managed: int
        spilled: int
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            process: int,
            unmanaged_old: int,
            managed: int,
            spilled: int,
        ):
            # Some data arrives with the heartbeat, some other arrives in realtime as the
            # tasks progress. Also, sizeof() is not guaranteed to return correct results.
            # This can cause glitches where a partial measure is larger than the whole, so
            # we need to force all numbers to add up exactly by definition.
            self.process = process
            self.managed = min(self.process, managed)
            self.spilled = spilled
            # Subtractions between unsigned ints guaranteed by construction to be >= 0
            self.unmanaged_old = min(unmanaged_old, process - self.managed)
    
        @staticmethod
        def sum(*infos: MemoryState) -> MemoryState:
            process = 0
            unmanaged_old = 0
            managed = 0
            spilled = 0
            for ms in infos:
                process += ms.process
                unmanaged_old += ms.unmanaged_old
                spilled += ms.spilled
                managed += ms.managed
            return MemoryState(
                process=process,
                unmanaged_old=unmanaged_old,
                managed=managed,
                spilled=spilled,
            )
    
        @property
        def managed_total(self) -> int:
            return self.managed + self.spilled
    
        @property
        def unmanaged(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed
    
        @property
        def unmanaged_recent(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed - self.unmanaged_old
    
        @property
        def optimistic(self) -> int:
            return self.managed + self.unmanaged_old
    
        @property
        def managed_in_memory(self) -> int:
            warnings.warn("managed_in_memory has been renamed to managed", FutureWarning)
            return self.managed
    
        @property
        def managed_spilled(self) -> int:
            warnings.warn("managed_spilled has been renamed to spilled", FutureWarning)
            return self.spilled
    
        def __repr__(self) -> str:
            return (
                f"Process memory (RSS)  : {format_bytes(self.process)}\n"
                f"  - managed by Dask   : {format_bytes(self.managed)}\n"
                f"  - unmanaged (old)   : {format_bytes(self.unmanaged_old)}\n"
                f"  - unmanaged (recent): {format_bytes(self.unmanaged_recent)}\n"
                f"Spilled to disk       : {format_bytes(self.spilled)}\n"
            )
    
        def _to_dict(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            """
            return {
                k: getattr(self, k)
                for k in dir(self)
                if not k.startswith("_")
                and k not in {"sum", "managed_in_memory", "managed_spilled"}
            }
    
    
    class WorkerState:
        """A simple object holding information about a worker.
    
        Not to be confused with :class:`distributed.worker_state_machine.WorkerState`.
        """
    
        #: This worker's unique key. This can be its connected address
        #: (such as ``"tcp://127.0.0.1:8891"``) or an alias (such as ``"alice"``).
        address: str
    
        pid: int
        name: Hashable
    
        #: The number of CPU threads made available on this worker
        nthreads: int
    
        #: Memory available to the worker, in bytes
        memory_limit: int
    
        local_directory: str
        services: dict[str, int]
    
        #: Output of :meth:`distributed.versions.get_versions` on the worker
        versions: dict[str, Any]
    
        #: Address of the associated :class:`~distributed.nanny.Nanny`, if present
        nanny: str | None
    
        #: Read-only worker status, synced one way from the remote Worker object
        status: Status
    
        #: Cached hash of :attr:`~WorkerState.server_id`
        _hash: int
    
        #: The total memory size, in bytes, used by the tasks this worker holds in memory
        #: (i.e. the tasks in this worker's :attr:`~WorkerState.has_what`).
        nbytes: int
    
        #: Worker memory unknown to the worker, in bytes, which has been there for more than
        #: 30 seconds. See :class:`MemoryState`.
        _memory_unmanaged_old: int
    
        #: History of the last 30 seconds' worth of unmanaged memory. Used to differentiate
        #: between "old" and "new" unmanaged memory.
        #: Format: ``[(timestamp, bytes), (timestamp, bytes), ...]``
        _memory_unmanaged_history: deque[tuple[float, int]]
    
        metrics: dict[str, Any]
    
        #: The last time we received a heartbeat from this worker, in local scheduler time.
        last_seen: float
    
        time_delay: float
        bandwidth: float
    
        #: A set of all TaskStates on this worker that are actors. This only includes those
        #: actors whose state actually lives on this worker, not actors to which this worker
        #: has a reference.
        actors: set[TaskState]
    
        #: Underlying data of :meth:`WorkerState.has_what`
        _has_what: dict[TaskState, None]
    
        #: A set of tasks that have been submitted to this worker. Multiple tasks may be
        # submitted to a worker in advance and the worker will run them eventually,
        # depending on its execution resources (but see :doc:`work-stealing`).
        #:
        #: All the tasks here are in the "processing" state.
        #: This attribute is kept in sync with :attr:`TaskState.processing_on`.
        processing: set[TaskState]
    
        #: Running tasks that invoked :func:`distributed.secede`
        long_running: set[TaskState]
    
        #: A dictionary of tasks that are currently being run on this worker.
        #: Each task state is associated with the duration in seconds which the task has
        #: been running.
        executing: dict[TaskState, float]
    
        #: The available resources on this worker, e.g. ``{"GPU": 2}``.
        #: These are abstract quantities that constrain certain tasks from running at the
        #: same time on this worker.
        resources: dict[str, float]
    
        #: The sum of each resource used by all tasks allocated to this worker.
        #: The numbers in this dictionary can only be less or equal than those in this
        #: worker's :attr:`~WorkerState.resources`.
        used_resources: dict[str, float]
    
        #: Arbitrary additional metadata to be added to :meth:`~WorkerState.identity`
        extra: dict[str, Any]
    
        # The unique server ID this WorkerState is referencing
        server_id: str
    
        # Reference to scheduler task_groups
        scheduler_ref: weakref.ref[SchedulerState] | None
        task_prefix_count: defaultdict[str, int]
        _network_occ: int
        _occupancy_cache: float | None
    
        #: Keys that may need to be fetched to this worker, and the number of tasks that need them.
        #: All tasks are currently in `memory` on a worker other than this one.
        #: Much like `processing`, this does not exactly reflect worker state:
        #: keys here may be queued to fetch, in flight, or already in memory
        #: on the worker.
        needs_what: dict[TaskState, int]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            address: str,
            status: Status,
            pid: int,
            name: object,
            nthreads: int = 0,
            memory_limit: int,
            local_directory: str,
            nanny: str | None,
            server_id: str,
            services: dict[str, int] | None = None,
            versions: dict[str, Any] | None = None,
            extra: dict[str, Any] | None = None,
            scheduler: SchedulerState | None = None,
        ):
            self.server_id = server_id
            self.address = address
            self.pid = pid
            self.name = name
            self.nthreads = nthreads
            self.memory_limit = memory_limit
            self.local_directory = local_directory
            self.services = services or {}
            self.versions = versions or {}
            self.nanny = nanny
            self.status = status
            self._hash = hash(self.server_id)
            self.nbytes = 0
            self._memory_unmanaged_old = 0
            self._memory_unmanaged_history = deque()
            self.metrics = {}
            self.last_seen = time()
            self.time_delay = 0
            self.bandwidth = parse_bytes(dask.config.get("distributed.scheduler.bandwidth"))
            self.actors = set()
            self._has_what = {}
            self.processing = set()
            self.long_running = set()
            self.executing = {}
            self.resources = {}
            self.used_resources = {}
            self.extra = extra or {}
            self.scheduler_ref = weakref.ref(scheduler) if scheduler else None
            self.task_prefix_count = defaultdict(int)
            self.needs_what = {}
            self._network_occ = 0
            self._occupancy_cache = None
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            return self is other or (
                isinstance(other, WorkerState) and other.server_id == self.server_id
            )
    
        @property
        def has_what(self) -> Set[TaskState]:
            """An insertion-sorted set-like of tasks which currently reside on this worker.
            All the tasks here are in the "memory" state.
            This is the reverse mapping of :attr:`TaskState.who_has`.
    
            This is a read-only public accessor. The data is implemented as a dict without
            values, because rebalance() relies on dicts being insertion-sorted.
            """
            return self._has_what.keys()
    
        @property
        def host(self) -> str:
            return get_address_host(self.address)
    
        @property
        def memory(self) -> MemoryState:
            """Polished memory metrics for the worker.
    
            **Design note on managed memory**
    
            There are two measures available for managed memory:
    
            - ``self.nbytes``
            - ``self.metrics["managed_bytes"]``
    
            At rest, the two numbers must be identical. However, ``self.nbytes`` is
            immediately updated through the batched comms as soon as each task lands in
            memory on the worker; ``self.metrics["managed_bytes"]`` instead is updated by
            the heartbeat, which can lag several seconds behind.
    
            Below we are mixing likely newer managed memory info from ``self.nbytes`` with
            process and spilled memory from the heartbeat. This is deliberate, so that
            managed memory total is updated more frequently.
    
            Managed memory directly and immediately contributes to optimistic memory, which
            is in turn used in Active Memory Manager heuristics (at the moment of writing;
            more uses will likely be added in the future). So it's important to have it
            up to date; much more than it is for process memory.
    
            Having up-to-date managed memory info as soon as the scheduler learns about
            task completion also substantially simplifies unit tests.
    
            The flip side of this design is that it may cause some noise in the
            unmanaged_recent measure. e.g.:
    
            1. Delete 100MB of managed data
            2. The updated managed memory reaches the scheduler faster than the
               updated process memory
            3. There's a blip where the scheduler thinks that there's a sudden 100MB
               increase in unmanaged_recent, since process memory hasn't changed but managed
               memory has decreased by 100MB
            4. When the heartbeat arrives, process memory goes down and so does the
               unmanaged_recent.
    
            This is OK - one of the main reasons for the unmanaged_recent / unmanaged_old
            split is exactly to concentrate all the noise in unmanaged_recent and exclude it
            from optimistic memory, which is used for heuristics.
    
            Something that is less OK, but also less frequent, is that the sudden deletion
            of spilled keys will cause a negative blip in managed memory:
    
            1. Delete 100MB of spilled data
            2. The updated managed memory *total* reaches the scheduler faster than the
               updated spilled portion
            3. This causes the managed memory to temporarily plummet and be replaced by
               unmanaged_recent, while spilled memory remains unaltered
            4. When the heartbeat arrives, managed goes back up, unmanaged_recent
               goes back down, and spilled goes down by 100MB as it should have to
               begin with.
    
            :issue:`6002` will let us solve this.
            """
            return MemoryState(
                process=self.metrics["memory"],
                managed=max(0, self.nbytes - self.metrics["spilled_bytes"]["memory"]),
                spilled=self.metrics["spilled_bytes"]["disk"],
                unmanaged_old=self._memory_unmanaged_old,
            )
    
        def clean(self) -> WorkerState:
            """Return a version of this object that is appropriate for serialization"""
            ws = WorkerState(
                address=self.address,
                status=self.status,
                pid=self.pid,
                name=self.name,
                nthreads=self.nthreads,
                memory_limit=self.memory_limit,
                local_directory=self.local_directory,
                services=self.services,
                nanny=self.nanny,
                extra=self.extra,
                server_id=self.server_id,
            )
            ws._occupancy_cache = self.occupancy
    
            ws.executing = {ts.key: duration for ts, duration in self.executing.items()}  # type: ignore
            return ws
    
        def __repr__(self) -> str:
            name = f", name: {self.name}" if self.name != self.address else ""
            return (
                f"<WorkerState {self.address!r}{name}, "
                f"status: {self.status.name}, "
                f"memory: {len(self.has_what)}, "
                f"processing: {len(self.processing)}>"
            )
    
        def _repr_html_(self) -> str:
            return get_template("worker_state.html.j2").render(
                address=self.address,
                name=self.name,
                status=self.status.name,
                has_what=self.has_what,
                processing=self.processing,
            )
    
        def identity(self) -> dict[str, Any]:
            return {
                "type": "Worker",
                "id": self.name,
                "host": self.host,
                "resources": self.resources,
                "local_directory": self.local_directory,
                "name": self.name,
                "nthreads": self.nthreads,
                "memory_limit": self.memory_limit,
                "last_seen": self.last_seen,
                "services": self.services,
                "metrics": self.metrics,
                "status": self.status.name,
                "nanny": self.nanny,
                **self.extra,
            }
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict[str, Any]:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
        @property
        def scheduler(self) -> SchedulerState:
            assert self.scheduler_ref
            s = self.scheduler_ref()
            assert s
            return s
    
        def add_to_processing(self, ts: TaskState) -> None:
            """Assign a task to this worker for compute."""
            if self.scheduler.validate:
                assert ts not in self.processing
    
            tp = ts.prefix
            self.task_prefix_count[tp.name] += 1
            self.scheduler._task_prefix_count_global[tp.name] += 1
            self.processing.add(ts)
            for dts in ts.dependencies:
                assert dts.who_has
                if self not in dts.who_has:
                    self._inc_needs_replica(dts)
    
        def add_to_long_running(self, ts: TaskState) -> None:
            if self.scheduler.validate:
                assert ts in self.processing
                assert ts not in self.long_running
    
            self._remove_from_task_prefix_count(ts)
            # Cannot remove from processing since we're using this for things like
            # idleness detection. Idle workers are typically targeted for
            # downscaling but we should not downscale workers with long running
            # tasks
            self.long_running.add(ts)
    
        def remove_from_processing(self, ts: TaskState) -> None:
            """Remove a task from a workers processing"""
            if self.scheduler.validate:
                assert ts in self.processing
    
            if ts in self.long_running:
                self.long_running.discard(ts)
            else:
                self._remove_from_task_prefix_count(ts)
            self.processing.remove(ts)
            for dts in ts.dependencies:
                if dts in self.needs_what:
                    self._dec_needs_replica(dts)
    
        def _remove_from_task_prefix_count(self, ts: TaskState) -> None:
            prefix_name = ts.prefix.name
            count = self.task_prefix_count[prefix_name] - 1
            tp_count = self.task_prefix_count
            tp_count_global = self.scheduler._task_prefix_count_global
            if count:
                tp_count[prefix_name] = count
            else:
                del tp_count[prefix_name]
    
            count = tp_count_global[prefix_name] - 1
            if count:
                tp_count_global[prefix_name] = count
            else:
                del tp_count_global[prefix_name]
    
        def remove_replica(self, ts: TaskState) -> None:
            """The worker no longer has a task in memory"""
            if self.scheduler.validate:
                assert ts.who_has
                assert self in ts.who_has
                assert ts in self.has_what
                assert ts not in self.nee…cheduler, title="Scheduler Profile (administrative)"
            )
            task_stream = TabPanel(child=task_stream, title="Task Stream")
            bandwidth_workers = TabPanel(
                child=bandwidth_workers.root, title="Bandwidth (Workers)"
            )
            bandwidth_types = TabPanel(
                child=bandwidth_types.root, title="Bandwidth (Types)"
            )
            system = TabPanel(child=sysmon.root, title="System")
            logs = TabPanel(child=logs.root, title="Scheduler Logs")
    
            tabs = Tabs(
                tabs=[
                    html,
                    task_stream,
                    system,
                    logs,
                    compute,
                    workers,
                    scheduler,
                    bandwidth_workers,
                    bandwidth_types,
                ],
                sizing_mode="stretch_both",
            )
    
            from bokeh.core.templates import get_env
            from bokeh.plotting import output_file, save
    
            with tmpfile(extension=".html") as fn:
                output_file(filename=fn, title="Dask Performance Report", mode=mode)
                template_directory = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)), "dashboard", "templates"
                )
                template_environment = get_env()
                template_environment.loader.searchpath.append(template_directory)
                template = template_environment.get_template("performance_report.html")
                save(tabs, filename=fn, template=template)
    
                with open(fn) as f:
                    data = f.read()
    
            return data
    
        async def get_worker_logs(self, n=None, workers=None, nanny=False):
            results = await self.broadcast(
                msg={"op": "get_logs", "n": n}, workers=workers, nanny=nanny
            )
            return results
    
        def log_event(self, topic: str | Collection[str], msg: Any) -> None:
            """Log an event under a given topic
    
            Parameters
            ----------
            topic : str, list[str]
                Name of the topic under which to log an event. To log the same
                event under multiple topics, pass a list of topic names.
            msg
                Event message to log. Note this must be msgpack serializable.
    
            See also
            --------
            Client.log_event
            """
            self._broker.publish(topic, msg)
    
        def subscribe_topic(self, topic: str, client: str) -> None:
            self._broker.subscribe(topic, client)
    
        def unsubscribe_topic(self, topic: str, client: str) -> None:
            self._broker.unsubscribe(topic, client)
    
        @overload
        def get_events(self, topic: str) -> tuple[tuple[float, Any], ...]: ...
    
        @overload
        def get_events(self) -> dict[str, tuple[tuple[float, Any], ...]]: ...
    
        def get_events(
            self, topic: str | None = None
        ) -> tuple[tuple[float, Any], ...] | dict[str, tuple[tuple[float, Any], ...]]:
            return self._broker.get_events(topic)
    
        async def get_worker_monitor_info(self, recent=False, starts=None):
            if starts is None:
                starts = {}
            results = await asyncio.gather(
                *(
                    self.rpc(w).get_monitor_info(recent=recent, start=starts.get(w, 0))
                    for w in self.workers
                )
            )
            return dict(zip(self.workers, results))
    
        ###########
        # Cleanup #
        ###########
    
        @log_errors
        async def check_worker_ttl(self) -> None:
            now = time()
            stimulus_id = f"check-worker-ttl-{now}"
            assert self.worker_ttl
            ttl = max(self.worker_ttl, 10 * heartbeat_interval(len(self.workers)))
            to_restart = []
    
            for ws in self.workers.values():
                last_seen = now - ws.last_seen
                if last_seen > ttl:
                    to_restart.append(ws.address)
                    logger.warning(
                        f"Worker failed to heartbeat for {last_seen:.0f}s; "
                        f"{'attempting restart' if ws.nanny else 'removing'}: {ws}"
                    )
    
            if to_restart:
                self.log_event(
                    "scheduler",
                    {
                        "action": "worker-ttl-timed-out",
                        "workers": to_restart.copy(),
                        "ttl": ttl,
                    },
                )
                await self.restart_workers(
                    to_restart,
                    wait_for_workers=False,
                    stimulus_id=stimulus_id,
                )
    
        def check_idle(self) -> float | None:
            if self.status in (Status.closing, Status.closed):
                return None  # pragma: nocover
    
            if self.transition_counter != self._idle_transition_counter:
                self._idle_transition_counter = self.transition_counter
                self.idle_since = None
                return None
    
            if self._active_graph_updates > 0:
                self.idle_since = None
                return None
    
            if (
                self.queued
                or self.unrunnable
                or any(ws.processing for ws in self.workers.values())
            ):
                self.idle_since = None
                return None
    
            if not self.idle_since:
                self.idle_since = time()
                return self.idle_since
    
            if self.jupyter:
                last_activity = (
                    self._jupyter_server_application.web_app.last_activity().timestamp()
                )
                if last_activity > self.idle_since:
                    self.idle_since = last_activity
                    return self.idle_since
    
            if self.idle_timeout:
                if time() > self.idle_since + self.idle_timeout:
                    assert self.idle_since
                    logger.info(
                        "Scheduler closing after being idle for %s",
                        format_time(self.idle_timeout),
                    )
                    self._ongoing_background_tasks.call_soon(
                        self.close, reason="idle-timeout-exceeded"
                    )
            return self.idle_since
    
        def _check_no_workers(self) -> None:
            if (
                self.status in (Status.closing, Status.closed)
                or self.no_workers_timeout is None
            ):
                return
    
            now = monotonic()
            stimulus_id = f"check-no-workers-timeout-{time()}"
    
            recommendations: Recs = {}
    
            self._refresh_no_workers_since(now)
    
            affected = self._check_unrunnable_task_timeouts(
                now, recommendations=recommendations, stimulus_id=stimulus_id
            )
    
            affected.update(
                self._check_queued_task_timeouts(
                    now, recommendations=recommendations, stimulus_id=stimulus_id
                )
            )
            self.transitions(recommendations, stimulus_id=stimulus_id)
            if affected:
                self.log_event(
                    "scheduler",
                    {"action": "no-workers-timeout-exceeded", "keys": affected},
                )
    
        def _check_unrunnable_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
            unsatisfied = []
            no_workers = []
            for ts, unrunnable_since in self.unrunnable.items():
                if timestamp <= unrunnable_since + self.no_workers_timeout:
                    # unrunnable is insertion-ordered, which means that unrunnable_since will
                    # be monotonically increasing in this loop.
                    break
                if (
                    self._no_workers_since is None
                    or self._no_workers_since >= unrunnable_since
                ):
                    unsatisfied.append(ts)
                else:
                    no_workers.append(ts)
            if not unsatisfied and not no_workers:
                return set()
    
            for ts in unsatisfied:
                e = pickle.dumps(
                    NoValidWorkerError(
                        task=ts.key,
                        host_restrictions=(ts.host_restrictions or set()).copy(),
                        worker_restrictions=(ts.worker_restrictions or set()).copy(),
                        resource_restrictions=(ts.resource_restrictions or {}).copy(),
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "for its restrictions to become satisfied.",
                    ts.key,
                )
            self._fail_tasks_after_no_workers_timeout(
                no_workers, recommendations, stimulus_id
            )
            return {ts.key for ts in concat([unsatisfied, no_workers])}
    
        def _check_queued_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
    
            if self._no_workers_since is None:
                return set()
    
            if timestamp <= self._no_workers_since + self.no_workers_timeout:
                return set()
            affected = list(self.queued)
            self._fail_tasks_after_no_workers_timeout(
                affected, recommendations, stimulus_id
            )
            return {ts.key for ts in affected}
    
        def _fail_tasks_after_no_workers_timeout(
            self, timed_out: Iterable[TaskState], recommendations: Recs, stimulus_id: str
        ) -> None:
            assert self.no_workers_timeout
    
            for ts in timed_out:
                e = pickle.dumps(
                    NoWorkerError(
                        task=ts.key,
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "without any running workers.",
                    ts.key,
                )
    
        def _refresh_no_workers_since(self, timestamp: float | None = None) -> None:
            if self.running or not (self.queued or self.unrunnable):
                self._no_workers_since = None
                return
    
            if not self._no_workers_since:
                self._no_workers_since = timestamp or monotonic()
                return
    
        def adaptive_target(self, target_duration=None):
            """Desired number of workers based on the current workload
    
            This looks at the current running tasks and memory use, and returns a
            number of desired workers.  This is often used by adaptive scheduling.
    
            Parameters
            ----------
            target_duration : str
                A desired duration of time for computations to take.  This affects
                how rapidly the scheduler will ask to scale.
    
            See Also
            --------
            distributed.deploy.Adaptive
            """
            if target_duration is None:
                target_duration = dask.config.get("distributed.adaptive.target-duration")
            target_duration = parse_timedelta(target_duration)
    
            # CPU
            queued = take(100, concat([self.queued, self.unrunnable.keys()]))
            queued_occupancy = 0
            for ts in queued:
                queued_occupancy += self._get_prefix_duration(ts.prefix)
    
            tasks_ready = len(self.queued) + len(self.unrunnable)
            if tasks_ready > 100:
                queued_occupancy *= tasks_ready / 100
    
            cpu = math.ceil((self.total_occupancy + queued_occupancy) / target_duration)
    
            # Avoid a few long tasks from asking for many cores
            for ws in self.workers.values():
                if tasks_ready > cpu:
                    break
                tasks_ready += len(ws.processing)
            else:
                cpu = min(tasks_ready, cpu)
    
            # Divide by average nthreads per worker
            if self.workers:
                nthreads = sum(ws.nthreads for ws in self.workers.values())
                cpu = math.ceil(cpu / nthreads * len(self.workers))
    
            if (self.unrunnable or self.queued) and not self.workers:
                cpu = max(1, cpu)
    
            # add more workers if more than 60% of memory is used
            limit = sum(ws.memory_limit for ws in self.workers.values())
            used = sum(ws.nbytes for ws in self.workers.values())
            memory = 0
            if used > 0.6 * limit and limit > 0:
                memory = 2 * len(self.workers)
    
            target = max(memory, cpu)
            if target >= len(self.workers):
                return target
            else:  # Scale down?
                to_close = self.workers_to_close()
                return len(self.workers) - len(to_close)
    
        def request_acquire_replicas(
            self, addr: str, keys: Iterable[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to acquire a replica of the listed keys from
            other workers. This is a fire-and-forget operation which offers no feedback for
            success or failure, and is intended for housekeeping and not for computation.
            """
            who_has = {}
            nbytes = {}
            for key in keys:
                ts = self.tasks[key]
                assert ts.who_has
                who_has[key] = [ws.address for ws in ts.who_has or ()]
                nbytes[key] = ts.nbytes
    
            self.stream_comms[addr].send(
                {
                    "op": "acquire-replicas",
                    "who_has": who_has,
                    "nbytes": nbytes,
                    "stimulus_id": stimulus_id,
                },
            )
    
        def request_remove_replicas(
            self, addr: str, keys: list[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to discard its replica of the listed keys.
            This must never be used to destroy the last replica of a key. This is a
            fire-and-forget operation, intended for housekeeping and not for computation.
    
            The replica disappears immediately from TaskState.who_has on the Scheduler side;
            if the worker refuses to delete, e.g. because the task is a dependency of
            another task running on it, it will (also asynchronously) inform the scheduler
            to re-add itself to who_has. If the worker agrees to discard the task, there is
            no feedback.
            """
            ws = self.workers[addr]
    
            # The scheduler immediately forgets about the replica and suggests the worker to
            # drop it. The worker may refuse, at which point it will send back an add-keys
            # message to reinstate it.
            for key in keys:
                ts = self.tasks[key]
                if self.validate:
                    # Do not destroy the last copy
                    assert ts.who_has
                    assert len(ts.who_has) > 1
                self.remove_replica(ts, ws)
    
            self.stream_comms[addr].send(
                {
                    "op": "remove-replicas",
                    "keys": keys,
                    "stimulus_id": stimulus_id,
                }
            )
    
    
    def _task_to_report_msg(ts: TaskState) -> dict[str, Any] | None:
        if ts.state == "forgotten":
            return {"op": "cancelled-keys", "keys": [ts.key]}
        elif ts.state == "memory":
            return {"op": "key-in-memory", "key": ts.key}
        elif ts.state == "erred":
            failing_ts = ts.exception_blame
            assert failing_ts
            return {
                "op": "task-erred",
                "key": ts.key,
                "exception": failing_ts.exception,
                "traceback": failing_ts.traceback,
            }
        else:
            return None
    
    
    def _task_to_client_msgs(ts: TaskState) -> Msgs:
        if ts.who_wants:
            report_msg = _task_to_report_msg(ts)
            if report_msg is not None:
                return {cs.client_key: [report_msg] for cs in ts.who_wants}
        return {}
    
    
    def decide_worker(
        ts: TaskState,
        all_workers: set[WorkerState],
        valid_workers: set[WorkerState] | None,
        objective: Callable[[WorkerState], Any],
    ) -> WorkerState | None:
        """
        Decide which worker should take task *ts*.
    
        We choose the worker that has the data on which *ts* depends.
    
        If several workers have dependencies then we choose the less-busy worker.
    
        Optionally provide *valid_workers* of where jobs are allowed to occur
        (if all workers are allowed to take the task, pass None instead).
    
        If the task requires data communication because no eligible worker has
        all the dependencies already, then we choose to minimize the number
        of bytes sent between workers.  This is determined by calling the
        *objective* function.
        """
        assert all(dts.who_has for dts in ts.dependencies)
        if ts.actor:
            candidates = all_workers.copy()
        else:
            candidates = {wws for dts in ts.dependencies for wws in dts.who_has or ()}
            candidates &= all_workers
        if valid_workers is None:
            if not candidates:
                candidates = all_workers.copy()
        else:
            candidates &= valid_workers
            if not candidates:
                candidates = valid_workers
                if not candidates:
                    if ts.loose_restrictions:
                        return decide_worker(ts, all_workers, None, objective)
    
        if not candidates:
            return None
        elif len(candidates) == 1:
            return next(iter(candidates))
        else:
            return min(candidates, key=objective)
    
    
    def validate_task_state(ts: TaskState) -> None:
        """Validate the given TaskState"""
        assert ts.state in ALL_TASK_STATES, ts
    
        if ts.waiting_on:
            assert ts.waiting_on.issubset(ts.dependencies), (
                "waiting not subset of dependencies",
                str(ts.waiting_on),
                str(ts.dependencies),
            )
        if ts.waiters:
            assert ts.waiters.issubset(ts.dependents), (
                "waiters not subset of dependents",
                str(ts.waiters),
                str(ts.dependents),
            )
    
        for dts in ts.waiting_on or ():
            assert not dts.who_has, ("waiting on in-memory dep", str(ts), str(dts))
            assert dts.state != "released", ("waiting on released dep", str(ts), str(dts))
        for dts in ts.dependencies:
            assert ts in dts.dependents, (
                "not in dependency's dependents",
                str(ts),
                str(dts),
                str(dts.dependents),
            )
            if ts.state in ("waiting", "queued", "processing", "no-worker"):
                assert ts.waiting_on and dts in ts.waiting_on or dts.who_has, (
                    "dep missing",
                    str(ts),
                    str(dts),
                )
            assert dts.state != "forgotten"
    
        for dts in ts.waiters or ():
            assert dts.state in ("waiting", "queued", "processing", "no-worker"), (
                "waiter not in play",
                str(ts),
                str(dts),
            )
        for dts in ts.dependents:
            assert ts in dts.dependencies, (
                "not in dependent's dependencies",
                str(ts),
                str(dts),
                str(dts.dependencies),
            )
            assert dts.state != "forgotten"
    
        assert (ts.processing_on is not None) == (ts.state == "processing")
        assert bool(ts.who_has) == (ts.state == "memory"), (ts, ts.who_has, ts.state)
    
        if ts.state == "queued":
            assert not ts.processing_on
            assert not ts.who_has
            assert all(dts.who_has for dts in ts.dependencies), (
                "task queued without all deps",
                str(ts),
                str(ts.dependencies),
            )
    
        if ts.state == "processing":
            assert all(dts.who_has for dts in ts.dependencies), (
                "task processing without all deps",
                str(ts),
                str(ts.dependencies),
            )
            assert not ts.waiting_on
    
        if ts.who_has:
            assert ts.waiters or ts.who_wants, (
                "unneeded task in memory",
                str(ts),
                str(ts.who_has),
            )
            if ts.run_spec:  # was computed
                assert ts.type
                assert isinstance(ts.type, str)
            assert not any(
                [
                    ts in dts.waiting_on
                    for dts in ts.dependents
                    if dts.waiting_on is not None
                ]
            )
            for ws in ts.who_has:
                assert ts in ws.has_what, (
                    "not in who_has' has_what",
                    str(ts),
                    str(ws),
                    str(ws.has_what),
                )
    
        for cs in ts.who_wants or ():
            assert ts in cs.wants_what, (
                "not in who_wants' wants_what",
                str(ts),
                str(cs),
                str(cs.wants_what),
            )
    
        if ts.actor:
            if ts.state == "memory":
                assert ts.who_has
                assert sum(ts in ws.actors for ws in ts.who_has) == 1
            if ts.state == "processing":
                assert ts.processing_on
                assert ts in ts.processing_on.actors
            assert ts.state != "queued"
    
    
    def validate_unrunnable(unrunnable: dict[TaskState, float]) -> None:
        prev_unrunnable_since: float | None = None
        prev_ts: TaskState | None = None
        for ts, unrunnable_since in unrunnable.items():
            assert ts.state == "no-worker"
            if prev_ts is not None:
                assert prev_unrunnable_since is not None
                # Ensure that unrunnable_since is monotonically increasing when iterating over unrunnable.
                # _check_no_workers relies on this.
                assert prev_unrunnable_since <= unrunnable_since, (
                    prev_ts,
                    ts,
                    prev_unrunnable_since,
                    unrunnable_since,
                )
            prev_ts = ts
            prev_unrunnable_since = unrunnable_since
    
    
    def validate_worker_state(ws: WorkerState) -> None:
        for ts in ws.has_what or ():
            assert ts.who_has
            assert ws in ts.who_has, (
                "not in has_what' who_has",
                str(ws),
                str(ts),
                str(ts.who_has),
            )
    
        for ts in ws.actors:
            assert ts.state in ("memory", "processing")
    
    
    def validate_state(
        tasks: dict[Key, TaskState],
        workers: dict[str, WorkerState],
        clients: dict[str, ClientState],
    ) -> None:
        """Validate a current runtime state.
    
        This performs a sequence of checks on the entire graph, running in about linear
        time. This raises assert errors if anything doesn't check out.
        """
        for ts in tasks.values():
            validate_task_state(ts)
    
        for ws in workers.values():
            validate_worker_state(ws)
    
        for cs in clients.values():
            for ts in cs.wants_what or ():
                assert ts.who_wants
                assert cs in ts.who_wants, (
                    "not in wants_what' who_wants",
                    str(cs),
                    str(ts),
                    str(ts.who_wants),
                )
    
    
    def heartbeat_interval(n: int) -> float:
        """Interval in seconds that we desire heartbeats based on number of workers"""
        if n <= 10:
            return 0.5
        elif n < 50:
            return 1
        elif n < 200:
            return 2
        else:
            # No more than 200 heartbeats a second scaled by workers
            return n / 200 + 1
    
    
    def _task_slots_available(ws: WorkerState, saturation_factor: float) -> int:
        """Number of tasks that can be sent to this worker without oversaturating it"""
        assert not math.isinf(saturation_factor)
        return max(math.ceil(saturation_factor * ws.nthreads), 1) - (
            len(ws.processing) - len(ws.long_running)
        )
    
    
    def _worker_full(ws: WorkerState, saturation_factor: float) -> bool:
        if math.isinf(saturation_factor):
            return False
        return _task_slots_available(ws, saturation_factor) <= 0
    
    
    class KilledWorker(Exception):
        def __init__(self, task: Key, last_worker: WorkerState, allowed_failures: int):
            super().__init__(task, last_worker, allowed_failures)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def last_worker(self) -> WorkerState:
            return self.args[1]
    
        @property
        def allowed_failures(self) -> int:
            return self.args[2]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} on {self.allowed_failures + 1} "
                "different workers, but all those workers died while running it. "
                f"The last worker that attempt to run the task was {self.last_worker.address}. "
                "Inspecting worker logs is often a good next step to diagnose what went wrong. "
                "For more information see https://distributed.dask.org/en/stable/killed.html."
            )
    
    
    class NoValidWorkerError(Exception):
        def __init__(
            self,
            task: Key,
            host_restrictions: set[str],
            worker_restrictions: set[str],
            resource_restrictions: dict[str, float],
            timeout: float,
        ):
            super().__init__(
                task, host_restrictions, worker_restrictions, resource_restrictions, timeout
            )
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def host_restrictions(self) -> Any:
            return self.args[1]
    
        @property
        def worker_restrictions(self) -> Any:
            return self.args[2]
    
        @property
        def resource_restrictions(self) -> Any:
            return self.args[3]
    
        @property
        def timeout(self) -> float:
            return self.args[4]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting for a valid worker matching all restrictions.\n\nRestrictions:\n"
                f"host_restrictions={self.host_restrictions!s}\n"
                f"worker_restrictions={self.worker_restrictions!s}\n"
                f"resource_restrictions={self.resource_restrictions!s}\n"
            )
    
    
    class NoWorkerError(Exception):
        def __init__(self, task: Key, timeout: float):
            super().__init__(task, timeout)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def timeout(self) -> float:
            return self.args[1]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting without any running workers."
            )
    
    
    class WorkerStatusPlugin(SchedulerPlugin):
        """A plugin to share worker status with a remote observer
    
        This is used in cluster managers to keep updated about the status of the scheduler.
        """
    
        name: ClassVar[str] = "worker-status"
        bcomm: BatchedSend
    
        def __init__(self, scheduler: Scheduler, comm: Comm):
            self.bcomm = BatchedSend(interval="5ms")
            self.bcomm.start(comm)
            scheduler.add_plugin(self)
    
        def add_worker(self, scheduler: Scheduler, worker: str) -> None:
            ident = scheduler.workers[worker].identity()
            del ident["metrics"]
            del ident["last_seen"]
            try:
                self.bcomm.send(["add", {"workers": {worker: ident}}])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def remove_worker(self, scheduler: Scheduler, worker: str, **kwargs: Any) -> None:
            try:
                self.bcomm.send(["remove", worker])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def teardown(self) -> None:
            self.bcomm.close()
    
    
    class CollectTaskMetaDataPlugin(SchedulerPlugin):
        scheduler: Scheduler
        name: str
        keys: set[Key]
        metadata: dict[Key, Any]
        state: dict[Key, TaskStateState]
    
        def __init__(self, scheduler: Scheduler, name: str):
            self.scheduler = scheduler
            self.name = name
            self.keys = set()
            self.metadata = {}
            self.state = {}
    
        def update_graph(
            self,
            scheduler: Scheduler,
            *,
            keys: set[Key],
            **kwargs: Any,
        ) -> None:
            self.keys.update(keys)
    
        def transition(
            self,
            key: Key,
            start: TaskStateState,
            finish: TaskStateState,
            *args: Any,
            **kwargs: Any,
        ) -> None:
            if finish in ("memory", "erred"):
                ts = self.scheduler.tasks.get(key)
                if ts is not None and ts.key in self.keys:
                    self.metadata[key] = ts.metadata
                    self.state[key] = finish
                    self.keys.discard(key)
    
    
    def _materialize_graph(
        expr: Expr,
        global_annotations: dict[str, Any],
        validate: bool,
    ) -> tuple[dict[Key, T_runspec], dict[Key, set[Key]], dict[str, dict[Key, Any]]]:
>       dsk: dict = expr.__dask_graph__()
E       AttributeError: 'dict' object has no attribute '__dask_graph__'

distributed/scheduler.py:9383: AttributeError

Check warning on line 0 in distributed.cli.tests.test_dask_worker

github-actions / Unit Test Results

1 out of 5 runs failed: test_respect_host_listen_address[127.0.0.2---nanny] (distributed.cli.tests.test_dask_worker)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 1s]

Raw output


            AttributeError: 'dict' object has no attribute '__dask_graph__'
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:43649', workers: 0, cores: 0, tasks: 0>
nanny = '--nanny', host = '127.0.0.2'

    @pytest.mark.slow
    @pytest.mark.skipif(not LINUX, reason="Need 127.0.0.2 to mean localhost")
    @pytest.mark.parametrize("nanny", ["--nanny", "--no-nanny"])
    @pytest.mark.parametrize("host", ["127.0.0.2", "0.0.0.0"])
    @gen_cluster(client=True, nthreads=[])
    async def test_respect_host_listen_address(c, s, nanny, host):
        with popen(
            [
                sys.executable,
                "-m",
                "dask",
                "worker",
                s.address,
                nanny,
                "--no-dashboard",
                "--host",
                host,
            ]
        ):
            await c.wait_for_workers(1)
    
            # roundtrip works
>           assert await c.submit(lambda x: x + 1, 10) == 11

distributed/cli/tests/test_dask_worker.py:569: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:408: in _result
    raise exc.with_traceback(tb)
distributed/utils.py:1507: in run_in_executor_with_context
    return await loop.run_in_executor(
../../../miniconda3/envs/dask-distributed/lib/python3.10/concurrent/futures/thread.py:58: in run
    result = self.fn(*self.args, **self.kwargs)
distributed/utils.py:1508: in <lambda>
    executor, lambda: context.run(func, *args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import dataclasses
    import heapq
    import inspect
    import itertools
    import json
    import logging
    import math
    import operator
    import os
    import pickle
    import random
    import textwrap
    import uuid
    import warnings
    import weakref
    from abc import abstractmethod
    from collections import defaultdict, deque
    from collections.abc import (
        Callable,
        Collection,
        Container,
        Hashable,
        Iterable,
        Iterator,
        Mapping,
        Sequence,
        Set,
    )
    from contextlib import suppress
    from functools import partial
    from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, cast, overload
    
    import psutil
    import tornado.web
    from sortedcontainers import SortedDict, SortedSet
    from tlz import (
        concat,
        first,
        groupby,
        merge,
        merge_sorted,
        merge_with,
        partition,
        pluck,
        second,
        take,
        valmap,
    )
    from tornado.ioloop import IOLoop
    
    import dask
    import dask.utils
    from dask._task_spec import DependenciesMapping, GraphNode, convert_legacy_graph
    from dask.core import istask, validate_key
    from dask.typing import Key, no_default
    from dask.utils import (
        _deprecated,
        _deprecated_kwarg,
        format_bytes,
        format_time,
        key_split,
        parse_bytes,
        parse_timedelta,
        tmpfile,
    )
    from dask.widgets import get_template
    
    from distributed import cluster_dump, preloading, profile
    from distributed import versions as version_module
    from distributed._asyncio import RLock
    from distributed._stories import scheduler_story
    from distributed.active_memory_manager import ActiveMemoryManagerExtension, RetireWorker
    from distributed.batched import BatchedSend
    from distributed.broker import Broker
    from distributed.client import SourceCode
    from distributed.collections import HeapSet
    from distributed.comm import (
        Comm,
        CommClosedError,
        get_address_host,
        normalize_address,
        resolve_address,
        unparse_host_port,
    )
    from distributed.comm.addressing import addresses_from_user_args
    from distributed.compatibility import PeriodicCallback
    from distributed.core import (
        ErrorMessage,
        OKMessage,
        Status,
        clean_exception,
        error_message,
        rpc,
        send_recv,
    )
    from distributed.diagnostics.memory_sampler import MemorySamplerExtension
    from distributed.diagnostics.plugin import SchedulerPlugin, _get_plugin_name
    from distributed.event import EventExtension
    from distributed.gc import disable_gc_diagnosis, enable_gc_diagnosis
    from distributed.http import get_handlers
    from distributed.metrics import monotonic, time
    from distributed.multi_lock import MultiLockExtension
    from distributed.node import ServerNode
    from distributed.proctitle import setproctitle
    from distributed.protocol import deserialize
    from distributed.protocol.pickle import dumps, loads
    from distributed.protocol.serialize import Serialized, ToPickle, serialize
    from distributed.publish import PublishExtension
    from distributed.pubsub import PubSubSchedulerExtension
    from distributed.queues import QueueExtension
    from distributed.recreate_tasks import ReplayTaskScheduler
    from distributed.security import Security
    from distributed.semaphore import SemaphoreExtension
    from distributed.shuffle import ShuffleSchedulerPlugin
    from distributed.spans import SpanMetadata, SpansSchedulerExtension
    from distributed.stealing import WorkStealing
    from distributed.utils import (
        All,
        Deadline,
        TimeoutError,
        format_dashboard_link,
        get_fileno_limit,
        key_split_group,
        log_errors,
        offload,
        recursive_to_dict,
        wait_for,
    )
    from distributed.utils_comm import (
        gather_from_workers,
        retry_operation,
        scatter_to_workers,
    )
    from distributed.variable import VariableExtension
    
    if TYPE_CHECKING:
        # TODO import from typing (requires Python >=3.10)
        # TODO import from typing (requires Python >=3.11)
        from typing_extensions import Self, TypeAlias
    
        from dask._expr import Expr
    
    # Not to be confused with distributed.worker_state_machine.TaskStateState
    TaskStateState: TypeAlias = Literal[
        "released",
        "waiting",
        "no-worker",
        "queued",
        "processing",
        "memory",
        "erred",
        "forgotten",
    ]
    
    ALL_TASK_STATES: Set[TaskStateState] = set(TaskStateState.__args__)  # type: ignore
    
    # {task key -> finish state}
    # Not to be confused with distributed.worker_state_machine.Recs
    Recs: TypeAlias = dict[Key, TaskStateState]
    # {client or worker address: [{op: <key>, ...}, ...]}
    Msgs: TypeAlias = dict[str, list[dict[str, Any]]]
    # (recommendations, client messages, worker messages)
    RecsMsgs: TypeAlias = tuple[Recs, Msgs, Msgs]
    
    T_runspec: TypeAlias = GraphNode
    
    logger = logging.getLogger(__name__)
    LOG_PDB = dask.config.get("distributed.admin.pdb-on-err")
    DEFAULT_DATA_SIZE = parse_bytes(
        dask.config.get("distributed.scheduler.default-data-size")
    )
    STIMULUS_ID_UNSET = "<stimulus_id unset>"
    
    DEFAULT_EXTENSIONS = {
        "multi_locks": MultiLockExtension,
        "publish": PublishExtension,
        "replay-tasks": ReplayTaskScheduler,
        "queues": QueueExtension,
        "variables": VariableExtension,
        "pubsub": PubSubSchedulerExtension,
        "semaphores": SemaphoreExtension,
        "events": EventExtension,
        "amm": ActiveMemoryManagerExtension,
        "memory_sampler": MemorySamplerExtension,
        "shuffle": ShuffleSchedulerPlugin,
        "spans": SpansSchedulerExtension,
        "stealing": WorkStealing,
    }
    
    
    class ClientState:
        """A simple object holding information about a client."""
    
        #: A unique identifier for this client. This is generally an opaque
        #: string generated by the client itself.
        client_key: str
    
        #: Cached hash of :attr:`~ClientState.client_key`
        _hash: int
    
        #: A set of tasks this client wants to be kept in memory, so that it can download
        #: its result when desired. This is the reverse mapping of
        #: :class:`TaskState.who_wants`. Tasks are typically removed from this set when the
        #: corresponding object in the client's space (for example a ``Future`` or a Dask
        #: collection) gets garbage-collected.
        wants_what: set[TaskState]
    
        #: The last time we received a heartbeat from this client, in local scheduler time.
        last_seen: float
    
        #: Output of :func:`distributed.versions.get_versions` on the client
        versions: dict[str, Any]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(self, client: str, *, versions: dict[str, Any] | None = None):
            self.client_key = client
            self._hash = hash(client)
            self.wants_what = set()
            self.last_seen = time()
            self.versions = versions or {}
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            if not isinstance(other, ClientState):
                return False
            return self.client_key == other.client_key
    
        def __repr__(self) -> str:
            return f"<Client {self.client_key!r}>"
    
        def __str__(self) -> str:
            return self.client_key
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
    
    class MemoryState:
        """Memory readings on a worker or on the whole cluster.
    
        See :doc:`worker-memory`.
    
        Attributes / properties:
    
        managed_total
            Sum of the output of sizeof() for all dask keys held by the worker in memory,
            plus number of bytes spilled to disk
    
        managed
            Sum of the output of sizeof() for the dask keys held in RAM. Note that this may
            be inaccurate, which may cause inaccurate unmanaged memory (see below).
    
        spilled
            Number of bytes  for the dask keys spilled to the hard drive.
            Note that this is the size on disk; size in memory may be different due to
            compression and inaccuracies in sizeof(). In other words, given the same keys,
            'managed' will change depending on the keys being in memory or spilled.
    
        process
            Total RSS memory measured by the OS on the worker process.
            This is always exactly equal to managed + unmanaged.
    
        unmanaged
            process - managed. This is the sum of
    
            - Python interpreter and modules
            - global variables
            - memory temporarily allocated by the dask tasks that are currently running
            - memory fragmentation
            - memory leaks
            - memory not yet garbage collected
            - memory not yet free()'d by the Python memory manager to the OS
    
        unmanaged_old
            Minimum of the 'unmanaged' measures over the last
            ``distributed.memory.recent-to-old-time`` seconds
    
        unmanaged_recent
            unmanaged - unmanaged_old; in other words process memory that has been recently
            allocated but is not accounted for by dask; hopefully it's mostly a temporary
            spike.
    
        optimistic
            managed + unmanaged_old; in other words the memory held long-term by
            the process under the hopeful assumption that all unmanaged_recent memory is a
            temporary spike
        """
    
        process: int
        unmanaged_old: int
        managed: int
        spilled: int
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            process: int,
            unmanaged_old: int,
            managed: int,
            spilled: int,
        ):
            # Some data arrives with the heartbeat, some other arrives in realtime as the
            # tasks progress. Also, sizeof() is not guaranteed to return correct results.
            # This can cause glitches where a partial measure is larger than the whole, so
            # we need to force all numbers to add up exactly by definition.
            self.process = process
            self.managed = min(self.process, managed)
            self.spilled = spilled
            # Subtractions between unsigned ints guaranteed by construction to be >= 0
            self.unmanaged_old = min(unmanaged_old, process - self.managed)
    
        @staticmethod
        def sum(*infos: MemoryState) -> MemoryState:
            process = 0
            unmanaged_old = 0
            managed = 0
            spilled = 0
            for ms in infos:
                process += ms.process
                unmanaged_old += ms.unmanaged_old
                spilled += ms.spilled
                managed += ms.managed
            return MemoryState(
                process=process,
                unmanaged_old=unmanaged_old,
                managed=managed,
                spilled=spilled,
            )
    
        @property
        def managed_total(self) -> int:
            return self.managed + self.spilled
    
        @property
        def unmanaged(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed
    
        @property
        def unmanaged_recent(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed - self.unmanaged_old
    
        @property
        def optimistic(self) -> int:
            return self.managed + self.unmanaged_old
    
        @property
        def managed_in_memory(self) -> int:
            warnings.warn("managed_in_memory has been renamed to managed", FutureWarning)
            return self.managed
    
        @property
        def managed_spilled(self) -> int:
            warnings.warn("managed_spilled has been renamed to spilled", FutureWarning)
            return self.spilled
    
        def __repr__(self) -> str:
            return (
                f"Process memory (RSS)  : {format_bytes(self.process)}\n"
                f"  - managed by Dask   : {format_bytes(self.managed)}\n"
                f"  - unmanaged (old)   : {format_bytes(self.unmanaged_old)}\n"
                f"  - unmanaged (recent): {format_bytes(self.unmanaged_recent)}\n"
                f"Spilled to disk       : {format_bytes(self.spilled)}\n"
            )
    
        def _to_dict(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            """
            return {
                k: getattr(self, k)
                for k in dir(self)
                if not k.startswith("_")
                and k not in {"sum", "managed_in_memory", "managed_spilled"}
            }
    
    
    class WorkerState:
        """A simple object holding information about a worker.
    
        Not to be confused with :class:`distributed.worker_state_machine.WorkerState`.
        """
    
        #: This worker's unique key. This can be its connected address
        #: (such as ``"tcp://127.0.0.1:8891"``) or an alias (such as ``"alice"``).
        address: str
    
        pid: int
        name: Hashable
    
        #: The number of CPU threads made available on this worker
        nthreads: int
    
        #: Memory available to the worker, in bytes
        memory_limit: int
    
        local_directory: str
        services: dict[str, int]
    
        #: Output of :meth:`distributed.versions.get_versions` on the worker
        versions: dict[str, Any]
    
        #: Address of the associated :class:`~distributed.nanny.Nanny`, if present
        nanny: str | None
    
        #: Read-only worker status, synced one way from the remote Worker object
        status: Status
    
        #: Cached hash of :attr:`~WorkerState.server_id`
        _hash: int
    
        #: The total memory size, in bytes, used by the tasks this worker holds in memory
        #: (i.e. the tasks in this worker's :attr:`~WorkerState.has_what`).
        nbytes: int
    
        #: Worker memory unknown to the worker, in bytes, which has been there for more than
        #: 30 seconds. See :class:`MemoryState`.
        _memory_unmanaged_old: int
    
        #: History of the last 30 seconds' worth of unmanaged memory. Used to differentiate
        #: between "old" and "new" unmanaged memory.
        #: Format: ``[(timestamp, bytes), (timestamp, bytes), ...]``
        _memory_unmanaged_history: deque[tuple[float, int]]
    
        metrics: dict[str, Any]
    
        #: The last time we received a heartbeat from this worker, in local scheduler time.
        last_seen: float
    
        time_delay: float
        bandwidth: float
    
        #: A set of all TaskStates on this worker that are actors. This only includes those
        #: actors whose state actually lives on this worker, not actors to which this worker
        #: has a reference.
        actors: set[TaskState]
    
        #: Underlying data of :meth:`WorkerState.has_what`
        _has_what: dict[TaskState, None]
    
        #: A set of tasks that have been submitted to this worker. Multiple tasks may be
        # submitted to a worker in advance and the worker will run them eventually,
        # depending on its execution resources (but see :doc:`work-stealing`).
        #:
        #: All the tasks here are in the "processing" state.
        #: This attribute is kept in sync with :attr:`TaskState.processing_on`.
        processing: set[TaskState]
    
        #: Running tasks that invoked :func:`distributed.secede`
        long_running: set[TaskState]
    
        #: A dictionary of tasks that are currently being run on this worker.
        #: Each task state is associated with the duration in seconds which the task has
        #: been running.
        executing: dict[TaskState, float]
    
        #: The available resources on this worker, e.g. ``{"GPU": 2}``.
        #: These are abstract quantities that constrain certain tasks from running at the
        #: same time on this worker.
        resources: dict[str, float]
    
        #: The sum of each resource used by all tasks allocated to this worker.
        #: The numbers in this dictionary can only be less or equal than those in this
        #: worker's :attr:`~WorkerState.resources`.
        used_resources: dict[str, float]
    
        #: Arbitrary additional metadata to be added to :meth:`~WorkerState.identity`
        extra: dict[str, Any]
    
        # The unique server ID this WorkerState is referencing
        server_id: str
    
        # Reference to scheduler task_groups
        scheduler_ref: weakref.ref[SchedulerState] | None
        task_prefix_count: defaultdict[str, int]
        _network_occ: int
        _occupancy_cache: float | None
    
        #: Keys that may need to be fetched to this worker, and the number of tasks that need them.
        #: All tasks are currently in `memory` on a worker other than this one.
        #: Much like `processing`, this does not exactly reflect worker state:
        #: keys here may be queued to fetch, in flight, or already in memory
        #: on the worker.
        needs_what: dict[TaskState, int]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            address: str,
            status: Status,
            pid: int,
            name: object,
            nthreads: int = 0,
            memory_limit: int,
            local_directory: str,
            nanny: str | None,
            server_id: str,
            services: dict[str, int] | None = None,
            versions: dict[str, Any] | None = None,
            extra: dict[str, Any] | None = None,
            scheduler: SchedulerState | None = None,
        ):
            self.server_id = server_id
            self.address = address
            self.pid = pid
            self.name = name
            self.nthreads = nthreads
            self.memory_limit = memory_limit
            self.local_directory = local_directory
            self.services = services or {}
            self.versions = versions or {}
            self.nanny = nanny
            self.status = status
            self._hash = hash(self.server_id)
            self.nbytes = 0
            self._memory_unmanaged_old = 0
            self._memory_unmanaged_history = deque()
            self.metrics = {}
            self.last_seen = time()
            self.time_delay = 0
            self.bandwidth = parse_bytes(dask.config.get("distributed.scheduler.bandwidth"))
            self.actors = set()
            self._has_what = {}
            self.processing = set()
            self.long_running = set()
            self.executing = {}
            self.resources = {}
            self.used_resources = {}
            self.extra = extra or {}
            self.scheduler_ref = weakref.ref(scheduler) if scheduler else None
            self.task_prefix_count = defaultdict(int)
            self.needs_what = {}
            self._network_occ = 0
            self._occupancy_cache = None
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            return self is other or (
                isinstance(other, WorkerState) and other.server_id == self.server_id
            )
    
        @property
        def has_what(self) -> Set[TaskState]:
            """An insertion-sorted set-like of tasks which currently reside on this worker.
            All the tasks here are in the "memory" state.
            This is the reverse mapping of :attr:`TaskState.who_has`.
    
            This is a read-only public accessor. The data is implemented as a dict without
            values, because rebalance() relies on dicts being insertion-sorted.
            """
            return self._has_what.keys()
    
        @property
        def host(self) -> str:
            return get_address_host(self.address)
    
        @property
        def memory(self) -> MemoryState:
            """Polished memory metrics for the worker.
    
            **Design note on managed memory**
    
            There are two measures available for managed memory:
    
            - ``self.nbytes``
            - ``self.metrics["managed_bytes"]``
    
            At rest, the two numbers must be identical. However, ``self.nbytes`` is
            immediately updated through the batched comms as soon as each task lands in
            memory on the worker; ``self.metrics["managed_bytes"]`` instead is updated by
            the heartbeat, which can lag several seconds behind.
    
            Below we are mixing likely newer managed memory info from ``self.nbytes`` with
            process and spilled memory from the heartbeat. This is deliberate, so that
            managed memory total is updated more frequently.
    
            Managed memory directly and immediately contributes to optimistic memory, which
            is in turn used in Active Memory Manager heuristics (at the moment of writing;
            more uses will likely be added in the future). So it's important to have it
            up to date; much more than it is for process memory.
    
            Having up-to-date managed memory info as soon as the scheduler learns about
            task completion also substantially simplifies unit tests.
    
            The flip side of this design is that it may cause some noise in the
            unmanaged_recent measure. e.g.:
    
            1. Delete 100MB of managed data
            2. The updated managed memory reaches the scheduler faster than the
               updated process memory
            3. There's a blip where the scheduler thinks that there's a sudden 100MB
               increase in unmanaged_recent, since process memory hasn't changed but managed
               memory has decreased by 100MB
            4. When the heartbeat arrives, process memory goes down and so does the
               unmanaged_recent.
    
            This is OK - one of the main reasons for the unmanaged_recent / unmanaged_old
            split is exactly to concentrate all the noise in unmanaged_recent and exclude it
            from optimistic memory, which is used for heuristics.
    
            Something that is less OK, but also less frequent, is that the sudden deletion
            of spilled keys will cause a negative blip in managed memory:
    
            1. Delete 100MB of spilled data
            2. The updated managed memory *total* reaches the scheduler faster than the
               updated spilled portion
            3. This causes the managed memory to temporarily plummet and be replaced by
               unmanaged_recent, while spilled memory remains unaltered
            4. When the heartbeat arrives, managed goes back up, unmanaged_recent
               goes back down, and spilled goes down by 100MB as it should have to
               begin with.
    
            :issue:`6002` will let us solve this.
            """
            return MemoryState(
                process=self.metrics["memory"],
                managed=max(0, self.nbytes - self.metrics["spilled_bytes"]["memory"]),
                spilled=self.metrics["spilled_bytes"]["disk"],
                unmanaged_old=self._memory_unmanaged_old,
            )
    
        def clean(self) -> WorkerState:
            """Return a version of this object that is appropriate for serialization"""
            ws = WorkerState(
                address=self.address,
                status=self.status,
                pid=self.pid,
                name=self.name,
                nthreads=self.nthreads,
                memory_limit=self.memory_limit,
                local_directory=self.local_directory,
                services=self.services,
                nanny=self.nanny,
                extra=self.extra,
                server_id=self.server_id,
            )
            ws._occupancy_cache = self.occupancy
    
            ws.executing = {ts.key: duration for ts, duration in self.executing.items()}  # type: ignore
            return ws
    
        def __repr__(self) -> str:
            name = f", name: {self.name}" if self.name != self.address else ""
            return (
                f"<WorkerState {self.address!r}{name}, "
                f"status: {self.status.name}, "
                f"memory: {len(self.has_what)}, "
                f"processing: {len(self.processing)}>"
            )
    
        def _repr_html_(self) -> str:
            return get_template("worker_state.html.j2").render(
                address=self.address,
                name=self.name,
                status=self.status.name,
                has_what=self.has_what,
                processing=self.processing,
            )
    
        def identity(self) -> dict[str, Any]:
            return {
                "type": "Worker",
                "id": self.name,
                "host": self.host,
                "resources": self.resources,
                "local_directory": self.local_directory,
                "name": self.name,
                "nthreads": self.nthreads,
                "memory_limit": self.memory_limit,
                "last_seen": self.last_seen,
                "services": self.services,
                "metrics": self.metrics,
                "status": self.status.name,
                "nanny": self.nanny,
                **self.extra,
            }
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict[str, Any]:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
        @property
        def scheduler(self) -> SchedulerState:
            assert self.scheduler_ref
            s = self.scheduler_ref()
            assert s
            return s
    
        def add_to_processing(self, ts: TaskState) -> None:
            """Assign a task to this worker for compute."""
            if self.scheduler.validate:
                assert ts not in self.processing
    
            tp = ts.prefix
            self.task_prefix_count[tp.name] += 1
            self.scheduler._task_prefix_count_global[tp.name] += 1
            self.processing.add(ts)
            for dts in ts.dependencies:
                assert dts.who_has
                if self not in dts.who_has:
                    self._inc_needs_replica(dts)
    
        def add_to_long_running(self, ts: TaskState) -> None:
            if self.scheduler.validate:
                assert ts in self.processing
                assert ts not in self.long_running
    
            self._remove_from_task_prefix_count(ts)
            # Cannot remove from processing since we're using this for things like
            # idleness detection. Idle workers are typically targeted for
            # downscaling but we should not downscale workers with long running
            # tasks
            self.long_running.add(ts)
    
        def remove_from_processing(self, ts: TaskState) -> None:
            """Remove a task from a workers processing"""
            if self.scheduler.validate:
                assert ts in self.processing
    
            if ts in self.long_running:
                self.long_running.discard(ts)
            else:
                self._remove_from_task_prefix_count(ts)
            self.processing.remove(ts)
            for dts in ts.dependencies:
                if dts in self.needs_what:
                    self._dec_needs_replica(dts)
    
        def _remove_from_task_prefix_count(self, ts: TaskState) -> None:
            prefix_name = ts.prefix.name
            count = self.task_prefix_count[prefix_name] - 1
            tp_count = self.task_prefix_count
            tp_count_global = self.scheduler._task_prefix_count_global
            if count:
                tp_count[prefix_name] = count
            else:
                del tp_count[prefix_name]
    
            count = tp_count_global[prefix_name] - 1
            if count:
                tp_count_global[prefix_name] = count
            else:
                del tp_count_global[prefix_name]
    
        def remove_replica(self, ts: TaskState) -> None:
            """The worker no longer has a task in memory"""
            if self.scheduler.validate:
                assert ts.who_has
                assert self in ts.who_has
                assert ts in self.has_what
                assert ts not in self.needs_what
    
            self.nbytes -= ts.get_nbytes()
            del self._has_what[ts]
            ts.who_has.remove(self)  # type: ignore
            if not ts.who_has:
                ts.who_has = None
    
        def _inc_needs_replica(self, ts: TaskState) -> None:
            """Assign a task fetch to this worker and …cheduler, title="Scheduler Profile (administrative)"
            )
            task_stream = TabPanel(child=task_stream, title="Task Stream")
            bandwidth_workers = TabPanel(
                child=bandwidth_workers.root, title="Bandwidth (Workers)"
            )
            bandwidth_types = TabPanel(
                child=bandwidth_types.root, title="Bandwidth (Types)"
            )
            system = TabPanel(child=sysmon.root, title="System")
            logs = TabPanel(child=logs.root, title="Scheduler Logs")
    
            tabs = Tabs(
                tabs=[
                    html,
                    task_stream,
                    system,
                    logs,
                    compute,
                    workers,
                    scheduler,
                    bandwidth_workers,
                    bandwidth_types,
                ],
                sizing_mode="stretch_both",
            )
    
            from bokeh.core.templates import get_env
            from bokeh.plotting import output_file, save
    
            with tmpfile(extension=".html") as fn:
                output_file(filename=fn, title="Dask Performance Report", mode=mode)
                template_directory = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)), "dashboard", "templates"
                )
                template_environment = get_env()
                template_environment.loader.searchpath.append(template_directory)
                template = template_environment.get_template("performance_report.html")
                save(tabs, filename=fn, template=template)
    
                with open(fn) as f:
                    data = f.read()
    
            return data
    
        async def get_worker_logs(self, n=None, workers=None, nanny=False):
            results = await self.broadcast(
                msg={"op": "get_logs", "n": n}, workers=workers, nanny=nanny
            )
            return results
    
        def log_event(self, topic: str | Collection[str], msg: Any) -> None:
            """Log an event under a given topic
    
            Parameters
            ----------
            topic : str, list[str]
                Name of the topic under which to log an event. To log the same
                event under multiple topics, pass a list of topic names.
            msg
                Event message to log. Note this must be msgpack serializable.
    
            See also
            --------
            Client.log_event
            """
            self._broker.publish(topic, msg)
    
        def subscribe_topic(self, topic: str, client: str) -> None:
            self._broker.subscribe(topic, client)
    
        def unsubscribe_topic(self, topic: str, client: str) -> None:
            self._broker.unsubscribe(topic, client)
    
        @overload
        def get_events(self, topic: str) -> tuple[tuple[float, Any], ...]: ...
    
        @overload
        def get_events(self) -> dict[str, tuple[tuple[float, Any], ...]]: ...
    
        def get_events(
            self, topic: str | None = None
        ) -> tuple[tuple[float, Any], ...] | dict[str, tuple[tuple[float, Any], ...]]:
            return self._broker.get_events(topic)
    
        async def get_worker_monitor_info(self, recent=False, starts=None):
            if starts is None:
                starts = {}
            results = await asyncio.gather(
                *(
                    self.rpc(w).get_monitor_info(recent=recent, start=starts.get(w, 0))
                    for w in self.workers
                )
            )
            return dict(zip(self.workers, results))
    
        ###########
        # Cleanup #
        ###########
    
        @log_errors
        async def check_worker_ttl(self) -> None:
            now = time()
            stimulus_id = f"check-worker-ttl-{now}"
            assert self.worker_ttl
            ttl = max(self.worker_ttl, 10 * heartbeat_interval(len(self.workers)))
            to_restart = []
    
            for ws in self.workers.values():
                last_seen = now - ws.last_seen
                if last_seen > ttl:
                    to_restart.append(ws.address)
                    logger.warning(
                        f"Worker failed to heartbeat for {last_seen:.0f}s; "
                        f"{'attempting restart' if ws.nanny else 'removing'}: {ws}"
                    )
    
            if to_restart:
                self.log_event(
                    "scheduler",
                    {
                        "action": "worker-ttl-timed-out",
                        "workers": to_restart.copy(),
                        "ttl": ttl,
                    },
                )
                await self.restart_workers(
                    to_restart,
                    wait_for_workers=False,
                    stimulus_id=stimulus_id,
                )
    
        def check_idle(self) -> float | None:
            if self.status in (Status.closing, Status.closed):
                return None  # pragma: nocover
    
            if self.transition_counter != self._idle_transition_counter:
                self._idle_transition_counter = self.transition_counter
                self.idle_since = None
                return None
    
            if self._active_graph_updates > 0:
                self.idle_since = None
                return None
    
            if (
                self.queued
                or self.unrunnable
                or any(ws.processing for ws in self.workers.values())
            ):
                self.idle_since = None
                return None
    
            if not self.idle_since:
                self.idle_since = time()
                return self.idle_since
    
            if self.jupyter:
                last_activity = (
                    self._jupyter_server_application.web_app.last_activity().timestamp()
                )
                if last_activity > self.idle_since:
                    self.idle_since = last_activity
                    return self.idle_since
    
            if self.idle_timeout:
                if time() > self.idle_since + self.idle_timeout:
                    assert self.idle_since
                    logger.info(
                        "Scheduler closing after being idle for %s",
                        format_time(self.idle_timeout),
                    )
                    self._ongoing_background_tasks.call_soon(
                        self.close, reason="idle-timeout-exceeded"
                    )
            return self.idle_since
    
        def _check_no_workers(self) -> None:
            if (
                self.status in (Status.closing, Status.closed)
                or self.no_workers_timeout is None
            ):
                return
    
            now = monotonic()
            stimulus_id = f"check-no-workers-timeout-{time()}"
    
            recommendations: Recs = {}
    
            self._refresh_no_workers_since(now)
    
            affected = self._check_unrunnable_task_timeouts(
                now, recommendations=recommendations, stimulus_id=stimulus_id
            )
    
            affected.update(
                self._check_queued_task_timeouts(
                    now, recommendations=recommendations, stimulus_id=stimulus_id
                )
            )
            self.transitions(recommendations, stimulus_id=stimulus_id)
            if affected:
                self.log_event(
                    "scheduler",
                    {"action": "no-workers-timeout-exceeded", "keys": affected},
                )
    
        def _check_unrunnable_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
            unsatisfied = []
            no_workers = []
            for ts, unrunnable_since in self.unrunnable.items():
                if timestamp <= unrunnable_since + self.no_workers_timeout:
                    # unrunnable is insertion-ordered, which means that unrunnable_since will
                    # be monotonically increasing in this loop.
                    break
                if (
                    self._no_workers_since is None
                    or self._no_workers_since >= unrunnable_since
                ):
                    unsatisfied.append(ts)
                else:
                    no_workers.append(ts)
            if not unsatisfied and not no_workers:
                return set()
    
            for ts in unsatisfied:
                e = pickle.dumps(
                    NoValidWorkerError(
                        task=ts.key,
                        host_restrictions=(ts.host_restrictions or set()).copy(),
                        worker_restrictions=(ts.worker_restrictions or set()).copy(),
                        resource_restrictions=(ts.resource_restrictions or {}).copy(),
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "for its restrictions to become satisfied.",
                    ts.key,
                )
            self._fail_tasks_after_no_workers_timeout(
                no_workers, recommendations, stimulus_id
            )
            return {ts.key for ts in concat([unsatisfied, no_workers])}
    
        def _check_queued_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
    
            if self._no_workers_since is None:
                return set()
    
            if timestamp <= self._no_workers_since + self.no_workers_timeout:
                return set()
            affected = list(self.queued)
            self._fail_tasks_after_no_workers_timeout(
                affected, recommendations, stimulus_id
            )
            return {ts.key for ts in affected}
    
        def _fail_tasks_after_no_workers_timeout(
            self, timed_out: Iterable[TaskState], recommendations: Recs, stimulus_id: str
        ) -> None:
            assert self.no_workers_timeout
    
            for ts in timed_out:
                e = pickle.dumps(
                    NoWorkerError(
                        task=ts.key,
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "without any running workers.",
                    ts.key,
                )
    
        def _refresh_no_workers_since(self, timestamp: float | None = None) -> None:
            if self.running or not (self.queued or self.unrunnable):
                self._no_workers_since = None
                return
    
            if not self._no_workers_since:
                self._no_workers_since = timestamp or monotonic()
                return
    
        def adaptive_target(self, target_duration=None):
            """Desired number of workers based on the current workload
    
            This looks at the current running tasks and memory use, and returns a
            number of desired workers.  This is often used by adaptive scheduling.
    
            Parameters
            ----------
            target_duration : str
                A desired duration of time for computations to take.  This affects
                how rapidly the scheduler will ask to scale.
    
            See Also
            --------
            distributed.deploy.Adaptive
            """
            if target_duration is None:
                target_duration = dask.config.get("distributed.adaptive.target-duration")
            target_duration = parse_timedelta(target_duration)
    
            # CPU
            queued = take(100, concat([self.queued, self.unrunnable.keys()]))
            queued_occupancy = 0
            for ts in queued:
                queued_occupancy += self._get_prefix_duration(ts.prefix)
    
            tasks_ready = len(self.queued) + len(self.unrunnable)
            if tasks_ready > 100:
                queued_occupancy *= tasks_ready / 100
    
            cpu = math.ceil((self.total_occupancy + queued_occupancy) / target_duration)
    
            # Avoid a few long tasks from asking for many cores
            for ws in self.workers.values():
                if tasks_ready > cpu:
                    break
                tasks_ready += len(ws.processing)
            else:
                cpu = min(tasks_ready, cpu)
    
            # Divide by average nthreads per worker
            if self.workers:
                nthreads = sum(ws.nthreads for ws in self.workers.values())
                cpu = math.ceil(cpu / nthreads * len(self.workers))
    
            if (self.unrunnable or self.queued) and not self.workers:
                cpu = max(1, cpu)
    
            # add more workers if more than 60% of memory is used
            limit = sum(ws.memory_limit for ws in self.workers.values())
            used = sum(ws.nbytes for ws in self.workers.values())
            memory = 0
            if used > 0.6 * limit and limit > 0:
                memory = 2 * len(self.workers)
    
            target = max(memory, cpu)
            if target >= len(self.workers):
                return target
            else:  # Scale down?
                to_close = self.workers_to_close()
                return len(self.workers) - len(to_close)
    
        def request_acquire_replicas(
            self, addr: str, keys: Iterable[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to acquire a replica of the listed keys from
            other workers. This is a fire-and-forget operation which offers no feedback for
            success or failure, and is intended for housekeeping and not for computation.
            """
            who_has = {}
            nbytes = {}
            for key in keys:
                ts = self.tasks[key]
                assert ts.who_has
                who_has[key] = [ws.address for ws in ts.who_has or ()]
                nbytes[key] = ts.nbytes
    
            self.stream_comms[addr].send(
                {
                    "op": "acquire-replicas",
                    "who_has": who_has,
                    "nbytes": nbytes,
                    "stimulus_id": stimulus_id,
                },
            )
    
        def request_remove_replicas(
            self, addr: str, keys: list[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to discard its replica of the listed keys.
            This must never be used to destroy the last replica of a key. This is a
            fire-and-forget operation, intended for housekeeping and not for computation.
    
            The replica disappears immediately from TaskState.who_has on the Scheduler side;
            if the worker refuses to delete, e.g. because the task is a dependency of
            another task running on it, it will (also asynchronously) inform the scheduler
            to re-add itself to who_has. If the worker agrees to discard the task, there is
            no feedback.
            """
            ws = self.workers[addr]
    
            # The scheduler immediately forgets about the replica and suggests the worker to
            # drop it. The worker may refuse, at which point it will send back an add-keys
            # message to reinstate it.
            for key in keys:
                ts = self.tasks[key]
                if self.validate:
                    # Do not destroy the last copy
                    assert ts.who_has
                    assert len(ts.who_has) > 1
                self.remove_replica(ts, ws)
    
            self.stream_comms[addr].send(
                {
                    "op": "remove-replicas",
                    "keys": keys,
                    "stimulus_id": stimulus_id,
                }
            )
    
    
    def _task_to_report_msg(ts: TaskState) -> dict[str, Any] | None:
        if ts.state == "forgotten":
            return {"op": "cancelled-keys", "keys": [ts.key]}
        elif ts.state == "memory":
            return {"op": "key-in-memory", "key": ts.key}
        elif ts.state == "erred":
            failing_ts = ts.exception_blame
            assert failing_ts
            return {
                "op": "task-erred",
                "key": ts.key,
                "exception": failing_ts.exception,
                "traceback": failing_ts.traceback,
            }
        else:
            return None
    
    
    def _task_to_client_msgs(ts: TaskState) -> Msgs:
        if ts.who_wants:
            report_msg = _task_to_report_msg(ts)
            if report_msg is not None:
                return {cs.client_key: [report_msg] for cs in ts.who_wants}
        return {}
    
    
    def decide_worker(
        ts: TaskState,
        all_workers: set[WorkerState],
        valid_workers: set[WorkerState] | None,
        objective: Callable[[WorkerState], Any],
    ) -> WorkerState | None:
        """
        Decide which worker should take task *ts*.
    
        We choose the worker that has the data on which *ts* depends.
    
        If several workers have dependencies then we choose the less-busy worker.
    
        Optionally provide *valid_workers* of where jobs are allowed to occur
        (if all workers are allowed to take the task, pass None instead).
    
        If the task requires data communication because no eligible worker has
        all the dependencies already, then we choose to minimize the number
        of bytes sent between workers.  This is determined by calling the
        *objective* function.
        """
        assert all(dts.who_has for dts in ts.dependencies)
        if ts.actor:
            candidates = all_workers.copy()
        else:
            candidates = {wws for dts in ts.dependencies for wws in dts.who_has or ()}
            candidates &= all_workers
        if valid_workers is None:
            if not candidates:
                candidates = all_workers.copy()
        else:
            candidates &= valid_workers
            if not candidates:
                candidates = valid_workers
                if not candidates:
                    if ts.loose_restrictions:
                        return decide_worker(ts, all_workers, None, objective)
    
        if not candidates:
            return None
        elif len(candidates) == 1:
            return next(iter(candidates))
        else:
            return min(candidates, key=objective)
    
    
    def validate_task_state(ts: TaskState) -> None:
        """Validate the given TaskState"""
        assert ts.state in ALL_TASK_STATES, ts
    
        if ts.waiting_on:
            assert ts.waiting_on.issubset(ts.dependencies), (
                "waiting not subset of dependencies",
                str(ts.waiting_on),
                str(ts.dependencies),
            )
        if ts.waiters:
            assert ts.waiters.issubset(ts.dependents), (
                "waiters not subset of dependents",
                str(ts.waiters),
                str(ts.dependents),
            )
    
        for dts in ts.waiting_on or ():
            assert not dts.who_has, ("waiting on in-memory dep", str(ts), str(dts))
            assert dts.state != "released", ("waiting on released dep", str(ts), str(dts))
        for dts in ts.dependencies:
            assert ts in dts.dependents, (
                "not in dependency's dependents",
                str(ts),
                str(dts),
                str(dts.dependents),
            )
            if ts.state in ("waiting", "queued", "processing", "no-worker"):
                assert ts.waiting_on and dts in ts.waiting_on or dts.who_has, (
                    "dep missing",
                    str(ts),
                    str(dts),
                )
            assert dts.state != "forgotten"
    
        for dts in ts.waiters or ():
            assert dts.state in ("waiting", "queued", "processing", "no-worker"), (
                "waiter not in play",
                str(ts),
                str(dts),
            )
        for dts in ts.dependents:
            assert ts in dts.dependencies, (
                "not in dependent's dependencies",
                str(ts),
                str(dts),
                str(dts.dependencies),
            )
            assert dts.state != "forgotten"
    
        assert (ts.processing_on is not None) == (ts.state == "processing")
        assert bool(ts.who_has) == (ts.state == "memory"), (ts, ts.who_has, ts.state)
    
        if ts.state == "queued":
            assert not ts.processing_on
            assert not ts.who_has
            assert all(dts.who_has for dts in ts.dependencies), (
                "task queued without all deps",
                str(ts),
                str(ts.dependencies),
            )
    
        if ts.state == "processing":
            assert all(dts.who_has for dts in ts.dependencies), (
                "task processing without all deps",
                str(ts),
                str(ts.dependencies),
            )
            assert not ts.waiting_on
    
        if ts.who_has:
            assert ts.waiters or ts.who_wants, (
                "unneeded task in memory",
                str(ts),
                str(ts.who_has),
            )
            if ts.run_spec:  # was computed
                assert ts.type
                assert isinstance(ts.type, str)
            assert not any(
                [
                    ts in dts.waiting_on
                    for dts in ts.dependents
                    if dts.waiting_on is not None
                ]
            )
            for ws in ts.who_has:
                assert ts in ws.has_what, (
                    "not in who_has' has_what",
                    str(ts),
                    str(ws),
                    str(ws.has_what),
                )
    
        for cs in ts.who_wants or ():
            assert ts in cs.wants_what, (
                "not in who_wants' wants_what",
                str(ts),
                str(cs),
                str(cs.wants_what),
            )
    
        if ts.actor:
            if ts.state == "memory":
                assert ts.who_has
                assert sum(ts in ws.actors for ws in ts.who_has) == 1
            if ts.state == "processing":
                assert ts.processing_on
                assert ts in ts.processing_on.actors
            assert ts.state != "queued"
    
    
    def validate_unrunnable(unrunnable: dict[TaskState, float]) -> None:
        prev_unrunnable_since: float | None = None
        prev_ts: TaskState | None = None
        for ts, unrunnable_since in unrunnable.items():
            assert ts.state == "no-worker"
            if prev_ts is not None:
                assert prev_unrunnable_since is not None
                # Ensure that unrunnable_since is monotonically increasing when iterating over unrunnable.
                # _check_no_workers relies on this.
                assert prev_unrunnable_since <= unrunnable_since, (
                    prev_ts,
                    ts,
                    prev_unrunnable_since,
                    unrunnable_since,
                )
            prev_ts = ts
            prev_unrunnable_since = unrunnable_since
    
    
    def validate_worker_state(ws: WorkerState) -> None:
        for ts in ws.has_what or ():
            assert ts.who_has
            assert ws in ts.who_has, (
                "not in has_what' who_has",
                str(ws),
                str(ts),
                str(ts.who_has),
            )
    
        for ts in ws.actors:
            assert ts.state in ("memory", "processing")
    
    
    def validate_state(
        tasks: dict[Key, TaskState],
        workers: dict[str, WorkerState],
        clients: dict[str, ClientState],
    ) -> None:
        """Validate a current runtime state.
    
        This performs a sequence of checks on the entire graph, running in about linear
        time. This raises assert errors if anything doesn't check out.
        """
        for ts in tasks.values():
            validate_task_state(ts)
    
        for ws in workers.values():
            validate_worker_state(ws)
    
        for cs in clients.values():
            for ts in cs.wants_what or ():
                assert ts.who_wants
                assert cs in ts.who_wants, (
                    "not in wants_what' who_wants",
                    str(cs),
                    str(ts),
                    str(ts.who_wants),
                )
    
    
    def heartbeat_interval(n: int) -> float:
        """Interval in seconds that we desire heartbeats based on number of workers"""
        if n <= 10:
            return 0.5
        elif n < 50:
            return 1
        elif n < 200:
            return 2
        else:
            # No more than 200 heartbeats a second scaled by workers
            return n / 200 + 1
    
    
    def _task_slots_available(ws: WorkerState, saturation_factor: float) -> int:
        """Number of tasks that can be sent to this worker without oversaturating it"""
        assert not math.isinf(saturation_factor)
        return max(math.ceil(saturation_factor * ws.nthreads), 1) - (
            len(ws.processing) - len(ws.long_running)
        )
    
    
    def _worker_full(ws: WorkerState, saturation_factor: float) -> bool:
        if math.isinf(saturation_factor):
            return False
        return _task_slots_available(ws, saturation_factor) <= 0
    
    
    class KilledWorker(Exception):
        def __init__(self, task: Key, last_worker: WorkerState, allowed_failures: int):
            super().__init__(task, last_worker, allowed_failures)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def last_worker(self) -> WorkerState:
            return self.args[1]
    
        @property
        def allowed_failures(self) -> int:
            return self.args[2]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} on {self.allowed_failures + 1} "
                "different workers, but all those workers died while running it. "
                f"The last worker that attempt to run the task was {self.last_worker.address}. "
                "Inspecting worker logs is often a good next step to diagnose what went wrong. "
                "For more information see https://distributed.dask.org/en/stable/killed.html."
            )
    
    
    class NoValidWorkerError(Exception):
        def __init__(
            self,
            task: Key,
            host_restrictions: set[str],
            worker_restrictions: set[str],
            resource_restrictions: dict[str, float],
            timeout: float,
        ):
            super().__init__(
                task, host_restrictions, worker_restrictions, resource_restrictions, timeout
            )
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def host_restrictions(self) -> Any:
            return self.args[1]
    
        @property
        def worker_restrictions(self) -> Any:
            return self.args[2]
    
        @property
        def resource_restrictions(self) -> Any:
            return self.args[3]
    
        @property
        def timeout(self) -> float:
            return self.args[4]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting for a valid worker matching all restrictions.\n\nRestrictions:\n"
                f"host_restrictions={self.host_restrictions!s}\n"
                f"worker_restrictions={self.worker_restrictions!s}\n"
                f"resource_restrictions={self.resource_restrictions!s}\n"
            )
    
    
    class NoWorkerError(Exception):
        def __init__(self, task: Key, timeout: float):
            super().__init__(task, timeout)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def timeout(self) -> float:
            return self.args[1]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting without any running workers."
            )
    
    
    class WorkerStatusPlugin(SchedulerPlugin):
        """A plugin to share worker status with a remote observer
    
        This is used in cluster managers to keep updated about the status of the scheduler.
        """
    
        name: ClassVar[str] = "worker-status"
        bcomm: BatchedSend
    
        def __init__(self, scheduler: Scheduler, comm: Comm):
            self.bcomm = BatchedSend(interval="5ms")
            self.bcomm.start(comm)
            scheduler.add_plugin(self)
    
        def add_worker(self, scheduler: Scheduler, worker: str) -> None:
            ident = scheduler.workers[worker].identity()
            del ident["metrics"]
            del ident["last_seen"]
            try:
                self.bcomm.send(["add", {"workers": {worker: ident}}])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def remove_worker(self, scheduler: Scheduler, worker: str, **kwargs: Any) -> None:
            try:
                self.bcomm.send(["remove", worker])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def teardown(self) -> None:
            self.bcomm.close()
    
    
    class CollectTaskMetaDataPlugin(SchedulerPlugin):
        scheduler: Scheduler
        name: str
        keys: set[Key]
        metadata: dict[Key, Any]
        state: dict[Key, TaskStateState]
    
        def __init__(self, scheduler: Scheduler, name: str):
            self.scheduler = scheduler
            self.name = name
            self.keys = set()
            self.metadata = {}
            self.state = {}
    
        def update_graph(
            self,
            scheduler: Scheduler,
            *,
            keys: set[Key],
            **kwargs: Any,
        ) -> None:
            self.keys.update(keys)
    
        def transition(
            self,
            key: Key,
            start: TaskStateState,
            finish: TaskStateState,
            *args: Any,
            **kwargs: Any,
        ) -> None:
            if finish in ("memory", "erred"):
                ts = self.scheduler.tasks.get(key)
                if ts is not None and ts.key in self.keys:
                    self.metadata[key] = ts.metadata
                    self.state[key] = finish
                    self.keys.discard(key)
    
    
    def _materialize_graph(
        expr: Expr,
        global_annotations: dict[str, Any],
        validate: bool,
    ) -> tuple[dict[Key, T_runspec], dict[Key, set[Key]], dict[str, dict[Key, Any]]]:
>       dsk: dict = expr.__dask_graph__()
E       AttributeError: 'dict' object has no attribute '__dask_graph__'

distributed/scheduler.py:9383: AttributeError

Check warning on line 0 in distributed.cli.tests.test_dask_worker

github-actions / Unit Test Results

1 out of 5 runs failed: test_respect_host_listen_address[127.0.0.2---no-nanny] (distributed.cli.tests.test_dask_worker)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 0s]

Raw output


            AttributeError: 'dict' object has no attribute '__dask_graph__'
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:41191', workers: 0, cores: 0, tasks: 0>
nanny = '--no-nanny', host = '127.0.0.2'

    @pytest.mark.slow
    @pytest.mark.skipif(not LINUX, reason="Need 127.0.0.2 to mean localhost")
    @pytest.mark.parametrize("nanny", ["--nanny", "--no-nanny"])
    @pytest.mark.parametrize("host", ["127.0.0.2", "0.0.0.0"])
    @gen_cluster(client=True, nthreads=[])
    async def test_respect_host_listen_address(c, s, nanny, host):
        with popen(
            [
                sys.executable,
                "-m",
                "dask",
                "worker",
                s.address,
                nanny,
                "--no-dashboard",
                "--host",
                host,
            ]
        ):
            await c.wait_for_workers(1)
    
            # roundtrip works
>           assert await c.submit(lambda x: x + 1, 10) == 11

distributed/cli/tests/test_dask_worker.py:569: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:408: in _result
    raise exc.with_traceback(tb)
distributed/utils.py:1507: in run_in_executor_with_context
    return await loop.run_in_executor(
../../../miniconda3/envs/dask-distributed/lib/python3.10/concurrent/futures/thread.py:58: in run
    result = self.fn(*self.args, **self.kwargs)
distributed/utils.py:1508: in <lambda>
    executor, lambda: context.run(func, *args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import dataclasses
    import heapq
    import inspect
    import itertools
    import json
    import logging
    import math
    import operator
    import os
    import pickle
    import random
    import textwrap
    import uuid
    import warnings
    import weakref
    from abc import abstractmethod
    from collections import defaultdict, deque
    from collections.abc import (
        Callable,
        Collection,
        Container,
        Hashable,
        Iterable,
        Iterator,
        Mapping,
        Sequence,
        Set,
    )
    from contextlib import suppress
    from functools import partial
    from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, cast, overload
    
    import psutil
    import tornado.web
    from sortedcontainers import SortedDict, SortedSet
    from tlz import (
        concat,
        first,
        groupby,
        merge,
        merge_sorted,
        merge_with,
        partition,
        pluck,
        second,
        take,
        valmap,
    )
    from tornado.ioloop import IOLoop
    
    import dask
    import dask.utils
    from dask._task_spec import DependenciesMapping, GraphNode, convert_legacy_graph
    from dask.core import istask, validate_key
    from dask.typing import Key, no_default
    from dask.utils import (
        _deprecated,
        _deprecated_kwarg,
        format_bytes,
        format_time,
        key_split,
        parse_bytes,
        parse_timedelta,
        tmpfile,
    )
    from dask.widgets import get_template
    
    from distributed import cluster_dump, preloading, profile
    from distributed import versions as version_module
    from distributed._asyncio import RLock
    from distributed._stories import scheduler_story
    from distributed.active_memory_manager import ActiveMemoryManagerExtension, RetireWorker
    from distributed.batched import BatchedSend
    from distributed.broker import Broker
    from distributed.client import SourceCode
    from distributed.collections import HeapSet
    from distributed.comm import (
        Comm,
        CommClosedError,
        get_address_host,
        normalize_address,
        resolve_address,
        unparse_host_port,
    )
    from distributed.comm.addressing import addresses_from_user_args
    from distributed.compatibility import PeriodicCallback
    from distributed.core import (
        ErrorMessage,
        OKMessage,
        Status,
        clean_exception,
        error_message,
        rpc,
        send_recv,
    )
    from distributed.diagnostics.memory_sampler import MemorySamplerExtension
    from distributed.diagnostics.plugin import SchedulerPlugin, _get_plugin_name
    from distributed.event import EventExtension
    from distributed.gc import disable_gc_diagnosis, enable_gc_diagnosis
    from distributed.http import get_handlers
    from distributed.metrics import monotonic, time
    from distributed.multi_lock import MultiLockExtension
    from distributed.node import ServerNode
    from distributed.proctitle import setproctitle
    from distributed.protocol import deserialize
    from distributed.protocol.pickle import dumps, loads
    from distributed.protocol.serialize import Serialized, ToPickle, serialize
    from distributed.publish import PublishExtension
    from distributed.pubsub import PubSubSchedulerExtension
    from distributed.queues import QueueExtension
    from distributed.recreate_tasks import ReplayTaskScheduler
    from distributed.security import Security
    from distributed.semaphore import SemaphoreExtension
    from distributed.shuffle import ShuffleSchedulerPlugin
    from distributed.spans import SpanMetadata, SpansSchedulerExtension
    from distributed.stealing import WorkStealing
    from distributed.utils import (
        All,
        Deadline,
        TimeoutError,
        format_dashboard_link,
        get_fileno_limit,
        key_split_group,
        log_errors,
        offload,
        recursive_to_dict,
        wait_for,
    )
    from distributed.utils_comm import (
        gather_from_workers,
        retry_operation,
        scatter_to_workers,
    )
    from distributed.variable import VariableExtension
    
    if TYPE_CHECKING:
        # TODO import from typing (requires Python >=3.10)
        # TODO import from typing (requires Python >=3.11)
        from typing_extensions import Self, TypeAlias
    
        from dask._expr import Expr
    
    # Not to be confused with distributed.worker_state_machine.TaskStateState
    TaskStateState: TypeAlias = Literal[
        "released",
        "waiting",
        "no-worker",
        "queued",
        "processing",
        "memory",
        "erred",
        "forgotten",
    ]
    
    ALL_TASK_STATES: Set[TaskStateState] = set(TaskStateState.__args__)  # type: ignore
    
    # {task key -> finish state}
    # Not to be confused with distributed.worker_state_machine.Recs
    Recs: TypeAlias = dict[Key, TaskStateState]
    # {client or worker address: [{op: <key>, ...}, ...]}
    Msgs: TypeAlias = dict[str, list[dict[str, Any]]]
    # (recommendations, client messages, worker messages)
    RecsMsgs: TypeAlias = tuple[Recs, Msgs, Msgs]
    
    T_runspec: TypeAlias = GraphNode
    
    logger = logging.getLogger(__name__)
    LOG_PDB = dask.config.get("distributed.admin.pdb-on-err")
    DEFAULT_DATA_SIZE = parse_bytes(
        dask.config.get("distributed.scheduler.default-data-size")
    )
    STIMULUS_ID_UNSET = "<stimulus_id unset>"
    
    DEFAULT_EXTENSIONS = {
        "multi_locks": MultiLockExtension,
        "publish": PublishExtension,
        "replay-tasks": ReplayTaskScheduler,
        "queues": QueueExtension,
        "variables": VariableExtension,
        "pubsub": PubSubSchedulerExtension,
        "semaphores": SemaphoreExtension,
        "events": EventExtension,
        "amm": ActiveMemoryManagerExtension,
        "memory_sampler": MemorySamplerExtension,
        "shuffle": ShuffleSchedulerPlugin,
        "spans": SpansSchedulerExtension,
        "stealing": WorkStealing,
    }
    
    
    class ClientState:
        """A simple object holding information about a client."""
    
        #: A unique identifier for this client. This is generally an opaque
        #: string generated by the client itself.
        client_key: str
    
        #: Cached hash of :attr:`~ClientState.client_key`
        _hash: int
    
        #: A set of tasks this client wants to be kept in memory, so that it can download
        #: its result when desired. This is the reverse mapping of
        #: :class:`TaskState.who_wants`. Tasks are typically removed from this set when the
        #: corresponding object in the client's space (for example a ``Future`` or a Dask
        #: collection) gets garbage-collected.
        wants_what: set[TaskState]
    
        #: The last time we received a heartbeat from this client, in local scheduler time.
        last_seen: float
    
        #: Output of :func:`distributed.versions.get_versions` on the client
        versions: dict[str, Any]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(self, client: str, *, versions: dict[str, Any] | None = None):
            self.client_key = client
            self._hash = hash(client)
            self.wants_what = set()
            self.last_seen = time()
            self.versions = versions or {}
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            if not isinstance(other, ClientState):
                return False
            return self.client_key == other.client_key
    
        def __repr__(self) -> str:
            return f"<Client {self.client_key!r}>"
    
        def __str__(self) -> str:
            return self.client_key
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
    
    class MemoryState:
        """Memory readings on a worker or on the whole cluster.
    
        See :doc:`worker-memory`.
    
        Attributes / properties:
    
        managed_total
            Sum of the output of sizeof() for all dask keys held by the worker in memory,
            plus number of bytes spilled to disk
    
        managed
            Sum of the output of sizeof() for the dask keys held in RAM. Note that this may
            be inaccurate, which may cause inaccurate unmanaged memory (see below).
    
        spilled
            Number of bytes  for the dask keys spilled to the hard drive.
            Note that this is the size on disk; size in memory may be different due to
            compression and inaccuracies in sizeof(). In other words, given the same keys,
            'managed' will change depending on the keys being in memory or spilled.
    
        process
            Total RSS memory measured by the OS on the worker process.
            This is always exactly equal to managed + unmanaged.
    
        unmanaged
            process - managed. This is the sum of
    
            - Python interpreter and modules
            - global variables
            - memory temporarily allocated by the dask tasks that are currently running
            - memory fragmentation
            - memory leaks
            - memory not yet garbage collected
            - memory not yet free()'d by the Python memory manager to the OS
    
        unmanaged_old
            Minimum of the 'unmanaged' measures over the last
            ``distributed.memory.recent-to-old-time`` seconds
    
        unmanaged_recent
            unmanaged - unmanaged_old; in other words process memory that has been recently
            allocated but is not accounted for by dask; hopefully it's mostly a temporary
            spike.
    
        optimistic
            managed + unmanaged_old; in other words the memory held long-term by
            the process under the hopeful assumption that all unmanaged_recent memory is a
            temporary spike
        """
    
        process: int
        unmanaged_old: int
        managed: int
        spilled: int
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            process: int,
            unmanaged_old: int,
            managed: int,
            spilled: int,
        ):
            # Some data arrives with the heartbeat, some other arrives in realtime as the
            # tasks progress. Also, sizeof() is not guaranteed to return correct results.
            # This can cause glitches where a partial measure is larger than the whole, so
            # we need to force all numbers to add up exactly by definition.
            self.process = process
            self.managed = min(self.process, managed)
            self.spilled = spilled
            # Subtractions between unsigned ints guaranteed by construction to be >= 0
            self.unmanaged_old = min(unmanaged_old, process - self.managed)
    
        @staticmethod
        def sum(*infos: MemoryState) -> MemoryState:
            process = 0
            unmanaged_old = 0
            managed = 0
            spilled = 0
            for ms in infos:
                process += ms.process
                unmanaged_old += ms.unmanaged_old
                spilled += ms.spilled
                managed += ms.managed
            return MemoryState(
                process=process,
                unmanaged_old=unmanaged_old,
                managed=managed,
                spilled=spilled,
            )
    
        @property
        def managed_total(self) -> int:
            return self.managed + self.spilled
    
        @property
        def unmanaged(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed
    
        @property
        def unmanaged_recent(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed - self.unmanaged_old
    
        @property
        def optimistic(self) -> int:
            return self.managed + self.unmanaged_old
    
        @property
        def managed_in_memory(self) -> int:
            warnings.warn("managed_in_memory has been renamed to managed", FutureWarning)
            return self.managed
    
        @property
        def managed_spilled(self) -> int:
            warnings.warn("managed_spilled has been renamed to spilled", FutureWarning)
            return self.spilled
    
        def __repr__(self) -> str:
            return (
                f"Process memory (RSS)  : {format_bytes(self.process)}\n"
                f"  - managed by Dask   : {format_bytes(self.managed)}\n"
                f"  - unmanaged (old)   : {format_bytes(self.unmanaged_old)}\n"
                f"  - unmanaged (recent): {format_bytes(self.unmanaged_recent)}\n"
                f"Spilled to disk       : {format_bytes(self.spilled)}\n"
            )
    
        def _to_dict(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            """
            return {
                k: getattr(self, k)
                for k in dir(self)
                if not k.startswith("_")
                and k not in {"sum", "managed_in_memory", "managed_spilled"}
            }
    
    
    class WorkerState:
        """A simple object holding information about a worker.
    
        Not to be confused with :class:`distributed.worker_state_machine.WorkerState`.
        """
    
        #: This worker's unique key. This can be its connected address
        #: (such as ``"tcp://127.0.0.1:8891"``) or an alias (such as ``"alice"``).
        address: str
    
        pid: int
        name: Hashable
    
        #: The number of CPU threads made available on this worker
        nthreads: int
    
        #: Memory available to the worker, in bytes
        memory_limit: int
    
        local_directory: str
        services: dict[str, int]
    
        #: Output of :meth:`distributed.versions.get_versions` on the worker
        versions: dict[str, Any]
    
        #: Address of the associated :class:`~distributed.nanny.Nanny`, if present
        nanny: str | None
    
        #: Read-only worker status, synced one way from the remote Worker object
        status: Status
    
        #: Cached hash of :attr:`~WorkerState.server_id`
        _hash: int
    
        #: The total memory size, in bytes, used by the tasks this worker holds in memory
        #: (i.e. the tasks in this worker's :attr:`~WorkerState.has_what`).
        nbytes: int
    
        #: Worker memory unknown to the worker, in bytes, which has been there for more than
        #: 30 seconds. See :class:`MemoryState`.
        _memory_unmanaged_old: int
    
        #: History of the last 30 seconds' worth of unmanaged memory. Used to differentiate
        #: between "old" and "new" unmanaged memory.
        #: Format: ``[(timestamp, bytes), (timestamp, bytes), ...]``
        _memory_unmanaged_history: deque[tuple[float, int]]
    
        metrics: dict[str, Any]
    
        #: The last time we received a heartbeat from this worker, in local scheduler time.
        last_seen: float
    
        time_delay: float
        bandwidth: float
    
        #: A set of all TaskStates on this worker that are actors. This only includes those
        #: actors whose state actually lives on this worker, not actors to which this worker
        #: has a reference.
        actors: set[TaskState]
    
        #: Underlying data of :meth:`WorkerState.has_what`
        _has_what: dict[TaskState, None]
    
        #: A set of tasks that have been submitted to this worker. Multiple tasks may be
        # submitted to a worker in advance and the worker will run them eventually,
        # depending on its execution resources (but see :doc:`work-stealing`).
        #:
        #: All the tasks here are in the "processing" state.
        #: This attribute is kept in sync with :attr:`TaskState.processing_on`.
        processing: set[TaskState]
    
        #: Running tasks that invoked :func:`distributed.secede`
        long_running: set[TaskState]
    
        #: A dictionary of tasks that are currently being run on this worker.
        #: Each task state is associated with the duration in seconds which the task has
        #: been running.
        executing: dict[TaskState, float]
    
        #: The available resources on this worker, e.g. ``{"GPU": 2}``.
        #: These are abstract quantities that constrain certain tasks from running at the
        #: same time on this worker.
        resources: dict[str, float]
    
        #: The sum of each resource used by all tasks allocated to this worker.
        #: The numbers in this dictionary can only be less or equal than those in this
        #: worker's :attr:`~WorkerState.resources`.
        used_resources: dict[str, float]
    
        #: Arbitrary additional metadata to be added to :meth:`~WorkerState.identity`
        extra: dict[str, Any]
    
        # The unique server ID this WorkerState is referencing
        server_id: str
    
        # Reference to scheduler task_groups
        scheduler_ref: weakref.ref[SchedulerState] | None
        task_prefix_count: defaultdict[str, int]
        _network_occ: int
        _occupancy_cache: float | None
    
        #: Keys that may need to be fetched to this worker, and the number of tasks that need them.
        #: All tasks are currently in `memory` on a worker other than this one.
        #: Much like `processing`, this does not exactly reflect worker state:
        #: keys here may be queued to fetch, in flight, or already in memory
        #: on the worker.
        needs_what: dict[TaskState, int]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            address: str,
            status: Status,
            pid: int,
            name: object,
            nthreads: int = 0,
            memory_limit: int,
            local_directory: str,
            nanny: str | None,
            server_id: str,
            services: dict[str, int] | None = None,
            versions: dict[str, Any] | None = None,
            extra: dict[str, Any] | None = None,
            scheduler: SchedulerState | None = None,
        ):
            self.server_id = server_id
            self.address = address
            self.pid = pid
            self.name = name
            self.nthreads = nthreads
            self.memory_limit = memory_limit
            self.local_directory = local_directory
            self.services = services or {}
            self.versions = versions or {}
            self.nanny = nanny
            self.status = status
            self._hash = hash(self.server_id)
            self.nbytes = 0
            self._memory_unmanaged_old = 0
            self._memory_unmanaged_history = deque()
            self.metrics = {}
            self.last_seen = time()
            self.time_delay = 0
            self.bandwidth = parse_bytes(dask.config.get("distributed.scheduler.bandwidth"))
            self.actors = set()
            self._has_what = {}
            self.processing = set()
            self.long_running = set()
            self.executing = {}
            self.resources = {}
            self.used_resources = {}
            self.extra = extra or {}
            self.scheduler_ref = weakref.ref(scheduler) if scheduler else None
            self.task_prefix_count = defaultdict(int)
            self.needs_what = {}
            self._network_occ = 0
            self._occupancy_cache = None
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            return self is other or (
                isinstance(other, WorkerState) and other.server_id == self.server_id
            )
    
        @property
        def has_what(self) -> Set[TaskState]:
            """An insertion-sorted set-like of tasks which currently reside on this worker.
            All the tasks here are in the "memory" state.
            This is the reverse mapping of :attr:`TaskState.who_has`.
    
            This is a read-only public accessor. The data is implemented as a dict without
            values, because rebalance() relies on dicts being insertion-sorted.
            """
            return self._has_what.keys()
    
        @property
        def host(self) -> str:
            return get_address_host(self.address)
    
        @property
        def memory(self) -> MemoryState:
            """Polished memory metrics for the worker.
    
            **Design note on managed memory**
    
            There are two measures available for managed memory:
    
            - ``self.nbytes``
            - ``self.metrics["managed_bytes"]``
    
            At rest, the two numbers must be identical. However, ``self.nbytes`` is
            immediately updated through the batched comms as soon as each task lands in
            memory on the worker; ``self.metrics["managed_bytes"]`` instead is updated by
            the heartbeat, which can lag several seconds behind.
    
            Below we are mixing likely newer managed memory info from ``self.nbytes`` with
            process and spilled memory from the heartbeat. This is deliberate, so that
            managed memory total is updated more frequently.
    
            Managed memory directly and immediately contributes to optimistic memory, which
            is in turn used in Active Memory Manager heuristics (at the moment of writing;
            more uses will likely be added in the future). So it's important to have it
            up to date; much more than it is for process memory.
    
            Having up-to-date managed memory info as soon as the scheduler learns about
            task completion also substantially simplifies unit tests.
    
            The flip side of this design is that it may cause some noise in the
            unmanaged_recent measure. e.g.:
    
            1. Delete 100MB of managed data
            2. The updated managed memory reaches the scheduler faster than the
               updated process memory
            3. There's a blip where the scheduler thinks that there's a sudden 100MB
               increase in unmanaged_recent, since process memory hasn't changed but managed
               memory has decreased by 100MB
            4. When the heartbeat arrives, process memory goes down and so does the
               unmanaged_recent.
    
            This is OK - one of the main reasons for the unmanaged_recent / unmanaged_old
            split is exactly to concentrate all the noise in unmanaged_recent and exclude it
            from optimistic memory, which is used for heuristics.
    
            Something that is less OK, but also less frequent, is that the sudden deletion
            of spilled keys will cause a negative blip in managed memory:
    
            1. Delete 100MB of spilled data
            2. The updated managed memory *total* reaches the scheduler faster than the
               updated spilled portion
            3. This causes the managed memory to temporarily plummet and be replaced by
               unmanaged_recent, while spilled memory remains unaltered
            4. When the heartbeat arrives, managed goes back up, unmanaged_recent
               goes back down, and spilled goes down by 100MB as it should have to
               begin with.
    
            :issue:`6002` will let us solve this.
            """
            return MemoryState(
                process=self.metrics["memory"],
                managed=max(0, self.nbytes - self.metrics["spilled_bytes"]["memory"]),
                spilled=self.metrics["spilled_bytes"]["disk"],
                unmanaged_old=self._memory_unmanaged_old,
            )
    
        def clean(self) -> WorkerState:
            """Return a version of this object that is appropriate for serialization"""
            ws = WorkerState(
                address=self.address,
                status=self.status,
                pid=self.pid,
                name=self.name,
                nthreads=self.nthreads,
                memory_limit=self.memory_limit,
                local_directory=self.local_directory,
                services=self.services,
                nanny=self.nanny,
                extra=self.extra,
                server_id=self.server_id,
            )
            ws._occupancy_cache = self.occupancy
    
            ws.executing = {ts.key: duration for ts, duration in self.executing.items()}  # type: ignore
            return ws
    
        def __repr__(self) -> str:
            name = f", name: {self.name}" if self.name != self.address else ""
            return (
                f"<WorkerState {self.address!r}{name}, "
                f"status: {self.status.name}, "
                f"memory: {len(self.has_what)}, "
                f"processing: {len(self.processing)}>"
            )
    
        def _repr_html_(self) -> str:
            return get_template("worker_state.html.j2").render(
                address=self.address,
                name=self.name,
                status=self.status.name,
                has_what=self.has_what,
                processing=self.processing,
            )
    
        def identity(self) -> dict[str, Any]:
            return {
                "type": "Worker",
                "id": self.name,
                "host": self.host,
                "resources": self.resources,
                "local_directory": self.local_directory,
                "name": self.name,
                "nthreads": self.nthreads,
                "memory_limit": self.memory_limit,
                "last_seen": self.last_seen,
                "services": self.services,
                "metrics": self.metrics,
                "status": self.status.name,
                "nanny": self.nanny,
                **self.extra,
            }
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict[str, Any]:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
        @property
        def scheduler(self) -> SchedulerState:
            assert self.scheduler_ref
            s = self.scheduler_ref()
            assert s
            return s
    
        def add_to_processing(self, ts: TaskState) -> None:
            """Assign a task to this worker for compute."""
            if self.scheduler.validate:
                assert ts not in self.processing
    
            tp = ts.prefix
            self.task_prefix_count[tp.name] += 1
            self.scheduler._task_prefix_count_global[tp.name] += 1
            self.processing.add(ts)
            for dts in ts.dependencies:
                assert dts.who_has
                if self not in dts.who_has:
                    self._inc_needs_replica(dts)
    
        def add_to_long_running(self, ts: TaskState) -> None:
            if self.scheduler.validate:
                assert ts in self.processing
                assert ts not in self.long_running
    
            self._remove_from_task_prefix_count(ts)
            # Cannot remove from processing since we're using this for things like
            # idleness detection. Idle workers are typically targeted for
            # downscaling but we should not downscale workers with long running
            # tasks
            self.long_running.add(ts)
    
        def remove_from_processing(self, ts: TaskState) -> None:
            """Remove a task from a workers processing"""
            if self.scheduler.validate:
                assert ts in self.processing
    
            if ts in self.long_running:
                self.long_running.discard(ts)
            else:
                self._remove_from_task_prefix_count(ts)
            self.processing.remove(ts)
            for dts in ts.dependencies:
                if dts in self.needs_what:
                    self._dec_needs_replica(dts)
    
        def _remove_from_task_prefix_count(self, ts: TaskState) -> None:
            prefix_name = ts.prefix.name
            count = self.task_prefix_count[prefix_name] - 1
            tp_count = self.task_prefix_count
            tp_count_global = self.scheduler._task_prefix_count_global
            if count:
                tp_count[prefix_name] = count
            else:
                del tp_count[prefix_name]
    
            count = tp_count_global[prefix_name] - 1
            if count:
                tp_count_global[prefix_name] = count
            else:
                del tp_count_global[prefix_name]
    
        def remove_replica(self, ts: TaskState) -> None:
            """The worker no longer has a task in memory"""
            if self.scheduler.validate:
                assert ts.who_has
                assert self in ts.who_has
                assert ts in self.has_what
                assert ts not in self.needs_what
    
            self.nbytes -= ts.get_nbytes()
            del self._has_what[ts]
            ts.who_has.remove(self)  # type: ignore
            if not ts.who_has:
                ts.who_has = None
    
        def _inc_needs_replica(self, ts: TaskState) -> None:
            """Assign a task fetch to this worker a…cheduler, title="Scheduler Profile (administrative)"
            )
            task_stream = TabPanel(child=task_stream, title="Task Stream")
            bandwidth_workers = TabPanel(
                child=bandwidth_workers.root, title="Bandwidth (Workers)"
            )
            bandwidth_types = TabPanel(
                child=bandwidth_types.root, title="Bandwidth (Types)"
            )
            system = TabPanel(child=sysmon.root, title="System")
            logs = TabPanel(child=logs.root, title="Scheduler Logs")
    
            tabs = Tabs(
                tabs=[
                    html,
                    task_stream,
                    system,
                    logs,
                    compute,
                    workers,
                    scheduler,
                    bandwidth_workers,
                    bandwidth_types,
                ],
                sizing_mode="stretch_both",
            )
    
            from bokeh.core.templates import get_env
            from bokeh.plotting import output_file, save
    
            with tmpfile(extension=".html") as fn:
                output_file(filename=fn, title="Dask Performance Report", mode=mode)
                template_directory = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)), "dashboard", "templates"
                )
                template_environment = get_env()
                template_environment.loader.searchpath.append(template_directory)
                template = template_environment.get_template("performance_report.html")
                save(tabs, filename=fn, template=template)
    
                with open(fn) as f:
                    data = f.read()
    
            return data
    
        async def get_worker_logs(self, n=None, workers=None, nanny=False):
            results = await self.broadcast(
                msg={"op": "get_logs", "n": n}, workers=workers, nanny=nanny
            )
            return results
    
        def log_event(self, topic: str | Collection[str], msg: Any) -> None:
            """Log an event under a given topic
    
            Parameters
            ----------
            topic : str, list[str]
                Name of the topic under which to log an event. To log the same
                event under multiple topics, pass a list of topic names.
            msg
                Event message to log. Note this must be msgpack serializable.
    
            See also
            --------
            Client.log_event
            """
            self._broker.publish(topic, msg)
    
        def subscribe_topic(self, topic: str, client: str) -> None:
            self._broker.subscribe(topic, client)
    
        def unsubscribe_topic(self, topic: str, client: str) -> None:
            self._broker.unsubscribe(topic, client)
    
        @overload
        def get_events(self, topic: str) -> tuple[tuple[float, Any], ...]: ...
    
        @overload
        def get_events(self) -> dict[str, tuple[tuple[float, Any], ...]]: ...
    
        def get_events(
            self, topic: str | None = None
        ) -> tuple[tuple[float, Any], ...] | dict[str, tuple[tuple[float, Any], ...]]:
            return self._broker.get_events(topic)
    
        async def get_worker_monitor_info(self, recent=False, starts=None):
            if starts is None:
                starts = {}
            results = await asyncio.gather(
                *(
                    self.rpc(w).get_monitor_info(recent=recent, start=starts.get(w, 0))
                    for w in self.workers
                )
            )
            return dict(zip(self.workers, results))
    
        ###########
        # Cleanup #
        ###########
    
        @log_errors
        async def check_worker_ttl(self) -> None:
            now = time()
            stimulus_id = f"check-worker-ttl-{now}"
            assert self.worker_ttl
            ttl = max(self.worker_ttl, 10 * heartbeat_interval(len(self.workers)))
            to_restart = []
    
            for ws in self.workers.values():
                last_seen = now - ws.last_seen
                if last_seen > ttl:
                    to_restart.append(ws.address)
                    logger.warning(
                        f"Worker failed to heartbeat for {last_seen:.0f}s; "
                        f"{'attempting restart' if ws.nanny else 'removing'}: {ws}"
                    )
    
            if to_restart:
                self.log_event(
                    "scheduler",
                    {
                        "action": "worker-ttl-timed-out",
                        "workers": to_restart.copy(),
                        "ttl": ttl,
                    },
                )
                await self.restart_workers(
                    to_restart,
                    wait_for_workers=False,
                    stimulus_id=stimulus_id,
                )
    
        def check_idle(self) -> float | None:
            if self.status in (Status.closing, Status.closed):
                return None  # pragma: nocover
    
            if self.transition_counter != self._idle_transition_counter:
                self._idle_transition_counter = self.transition_counter
                self.idle_since = None
                return None
    
            if self._active_graph_updates > 0:
                self.idle_since = None
                return None
    
            if (
                self.queued
                or self.unrunnable
                or any(ws.processing for ws in self.workers.values())
            ):
                self.idle_since = None
                return None
    
            if not self.idle_since:
                self.idle_since = time()
                return self.idle_since
    
            if self.jupyter:
                last_activity = (
                    self._jupyter_server_application.web_app.last_activity().timestamp()
                )
                if last_activity > self.idle_since:
                    self.idle_since = last_activity
                    return self.idle_since
    
            if self.idle_timeout:
                if time() > self.idle_since + self.idle_timeout:
                    assert self.idle_since
                    logger.info(
                        "Scheduler closing after being idle for %s",
                        format_time(self.idle_timeout),
                    )
                    self._ongoing_background_tasks.call_soon(
                        self.close, reason="idle-timeout-exceeded"
                    )
            return self.idle_since
    
        def _check_no_workers(self) -> None:
            if (
                self.status in (Status.closing, Status.closed)
                or self.no_workers_timeout is None
            ):
                return
    
            now = monotonic()
            stimulus_id = f"check-no-workers-timeout-{time()}"
    
            recommendations: Recs = {}
    
            self._refresh_no_workers_since(now)
    
            affected = self._check_unrunnable_task_timeouts(
                now, recommendations=recommendations, stimulus_id=stimulus_id
            )
    
            affected.update(
                self._check_queued_task_timeouts(
                    now, recommendations=recommendations, stimulus_id=stimulus_id
                )
            )
            self.transitions(recommendations, stimulus_id=stimulus_id)
            if affected:
                self.log_event(
                    "scheduler",
                    {"action": "no-workers-timeout-exceeded", "keys": affected},
                )
    
        def _check_unrunnable_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
            unsatisfied = []
            no_workers = []
            for ts, unrunnable_since in self.unrunnable.items():
                if timestamp <= unrunnable_since + self.no_workers_timeout:
                    # unrunnable is insertion-ordered, which means that unrunnable_since will
                    # be monotonically increasing in this loop.
                    break
                if (
                    self._no_workers_since is None
                    or self._no_workers_since >= unrunnable_since
                ):
                    unsatisfied.append(ts)
                else:
                    no_workers.append(ts)
            if not unsatisfied and not no_workers:
                return set()
    
            for ts in unsatisfied:
                e = pickle.dumps(
                    NoValidWorkerError(
                        task=ts.key,
                        host_restrictions=(ts.host_restrictions or set()).copy(),
                        worker_restrictions=(ts.worker_restrictions or set()).copy(),
                        resource_restrictions=(ts.resource_restrictions or {}).copy(),
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "for its restrictions to become satisfied.",
                    ts.key,
                )
            self._fail_tasks_after_no_workers_timeout(
                no_workers, recommendations, stimulus_id
            )
            return {ts.key for ts in concat([unsatisfied, no_workers])}
    
        def _check_queued_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
    
            if self._no_workers_since is None:
                return set()
    
            if timestamp <= self._no_workers_since + self.no_workers_timeout:
                return set()
            affected = list(self.queued)
            self._fail_tasks_after_no_workers_timeout(
                affected, recommendations, stimulus_id
            )
            return {ts.key for ts in affected}
    
        def _fail_tasks_after_no_workers_timeout(
            self, timed_out: Iterable[TaskState], recommendations: Recs, stimulus_id: str
        ) -> None:
            assert self.no_workers_timeout
    
            for ts in timed_out:
                e = pickle.dumps(
                    NoWorkerError(
                        task=ts.key,
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "without any running workers.",
                    ts.key,
                )
    
        def _refresh_no_workers_since(self, timestamp: float | None = None) -> None:
            if self.running or not (self.queued or self.unrunnable):
                self._no_workers_since = None
                return
    
            if not self._no_workers_since:
                self._no_workers_since = timestamp or monotonic()
                return
    
        def adaptive_target(self, target_duration=None):
            """Desired number of workers based on the current workload
    
            This looks at the current running tasks and memory use, and returns a
            number of desired workers.  This is often used by adaptive scheduling.
    
            Parameters
            ----------
            target_duration : str
                A desired duration of time for computations to take.  This affects
                how rapidly the scheduler will ask to scale.
    
            See Also
            --------
            distributed.deploy.Adaptive
            """
            if target_duration is None:
                target_duration = dask.config.get("distributed.adaptive.target-duration")
            target_duration = parse_timedelta(target_duration)
    
            # CPU
            queued = take(100, concat([self.queued, self.unrunnable.keys()]))
            queued_occupancy = 0
            for ts in queued:
                queued_occupancy += self._get_prefix_duration(ts.prefix)
    
            tasks_ready = len(self.queued) + len(self.unrunnable)
            if tasks_ready > 100:
                queued_occupancy *= tasks_ready / 100
    
            cpu = math.ceil((self.total_occupancy + queued_occupancy) / target_duration)
    
            # Avoid a few long tasks from asking for many cores
            for ws in self.workers.values():
                if tasks_ready > cpu:
                    break
                tasks_ready += len(ws.processing)
            else:
                cpu = min(tasks_ready, cpu)
    
            # Divide by average nthreads per worker
            if self.workers:
                nthreads = sum(ws.nthreads for ws in self.workers.values())
                cpu = math.ceil(cpu / nthreads * len(self.workers))
    
            if (self.unrunnable or self.queued) and not self.workers:
                cpu = max(1, cpu)
    
            # add more workers if more than 60% of memory is used
            limit = sum(ws.memory_limit for ws in self.workers.values())
            used = sum(ws.nbytes for ws in self.workers.values())
            memory = 0
            if used > 0.6 * limit and limit > 0:
                memory = 2 * len(self.workers)
    
            target = max(memory, cpu)
            if target >= len(self.workers):
                return target
            else:  # Scale down?
                to_close = self.workers_to_close()
                return len(self.workers) - len(to_close)
    
        def request_acquire_replicas(
            self, addr: str, keys: Iterable[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to acquire a replica of the listed keys from
            other workers. This is a fire-and-forget operation which offers no feedback for
            success or failure, and is intended for housekeeping and not for computation.
            """
            who_has = {}
            nbytes = {}
            for key in keys:
                ts = self.tasks[key]
                assert ts.who_has
                who_has[key] = [ws.address for ws in ts.who_has or ()]
                nbytes[key] = ts.nbytes
    
            self.stream_comms[addr].send(
                {
                    "op": "acquire-replicas",
                    "who_has": who_has,
                    "nbytes": nbytes,
                    "stimulus_id": stimulus_id,
                },
            )
    
        def request_remove_replicas(
            self, addr: str, keys: list[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to discard its replica of the listed keys.
            This must never be used to destroy the last replica of a key. This is a
            fire-and-forget operation, intended for housekeeping and not for computation.
    
            The replica disappears immediately from TaskState.who_has on the Scheduler side;
            if the worker refuses to delete, e.g. because the task is a dependency of
            another task running on it, it will (also asynchronously) inform the scheduler
            to re-add itself to who_has. If the worker agrees to discard the task, there is
            no feedback.
            """
            ws = self.workers[addr]
    
            # The scheduler immediately forgets about the replica and suggests the worker to
            # drop it. The worker may refuse, at which point it will send back an add-keys
            # message to reinstate it.
            for key in keys:
                ts = self.tasks[key]
                if self.validate:
                    # Do not destroy the last copy
                    assert ts.who_has
                    assert len(ts.who_has) > 1
                self.remove_replica(ts, ws)
    
            self.stream_comms[addr].send(
                {
                    "op": "remove-replicas",
                    "keys": keys,
                    "stimulus_id": stimulus_id,
                }
            )
    
    
    def _task_to_report_msg(ts: TaskState) -> dict[str, Any] | None:
        if ts.state == "forgotten":
            return {"op": "cancelled-keys", "keys": [ts.key]}
        elif ts.state == "memory":
            return {"op": "key-in-memory", "key": ts.key}
        elif ts.state == "erred":
            failing_ts = ts.exception_blame
            assert failing_ts
            return {
                "op": "task-erred",
                "key": ts.key,
                "exception": failing_ts.exception,
                "traceback": failing_ts.traceback,
            }
        else:
            return None
    
    
    def _task_to_client_msgs(ts: TaskState) -> Msgs:
        if ts.who_wants:
            report_msg = _task_to_report_msg(ts)
            if report_msg is not None:
                return {cs.client_key: [report_msg] for cs in ts.who_wants}
        return {}
    
    
    def decide_worker(
        ts: TaskState,
        all_workers: set[WorkerState],
        valid_workers: set[WorkerState] | None,
        objective: Callable[[WorkerState], Any],
    ) -> WorkerState | None:
        """
        Decide which worker should take task *ts*.
    
        We choose the worker that has the data on which *ts* depends.
    
        If several workers have dependencies then we choose the less-busy worker.
    
        Optionally provide *valid_workers* of where jobs are allowed to occur
        (if all workers are allowed to take the task, pass None instead).
    
        If the task requires data communication because no eligible worker has
        all the dependencies already, then we choose to minimize the number
        of bytes sent between workers.  This is determined by calling the
        *objective* function.
        """
        assert all(dts.who_has for dts in ts.dependencies)
        if ts.actor:
            candidates = all_workers.copy()
        else:
            candidates = {wws for dts in ts.dependencies for wws in dts.who_has or ()}
            candidates &= all_workers
        if valid_workers is None:
            if not candidates:
                candidates = all_workers.copy()
        else:
            candidates &= valid_workers
            if not candidates:
                candidates = valid_workers
                if not candidates:
                    if ts.loose_restrictions:
                        return decide_worker(ts, all_workers, None, objective)
    
        if not candidates:
            return None
        elif len(candidates) == 1:
            return next(iter(candidates))
        else:
            return min(candidates, key=objective)
    
    
    def validate_task_state(ts: TaskState) -> None:
        """Validate the given TaskState"""
        assert ts.state in ALL_TASK_STATES, ts
    
        if ts.waiting_on:
            assert ts.waiting_on.issubset(ts.dependencies), (
                "waiting not subset of dependencies",
                str(ts.waiting_on),
                str(ts.dependencies),
            )
        if ts.waiters:
            assert ts.waiters.issubset(ts.dependents), (
                "waiters not subset of dependents",
                str(ts.waiters),
                str(ts.dependents),
            )
    
        for dts in ts.waiting_on or ():
            assert not dts.who_has, ("waiting on in-memory dep", str(ts), str(dts))
            assert dts.state != "released", ("waiting on released dep", str(ts), str(dts))
        for dts in ts.dependencies:
            assert ts in dts.dependents, (
                "not in dependency's dependents",
                str(ts),
                str(dts),
                str(dts.dependents),
            )
            if ts.state in ("waiting", "queued", "processing", "no-worker"):
                assert ts.waiting_on and dts in ts.waiting_on or dts.who_has, (
                    "dep missing",
                    str(ts),
                    str(dts),
                )
            assert dts.state != "forgotten"
    
        for dts in ts.waiters or ():
            assert dts.state in ("waiting", "queued", "processing", "no-worker"), (
                "waiter not in play",
                str(ts),
                str(dts),
            )
        for dts in ts.dependents:
            assert ts in dts.dependencies, (
                "not in dependent's dependencies",
                str(ts),
                str(dts),
                str(dts.dependencies),
            )
            assert dts.state != "forgotten"
    
        assert (ts.processing_on is not None) == (ts.state == "processing")
        assert bool(ts.who_has) == (ts.state == "memory"), (ts, ts.who_has, ts.state)
    
        if ts.state == "queued":
            assert not ts.processing_on
            assert not ts.who_has
            assert all(dts.who_has for dts in ts.dependencies), (
                "task queued without all deps",
                str(ts),
                str(ts.dependencies),
            )
    
        if ts.state == "processing":
            assert all(dts.who_has for dts in ts.dependencies), (
                "task processing without all deps",
                str(ts),
                str(ts.dependencies),
            )
            assert not ts.waiting_on
    
        if ts.who_has:
            assert ts.waiters or ts.who_wants, (
                "unneeded task in memory",
                str(ts),
                str(ts.who_has),
            )
            if ts.run_spec:  # was computed
                assert ts.type
                assert isinstance(ts.type, str)
            assert not any(
                [
                    ts in dts.waiting_on
                    for dts in ts.dependents
                    if dts.waiting_on is not None
                ]
            )
            for ws in ts.who_has:
                assert ts in ws.has_what, (
                    "not in who_has' has_what",
                    str(ts),
                    str(ws),
                    str(ws.has_what),
                )
    
        for cs in ts.who_wants or ():
            assert ts in cs.wants_what, (
                "not in who_wants' wants_what",
                str(ts),
                str(cs),
                str(cs.wants_what),
            )
    
        if ts.actor:
            if ts.state == "memory":
                assert ts.who_has
                assert sum(ts in ws.actors for ws in ts.who_has) == 1
            if ts.state == "processing":
                assert ts.processing_on
                assert ts in ts.processing_on.actors
            assert ts.state != "queued"
    
    
    def validate_unrunnable(unrunnable: dict[TaskState, float]) -> None:
        prev_unrunnable_since: float | None = None
        prev_ts: TaskState | None = None
        for ts, unrunnable_since in unrunnable.items():
            assert ts.state == "no-worker"
            if prev_ts is not None:
                assert prev_unrunnable_since is not None
                # Ensure that unrunnable_since is monotonically increasing when iterating over unrunnable.
                # _check_no_workers relies on this.
                assert prev_unrunnable_since <= unrunnable_since, (
                    prev_ts,
                    ts,
                    prev_unrunnable_since,
                    unrunnable_since,
                )
            prev_ts = ts
            prev_unrunnable_since = unrunnable_since
    
    
    def validate_worker_state(ws: WorkerState) -> None:
        for ts in ws.has_what or ():
            assert ts.who_has
            assert ws in ts.who_has, (
                "not in has_what' who_has",
                str(ws),
                str(ts),
                str(ts.who_has),
            )
    
        for ts in ws.actors:
            assert ts.state in ("memory", "processing")
    
    
    def validate_state(
        tasks: dict[Key, TaskState],
        workers: dict[str, WorkerState],
        clients: dict[str, ClientState],
    ) -> None:
        """Validate a current runtime state.
    
        This performs a sequence of checks on the entire graph, running in about linear
        time. This raises assert errors if anything doesn't check out.
        """
        for ts in tasks.values():
            validate_task_state(ts)
    
        for ws in workers.values():
            validate_worker_state(ws)
    
        for cs in clients.values():
            for ts in cs.wants_what or ():
                assert ts.who_wants
                assert cs in ts.who_wants, (
                    "not in wants_what' who_wants",
                    str(cs),
                    str(ts),
                    str(ts.who_wants),
                )
    
    
    def heartbeat_interval(n: int) -> float:
        """Interval in seconds that we desire heartbeats based on number of workers"""
        if n <= 10:
            return 0.5
        elif n < 50:
            return 1
        elif n < 200:
            return 2
        else:
            # No more than 200 heartbeats a second scaled by workers
            return n / 200 + 1
    
    
    def _task_slots_available(ws: WorkerState, saturation_factor: float) -> int:
        """Number of tasks that can be sent to this worker without oversaturating it"""
        assert not math.isinf(saturation_factor)
        return max(math.ceil(saturation_factor * ws.nthreads), 1) - (
            len(ws.processing) - len(ws.long_running)
        )
    
    
    def _worker_full(ws: WorkerState, saturation_factor: float) -> bool:
        if math.isinf(saturation_factor):
            return False
        return _task_slots_available(ws, saturation_factor) <= 0
    
    
    class KilledWorker(Exception):
        def __init__(self, task: Key, last_worker: WorkerState, allowed_failures: int):
            super().__init__(task, last_worker, allowed_failures)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def last_worker(self) -> WorkerState:
            return self.args[1]
    
        @property
        def allowed_failures(self) -> int:
            return self.args[2]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} on {self.allowed_failures + 1} "
                "different workers, but all those workers died while running it. "
                f"The last worker that attempt to run the task was {self.last_worker.address}. "
                "Inspecting worker logs is often a good next step to diagnose what went wrong. "
                "For more information see https://distributed.dask.org/en/stable/killed.html."
            )
    
    
    class NoValidWorkerError(Exception):
        def __init__(
            self,
            task: Key,
            host_restrictions: set[str],
            worker_restrictions: set[str],
            resource_restrictions: dict[str, float],
            timeout: float,
        ):
            super().__init__(
                task, host_restrictions, worker_restrictions, resource_restrictions, timeout
            )
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def host_restrictions(self) -> Any:
            return self.args[1]
    
        @property
        def worker_restrictions(self) -> Any:
            return self.args[2]
    
        @property
        def resource_restrictions(self) -> Any:
            return self.args[3]
    
        @property
        def timeout(self) -> float:
            return self.args[4]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting for a valid worker matching all restrictions.\n\nRestrictions:\n"
                f"host_restrictions={self.host_restrictions!s}\n"
                f"worker_restrictions={self.worker_restrictions!s}\n"
                f"resource_restrictions={self.resource_restrictions!s}\n"
            )
    
    
    class NoWorkerError(Exception):
        def __init__(self, task: Key, timeout: float):
            super().__init__(task, timeout)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def timeout(self) -> float:
            return self.args[1]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting without any running workers."
            )
    
    
    class WorkerStatusPlugin(SchedulerPlugin):
        """A plugin to share worker status with a remote observer
    
        This is used in cluster managers to keep updated about the status of the scheduler.
        """
    
        name: ClassVar[str] = "worker-status"
        bcomm: BatchedSend
    
        def __init__(self, scheduler: Scheduler, comm: Comm):
            self.bcomm = BatchedSend(interval="5ms")
            self.bcomm.start(comm)
            scheduler.add_plugin(self)
    
        def add_worker(self, scheduler: Scheduler, worker: str) -> None:
            ident = scheduler.workers[worker].identity()
            del ident["metrics"]
            del ident["last_seen"]
            try:
                self.bcomm.send(["add", {"workers": {worker: ident}}])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def remove_worker(self, scheduler: Scheduler, worker: str, **kwargs: Any) -> None:
            try:
                self.bcomm.send(["remove", worker])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def teardown(self) -> None:
            self.bcomm.close()
    
    
    class CollectTaskMetaDataPlugin(SchedulerPlugin):
        scheduler: Scheduler
        name: str
        keys: set[Key]
        metadata: dict[Key, Any]
        state: dict[Key, TaskStateState]
    
        def __init__(self, scheduler: Scheduler, name: str):
            self.scheduler = scheduler
            self.name = name
            self.keys = set()
            self.metadata = {}
            self.state = {}
    
        def update_graph(
            self,
            scheduler: Scheduler,
            *,
            keys: set[Key],
            **kwargs: Any,
        ) -> None:
            self.keys.update(keys)
    
        def transition(
            self,
            key: Key,
            start: TaskStateState,
            finish: TaskStateState,
            *args: Any,
            **kwargs: Any,
        ) -> None:
            if finish in ("memory", "erred"):
                ts = self.scheduler.tasks.get(key)
                if ts is not None and ts.key in self.keys:
                    self.metadata[key] = ts.metadata
                    self.state[key] = finish
                    self.keys.discard(key)
    
    
    def _materialize_graph(
        expr: Expr,
        global_annotations: dict[str, Any],
        validate: bool,
    ) -> tuple[dict[Key, T_runspec], dict[Key, set[Key]], dict[str, dict[Key, Any]]]:
>       dsk: dict = expr.__dask_graph__()
E       AttributeError: 'dict' object has no attribute '__dask_graph__'

distributed/scheduler.py:9383: AttributeError

Check warning on line 0 in distributed.cli.tests.test_dask_worker

github-actions / Unit Test Results

1 out of 5 runs failed: test_respect_host_listen_address[0.0.0.0---nanny] (distributed.cli.tests.test_dask_worker)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 1s]

Raw output


            AttributeError: 'dict' object has no attribute '__dask_graph__'
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:43723', workers: 0, cores: 0, tasks: 0>
nanny = '--nanny', host = '0.0.0.0'

    @pytest.mark.slow
    @pytest.mark.skipif(not LINUX, reason="Need 127.0.0.2 to mean localhost")
    @pytest.mark.parametrize("nanny", ["--nanny", "--no-nanny"])
    @pytest.mark.parametrize("host", ["127.0.0.2", "0.0.0.0"])
    @gen_cluster(client=True, nthreads=[])
    async def test_respect_host_listen_address(c, s, nanny, host):
        with popen(
            [
                sys.executable,
                "-m",
                "dask",
                "worker",
                s.address,
                nanny,
                "--no-dashboard",
                "--host",
                host,
            ]
        ):
            await c.wait_for_workers(1)
    
            # roundtrip works
>           assert await c.submit(lambda x: x + 1, 10) == 11

distributed/cli/tests/test_dask_worker.py:569: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:408: in _result
    raise exc.with_traceback(tb)
distributed/utils.py:1507: in run_in_executor_with_context
    return await loop.run_in_executor(
../../../miniconda3/envs/dask-distributed/lib/python3.10/concurrent/futures/thread.py:58: in run
    result = self.fn(*self.args, **self.kwargs)
distributed/utils.py:1508: in <lambda>
    executor, lambda: context.run(func, *args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import dataclasses
    import heapq
    import inspect
    import itertools
    import json
    import logging
    import math
    import operator
    import os
    import pickle
    import random
    import textwrap
    import uuid
    import warnings
    import weakref
    from abc import abstractmethod
    from collections import defaultdict, deque
    from collections.abc import (
        Callable,
        Collection,
        Container,
        Hashable,
        Iterable,
        Iterator,
        Mapping,
        Sequence,
        Set,
    )
    from contextlib import suppress
    from functools import partial
    from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, cast, overload
    
    import psutil
    import tornado.web
    from sortedcontainers import SortedDict, SortedSet
    from tlz import (
        concat,
        first,
        groupby,
        merge,
        merge_sorted,
        merge_with,
        partition,
        pluck,
        second,
        take,
        valmap,
    )
    from tornado.ioloop import IOLoop
    
    import dask
    import dask.utils
    from dask._task_spec import DependenciesMapping, GraphNode, convert_legacy_graph
    from dask.core import istask, validate_key
    from dask.typing import Key, no_default
    from dask.utils import (
        _deprecated,
        _deprecated_kwarg,
        format_bytes,
        format_time,
        key_split,
        parse_bytes,
        parse_timedelta,
        tmpfile,
    )
    from dask.widgets import get_template
    
    from distributed import cluster_dump, preloading, profile
    from distributed import versions as version_module
    from distributed._asyncio import RLock
    from distributed._stories import scheduler_story
    from distributed.active_memory_manager import ActiveMemoryManagerExtension, RetireWorker
    from distributed.batched import BatchedSend
    from distributed.broker import Broker
    from distributed.client import SourceCode
    from distributed.collections import HeapSet
    from distributed.comm import (
        Comm,
        CommClosedError,
        get_address_host,
        normalize_address,
        resolve_address,
        unparse_host_port,
    )
    from distributed.comm.addressing import addresses_from_user_args
    from distributed.compatibility import PeriodicCallback
    from distributed.core import (
        ErrorMessage,
        OKMessage,
        Status,
        clean_exception,
        error_message,
        rpc,
        send_recv,
    )
    from distributed.diagnostics.memory_sampler import MemorySamplerExtension
    from distributed.diagnostics.plugin import SchedulerPlugin, _get_plugin_name
    from distributed.event import EventExtension
    from distributed.gc import disable_gc_diagnosis, enable_gc_diagnosis
    from distributed.http import get_handlers
    from distributed.metrics import monotonic, time
    from distributed.multi_lock import MultiLockExtension
    from distributed.node import ServerNode
    from distributed.proctitle import setproctitle
    from distributed.protocol import deserialize
    from distributed.protocol.pickle import dumps, loads
    from distributed.protocol.serialize import Serialized, ToPickle, serialize
    from distributed.publish import PublishExtension
    from distributed.pubsub import PubSubSchedulerExtension
    from distributed.queues import QueueExtension
    from distributed.recreate_tasks import ReplayTaskScheduler
    from distributed.security import Security
    from distributed.semaphore import SemaphoreExtension
    from distributed.shuffle import ShuffleSchedulerPlugin
    from distributed.spans import SpanMetadata, SpansSchedulerExtension
    from distributed.stealing import WorkStealing
    from distributed.utils import (
        All,
        Deadline,
        TimeoutError,
        format_dashboard_link,
        get_fileno_limit,
        key_split_group,
        log_errors,
        offload,
        recursive_to_dict,
        wait_for,
    )
    from distributed.utils_comm import (
        gather_from_workers,
        retry_operation,
        scatter_to_workers,
    )
    from distributed.variable import VariableExtension
    
    if TYPE_CHECKING:
        # TODO import from typing (requires Python >=3.10)
        # TODO import from typing (requires Python >=3.11)
        from typing_extensions import Self, TypeAlias
    
        from dask._expr import Expr
    
    # Not to be confused with distributed.worker_state_machine.TaskStateState
    TaskStateState: TypeAlias = Literal[
        "released",
        "waiting",
        "no-worker",
        "queued",
        "processing",
        "memory",
        "erred",
        "forgotten",
    ]
    
    ALL_TASK_STATES: Set[TaskStateState] = set(TaskStateState.__args__)  # type: ignore
    
    # {task key -> finish state}
    # Not to be confused with distributed.worker_state_machine.Recs
    Recs: TypeAlias = dict[Key, TaskStateState]
    # {client or worker address: [{op: <key>, ...}, ...]}
    Msgs: TypeAlias = dict[str, list[dict[str, Any]]]
    # (recommendations, client messages, worker messages)
    RecsMsgs: TypeAlias = tuple[Recs, Msgs, Msgs]
    
    T_runspec: TypeAlias = GraphNode
    
    logger = logging.getLogger(__name__)
    LOG_PDB = dask.config.get("distributed.admin.pdb-on-err")
    DEFAULT_DATA_SIZE = parse_bytes(
        dask.config.get("distributed.scheduler.default-data-size")
    )
    STIMULUS_ID_UNSET = "<stimulus_id unset>"
    
    DEFAULT_EXTENSIONS = {
        "multi_locks": MultiLockExtension,
        "publish": PublishExtension,
        "replay-tasks": ReplayTaskScheduler,
        "queues": QueueExtension,
        "variables": VariableExtension,
        "pubsub": PubSubSchedulerExtension,
        "semaphores": SemaphoreExtension,
        "events": EventExtension,
        "amm": ActiveMemoryManagerExtension,
        "memory_sampler": MemorySamplerExtension,
        "shuffle": ShuffleSchedulerPlugin,
        "spans": SpansSchedulerExtension,
        "stealing": WorkStealing,
    }
    
    
    class ClientState:
        """A simple object holding information about a client."""
    
        #: A unique identifier for this client. This is generally an opaque
        #: string generated by the client itself.
        client_key: str
    
        #: Cached hash of :attr:`~ClientState.client_key`
        _hash: int
    
        #: A set of tasks this client wants to be kept in memory, so that it can download
        #: its result when desired. This is the reverse mapping of
        #: :class:`TaskState.who_wants`. Tasks are typically removed from this set when the
        #: corresponding object in the client's space (for example a ``Future`` or a Dask
        #: collection) gets garbage-collected.
        wants_what: set[TaskState]
    
        #: The last time we received a heartbeat from this client, in local scheduler time.
        last_seen: float
    
        #: Output of :func:`distributed.versions.get_versions` on the client
        versions: dict[str, Any]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(self, client: str, *, versions: dict[str, Any] | None = None):
            self.client_key = client
            self._hash = hash(client)
            self.wants_what = set()
            self.last_seen = time()
            self.versions = versions or {}
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            if not isinstance(other, ClientState):
                return False
            return self.client_key == other.client_key
    
        def __repr__(self) -> str:
            return f"<Client {self.client_key!r}>"
    
        def __str__(self) -> str:
            return self.client_key
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
    
    class MemoryState:
        """Memory readings on a worker or on the whole cluster.
    
        See :doc:`worker-memory`.
    
        Attributes / properties:
    
        managed_total
            Sum of the output of sizeof() for all dask keys held by the worker in memory,
            plus number of bytes spilled to disk
    
        managed
            Sum of the output of sizeof() for the dask keys held in RAM. Note that this may
            be inaccurate, which may cause inaccurate unmanaged memory (see below).
    
        spilled
            Number of bytes  for the dask keys spilled to the hard drive.
            Note that this is the size on disk; size in memory may be different due to
            compression and inaccuracies in sizeof(). In other words, given the same keys,
            'managed' will change depending on the keys being in memory or spilled.
    
        process
            Total RSS memory measured by the OS on the worker process.
            This is always exactly equal to managed + unmanaged.
    
        unmanaged
            process - managed. This is the sum of
    
            - Python interpreter and modules
            - global variables
            - memory temporarily allocated by the dask tasks that are currently running
            - memory fragmentation
            - memory leaks
            - memory not yet garbage collected
            - memory not yet free()'d by the Python memory manager to the OS
    
        unmanaged_old
            Minimum of the 'unmanaged' measures over the last
            ``distributed.memory.recent-to-old-time`` seconds
    
        unmanaged_recent
            unmanaged - unmanaged_old; in other words process memory that has been recently
            allocated but is not accounted for by dask; hopefully it's mostly a temporary
            spike.
    
        optimistic
            managed + unmanaged_old; in other words the memory held long-term by
            the process under the hopeful assumption that all unmanaged_recent memory is a
            temporary spike
        """
    
        process: int
        unmanaged_old: int
        managed: int
        spilled: int
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            process: int,
            unmanaged_old: int,
            managed: int,
            spilled: int,
        ):
            # Some data arrives with the heartbeat, some other arrives in realtime as the
            # tasks progress. Also, sizeof() is not guaranteed to return correct results.
            # This can cause glitches where a partial measure is larger than the whole, so
            # we need to force all numbers to add up exactly by definition.
            self.process = process
            self.managed = min(self.process, managed)
            self.spilled = spilled
            # Subtractions between unsigned ints guaranteed by construction to be >= 0
            self.unmanaged_old = min(unmanaged_old, process - self.managed)
    
        @staticmethod
        def sum(*infos: MemoryState) -> MemoryState:
            process = 0
            unmanaged_old = 0
            managed = 0
            spilled = 0
            for ms in infos:
                process += ms.process
                unmanaged_old += ms.unmanaged_old
                spilled += ms.spilled
                managed += ms.managed
            return MemoryState(
                process=process,
                unmanaged_old=unmanaged_old,
                managed=managed,
                spilled=spilled,
            )
    
        @property
        def managed_total(self) -> int:
            return self.managed + self.spilled
    
        @property
        def unmanaged(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed
    
        @property
        def unmanaged_recent(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed - self.unmanaged_old
    
        @property
        def optimistic(self) -> int:
            return self.managed + self.unmanaged_old
    
        @property
        def managed_in_memory(self) -> int:
            warnings.warn("managed_in_memory has been renamed to managed", FutureWarning)
            return self.managed
    
        @property
        def managed_spilled(self) -> int:
            warnings.warn("managed_spilled has been renamed to spilled", FutureWarning)
            return self.spilled
    
        def __repr__(self) -> str:
            return (
                f"Process memory (RSS)  : {format_bytes(self.process)}\n"
                f"  - managed by Dask   : {format_bytes(self.managed)}\n"
                f"  - unmanaged (old)   : {format_bytes(self.unmanaged_old)}\n"
                f"  - unmanaged (recent): {format_bytes(self.unmanaged_recent)}\n"
                f"Spilled to disk       : {format_bytes(self.spilled)}\n"
            )
    
        def _to_dict(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            """
            return {
                k: getattr(self, k)
                for k in dir(self)
                if not k.startswith("_")
                and k not in {"sum", "managed_in_memory", "managed_spilled"}
            }
    
    
    class WorkerState:
        """A simple object holding information about a worker.
    
        Not to be confused with :class:`distributed.worker_state_machine.WorkerState`.
        """
    
        #: This worker's unique key. This can be its connected address
        #: (such as ``"tcp://127.0.0.1:8891"``) or an alias (such as ``"alice"``).
        address: str
    
        pid: int
        name: Hashable
    
        #: The number of CPU threads made available on this worker
        nthreads: int
    
        #: Memory available to the worker, in bytes
        memory_limit: int
    
        local_directory: str
        services: dict[str, int]
    
        #: Output of :meth:`distributed.versions.get_versions` on the worker
        versions: dict[str, Any]
    
        #: Address of the associated :class:`~distributed.nanny.Nanny`, if present
        nanny: str | None
    
        #: Read-only worker status, synced one way from the remote Worker object
        status: Status
    
        #: Cached hash of :attr:`~WorkerState.server_id`
        _hash: int
    
        #: The total memory size, in bytes, used by the tasks this worker holds in memory
        #: (i.e. the tasks in this worker's :attr:`~WorkerState.has_what`).
        nbytes: int
    
        #: Worker memory unknown to the worker, in bytes, which has been there for more than
        #: 30 seconds. See :class:`MemoryState`.
        _memory_unmanaged_old: int
    
        #: History of the last 30 seconds' worth of unmanaged memory. Used to differentiate
        #: between "old" and "new" unmanaged memory.
        #: Format: ``[(timestamp, bytes), (timestamp, bytes), ...]``
        _memory_unmanaged_history: deque[tuple[float, int]]
    
        metrics: dict[str, Any]
    
        #: The last time we received a heartbeat from this worker, in local scheduler time.
        last_seen: float
    
        time_delay: float
        bandwidth: float
    
        #: A set of all TaskStates on this worker that are actors. This only includes those
        #: actors whose state actually lives on this worker, not actors to which this worker
        #: has a reference.
        actors: set[TaskState]
    
        #: Underlying data of :meth:`WorkerState.has_what`
        _has_what: dict[TaskState, None]
    
        #: A set of tasks that have been submitted to this worker. Multiple tasks may be
        # submitted to a worker in advance and the worker will run them eventually,
        # depending on its execution resources (but see :doc:`work-stealing`).
        #:
        #: All the tasks here are in the "processing" state.
        #: This attribute is kept in sync with :attr:`TaskState.processing_on`.
        processing: set[TaskState]
    
        #: Running tasks that invoked :func:`distributed.secede`
        long_running: set[TaskState]
    
        #: A dictionary of tasks that are currently being run on this worker.
        #: Each task state is associated with the duration in seconds which the task has
        #: been running.
        executing: dict[TaskState, float]
    
        #: The available resources on this worker, e.g. ``{"GPU": 2}``.
        #: These are abstract quantities that constrain certain tasks from running at the
        #: same time on this worker.
        resources: dict[str, float]
    
        #: The sum of each resource used by all tasks allocated to this worker.
        #: The numbers in this dictionary can only be less or equal than those in this
        #: worker's :attr:`~WorkerState.resources`.
        used_resources: dict[str, float]
    
        #: Arbitrary additional metadata to be added to :meth:`~WorkerState.identity`
        extra: dict[str, Any]
    
        # The unique server ID this WorkerState is referencing
        server_id: str
    
        # Reference to scheduler task_groups
        scheduler_ref: weakref.ref[SchedulerState] | None
        task_prefix_count: defaultdict[str, int]
        _network_occ: int
        _occupancy_cache: float | None
    
        #: Keys that may need to be fetched to this worker, and the number of tasks that need them.
        #: All tasks are currently in `memory` on a worker other than this one.
        #: Much like `processing`, this does not exactly reflect worker state:
        #: keys here may be queued to fetch, in flight, or already in memory
        #: on the worker.
        needs_what: dict[TaskState, int]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            address: str,
            status: Status,
            pid: int,
            name: object,
            nthreads: int = 0,
            memory_limit: int,
            local_directory: str,
            nanny: str | None,
            server_id: str,
            services: dict[str, int] | None = None,
            versions: dict[str, Any] | None = None,
            extra: dict[str, Any] | None = None,
            scheduler: SchedulerState | None = None,
        ):
            self.server_id = server_id
            self.address = address
            self.pid = pid
            self.name = name
            self.nthreads = nthreads
            self.memory_limit = memory_limit
            self.local_directory = local_directory
            self.services = services or {}
            self.versions = versions or {}
            self.nanny = nanny
            self.status = status
            self._hash = hash(self.server_id)
            self.nbytes = 0
            self._memory_unmanaged_old = 0
            self._memory_unmanaged_history = deque()
            self.metrics = {}
            self.last_seen = time()
            self.time_delay = 0
            self.bandwidth = parse_bytes(dask.config.get("distributed.scheduler.bandwidth"))
            self.actors = set()
            self._has_what = {}
            self.processing = set()
            self.long_running = set()
            self.executing = {}
            self.resources = {}
            self.used_resources = {}
            self.extra = extra or {}
            self.scheduler_ref = weakref.ref(scheduler) if scheduler else None
            self.task_prefix_count = defaultdict(int)
            self.needs_what = {}
            self._network_occ = 0
            self._occupancy_cache = None
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            return self is other or (
                isinstance(other, WorkerState) and other.server_id == self.server_id
            )
    
        @property
        def has_what(self) -> Set[TaskState]:
            """An insertion-sorted set-like of tasks which currently reside on this worker.
            All the tasks here are in the "memory" state.
            This is the reverse mapping of :attr:`TaskState.who_has`.
    
            This is a read-only public accessor. The data is implemented as a dict without
            values, because rebalance() relies on dicts being insertion-sorted.
            """
            return self._has_what.keys()
    
        @property
        def host(self) -> str:
            return get_address_host(self.address)
    
        @property
        def memory(self) -> MemoryState:
            """Polished memory metrics for the worker.
    
            **Design note on managed memory**
    
            There are two measures available for managed memory:
    
            - ``self.nbytes``
            - ``self.metrics["managed_bytes"]``
    
            At rest, the two numbers must be identical. However, ``self.nbytes`` is
            immediately updated through the batched comms as soon as each task lands in
            memory on the worker; ``self.metrics["managed_bytes"]`` instead is updated by
            the heartbeat, which can lag several seconds behind.
    
            Below we are mixing likely newer managed memory info from ``self.nbytes`` with
            process and spilled memory from the heartbeat. This is deliberate, so that
            managed memory total is updated more frequently.
    
            Managed memory directly and immediately contributes to optimistic memory, which
            is in turn used in Active Memory Manager heuristics (at the moment of writing;
            more uses will likely be added in the future). So it's important to have it
            up to date; much more than it is for process memory.
    
            Having up-to-date managed memory info as soon as the scheduler learns about
            task completion also substantially simplifies unit tests.
    
            The flip side of this design is that it may cause some noise in the
            unmanaged_recent measure. e.g.:
    
            1. Delete 100MB of managed data
            2. The updated managed memory reaches the scheduler faster than the
               updated process memory
            3. There's a blip where the scheduler thinks that there's a sudden 100MB
               increase in unmanaged_recent, since process memory hasn't changed but managed
               memory has decreased by 100MB
            4. When the heartbeat arrives, process memory goes down and so does the
               unmanaged_recent.
    
            This is OK - one of the main reasons for the unmanaged_recent / unmanaged_old
            split is exactly to concentrate all the noise in unmanaged_recent and exclude it
            from optimistic memory, which is used for heuristics.
    
            Something that is less OK, but also less frequent, is that the sudden deletion
            of spilled keys will cause a negative blip in managed memory:
    
            1. Delete 100MB of spilled data
            2. The updated managed memory *total* reaches the scheduler faster than the
               updated spilled portion
            3. This causes the managed memory to temporarily plummet and be replaced by
               unmanaged_recent, while spilled memory remains unaltered
            4. When the heartbeat arrives, managed goes back up, unmanaged_recent
               goes back down, and spilled goes down by 100MB as it should have to
               begin with.
    
            :issue:`6002` will let us solve this.
            """
            return MemoryState(
                process=self.metrics["memory"],
                managed=max(0, self.nbytes - self.metrics["spilled_bytes"]["memory"]),
                spilled=self.metrics["spilled_bytes"]["disk"],
                unmanaged_old=self._memory_unmanaged_old,
            )
    
        def clean(self) -> WorkerState:
            """Return a version of this object that is appropriate for serialization"""
            ws = WorkerState(
                address=self.address,
                status=self.status,
                pid=self.pid,
                name=self.name,
                nthreads=self.nthreads,
                memory_limit=self.memory_limit,
                local_directory=self.local_directory,
                services=self.services,
                nanny=self.nanny,
                extra=self.extra,
                server_id=self.server_id,
            )
            ws._occupancy_cache = self.occupancy
    
            ws.executing = {ts.key: duration for ts, duration in self.executing.items()}  # type: ignore
            return ws
    
        def __repr__(self) -> str:
            name = f", name: {self.name}" if self.name != self.address else ""
            return (
                f"<WorkerState {self.address!r}{name}, "
                f"status: {self.status.name}, "
                f"memory: {len(self.has_what)}, "
                f"processing: {len(self.processing)}>"
            )
    
        def _repr_html_(self) -> str:
            return get_template("worker_state.html.j2").render(
                address=self.address,
                name=self.name,
                status=self.status.name,
                has_what=self.has_what,
                processing=self.processing,
            )
    
        def identity(self) -> dict[str, Any]:
            return {
                "type": "Worker",
                "id": self.name,
                "host": self.host,
                "resources": self.resources,
                "local_directory": self.local_directory,
                "name": self.name,
                "nthreads": self.nthreads,
                "memory_limit": self.memory_limit,
                "last_seen": self.last_seen,
                "services": self.services,
                "metrics": self.metrics,
                "status": self.status.name,
                "nanny": self.nanny,
                **self.extra,
            }
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict[str, Any]:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
        @property
        def scheduler(self) -> SchedulerState:
            assert self.scheduler_ref
            s = self.scheduler_ref()
            assert s
            return s
    
        def add_to_processing(self, ts: TaskState) -> None:
            """Assign a task to this worker for compute."""
            if self.scheduler.validate:
                assert ts not in self.processing
    
            tp = ts.prefix
            self.task_prefix_count[tp.name] += 1
            self.scheduler._task_prefix_count_global[tp.name] += 1
            self.processing.add(ts)
            for dts in ts.dependencies:
                assert dts.who_has
                if self not in dts.who_has:
                    self._inc_needs_replica(dts)
    
        def add_to_long_running(self, ts: TaskState) -> None:
            if self.scheduler.validate:
                assert ts in self.processing
                assert ts not in self.long_running
    
            self._remove_from_task_prefix_count(ts)
            # Cannot remove from processing since we're using this for things like
            # idleness detection. Idle workers are typically targeted for
            # downscaling but we should not downscale workers with long running
            # tasks
            self.long_running.add(ts)
    
        def remove_from_processing(self, ts: TaskState) -> None:
            """Remove a task from a workers processing"""
            if self.scheduler.validate:
                assert ts in self.processing
    
            if ts in self.long_running:
                self.long_running.discard(ts)
            else:
                self._remove_from_task_prefix_count(ts)
            self.processing.remove(ts)
            for dts in ts.dependencies:
                if dts in self.needs_what:
                    self._dec_needs_replica(dts)
    
        def _remove_from_task_prefix_count(self, ts: TaskState) -> None:
            prefix_name = ts.prefix.name
            count = self.task_prefix_count[prefix_name] - 1
            tp_count = self.task_prefix_count
            tp_count_global = self.scheduler._task_prefix_count_global
            if count:
                tp_count[prefix_name] = count
            else:
                del tp_count[prefix_name]
    
            count = tp_count_global[prefix_name] - 1
            if count:
                tp_count_global[prefix_name] = count
            else:
                del tp_count_global[prefix_name]
    
        def remove_replica(self, ts: TaskState) -> None:
            """The worker no longer has a task in memory"""
            if self.scheduler.validate:
                assert ts.who_has
                assert self in ts.who_has
                assert ts in self.has_what
                assert ts not in self.needs_what
    
            self.nbytes -= ts.get_nbytes()
            del self._has_what[ts]
            ts.who_has.remove(self)  # type: ignore
            if not ts.who_has:
                ts.who_has = None
    
        def _inc_needs_replica(self, ts: TaskState) -> None:
            """Assign a task fetch to this worker and up…cheduler, title="Scheduler Profile (administrative)"
            )
            task_stream = TabPanel(child=task_stream, title="Task Stream")
            bandwidth_workers = TabPanel(
                child=bandwidth_workers.root, title="Bandwidth (Workers)"
            )
            bandwidth_types = TabPanel(
                child=bandwidth_types.root, title="Bandwidth (Types)"
            )
            system = TabPanel(child=sysmon.root, title="System")
            logs = TabPanel(child=logs.root, title="Scheduler Logs")
    
            tabs = Tabs(
                tabs=[
                    html,
                    task_stream,
                    system,
                    logs,
                    compute,
                    workers,
                    scheduler,
                    bandwidth_workers,
                    bandwidth_types,
                ],
                sizing_mode="stretch_both",
            )
    
            from bokeh.core.templates import get_env
            from bokeh.plotting import output_file, save
    
            with tmpfile(extension=".html") as fn:
                output_file(filename=fn, title="Dask Performance Report", mode=mode)
                template_directory = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)), "dashboard", "templates"
                )
                template_environment = get_env()
                template_environment.loader.searchpath.append(template_directory)
                template = template_environment.get_template("performance_report.html")
                save(tabs, filename=fn, template=template)
    
                with open(fn) as f:
                    data = f.read()
    
            return data
    
        async def get_worker_logs(self, n=None, workers=None, nanny=False):
            results = await self.broadcast(
                msg={"op": "get_logs", "n": n}, workers=workers, nanny=nanny
            )
            return results
    
        def log_event(self, topic: str | Collection[str], msg: Any) -> None:
            """Log an event under a given topic
    
            Parameters
            ----------
            topic : str, list[str]
                Name of the topic under which to log an event. To log the same
                event under multiple topics, pass a list of topic names.
            msg
                Event message to log. Note this must be msgpack serializable.
    
            See also
            --------
            Client.log_event
            """
            self._broker.publish(topic, msg)
    
        def subscribe_topic(self, topic: str, client: str) -> None:
            self._broker.subscribe(topic, client)
    
        def unsubscribe_topic(self, topic: str, client: str) -> None:
            self._broker.unsubscribe(topic, client)
    
        @overload
        def get_events(self, topic: str) -> tuple[tuple[float, Any], ...]: ...
    
        @overload
        def get_events(self) -> dict[str, tuple[tuple[float, Any], ...]]: ...
    
        def get_events(
            self, topic: str | None = None
        ) -> tuple[tuple[float, Any], ...] | dict[str, tuple[tuple[float, Any], ...]]:
            return self._broker.get_events(topic)
    
        async def get_worker_monitor_info(self, recent=False, starts=None):
            if starts is None:
                starts = {}
            results = await asyncio.gather(
                *(
                    self.rpc(w).get_monitor_info(recent=recent, start=starts.get(w, 0))
                    for w in self.workers
                )
            )
            return dict(zip(self.workers, results))
    
        ###########
        # Cleanup #
        ###########
    
        @log_errors
        async def check_worker_ttl(self) -> None:
            now = time()
            stimulus_id = f"check-worker-ttl-{now}"
            assert self.worker_ttl
            ttl = max(self.worker_ttl, 10 * heartbeat_interval(len(self.workers)))
            to_restart = []
    
            for ws in self.workers.values():
                last_seen = now - ws.last_seen
                if last_seen > ttl:
                    to_restart.append(ws.address)
                    logger.warning(
                        f"Worker failed to heartbeat for {last_seen:.0f}s; "
                        f"{'attempting restart' if ws.nanny else 'removing'}: {ws}"
                    )
    
            if to_restart:
                self.log_event(
                    "scheduler",
                    {
                        "action": "worker-ttl-timed-out",
                        "workers": to_restart.copy(),
                        "ttl": ttl,
                    },
                )
                await self.restart_workers(
                    to_restart,
                    wait_for_workers=False,
                    stimulus_id=stimulus_id,
                )
    
        def check_idle(self) -> float | None:
            if self.status in (Status.closing, Status.closed):
                return None  # pragma: nocover
    
            if self.transition_counter != self._idle_transition_counter:
                self._idle_transition_counter = self.transition_counter
                self.idle_since = None
                return None
    
            if self._active_graph_updates > 0:
                self.idle_since = None
                return None
    
            if (
                self.queued
                or self.unrunnable
                or any(ws.processing for ws in self.workers.values())
            ):
                self.idle_since = None
                return None
    
            if not self.idle_since:
                self.idle_since = time()
                return self.idle_since
    
            if self.jupyter:
                last_activity = (
                    self._jupyter_server_application.web_app.last_activity().timestamp()
                )
                if last_activity > self.idle_since:
                    self.idle_since = last_activity
                    return self.idle_since
    
            if self.idle_timeout:
                if time() > self.idle_since + self.idle_timeout:
                    assert self.idle_since
                    logger.info(
                        "Scheduler closing after being idle for %s",
                        format_time(self.idle_timeout),
                    )
                    self._ongoing_background_tasks.call_soon(
                        self.close, reason="idle-timeout-exceeded"
                    )
            return self.idle_since
    
        def _check_no_workers(self) -> None:
            if (
                self.status in (Status.closing, Status.closed)
                or self.no_workers_timeout is None
            ):
                return
    
            now = monotonic()
            stimulus_id = f"check-no-workers-timeout-{time()}"
    
            recommendations: Recs = {}
    
            self._refresh_no_workers_since(now)
    
            affected = self._check_unrunnable_task_timeouts(
                now, recommendations=recommendations, stimulus_id=stimulus_id
            )
    
            affected.update(
                self._check_queued_task_timeouts(
                    now, recommendations=recommendations, stimulus_id=stimulus_id
                )
            )
            self.transitions(recommendations, stimulus_id=stimulus_id)
            if affected:
                self.log_event(
                    "scheduler",
                    {"action": "no-workers-timeout-exceeded", "keys": affected},
                )
    
        def _check_unrunnable_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
            unsatisfied = []
            no_workers = []
            for ts, unrunnable_since in self.unrunnable.items():
                if timestamp <= unrunnable_since + self.no_workers_timeout:
                    # unrunnable is insertion-ordered, which means that unrunnable_since will
                    # be monotonically increasing in this loop.
                    break
                if (
                    self._no_workers_since is None
                    or self._no_workers_since >= unrunnable_since
                ):
                    unsatisfied.append(ts)
                else:
                    no_workers.append(ts)
            if not unsatisfied and not no_workers:
                return set()
    
            for ts in unsatisfied:
                e = pickle.dumps(
                    NoValidWorkerError(
                        task=ts.key,
                        host_restrictions=(ts.host_restrictions or set()).copy(),
                        worker_restrictions=(ts.worker_restrictions or set()).copy(),
                        resource_restrictions=(ts.resource_restrictions or {}).copy(),
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "for its restrictions to become satisfied.",
                    ts.key,
                )
            self._fail_tasks_after_no_workers_timeout(
                no_workers, recommendations, stimulus_id
            )
            return {ts.key for ts in concat([unsatisfied, no_workers])}
    
        def _check_queued_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
    
            if self._no_workers_since is None:
                return set()
    
            if timestamp <= self._no_workers_since + self.no_workers_timeout:
                return set()
            affected = list(self.queued)
            self._fail_tasks_after_no_workers_timeout(
                affected, recommendations, stimulus_id
            )
            return {ts.key for ts in affected}
    
        def _fail_tasks_after_no_workers_timeout(
            self, timed_out: Iterable[TaskState], recommendations: Recs, stimulus_id: str
        ) -> None:
            assert self.no_workers_timeout
    
            for ts in timed_out:
                e = pickle.dumps(
                    NoWorkerError(
                        task=ts.key,
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "without any running workers.",
                    ts.key,
                )
    
        def _refresh_no_workers_since(self, timestamp: float | None = None) -> None:
            if self.running or not (self.queued or self.unrunnable):
                self._no_workers_since = None
                return
    
            if not self._no_workers_since:
                self._no_workers_since = timestamp or monotonic()
                return
    
        def adaptive_target(self, target_duration=None):
            """Desired number of workers based on the current workload
    
            This looks at the current running tasks and memory use, and returns a
            number of desired workers.  This is often used by adaptive scheduling.
    
            Parameters
            ----------
            target_duration : str
                A desired duration of time for computations to take.  This affects
                how rapidly the scheduler will ask to scale.
    
            See Also
            --------
            distributed.deploy.Adaptive
            """
            if target_duration is None:
                target_duration = dask.config.get("distributed.adaptive.target-duration")
            target_duration = parse_timedelta(target_duration)
    
            # CPU
            queued = take(100, concat([self.queued, self.unrunnable.keys()]))
            queued_occupancy = 0
            for ts in queued:
                queued_occupancy += self._get_prefix_duration(ts.prefix)
    
            tasks_ready = len(self.queued) + len(self.unrunnable)
            if tasks_ready > 100:
                queued_occupancy *= tasks_ready / 100
    
            cpu = math.ceil((self.total_occupancy + queued_occupancy) / target_duration)
    
            # Avoid a few long tasks from asking for many cores
            for ws in self.workers.values():
                if tasks_ready > cpu:
                    break
                tasks_ready += len(ws.processing)
            else:
                cpu = min(tasks_ready, cpu)
    
            # Divide by average nthreads per worker
            if self.workers:
                nthreads = sum(ws.nthreads for ws in self.workers.values())
                cpu = math.ceil(cpu / nthreads * len(self.workers))
    
            if (self.unrunnable or self.queued) and not self.workers:
                cpu = max(1, cpu)
    
            # add more workers if more than 60% of memory is used
            limit = sum(ws.memory_limit for ws in self.workers.values())
            used = sum(ws.nbytes for ws in self.workers.values())
            memory = 0
            if used > 0.6 * limit and limit > 0:
                memory = 2 * len(self.workers)
    
            target = max(memory, cpu)
            if target >= len(self.workers):
                return target
            else:  # Scale down?
                to_close = self.workers_to_close()
                return len(self.workers) - len(to_close)
    
        def request_acquire_replicas(
            self, addr: str, keys: Iterable[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to acquire a replica of the listed keys from
            other workers. This is a fire-and-forget operation which offers no feedback for
            success or failure, and is intended for housekeeping and not for computation.
            """
            who_has = {}
            nbytes = {}
            for key in keys:
                ts = self.tasks[key]
                assert ts.who_has
                who_has[key] = [ws.address for ws in ts.who_has or ()]
                nbytes[key] = ts.nbytes
    
            self.stream_comms[addr].send(
                {
                    "op": "acquire-replicas",
                    "who_has": who_has,
                    "nbytes": nbytes,
                    "stimulus_id": stimulus_id,
                },
            )
    
        def request_remove_replicas(
            self, addr: str, keys: list[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to discard its replica of the listed keys.
            This must never be used to destroy the last replica of a key. This is a
            fire-and-forget operation, intended for housekeeping and not for computation.
    
            The replica disappears immediately from TaskState.who_has on the Scheduler side;
            if the worker refuses to delete, e.g. because the task is a dependency of
            another task running on it, it will (also asynchronously) inform the scheduler
            to re-add itself to who_has. If the worker agrees to discard the task, there is
            no feedback.
            """
            ws = self.workers[addr]
    
            # The scheduler immediately forgets about the replica and suggests the worker to
            # drop it. The worker may refuse, at which point it will send back an add-keys
            # message to reinstate it.
            for key in keys:
                ts = self.tasks[key]
                if self.validate:
                    # Do not destroy the last copy
                    assert ts.who_has
                    assert len(ts.who_has) > 1
                self.remove_replica(ts, ws)
    
            self.stream_comms[addr].send(
                {
                    "op": "remove-replicas",
                    "keys": keys,
                    "stimulus_id": stimulus_id,
                }
            )
    
    
    def _task_to_report_msg(ts: TaskState) -> dict[str, Any] | None:
        if ts.state == "forgotten":
            return {"op": "cancelled-keys", "keys": [ts.key]}
        elif ts.state == "memory":
            return {"op": "key-in-memory", "key": ts.key}
        elif ts.state == "erred":
            failing_ts = ts.exception_blame
            assert failing_ts
            return {
                "op": "task-erred",
                "key": ts.key,
                "exception": failing_ts.exception,
                "traceback": failing_ts.traceback,
            }
        else:
            return None
    
    
    def _task_to_client_msgs(ts: TaskState) -> Msgs:
        if ts.who_wants:
            report_msg = _task_to_report_msg(ts)
            if report_msg is not None:
                return {cs.client_key: [report_msg] for cs in ts.who_wants}
        return {}
    
    
    def decide_worker(
        ts: TaskState,
        all_workers: set[WorkerState],
        valid_workers: set[WorkerState] | None,
        objective: Callable[[WorkerState], Any],
    ) -> WorkerState | None:
        """
        Decide which worker should take task *ts*.
    
        We choose the worker that has the data on which *ts* depends.
    
        If several workers have dependencies then we choose the less-busy worker.
    
        Optionally provide *valid_workers* of where jobs are allowed to occur
        (if all workers are allowed to take the task, pass None instead).
    
        If the task requires data communication because no eligible worker has
        all the dependencies already, then we choose to minimize the number
        of bytes sent between workers.  This is determined by calling the
        *objective* function.
        """
        assert all(dts.who_has for dts in ts.dependencies)
        if ts.actor:
            candidates = all_workers.copy()
        else:
            candidates = {wws for dts in ts.dependencies for wws in dts.who_has or ()}
            candidates &= all_workers
        if valid_workers is None:
            if not candidates:
                candidates = all_workers.copy()
        else:
            candidates &= valid_workers
            if not candidates:
                candidates = valid_workers
                if not candidates:
                    if ts.loose_restrictions:
                        return decide_worker(ts, all_workers, None, objective)
    
        if not candidates:
            return None
        elif len(candidates) == 1:
            return next(iter(candidates))
        else:
            return min(candidates, key=objective)
    
    
    def validate_task_state(ts: TaskState) -> None:
        """Validate the given TaskState"""
        assert ts.state in ALL_TASK_STATES, ts
    
        if ts.waiting_on:
            assert ts.waiting_on.issubset(ts.dependencies), (
                "waiting not subset of dependencies",
                str(ts.waiting_on),
                str(ts.dependencies),
            )
        if ts.waiters:
            assert ts.waiters.issubset(ts.dependents), (
                "waiters not subset of dependents",
                str(ts.waiters),
                str(ts.dependents),
            )
    
        for dts in ts.waiting_on or ():
            assert not dts.who_has, ("waiting on in-memory dep", str(ts), str(dts))
            assert dts.state != "released", ("waiting on released dep", str(ts), str(dts))
        for dts in ts.dependencies:
            assert ts in dts.dependents, (
                "not in dependency's dependents",
                str(ts),
                str(dts),
                str(dts.dependents),
            )
            if ts.state in ("waiting", "queued", "processing", "no-worker"):
                assert ts.waiting_on and dts in ts.waiting_on or dts.who_has, (
                    "dep missing",
                    str(ts),
                    str(dts),
                )
            assert dts.state != "forgotten"
    
        for dts in ts.waiters or ():
            assert dts.state in ("waiting", "queued", "processing", "no-worker"), (
                "waiter not in play",
                str(ts),
                str(dts),
            )
        for dts in ts.dependents:
            assert ts in dts.dependencies, (
                "not in dependent's dependencies",
                str(ts),
                str(dts),
                str(dts.dependencies),
            )
            assert dts.state != "forgotten"
    
        assert (ts.processing_on is not None) == (ts.state == "processing")
        assert bool(ts.who_has) == (ts.state == "memory"), (ts, ts.who_has, ts.state)
    
        if ts.state == "queued":
            assert not ts.processing_on
            assert not ts.who_has
            assert all(dts.who_has for dts in ts.dependencies), (
                "task queued without all deps",
                str(ts),
                str(ts.dependencies),
            )
    
        if ts.state == "processing":
            assert all(dts.who_has for dts in ts.dependencies), (
                "task processing without all deps",
                str(ts),
                str(ts.dependencies),
            )
            assert not ts.waiting_on
    
        if ts.who_has:
            assert ts.waiters or ts.who_wants, (
                "unneeded task in memory",
                str(ts),
                str(ts.who_has),
            )
            if ts.run_spec:  # was computed
                assert ts.type
                assert isinstance(ts.type, str)
            assert not any(
                [
                    ts in dts.waiting_on
                    for dts in ts.dependents
                    if dts.waiting_on is not None
                ]
            )
            for ws in ts.who_has:
                assert ts in ws.has_what, (
                    "not in who_has' has_what",
                    str(ts),
                    str(ws),
                    str(ws.has_what),
                )
    
        for cs in ts.who_wants or ():
            assert ts in cs.wants_what, (
                "not in who_wants' wants_what",
                str(ts),
                str(cs),
                str(cs.wants_what),
            )
    
        if ts.actor:
            if ts.state == "memory":
                assert ts.who_has
                assert sum(ts in ws.actors for ws in ts.who_has) == 1
            if ts.state == "processing":
                assert ts.processing_on
                assert ts in ts.processing_on.actors
            assert ts.state != "queued"
    
    
    def validate_unrunnable(unrunnable: dict[TaskState, float]) -> None:
        prev_unrunnable_since: float | None = None
        prev_ts: TaskState | None = None
        for ts, unrunnable_since in unrunnable.items():
            assert ts.state == "no-worker"
            if prev_ts is not None:
                assert prev_unrunnable_since is not None
                # Ensure that unrunnable_since is monotonically increasing when iterating over unrunnable.
                # _check_no_workers relies on this.
                assert prev_unrunnable_since <= unrunnable_since, (
                    prev_ts,
                    ts,
                    prev_unrunnable_since,
                    unrunnable_since,
                )
            prev_ts = ts
            prev_unrunnable_since = unrunnable_since
    
    
    def validate_worker_state(ws: WorkerState) -> None:
        for ts in ws.has_what or ():
            assert ts.who_has
            assert ws in ts.who_has, (
                "not in has_what' who_has",
                str(ws),
                str(ts),
                str(ts.who_has),
            )
    
        for ts in ws.actors:
            assert ts.state in ("memory", "processing")
    
    
    def validate_state(
        tasks: dict[Key, TaskState],
        workers: dict[str, WorkerState],
        clients: dict[str, ClientState],
    ) -> None:
        """Validate a current runtime state.
    
        This performs a sequence of checks on the entire graph, running in about linear
        time. This raises assert errors if anything doesn't check out.
        """
        for ts in tasks.values():
            validate_task_state(ts)
    
        for ws in workers.values():
            validate_worker_state(ws)
    
        for cs in clients.values():
            for ts in cs.wants_what or ():
                assert ts.who_wants
                assert cs in ts.who_wants, (
                    "not in wants_what' who_wants",
                    str(cs),
                    str(ts),
                    str(ts.who_wants),
                )
    
    
    def heartbeat_interval(n: int) -> float:
        """Interval in seconds that we desire heartbeats based on number of workers"""
        if n <= 10:
            return 0.5
        elif n < 50:
            return 1
        elif n < 200:
            return 2
        else:
            # No more than 200 heartbeats a second scaled by workers
            return n / 200 + 1
    
    
    def _task_slots_available(ws: WorkerState, saturation_factor: float) -> int:
        """Number of tasks that can be sent to this worker without oversaturating it"""
        assert not math.isinf(saturation_factor)
        return max(math.ceil(saturation_factor * ws.nthreads), 1) - (
            len(ws.processing) - len(ws.long_running)
        )
    
    
    def _worker_full(ws: WorkerState, saturation_factor: float) -> bool:
        if math.isinf(saturation_factor):
            return False
        return _task_slots_available(ws, saturation_factor) <= 0
    
    
    class KilledWorker(Exception):
        def __init__(self, task: Key, last_worker: WorkerState, allowed_failures: int):
            super().__init__(task, last_worker, allowed_failures)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def last_worker(self) -> WorkerState:
            return self.args[1]
    
        @property
        def allowed_failures(self) -> int:
            return self.args[2]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} on {self.allowed_failures + 1} "
                "different workers, but all those workers died while running it. "
                f"The last worker that attempt to run the task was {self.last_worker.address}. "
                "Inspecting worker logs is often a good next step to diagnose what went wrong. "
                "For more information see https://distributed.dask.org/en/stable/killed.html."
            )
    
    
    class NoValidWorkerError(Exception):
        def __init__(
            self,
            task: Key,
            host_restrictions: set[str],
            worker_restrictions: set[str],
            resource_restrictions: dict[str, float],
            timeout: float,
        ):
            super().__init__(
                task, host_restrictions, worker_restrictions, resource_restrictions, timeout
            )
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def host_restrictions(self) -> Any:
            return self.args[1]
    
        @property
        def worker_restrictions(self) -> Any:
            return self.args[2]
    
        @property
        def resource_restrictions(self) -> Any:
            return self.args[3]
    
        @property
        def timeout(self) -> float:
            return self.args[4]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting for a valid worker matching all restrictions.\n\nRestrictions:\n"
                f"host_restrictions={self.host_restrictions!s}\n"
                f"worker_restrictions={self.worker_restrictions!s}\n"
                f"resource_restrictions={self.resource_restrictions!s}\n"
            )
    
    
    class NoWorkerError(Exception):
        def __init__(self, task: Key, timeout: float):
            super().__init__(task, timeout)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def timeout(self) -> float:
            return self.args[1]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting without any running workers."
            )
    
    
    class WorkerStatusPlugin(SchedulerPlugin):
        """A plugin to share worker status with a remote observer
    
        This is used in cluster managers to keep updated about the status of the scheduler.
        """
    
        name: ClassVar[str] = "worker-status"
        bcomm: BatchedSend
    
        def __init__(self, scheduler: Scheduler, comm: Comm):
            self.bcomm = BatchedSend(interval="5ms")
            self.bcomm.start(comm)
            scheduler.add_plugin(self)
    
        def add_worker(self, scheduler: Scheduler, worker: str) -> None:
            ident = scheduler.workers[worker].identity()
            del ident["metrics"]
            del ident["last_seen"]
            try:
                self.bcomm.send(["add", {"workers": {worker: ident}}])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def remove_worker(self, scheduler: Scheduler, worker: str, **kwargs: Any) -> None:
            try:
                self.bcomm.send(["remove", worker])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def teardown(self) -> None:
            self.bcomm.close()
    
    
    class CollectTaskMetaDataPlugin(SchedulerPlugin):
        scheduler: Scheduler
        name: str
        keys: set[Key]
        metadata: dict[Key, Any]
        state: dict[Key, TaskStateState]
    
        def __init__(self, scheduler: Scheduler, name: str):
            self.scheduler = scheduler
            self.name = name
            self.keys = set()
            self.metadata = {}
            self.state = {}
    
        def update_graph(
            self,
            scheduler: Scheduler,
            *,
            keys: set[Key],
            **kwargs: Any,
        ) -> None:
            self.keys.update(keys)
    
        def transition(
            self,
            key: Key,
            start: TaskStateState,
            finish: TaskStateState,
            *args: Any,
            **kwargs: Any,
        ) -> None:
            if finish in ("memory", "erred"):
                ts = self.scheduler.tasks.get(key)
                if ts is not None and ts.key in self.keys:
                    self.metadata[key] = ts.metadata
                    self.state[key] = finish
                    self.keys.discard(key)
    
    
    def _materialize_graph(
        expr: Expr,
        global_annotations: dict[str, Any],
        validate: bool,
    ) -> tuple[dict[Key, T_runspec], dict[Key, set[Key]], dict[str, dict[Key, Any]]]:
>       dsk: dict = expr.__dask_graph__()
E       AttributeError: 'dict' object has no attribute '__dask_graph__'

distributed/scheduler.py:9383: AttributeError

Check warning on line 0 in distributed.cli.tests.test_dask_worker

github-actions / Unit Test Results

1 out of 5 runs failed: test_respect_host_listen_address[0.0.0.0---no-nanny] (distributed.cli.tests.test_dask_worker)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 0s]

Raw output


            AttributeError: 'dict' object has no attribute '__dask_graph__'
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:42257', workers: 0, cores: 0, tasks: 0>
nanny = '--no-nanny', host = '0.0.0.0'

    @pytest.mark.slow
    @pytest.mark.skipif(not LINUX, reason="Need 127.0.0.2 to mean localhost")
    @pytest.mark.parametrize("nanny", ["--nanny", "--no-nanny"])
    @pytest.mark.parametrize("host", ["127.0.0.2", "0.0.0.0"])
    @gen_cluster(client=True, nthreads=[])
    async def test_respect_host_listen_address(c, s, nanny, host):
        with popen(
            [
                sys.executable,
                "-m",
                "dask",
                "worker",
                s.address,
                nanny,
                "--no-dashboard",
                "--host",
                host,
            ]
        ):
            await c.wait_for_workers(1)
    
            # roundtrip works
>           assert await c.submit(lambda x: x + 1, 10) == 11

distributed/cli/tests/test_dask_worker.py:569: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:408: in _result
    raise exc.with_traceback(tb)
distributed/utils.py:1507: in run_in_executor_with_context
    return await loop.run_in_executor(
../../../miniconda3/envs/dask-distributed/lib/python3.10/concurrent/futures/thread.py:58: in run
    result = self.fn(*self.args, **self.kwargs)
distributed/utils.py:1508: in <lambda>
    executor, lambda: context.run(func, *args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import dataclasses
    import heapq
    import inspect
    import itertools
    import json
    import logging
    import math
    import operator
    import os
    import pickle
    import random
    import textwrap
    import uuid
    import warnings
    import weakref
    from abc import abstractmethod
    from collections import defaultdict, deque
    from collections.abc import (
        Callable,
        Collection,
        Container,
        Hashable,
        Iterable,
        Iterator,
        Mapping,
        Sequence,
        Set,
    )
    from contextlib import suppress
    from functools import partial
    from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple, cast, overload
    
    import psutil
    import tornado.web
    from sortedcontainers import SortedDict, SortedSet
    from tlz import (
        concat,
        first,
        groupby,
        merge,
        merge_sorted,
        merge_with,
        partition,
        pluck,
        second,
        take,
        valmap,
    )
    from tornado.ioloop import IOLoop
    
    import dask
    import dask.utils
    from dask._task_spec import DependenciesMapping, GraphNode, convert_legacy_graph
    from dask.core import istask, validate_key
    from dask.typing import Key, no_default
    from dask.utils import (
        _deprecated,
        _deprecated_kwarg,
        format_bytes,
        format_time,
        key_split,
        parse_bytes,
        parse_timedelta,
        tmpfile,
    )
    from dask.widgets import get_template
    
    from distributed import cluster_dump, preloading, profile
    from distributed import versions as version_module
    from distributed._asyncio import RLock
    from distributed._stories import scheduler_story
    from distributed.active_memory_manager import ActiveMemoryManagerExtension, RetireWorker
    from distributed.batched import BatchedSend
    from distributed.broker import Broker
    from distributed.client import SourceCode
    from distributed.collections import HeapSet
    from distributed.comm import (
        Comm,
        CommClosedError,
        get_address_host,
        normalize_address,
        resolve_address,
        unparse_host_port,
    )
    from distributed.comm.addressing import addresses_from_user_args
    from distributed.compatibility import PeriodicCallback
    from distributed.core import (
        ErrorMessage,
        OKMessage,
        Status,
        clean_exception,
        error_message,
        rpc,
        send_recv,
    )
    from distributed.diagnostics.memory_sampler import MemorySamplerExtension
    from distributed.diagnostics.plugin import SchedulerPlugin, _get_plugin_name
    from distributed.event import EventExtension
    from distributed.gc import disable_gc_diagnosis, enable_gc_diagnosis
    from distributed.http import get_handlers
    from distributed.metrics import monotonic, time
    from distributed.multi_lock import MultiLockExtension
    from distributed.node import ServerNode
    from distributed.proctitle import setproctitle
    from distributed.protocol import deserialize
    from distributed.protocol.pickle import dumps, loads
    from distributed.protocol.serialize import Serialized, ToPickle, serialize
    from distributed.publish import PublishExtension
    from distributed.pubsub import PubSubSchedulerExtension
    from distributed.queues import QueueExtension
    from distributed.recreate_tasks import ReplayTaskScheduler
    from distributed.security import Security
    from distributed.semaphore import SemaphoreExtension
    from distributed.shuffle import ShuffleSchedulerPlugin
    from distributed.spans import SpanMetadata, SpansSchedulerExtension
    from distributed.stealing import WorkStealing
    from distributed.utils import (
        All,
        Deadline,
        TimeoutError,
        format_dashboard_link,
        get_fileno_limit,
        key_split_group,
        log_errors,
        offload,
        recursive_to_dict,
        wait_for,
    )
    from distributed.utils_comm import (
        gather_from_workers,
        retry_operation,
        scatter_to_workers,
    )
    from distributed.variable import VariableExtension
    
    if TYPE_CHECKING:
        # TODO import from typing (requires Python >=3.10)
        # TODO import from typing (requires Python >=3.11)
        from typing_extensions import Self, TypeAlias
    
        from dask._expr import Expr
    
    # Not to be confused with distributed.worker_state_machine.TaskStateState
    TaskStateState: TypeAlias = Literal[
        "released",
        "waiting",
        "no-worker",
        "queued",
        "processing",
        "memory",
        "erred",
        "forgotten",
    ]
    
    ALL_TASK_STATES: Set[TaskStateState] = set(TaskStateState.__args__)  # type: ignore
    
    # {task key -> finish state}
    # Not to be confused with distributed.worker_state_machine.Recs
    Recs: TypeAlias = dict[Key, TaskStateState]
    # {client or worker address: [{op: <key>, ...}, ...]}
    Msgs: TypeAlias = dict[str, list[dict[str, Any]]]
    # (recommendations, client messages, worker messages)
    RecsMsgs: TypeAlias = tuple[Recs, Msgs, Msgs]
    
    T_runspec: TypeAlias = GraphNode
    
    logger = logging.getLogger(__name__)
    LOG_PDB = dask.config.get("distributed.admin.pdb-on-err")
    DEFAULT_DATA_SIZE = parse_bytes(
        dask.config.get("distributed.scheduler.default-data-size")
    )
    STIMULUS_ID_UNSET = "<stimulus_id unset>"
    
    DEFAULT_EXTENSIONS = {
        "multi_locks": MultiLockExtension,
        "publish": PublishExtension,
        "replay-tasks": ReplayTaskScheduler,
        "queues": QueueExtension,
        "variables": VariableExtension,
        "pubsub": PubSubSchedulerExtension,
        "semaphores": SemaphoreExtension,
        "events": EventExtension,
        "amm": ActiveMemoryManagerExtension,
        "memory_sampler": MemorySamplerExtension,
        "shuffle": ShuffleSchedulerPlugin,
        "spans": SpansSchedulerExtension,
        "stealing": WorkStealing,
    }
    
    
    class ClientState:
        """A simple object holding information about a client."""
    
        #: A unique identifier for this client. This is generally an opaque
        #: string generated by the client itself.
        client_key: str
    
        #: Cached hash of :attr:`~ClientState.client_key`
        _hash: int
    
        #: A set of tasks this client wants to be kept in memory, so that it can download
        #: its result when desired. This is the reverse mapping of
        #: :class:`TaskState.who_wants`. Tasks are typically removed from this set when the
        #: corresponding object in the client's space (for example a ``Future`` or a Dask
        #: collection) gets garbage-collected.
        wants_what: set[TaskState]
    
        #: The last time we received a heartbeat from this client, in local scheduler time.
        last_seen: float
    
        #: Output of :func:`distributed.versions.get_versions` on the client
        versions: dict[str, Any]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(self, client: str, *, versions: dict[str, Any] | None = None):
            self.client_key = client
            self._hash = hash(client)
            self.wants_what = set()
            self.last_seen = time()
            self.versions = versions or {}
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            if not isinstance(other, ClientState):
                return False
            return self.client_key == other.client_key
    
        def __repr__(self) -> str:
            return f"<Client {self.client_key!r}>"
    
        def __str__(self) -> str:
            return self.client_key
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
    
    class MemoryState:
        """Memory readings on a worker or on the whole cluster.
    
        See :doc:`worker-memory`.
    
        Attributes / properties:
    
        managed_total
            Sum of the output of sizeof() for all dask keys held by the worker in memory,
            plus number of bytes spilled to disk
    
        managed
            Sum of the output of sizeof() for the dask keys held in RAM. Note that this may
            be inaccurate, which may cause inaccurate unmanaged memory (see below).
    
        spilled
            Number of bytes  for the dask keys spilled to the hard drive.
            Note that this is the size on disk; size in memory may be different due to
            compression and inaccuracies in sizeof(). In other words, given the same keys,
            'managed' will change depending on the keys being in memory or spilled.
    
        process
            Total RSS memory measured by the OS on the worker process.
            This is always exactly equal to managed + unmanaged.
    
        unmanaged
            process - managed. This is the sum of
    
            - Python interpreter and modules
            - global variables
            - memory temporarily allocated by the dask tasks that are currently running
            - memory fragmentation
            - memory leaks
            - memory not yet garbage collected
            - memory not yet free()'d by the Python memory manager to the OS
    
        unmanaged_old
            Minimum of the 'unmanaged' measures over the last
            ``distributed.memory.recent-to-old-time`` seconds
    
        unmanaged_recent
            unmanaged - unmanaged_old; in other words process memory that has been recently
            allocated but is not accounted for by dask; hopefully it's mostly a temporary
            spike.
    
        optimistic
            managed + unmanaged_old; in other words the memory held long-term by
            the process under the hopeful assumption that all unmanaged_recent memory is a
            temporary spike
        """
    
        process: int
        unmanaged_old: int
        managed: int
        spilled: int
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            process: int,
            unmanaged_old: int,
            managed: int,
            spilled: int,
        ):
            # Some data arrives with the heartbeat, some other arrives in realtime as the
            # tasks progress. Also, sizeof() is not guaranteed to return correct results.
            # This can cause glitches where a partial measure is larger than the whole, so
            # we need to force all numbers to add up exactly by definition.
            self.process = process
            self.managed = min(self.process, managed)
            self.spilled = spilled
            # Subtractions between unsigned ints guaranteed by construction to be >= 0
            self.unmanaged_old = min(unmanaged_old, process - self.managed)
    
        @staticmethod
        def sum(*infos: MemoryState) -> MemoryState:
            process = 0
            unmanaged_old = 0
            managed = 0
            spilled = 0
            for ms in infos:
                process += ms.process
                unmanaged_old += ms.unmanaged_old
                spilled += ms.spilled
                managed += ms.managed
            return MemoryState(
                process=process,
                unmanaged_old=unmanaged_old,
                managed=managed,
                spilled=spilled,
            )
    
        @property
        def managed_total(self) -> int:
            return self.managed + self.spilled
    
        @property
        def unmanaged(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed
    
        @property
        def unmanaged_recent(self) -> int:
            # This is never negative thanks to __init__
            return self.process - self.managed - self.unmanaged_old
    
        @property
        def optimistic(self) -> int:
            return self.managed + self.unmanaged_old
    
        @property
        def managed_in_memory(self) -> int:
            warnings.warn("managed_in_memory has been renamed to managed", FutureWarning)
            return self.managed
    
        @property
        def managed_spilled(self) -> int:
            warnings.warn("managed_spilled has been renamed to spilled", FutureWarning)
            return self.spilled
    
        def __repr__(self) -> str:
            return (
                f"Process memory (RSS)  : {format_bytes(self.process)}\n"
                f"  - managed by Dask   : {format_bytes(self.managed)}\n"
                f"  - unmanaged (old)   : {format_bytes(self.unmanaged_old)}\n"
                f"  - unmanaged (recent): {format_bytes(self.unmanaged_recent)}\n"
                f"Spilled to disk       : {format_bytes(self.spilled)}\n"
            )
    
        def _to_dict(self, *, exclude: Container[str] = ()) -> dict:
            """Dictionary representation for debugging purposes.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            """
            return {
                k: getattr(self, k)
                for k in dir(self)
                if not k.startswith("_")
                and k not in {"sum", "managed_in_memory", "managed_spilled"}
            }
    
    
    class WorkerState:
        """A simple object holding information about a worker.
    
        Not to be confused with :class:`distributed.worker_state_machine.WorkerState`.
        """
    
        #: This worker's unique key. This can be its connected address
        #: (such as ``"tcp://127.0.0.1:8891"``) or an alias (such as ``"alice"``).
        address: str
    
        pid: int
        name: Hashable
    
        #: The number of CPU threads made available on this worker
        nthreads: int
    
        #: Memory available to the worker, in bytes
        memory_limit: int
    
        local_directory: str
        services: dict[str, int]
    
        #: Output of :meth:`distributed.versions.get_versions` on the worker
        versions: dict[str, Any]
    
        #: Address of the associated :class:`~distributed.nanny.Nanny`, if present
        nanny: str | None
    
        #: Read-only worker status, synced one way from the remote Worker object
        status: Status
    
        #: Cached hash of :attr:`~WorkerState.server_id`
        _hash: int
    
        #: The total memory size, in bytes, used by the tasks this worker holds in memory
        #: (i.e. the tasks in this worker's :attr:`~WorkerState.has_what`).
        nbytes: int
    
        #: Worker memory unknown to the worker, in bytes, which has been there for more than
        #: 30 seconds. See :class:`MemoryState`.
        _memory_unmanaged_old: int
    
        #: History of the last 30 seconds' worth of unmanaged memory. Used to differentiate
        #: between "old" and "new" unmanaged memory.
        #: Format: ``[(timestamp, bytes), (timestamp, bytes), ...]``
        _memory_unmanaged_history: deque[tuple[float, int]]
    
        metrics: dict[str, Any]
    
        #: The last time we received a heartbeat from this worker, in local scheduler time.
        last_seen: float
    
        time_delay: float
        bandwidth: float
    
        #: A set of all TaskStates on this worker that are actors. This only includes those
        #: actors whose state actually lives on this worker, not actors to which this worker
        #: has a reference.
        actors: set[TaskState]
    
        #: Underlying data of :meth:`WorkerState.has_what`
        _has_what: dict[TaskState, None]
    
        #: A set of tasks that have been submitted to this worker. Multiple tasks may be
        # submitted to a worker in advance and the worker will run them eventually,
        # depending on its execution resources (but see :doc:`work-stealing`).
        #:
        #: All the tasks here are in the "processing" state.
        #: This attribute is kept in sync with :attr:`TaskState.processing_on`.
        processing: set[TaskState]
    
        #: Running tasks that invoked :func:`distributed.secede`
        long_running: set[TaskState]
    
        #: A dictionary of tasks that are currently being run on this worker.
        #: Each task state is associated with the duration in seconds which the task has
        #: been running.
        executing: dict[TaskState, float]
    
        #: The available resources on this worker, e.g. ``{"GPU": 2}``.
        #: These are abstract quantities that constrain certain tasks from running at the
        #: same time on this worker.
        resources: dict[str, float]
    
        #: The sum of each resource used by all tasks allocated to this worker.
        #: The numbers in this dictionary can only be less or equal than those in this
        #: worker's :attr:`~WorkerState.resources`.
        used_resources: dict[str, float]
    
        #: Arbitrary additional metadata to be added to :meth:`~WorkerState.identity`
        extra: dict[str, Any]
    
        # The unique server ID this WorkerState is referencing
        server_id: str
    
        # Reference to scheduler task_groups
        scheduler_ref: weakref.ref[SchedulerState] | None
        task_prefix_count: defaultdict[str, int]
        _network_occ: int
        _occupancy_cache: float | None
    
        #: Keys that may need to be fetched to this worker, and the number of tasks that need them.
        #: All tasks are currently in `memory` on a worker other than this one.
        #: Much like `processing`, this does not exactly reflect worker state:
        #: keys here may be queued to fetch, in flight, or already in memory
        #: on the worker.
        needs_what: dict[TaskState, int]
    
        __slots__ = tuple(__annotations__)
    
        def __init__(
            self,
            *,
            address: str,
            status: Status,
            pid: int,
            name: object,
            nthreads: int = 0,
            memory_limit: int,
            local_directory: str,
            nanny: str | None,
            server_id: str,
            services: dict[str, int] | None = None,
            versions: dict[str, Any] | None = None,
            extra: dict[str, Any] | None = None,
            scheduler: SchedulerState | None = None,
        ):
            self.server_id = server_id
            self.address = address
            self.pid = pid
            self.name = name
            self.nthreads = nthreads
            self.memory_limit = memory_limit
            self.local_directory = local_directory
            self.services = services or {}
            self.versions = versions or {}
            self.nanny = nanny
            self.status = status
            self._hash = hash(self.server_id)
            self.nbytes = 0
            self._memory_unmanaged_old = 0
            self._memory_unmanaged_history = deque()
            self.metrics = {}
            self.last_seen = time()
            self.time_delay = 0
            self.bandwidth = parse_bytes(dask.config.get("distributed.scheduler.bandwidth"))
            self.actors = set()
            self._has_what = {}
            self.processing = set()
            self.long_running = set()
            self.executing = {}
            self.resources = {}
            self.used_resources = {}
            self.extra = extra or {}
            self.scheduler_ref = weakref.ref(scheduler) if scheduler else None
            self.task_prefix_count = defaultdict(int)
            self.needs_what = {}
            self._network_occ = 0
            self._occupancy_cache = None
    
        def __hash__(self) -> int:
            return self._hash
    
        def __eq__(self, other: object) -> bool:
            return self is other or (
                isinstance(other, WorkerState) and other.server_id == self.server_id
            )
    
        @property
        def has_what(self) -> Set[TaskState]:
            """An insertion-sorted set-like of tasks which currently reside on this worker.
            All the tasks here are in the "memory" state.
            This is the reverse mapping of :attr:`TaskState.who_has`.
    
            This is a read-only public accessor. The data is implemented as a dict without
            values, because rebalance() relies on dicts being insertion-sorted.
            """
            return self._has_what.keys()
    
        @property
        def host(self) -> str:
            return get_address_host(self.address)
    
        @property
        def memory(self) -> MemoryState:
            """Polished memory metrics for the worker.
    
            **Design note on managed memory**
    
            There are two measures available for managed memory:
    
            - ``self.nbytes``
            - ``self.metrics["managed_bytes"]``
    
            At rest, the two numbers must be identical. However, ``self.nbytes`` is
            immediately updated through the batched comms as soon as each task lands in
            memory on the worker; ``self.metrics["managed_bytes"]`` instead is updated by
            the heartbeat, which can lag several seconds behind.
    
            Below we are mixing likely newer managed memory info from ``self.nbytes`` with
            process and spilled memory from the heartbeat. This is deliberate, so that
            managed memory total is updated more frequently.
    
            Managed memory directly and immediately contributes to optimistic memory, which
            is in turn used in Active Memory Manager heuristics (at the moment of writing;
            more uses will likely be added in the future). So it's important to have it
            up to date; much more than it is for process memory.
    
            Having up-to-date managed memory info as soon as the scheduler learns about
            task completion also substantially simplifies unit tests.
    
            The flip side of this design is that it may cause some noise in the
            unmanaged_recent measure. e.g.:
    
            1. Delete 100MB of managed data
            2. The updated managed memory reaches the scheduler faster than the
               updated process memory
            3. There's a blip where the scheduler thinks that there's a sudden 100MB
               increase in unmanaged_recent, since process memory hasn't changed but managed
               memory has decreased by 100MB
            4. When the heartbeat arrives, process memory goes down and so does the
               unmanaged_recent.
    
            This is OK - one of the main reasons for the unmanaged_recent / unmanaged_old
            split is exactly to concentrate all the noise in unmanaged_recent and exclude it
            from optimistic memory, which is used for heuristics.
    
            Something that is less OK, but also less frequent, is that the sudden deletion
            of spilled keys will cause a negative blip in managed memory:
    
            1. Delete 100MB of spilled data
            2. The updated managed memory *total* reaches the scheduler faster than the
               updated spilled portion
            3. This causes the managed memory to temporarily plummet and be replaced by
               unmanaged_recent, while spilled memory remains unaltered
            4. When the heartbeat arrives, managed goes back up, unmanaged_recent
               goes back down, and spilled goes down by 100MB as it should have to
               begin with.
    
            :issue:`6002` will let us solve this.
            """
            return MemoryState(
                process=self.metrics["memory"],
                managed=max(0, self.nbytes - self.metrics["spilled_bytes"]["memory"]),
                spilled=self.metrics["spilled_bytes"]["disk"],
                unmanaged_old=self._memory_unmanaged_old,
            )
    
        def clean(self) -> WorkerState:
            """Return a version of this object that is appropriate for serialization"""
            ws = WorkerState(
                address=self.address,
                status=self.status,
                pid=self.pid,
                name=self.name,
                nthreads=self.nthreads,
                memory_limit=self.memory_limit,
                local_directory=self.local_directory,
                services=self.services,
                nanny=self.nanny,
                extra=self.extra,
                server_id=self.server_id,
            )
            ws._occupancy_cache = self.occupancy
    
            ws.executing = {ts.key: duration for ts, duration in self.executing.items()}  # type: ignore
            return ws
    
        def __repr__(self) -> str:
            name = f", name: {self.name}" if self.name != self.address else ""
            return (
                f"<WorkerState {self.address!r}{name}, "
                f"status: {self.status.name}, "
                f"memory: {len(self.has_what)}, "
                f"processing: {len(self.processing)}>"
            )
    
        def _repr_html_(self) -> str:
            return get_template("worker_state.html.j2").render(
                address=self.address,
                name=self.name,
                status=self.status.name,
                has_what=self.has_what,
                processing=self.processing,
            )
    
        def identity(self) -> dict[str, Any]:
            return {
                "type": "Worker",
                "id": self.name,
                "host": self.host,
                "resources": self.resources,
                "local_directory": self.local_directory,
                "name": self.name,
                "nthreads": self.nthreads,
                "memory_limit": self.memory_limit,
                "last_seen": self.last_seen,
                "services": self.services,
                "metrics": self.metrics,
                "status": self.status.name,
                "nanny": self.nanny,
                **self.extra,
            }
    
        def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict[str, Any]:
            """Dictionary representation for debugging purposes.
            Not type stable and not intended for roundtrips.
    
            See also
            --------
            Client.dump_cluster_state
            distributed.utils.recursive_to_dict
            TaskState._to_dict
            """
            return recursive_to_dict(
                self,
                exclude=set(exclude) | {"versions"},  # type: ignore
                members=True,
            )
    
        @property
        def scheduler(self) -> SchedulerState:
            assert self.scheduler_ref
            s = self.scheduler_ref()
            assert s
            return s
    
        def add_to_processing(self, ts: TaskState) -> None:
            """Assign a task to this worker for compute."""
            if self.scheduler.validate:
                assert ts not in self.processing
    
            tp = ts.prefix
            self.task_prefix_count[tp.name] += 1
            self.scheduler._task_prefix_count_global[tp.name] += 1
            self.processing.add(ts)
            for dts in ts.dependencies:
                assert dts.who_has
                if self not in dts.who_has:
                    self._inc_needs_replica(dts)
    
        def add_to_long_running(self, ts: TaskState) -> None:
            if self.scheduler.validate:
                assert ts in self.processing
                assert ts not in self.long_running
    
            self._remove_from_task_prefix_count(ts)
            # Cannot remove from processing since we're using this for things like
            # idleness detection. Idle workers are typically targeted for
            # downscaling but we should not downscale workers with long running
            # tasks
            self.long_running.add(ts)
    
        def remove_from_processing(self, ts: TaskState) -> None:
            """Remove a task from a workers processing"""
            if self.scheduler.validate:
                assert ts in self.processing
    
            if ts in self.long_running:
                self.long_running.discard(ts)
            else:
                self._remove_from_task_prefix_count(ts)
            self.processing.remove(ts)
            for dts in ts.dependencies:
                if dts in self.needs_what:
                    self._dec_needs_replica(dts)
    
        def _remove_from_task_prefix_count(self, ts: TaskState) -> None:
            prefix_name = ts.prefix.name
            count = self.task_prefix_count[prefix_name] - 1
            tp_count = self.task_prefix_count
            tp_count_global = self.scheduler._task_prefix_count_global
            if count:
                tp_count[prefix_name] = count
            else:
                del tp_count[prefix_name]
    
            count = tp_count_global[prefix_name] - 1
            if count:
                tp_count_global[prefix_name] = count
            else:
                del tp_count_global[prefix_name]
    
        def remove_replica(self, ts: TaskState) -> None:
            """The worker no longer has a task in memory"""
            if self.scheduler.validate:
                assert ts.who_has
                assert self in ts.who_has
                assert ts in self.has_what
                assert ts not in self.needs_what
    
            self.nbytes -= ts.get_nbytes()
            del self._has_what[ts]
            ts.who_has.remove(self)  # type: ignore
            if not ts.who_has:
                ts.who_has = None
    
        def _inc_needs_replica(self, ts: TaskState) -> None:
            """Assign a task fetch to this worker and…cheduler, title="Scheduler Profile (administrative)"
            )
            task_stream = TabPanel(child=task_stream, title="Task Stream")
            bandwidth_workers = TabPanel(
                child=bandwidth_workers.root, title="Bandwidth (Workers)"
            )
            bandwidth_types = TabPanel(
                child=bandwidth_types.root, title="Bandwidth (Types)"
            )
            system = TabPanel(child=sysmon.root, title="System")
            logs = TabPanel(child=logs.root, title="Scheduler Logs")
    
            tabs = Tabs(
                tabs=[
                    html,
                    task_stream,
                    system,
                    logs,
                    compute,
                    workers,
                    scheduler,
                    bandwidth_workers,
                    bandwidth_types,
                ],
                sizing_mode="stretch_both",
            )
    
            from bokeh.core.templates import get_env
            from bokeh.plotting import output_file, save
    
            with tmpfile(extension=".html") as fn:
                output_file(filename=fn, title="Dask Performance Report", mode=mode)
                template_directory = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)), "dashboard", "templates"
                )
                template_environment = get_env()
                template_environment.loader.searchpath.append(template_directory)
                template = template_environment.get_template("performance_report.html")
                save(tabs, filename=fn, template=template)
    
                with open(fn) as f:
                    data = f.read()
    
            return data
    
        async def get_worker_logs(self, n=None, workers=None, nanny=False):
            results = await self.broadcast(
                msg={"op": "get_logs", "n": n}, workers=workers, nanny=nanny
            )
            return results
    
        def log_event(self, topic: str | Collection[str], msg: Any) -> None:
            """Log an event under a given topic
    
            Parameters
            ----------
            topic : str, list[str]
                Name of the topic under which to log an event. To log the same
                event under multiple topics, pass a list of topic names.
            msg
                Event message to log. Note this must be msgpack serializable.
    
            See also
            --------
            Client.log_event
            """
            self._broker.publish(topic, msg)
    
        def subscribe_topic(self, topic: str, client: str) -> None:
            self._broker.subscribe(topic, client)
    
        def unsubscribe_topic(self, topic: str, client: str) -> None:
            self._broker.unsubscribe(topic, client)
    
        @overload
        def get_events(self, topic: str) -> tuple[tuple[float, Any], ...]: ...
    
        @overload
        def get_events(self) -> dict[str, tuple[tuple[float, Any], ...]]: ...
    
        def get_events(
            self, topic: str | None = None
        ) -> tuple[tuple[float, Any], ...] | dict[str, tuple[tuple[float, Any], ...]]:
            return self._broker.get_events(topic)
    
        async def get_worker_monitor_info(self, recent=False, starts=None):
            if starts is None:
                starts = {}
            results = await asyncio.gather(
                *(
                    self.rpc(w).get_monitor_info(recent=recent, start=starts.get(w, 0))
                    for w in self.workers
                )
            )
            return dict(zip(self.workers, results))
    
        ###########
        # Cleanup #
        ###########
    
        @log_errors
        async def check_worker_ttl(self) -> None:
            now = time()
            stimulus_id = f"check-worker-ttl-{now}"
            assert self.worker_ttl
            ttl = max(self.worker_ttl, 10 * heartbeat_interval(len(self.workers)))
            to_restart = []
    
            for ws in self.workers.values():
                last_seen = now - ws.last_seen
                if last_seen > ttl:
                    to_restart.append(ws.address)
                    logger.warning(
                        f"Worker failed to heartbeat for {last_seen:.0f}s; "
                        f"{'attempting restart' if ws.nanny else 'removing'}: {ws}"
                    )
    
            if to_restart:
                self.log_event(
                    "scheduler",
                    {
                        "action": "worker-ttl-timed-out",
                        "workers": to_restart.copy(),
                        "ttl": ttl,
                    },
                )
                await self.restart_workers(
                    to_restart,
                    wait_for_workers=False,
                    stimulus_id=stimulus_id,
                )
    
        def check_idle(self) -> float | None:
            if self.status in (Status.closing, Status.closed):
                return None  # pragma: nocover
    
            if self.transition_counter != self._idle_transition_counter:
                self._idle_transition_counter = self.transition_counter
                self.idle_since = None
                return None
    
            if self._active_graph_updates > 0:
                self.idle_since = None
                return None
    
            if (
                self.queued
                or self.unrunnable
                or any(ws.processing for ws in self.workers.values())
            ):
                self.idle_since = None
                return None
    
            if not self.idle_since:
                self.idle_since = time()
                return self.idle_since
    
            if self.jupyter:
                last_activity = (
                    self._jupyter_server_application.web_app.last_activity().timestamp()
                )
                if last_activity > self.idle_since:
                    self.idle_since = last_activity
                    return self.idle_since
    
            if self.idle_timeout:
                if time() > self.idle_since + self.idle_timeout:
                    assert self.idle_since
                    logger.info(
                        "Scheduler closing after being idle for %s",
                        format_time(self.idle_timeout),
                    )
                    self._ongoing_background_tasks.call_soon(
                        self.close, reason="idle-timeout-exceeded"
                    )
            return self.idle_since
    
        def _check_no_workers(self) -> None:
            if (
                self.status in (Status.closing, Status.closed)
                or self.no_workers_timeout is None
            ):
                return
    
            now = monotonic()
            stimulus_id = f"check-no-workers-timeout-{time()}"
    
            recommendations: Recs = {}
    
            self._refresh_no_workers_since(now)
    
            affected = self._check_unrunnable_task_timeouts(
                now, recommendations=recommendations, stimulus_id=stimulus_id
            )
    
            affected.update(
                self._check_queued_task_timeouts(
                    now, recommendations=recommendations, stimulus_id=stimulus_id
                )
            )
            self.transitions(recommendations, stimulus_id=stimulus_id)
            if affected:
                self.log_event(
                    "scheduler",
                    {"action": "no-workers-timeout-exceeded", "keys": affected},
                )
    
        def _check_unrunnable_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
            unsatisfied = []
            no_workers = []
            for ts, unrunnable_since in self.unrunnable.items():
                if timestamp <= unrunnable_since + self.no_workers_timeout:
                    # unrunnable is insertion-ordered, which means that unrunnable_since will
                    # be monotonically increasing in this loop.
                    break
                if (
                    self._no_workers_since is None
                    or self._no_workers_since >= unrunnable_since
                ):
                    unsatisfied.append(ts)
                else:
                    no_workers.append(ts)
            if not unsatisfied and not no_workers:
                return set()
    
            for ts in unsatisfied:
                e = pickle.dumps(
                    NoValidWorkerError(
                        task=ts.key,
                        host_restrictions=(ts.host_restrictions or set()).copy(),
                        worker_restrictions=(ts.worker_restrictions or set()).copy(),
                        resource_restrictions=(ts.resource_restrictions or {}).copy(),
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "for its restrictions to become satisfied.",
                    ts.key,
                )
            self._fail_tasks_after_no_workers_timeout(
                no_workers, recommendations, stimulus_id
            )
            return {ts.key for ts in concat([unsatisfied, no_workers])}
    
        def _check_queued_task_timeouts(
            self, timestamp: float, recommendations: Recs, stimulus_id: str
        ) -> set[Key]:
            assert self.no_workers_timeout
    
            if self._no_workers_since is None:
                return set()
    
            if timestamp <= self._no_workers_since + self.no_workers_timeout:
                return set()
            affected = list(self.queued)
            self._fail_tasks_after_no_workers_timeout(
                affected, recommendations, stimulus_id
            )
            return {ts.key for ts in affected}
    
        def _fail_tasks_after_no_workers_timeout(
            self, timed_out: Iterable[TaskState], recommendations: Recs, stimulus_id: str
        ) -> None:
            assert self.no_workers_timeout
    
            for ts in timed_out:
                e = pickle.dumps(
                    NoWorkerError(
                        task=ts.key,
                        timeout=self.no_workers_timeout,
                    ),
                )
                r = self.transition(
                    ts.key,
                    "erred",
                    exception=e,
                    cause=ts.key,
                    stimulus_id=stimulus_id,
                )
                recommendations.update(r)
                logger.error(
                    "Task %s marked as failed because it timed out waiting "
                    "without any running workers.",
                    ts.key,
                )
    
        def _refresh_no_workers_since(self, timestamp: float | None = None) -> None:
            if self.running or not (self.queued or self.unrunnable):
                self._no_workers_since = None
                return
    
            if not self._no_workers_since:
                self._no_workers_since = timestamp or monotonic()
                return
    
        def adaptive_target(self, target_duration=None):
            """Desired number of workers based on the current workload
    
            This looks at the current running tasks and memory use, and returns a
            number of desired workers.  This is often used by adaptive scheduling.
    
            Parameters
            ----------
            target_duration : str
                A desired duration of time for computations to take.  This affects
                how rapidly the scheduler will ask to scale.
    
            See Also
            --------
            distributed.deploy.Adaptive
            """
            if target_duration is None:
                target_duration = dask.config.get("distributed.adaptive.target-duration")
            target_duration = parse_timedelta(target_duration)
    
            # CPU
            queued = take(100, concat([self.queued, self.unrunnable.keys()]))
            queued_occupancy = 0
            for ts in queued:
                queued_occupancy += self._get_prefix_duration(ts.prefix)
    
            tasks_ready = len(self.queued) + len(self.unrunnable)
            if tasks_ready > 100:
                queued_occupancy *= tasks_ready / 100
    
            cpu = math.ceil((self.total_occupancy + queued_occupancy) / target_duration)
    
            # Avoid a few long tasks from asking for many cores
            for ws in self.workers.values():
                if tasks_ready > cpu:
                    break
                tasks_ready += len(ws.processing)
            else:
                cpu = min(tasks_ready, cpu)
    
            # Divide by average nthreads per worker
            if self.workers:
                nthreads = sum(ws.nthreads for ws in self.workers.values())
                cpu = math.ceil(cpu / nthreads * len(self.workers))
    
            if (self.unrunnable or self.queued) and not self.workers:
                cpu = max(1, cpu)
    
            # add more workers if more than 60% of memory is used
            limit = sum(ws.memory_limit for ws in self.workers.values())
            used = sum(ws.nbytes for ws in self.workers.values())
            memory = 0
            if used > 0.6 * limit and limit > 0:
                memory = 2 * len(self.workers)
    
            target = max(memory, cpu)
            if target >= len(self.workers):
                return target
            else:  # Scale down?
                to_close = self.workers_to_close()
                return len(self.workers) - len(to_close)
    
        def request_acquire_replicas(
            self, addr: str, keys: Iterable[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to acquire a replica of the listed keys from
            other workers. This is a fire-and-forget operation which offers no feedback for
            success or failure, and is intended for housekeeping and not for computation.
            """
            who_has = {}
            nbytes = {}
            for key in keys:
                ts = self.tasks[key]
                assert ts.who_has
                who_has[key] = [ws.address for ws in ts.who_has or ()]
                nbytes[key] = ts.nbytes
    
            self.stream_comms[addr].send(
                {
                    "op": "acquire-replicas",
                    "who_has": who_has,
                    "nbytes": nbytes,
                    "stimulus_id": stimulus_id,
                },
            )
    
        def request_remove_replicas(
            self, addr: str, keys: list[Key], *, stimulus_id: str
        ) -> None:
            """Asynchronously ask a worker to discard its replica of the listed keys.
            This must never be used to destroy the last replica of a key. This is a
            fire-and-forget operation, intended for housekeeping and not for computation.
    
            The replica disappears immediately from TaskState.who_has on the Scheduler side;
            if the worker refuses to delete, e.g. because the task is a dependency of
            another task running on it, it will (also asynchronously) inform the scheduler
            to re-add itself to who_has. If the worker agrees to discard the task, there is
            no feedback.
            """
            ws = self.workers[addr]
    
            # The scheduler immediately forgets about the replica and suggests the worker to
            # drop it. The worker may refuse, at which point it will send back an add-keys
            # message to reinstate it.
            for key in keys:
                ts = self.tasks[key]
                if self.validate:
                    # Do not destroy the last copy
                    assert ts.who_has
                    assert len(ts.who_has) > 1
                self.remove_replica(ts, ws)
    
            self.stream_comms[addr].send(
                {
                    "op": "remove-replicas",
                    "keys": keys,
                    "stimulus_id": stimulus_id,
                }
            )
    
    
    def _task_to_report_msg(ts: TaskState) -> dict[str, Any] | None:
        if ts.state == "forgotten":
            return {"op": "cancelled-keys", "keys": [ts.key]}
        elif ts.state == "memory":
            return {"op": "key-in-memory", "key": ts.key}
        elif ts.state == "erred":
            failing_ts = ts.exception_blame
            assert failing_ts
            return {
                "op": "task-erred",
                "key": ts.key,
                "exception": failing_ts.exception,
                "traceback": failing_ts.traceback,
            }
        else:
            return None
    
    
    def _task_to_client_msgs(ts: TaskState) -> Msgs:
        if ts.who_wants:
            report_msg = _task_to_report_msg(ts)
            if report_msg is not None:
                return {cs.client_key: [report_msg] for cs in ts.who_wants}
        return {}
    
    
    def decide_worker(
        ts: TaskState,
        all_workers: set[WorkerState],
        valid_workers: set[WorkerState] | None,
        objective: Callable[[WorkerState], Any],
    ) -> WorkerState | None:
        """
        Decide which worker should take task *ts*.
    
        We choose the worker that has the data on which *ts* depends.
    
        If several workers have dependencies then we choose the less-busy worker.
    
        Optionally provide *valid_workers* of where jobs are allowed to occur
        (if all workers are allowed to take the task, pass None instead).
    
        If the task requires data communication because no eligible worker has
        all the dependencies already, then we choose to minimize the number
        of bytes sent between workers.  This is determined by calling the
        *objective* function.
        """
        assert all(dts.who_has for dts in ts.dependencies)
        if ts.actor:
            candidates = all_workers.copy()
        else:
            candidates = {wws for dts in ts.dependencies for wws in dts.who_has or ()}
            candidates &= all_workers
        if valid_workers is None:
            if not candidates:
                candidates = all_workers.copy()
        else:
            candidates &= valid_workers
            if not candidates:
                candidates = valid_workers
                if not candidates:
                    if ts.loose_restrictions:
                        return decide_worker(ts, all_workers, None, objective)
    
        if not candidates:
            return None
        elif len(candidates) == 1:
            return next(iter(candidates))
        else:
            return min(candidates, key=objective)
    
    
    def validate_task_state(ts: TaskState) -> None:
        """Validate the given TaskState"""
        assert ts.state in ALL_TASK_STATES, ts
    
        if ts.waiting_on:
            assert ts.waiting_on.issubset(ts.dependencies), (
                "waiting not subset of dependencies",
                str(ts.waiting_on),
                str(ts.dependencies),
            )
        if ts.waiters:
            assert ts.waiters.issubset(ts.dependents), (
                "waiters not subset of dependents",
                str(ts.waiters),
                str(ts.dependents),
            )
    
        for dts in ts.waiting_on or ():
            assert not dts.who_has, ("waiting on in-memory dep", str(ts), str(dts))
            assert dts.state != "released", ("waiting on released dep", str(ts), str(dts))
        for dts in ts.dependencies:
            assert ts in dts.dependents, (
                "not in dependency's dependents",
                str(ts),
                str(dts),
                str(dts.dependents),
            )
            if ts.state in ("waiting", "queued", "processing", "no-worker"):
                assert ts.waiting_on and dts in ts.waiting_on or dts.who_has, (
                    "dep missing",
                    str(ts),
                    str(dts),
                )
            assert dts.state != "forgotten"
    
        for dts in ts.waiters or ():
            assert dts.state in ("waiting", "queued", "processing", "no-worker"), (
                "waiter not in play",
                str(ts),
                str(dts),
            )
        for dts in ts.dependents:
            assert ts in dts.dependencies, (
                "not in dependent's dependencies",
                str(ts),
                str(dts),
                str(dts.dependencies),
            )
            assert dts.state != "forgotten"
    
        assert (ts.processing_on is not None) == (ts.state == "processing")
        assert bool(ts.who_has) == (ts.state == "memory"), (ts, ts.who_has, ts.state)
    
        if ts.state == "queued":
            assert not ts.processing_on
            assert not ts.who_has
            assert all(dts.who_has for dts in ts.dependencies), (
                "task queued without all deps",
                str(ts),
                str(ts.dependencies),
            )
    
        if ts.state == "processing":
            assert all(dts.who_has for dts in ts.dependencies), (
                "task processing without all deps",
                str(ts),
                str(ts.dependencies),
            )
            assert not ts.waiting_on
    
        if ts.who_has:
            assert ts.waiters or ts.who_wants, (
                "unneeded task in memory",
                str(ts),
                str(ts.who_has),
            )
            if ts.run_spec:  # was computed
                assert ts.type
                assert isinstance(ts.type, str)
            assert not any(
                [
                    ts in dts.waiting_on
                    for dts in ts.dependents
                    if dts.waiting_on is not None
                ]
            )
            for ws in ts.who_has:
                assert ts in ws.has_what, (
                    "not in who_has' has_what",
                    str(ts),
                    str(ws),
                    str(ws.has_what),
                )
    
        for cs in ts.who_wants or ():
            assert ts in cs.wants_what, (
                "not in who_wants' wants_what",
                str(ts),
                str(cs),
                str(cs.wants_what),
            )
    
        if ts.actor:
            if ts.state == "memory":
                assert ts.who_has
                assert sum(ts in ws.actors for ws in ts.who_has) == 1
            if ts.state == "processing":
                assert ts.processing_on
                assert ts in ts.processing_on.actors
            assert ts.state != "queued"
    
    
    def validate_unrunnable(unrunnable: dict[TaskState, float]) -> None:
        prev_unrunnable_since: float | None = None
        prev_ts: TaskState | None = None
        for ts, unrunnable_since in unrunnable.items():
            assert ts.state == "no-worker"
            if prev_ts is not None:
                assert prev_unrunnable_since is not None
                # Ensure that unrunnable_since is monotonically increasing when iterating over unrunnable.
                # _check_no_workers relies on this.
                assert prev_unrunnable_since <= unrunnable_since, (
                    prev_ts,
                    ts,
                    prev_unrunnable_since,
                    unrunnable_since,
                )
            prev_ts = ts
            prev_unrunnable_since = unrunnable_since
    
    
    def validate_worker_state(ws: WorkerState) -> None:
        for ts in ws.has_what or ():
            assert ts.who_has
            assert ws in ts.who_has, (
                "not in has_what' who_has",
                str(ws),
                str(ts),
                str(ts.who_has),
            )
    
        for ts in ws.actors:
            assert ts.state in ("memory", "processing")
    
    
    def validate_state(
        tasks: dict[Key, TaskState],
        workers: dict[str, WorkerState],
        clients: dict[str, ClientState],
    ) -> None:
        """Validate a current runtime state.
    
        This performs a sequence of checks on the entire graph, running in about linear
        time. This raises assert errors if anything doesn't check out.
        """
        for ts in tasks.values():
            validate_task_state(ts)
    
        for ws in workers.values():
            validate_worker_state(ws)
    
        for cs in clients.values():
            for ts in cs.wants_what or ():
                assert ts.who_wants
                assert cs in ts.who_wants, (
                    "not in wants_what' who_wants",
                    str(cs),
                    str(ts),
                    str(ts.who_wants),
                )
    
    
    def heartbeat_interval(n: int) -> float:
        """Interval in seconds that we desire heartbeats based on number of workers"""
        if n <= 10:
            return 0.5
        elif n < 50:
            return 1
        elif n < 200:
            return 2
        else:
            # No more than 200 heartbeats a second scaled by workers
            return n / 200 + 1
    
    
    def _task_slots_available(ws: WorkerState, saturation_factor: float) -> int:
        """Number of tasks that can be sent to this worker without oversaturating it"""
        assert not math.isinf(saturation_factor)
        return max(math.ceil(saturation_factor * ws.nthreads), 1) - (
            len(ws.processing) - len(ws.long_running)
        )
    
    
    def _worker_full(ws: WorkerState, saturation_factor: float) -> bool:
        if math.isinf(saturation_factor):
            return False
        return _task_slots_available(ws, saturation_factor) <= 0
    
    
    class KilledWorker(Exception):
        def __init__(self, task: Key, last_worker: WorkerState, allowed_failures: int):
            super().__init__(task, last_worker, allowed_failures)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def last_worker(self) -> WorkerState:
            return self.args[1]
    
        @property
        def allowed_failures(self) -> int:
            return self.args[2]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} on {self.allowed_failures + 1} "
                "different workers, but all those workers died while running it. "
                f"The last worker that attempt to run the task was {self.last_worker.address}. "
                "Inspecting worker logs is often a good next step to diagnose what went wrong. "
                "For more information see https://distributed.dask.org/en/stable/killed.html."
            )
    
    
    class NoValidWorkerError(Exception):
        def __init__(
            self,
            task: Key,
            host_restrictions: set[str],
            worker_restrictions: set[str],
            resource_restrictions: dict[str, float],
            timeout: float,
        ):
            super().__init__(
                task, host_restrictions, worker_restrictions, resource_restrictions, timeout
            )
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def host_restrictions(self) -> Any:
            return self.args[1]
    
        @property
        def worker_restrictions(self) -> Any:
            return self.args[2]
    
        @property
        def resource_restrictions(self) -> Any:
            return self.args[3]
    
        @property
        def timeout(self) -> float:
            return self.args[4]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting for a valid worker matching all restrictions.\n\nRestrictions:\n"
                f"host_restrictions={self.host_restrictions!s}\n"
                f"worker_restrictions={self.worker_restrictions!s}\n"
                f"resource_restrictions={self.resource_restrictions!s}\n"
            )
    
    
    class NoWorkerError(Exception):
        def __init__(self, task: Key, timeout: float):
            super().__init__(task, timeout)
    
        @property
        def task(self) -> Key:
            return self.args[0]
    
        @property
        def timeout(self) -> float:
            return self.args[1]
    
        def __str__(self) -> str:
            return (
                f"Attempted to run task {self.task!r} but timed out after {format_time(self.timeout)} "
                "waiting without any running workers."
            )
    
    
    class WorkerStatusPlugin(SchedulerPlugin):
        """A plugin to share worker status with a remote observer
    
        This is used in cluster managers to keep updated about the status of the scheduler.
        """
    
        name: ClassVar[str] = "worker-status"
        bcomm: BatchedSend
    
        def __init__(self, scheduler: Scheduler, comm: Comm):
            self.bcomm = BatchedSend(interval="5ms")
            self.bcomm.start(comm)
            scheduler.add_plugin(self)
    
        def add_worker(self, scheduler: Scheduler, worker: str) -> None:
            ident = scheduler.workers[worker].identity()
            del ident["metrics"]
            del ident["last_seen"]
            try:
                self.bcomm.send(["add", {"workers": {worker: ident}}])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def remove_worker(self, scheduler: Scheduler, worker: str, **kwargs: Any) -> None:
            try:
                self.bcomm.send(["remove", worker])
            except CommClosedError:
                scheduler.remove_plugin(name=self.name)
    
        def teardown(self) -> None:
            self.bcomm.close()
    
    
    class CollectTaskMetaDataPlugin(SchedulerPlugin):
        scheduler: Scheduler
        name: str
        keys: set[Key]
        metadata: dict[Key, Any]
        state: dict[Key, TaskStateState]
    
        def __init__(self, scheduler: Scheduler, name: str):
            self.scheduler = scheduler
            self.name = name
            self.keys = set()
            self.metadata = {}
            self.state = {}
    
        def update_graph(
            self,
            scheduler: Scheduler,
            *,
            keys: set[Key],
            **kwargs: Any,
        ) -> None:
            self.keys.update(keys)
    
        def transition(
            self,
            key: Key,
            start: TaskStateState,
            finish: TaskStateState,
            *args: Any,
            **kwargs: Any,
        ) -> None:
            if finish in ("memory", "erred"):
                ts = self.scheduler.tasks.get(key)
                if ts is not None and ts.key in self.keys:
                    self.metadata[key] = ts.metadata
                    self.state[key] = finish
                    self.keys.discard(key)
    
    
    def _materialize_graph(
        expr: Expr,
        global_annotations: dict[str, Any],
        validate: bool,
    ) -> tuple[dict[Key, T_runspec], dict[Key, set[Key]], dict[str, dict[Key, Any]]]:
>       dsk: dict = expr.__dask_graph__()
E       AttributeError: 'dict' object has no attribute '__dask_graph__'

distributed/scheduler.py:9383: AttributeError

Check warning on line 0 in distributed.cli.tests.test_dask_worker

github-actions / Unit Test Results

All 5 runs failed: test_single_executable_works (distributed.cli.tests.test_dask_worker)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.comm.tests.test_ws

github-actions / Unit Test Results

All 5 runs failed: test_roundtrip (distributed.comm.tests.test_ws)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.comm.tests.test_ws

github-actions / Unit Test Results

All 5 runs failed: test_http_and_comm_server[True-ws://-None-8787] (distributed.comm.tests.test_ws)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.comm.tests.test_ws

github-actions / Unit Test Results

All 5 runs failed: test_http_and_comm_server[True-wss://-True-8787] (distributed.comm.tests.test_ws)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.comm.tests.test_ws

github-actions / Unit Test Results

All 5 runs failed: test_http_and_comm_server[False-ws://-None-8787] (distributed.comm.tests.test_ws)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.comm.tests.test_ws

github-actions / Unit Test Results

All 5 runs failed: test_http_and_comm_server[False-wss://-True-8787] (distributed.comm.tests.test_ws)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.comm.tests.test_ws

github-actions / Unit Test Results

All 5 runs failed: test_http_and_comm_server[True-ws://-None-8786] (distributed.comm.tests.test_ws)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.comm.tests.test_ws

github-actions / Unit Test Results

All 5 runs failed: test_http_and_comm_server[True-wss://-True-8786] (distributed.comm.tests.test_ws)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.comm.tests.test_ws

github-actions / Unit Test Results

All 5 runs failed: test_http_and_comm_server[False-ws://-None-8786] (distributed.comm.tests.test_ws)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.comm.tests.test_ws

github-actions / Unit Test Results

All 5 runs failed: test_http_and_comm_server[False-wss://-True-8786] (distributed.comm.tests.test_ws)

artifacts/ubuntu-latest-mindeps-default-notci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.dashboard.tests.test_components

github-actions / Unit Test Results

3 out of 4 runs failed: test_profile_plot (distributed.dashboard.tests.test_components)

artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.dashboard.tests.test_components

github-actions / Unit Test Results

3 out of 4 runs failed: test_profile_time_plot (distributed.dashboard.tests.test_components)

artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh

github-actions / Unit Test Results

All 4 runs failed: test_simple (distributed.dashboard.tests.test_scheduler_bokeh)

artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh

github-actions / Unit Test Results

All 4 runs failed: test_stealing_events (distributed.dashboard.tests.test_scheduler_bokeh)

artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh

github-actions / Unit Test Results

All 4 runs failed: test_events (distributed.dashboard.tests.test_scheduler_bokeh)

artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh

github-actions / Unit Test Results

All 4 runs failed: test_task_stream (distributed.dashboard.tests.test_scheduler_bokeh)

artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh

github-actions / Unit Test Results

All 4 runs failed: test_task_stream_n_rectangles (distributed.dashboard.tests.test_scheduler_bokeh)

artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.13-default-notci1/pytest.xml [took 0s]

View more details on GitHub Actions

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Use Expr instead of HLG #14610

Unit Test Results