diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 33fc8b8717024..abea5cb4e617c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -42,12 +42,24 @@ repos: ?^ci/docker/python-.*-wheel-windows-test-vs2022.*\.dockerfile$| ) types: [] + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.10 + hooks: + # Run the linter. + - id: ruff + args: [--fix] + files: >- + ^dev/archery/ + # Run the formatter. + - id: ruff-format + files: >- + ^dev/ - repo: https://github.com/pycqa/flake8 rev: 6.1.0 hooks: - id: flake8 name: Python Format - files: ^(python|dev|c_glib|integration)/ + files: ^(python|c_glib|integration)/ types: - file - python diff --git a/c_glib/tool/generate-version-header.py b/c_glib/tool/generate-version-header.py index 6a8976204c05a..94fe0ed514ede 100755 --- a/c_glib/tool/generate-version-header.py +++ b/c_glib/tool/generate-version-header.py @@ -26,39 +26,31 @@ def main(): parser = argparse.ArgumentParser( - description="Generate C header with version macros") + description="Generate C header with version macros" + ) parser.add_argument( - "--library", - required=True, - help="The library name to use in macro prefixes") + "--library", required=True, help="The library name to use in macro prefixes" + ) + parser.add_argument("--version", required=True, help="The library version number") parser.add_argument( - "--version", - required=True, - help="The library version number") + "--input", type=Path, required=True, help="Path to the input template file" + ) parser.add_argument( - "--input", - type=Path, - required=True, - help="Path to the input template file") - parser.add_argument( - "--output", - type=Path, - required=True, - help="Path to the output file to generate") + "--output", type=Path, required=True, help="Path to the output file to generate" + ) args = parser.parse_args() - with open(args.input, "r", encoding="utf-8") as input_file, \ - open(args.output, "w", encoding="utf-8") as output_file: - write_header( - input_file, output_file, args.library, args.version) + with ( + open(args.input, "r", encoding="utf-8") as input_file, + open(args.output, "w", encoding="utf-8") as output_file, + ): + write_header(input_file, output_file, args.library, args.version) def write_header( - input_file: TextIOBase, - output_file: TextIOBase, - library_name: str, - version: str): + input_file: TextIOBase, output_file: TextIOBase, library_name: str, version: str +): if "-" in version: version, version_tag = version.split("-") else: @@ -70,17 +62,18 @@ def write_header( availability_macros = generate_availability_macros(library_name) replacements = { - "VERSION_MAJOR": str(version_major), - "VERSION_MINOR": str(version_minor), - "VERSION_MICRO": str(version_micro), - "VERSION_TAG": version_tag, - "ENCODED_VERSIONS": encoded_versions, - "VISIBILITY_MACROS": visibility_macros, - "AVAILABILITY_MACROS": availability_macros, + "VERSION_MAJOR": str(version_major), + "VERSION_MINOR": str(version_minor), + "VERSION_MICRO": str(version_micro), + "VERSION_TAG": version_tag, + "ENCODED_VERSIONS": encoded_versions, + "VISIBILITY_MACROS": visibility_macros, + "AVAILABILITY_MACROS": availability_macros, } - output_file.write(re.sub( - r"@([A-Z_]+)@", lambda match: replacements[match[1]], input_file.read())) + output_file.write( + re.sub(r"@([A-Z_]+)@", lambda match: replacements[match[1]], input_file.read()) + ) def generate_visibility_macros(library: str) -> str: @@ -140,36 +133,36 @@ def generate_availability_macros(library: str) -> str: ALL_VERSIONS = [ - (20, 0), - (19, 0), - (18, 0), - (17, 0), - (16, 0), - (15, 0), - (14, 0), - (13, 0), - (12, 0), - (11, 0), - (10, 0), - (9, 0), - (8, 0), - (7, 0), - (6, 0), - (5, 0), - (4, 0), - (3, 0), - (2, 0), - (1, 0), - (0, 17), - (0, 16), - (0, 15), - (0, 14), - (0, 13), - (0, 12), - (0, 11), - (0, 10), + (20, 0), + (19, 0), + (18, 0), + (17, 0), + (16, 0), + (15, 0), + (14, 0), + (13, 0), + (12, 0), + (11, 0), + (10, 0), + (9, 0), + (8, 0), + (7, 0), + (6, 0), + (5, 0), + (4, 0), + (3, 0), + (2, 0), + (1, 0), + (0, 17), + (0, 16), + (0, 15), + (0, 14), + (0, 13), + (0, 12), + (0, 11), + (0, 10), ] -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/dev/archery/archery/benchmark/codec.py b/dev/archery/archery/benchmark/codec.py index 4157890d13d0e..4f8e1d3b9114c 100644 --- a/dev/archery/archery/benchmark/codec.py +++ b/dev/archery/archery/benchmark/codec.py @@ -14,13 +14,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +from __future__ import annotations import json +from ..benchmark.compare import BenchmarkComparator from ..benchmark.core import Benchmark, BenchmarkSuite from ..benchmark.runner import BenchmarkRunner, StaticBenchmarkRunner -from ..benchmark.compare import BenchmarkComparator class JsonEncoder(json.JSONEncoder): @@ -63,13 +63,12 @@ class BenchmarkSuiteCodec: def encode(bs): return { "name": bs.name, - "benchmarks": [BenchmarkCodec.encode(b) for b in bs.benchmarks] + "benchmarks": [BenchmarkCodec.encode(b) for b in bs.benchmarks], } @staticmethod def decode(dct, **kwargs): - benchmarks = [BenchmarkCodec.decode(b) - for b in dct.pop("benchmarks", [])] + benchmarks = [BenchmarkCodec.decode(b) for b in dct.pop("benchmarks", [])] return BenchmarkSuite(benchmarks=benchmarks, **dct, **kwargs) @@ -80,8 +79,7 @@ def encode(br): @staticmethod def decode(dct, **kwargs): - suites = [BenchmarkSuiteCodec.decode(s) - for s in dct.pop("suites", [])] + suites = [BenchmarkSuiteCodec.decode(s) for s in dct.pop("suites", [])] return StaticBenchmarkRunner(suites=suites, **dct, **kwargs) diff --git a/dev/archery/archery/benchmark/compare.py b/dev/archery/archery/benchmark/compare.py index 622b80179178b..b34d23b4ea269 100644 --- a/dev/archery/archery/benchmark/compare.py +++ b/dev/archery/archery/benchmark/compare.py @@ -18,35 +18,37 @@ # Define a global regression threshold as 5%. This is purely subjective and # flawed. This does not track cumulative regression. +from __future__ import annotations + DEFAULT_THRESHOLD = 0.05 def items_per_seconds_fmt(value): if value < 1000: - return "{} items/sec".format(value) + return f"{value} items/sec" if value < 1000**2: - return "{:.3f}K items/sec".format(value / 1000) + return f"{value / 1000:.3f}K items/sec" if value < 1000**3: - return "{:.3f}M items/sec".format(value / 1000**2) + return f"{value / 1000**2:.3f}M items/sec" else: - return "{:.3f}G items/sec".format(value / 1000**3) + return f"{value / 1000**3:.3f}G items/sec" def bytes_per_seconds_fmt(value): if value < 1024: - return "{} bytes/sec".format(value) + return f"{value} bytes/sec" if value < 1024**2: - return "{:.3f} KiB/sec".format(value / 1024) + return f"{value / 1024:.3f} KiB/sec" if value < 1024**3: - return "{:.3f} MiB/sec".format(value / 1024**2) + return f"{value / 1024**2:.3f} MiB/sec" if value < 1024**4: - return "{:.3f} GiB/sec".format(value / 1024**3) + return f"{value / 1024**3:.3f} GiB/sec" else: - return "{:.3f} TiB/sec".format(value / 1024**4) + return f"{value / 1024**4:.3f} TiB/sec" def change_fmt(value): - return "{:.3%}".format(value) + return f"{value:.3%}" def formatter_for_unit(unit): @@ -59,14 +61,15 @@ def formatter_for_unit(unit): class BenchmarkComparator: - """ Compares two benchmarks. + """Compares two benchmarks. Encodes the logic of comparing two benchmarks and taking a decision on if it induce a regression. """ - def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD, - suite_name=None): + def __init__( + self, contender, baseline, threshold=DEFAULT_THRESHOLD, suite_name=None + ): self.contender = contender self.baseline = baseline self.threshold = threshold @@ -98,14 +101,14 @@ def change(self): @property def confidence(self): - """ Indicate if a comparison of benchmarks should be trusted. """ + """Indicate if a comparison of benchmarks should be trusted.""" return True @property def regression(self): change = self.change adjusted_change = change if self.less_is_better else -change - return (self.confidence and adjusted_change > self.threshold) + return self.confidence and adjusted_change > self.threshold @property def formatted(self): @@ -118,7 +121,7 @@ def formatted(self): "contender": fmt(self.contender.value), "unit": self.unit, "less_is_better": self.less_is_better, - "counters": str(self.baseline.counters) + "counters": str(self.baseline.counters), } def compare(self, comparator=None): @@ -130,7 +133,7 @@ def compare(self, comparator=None): "contender": self.contender.value, "unit": self.unit, "less_is_better": self.less_is_better, - "counters": self.baseline.counters + "counters": self.baseline.counters, } def __call__(self, **kwargs): @@ -141,12 +144,12 @@ def pairwise_compare(contender, baseline): dict_contender = {e.name: e for e in contender} dict_baseline = {e.name: e for e in baseline} - for name in (dict_contender.keys() & dict_baseline.keys()): + for name in dict_contender.keys() & dict_baseline.keys(): yield name, (dict_contender[name], dict_baseline[name]) class RunnerComparator: - """ Compares suites/benchmarks from runners. + """Compares suites/benchmarks from runners. It is up to the caller that ensure that runners are compatible (both from the same language implementation). @@ -164,10 +167,12 @@ def comparisons(self): suites = pairwise_compare(contender, baseline) for suite_name, (suite_cont, suite_base) in suites: - benchmarks = pairwise_compare( - suite_cont.benchmarks, suite_base.benchmarks) + benchmarks = pairwise_compare(suite_cont.benchmarks, suite_base.benchmarks) for _, (bench_cont, bench_base) in benchmarks: - yield BenchmarkComparator(bench_cont, bench_base, - threshold=self.threshold, - suite_name=suite_name) + yield BenchmarkComparator( + bench_cont, + bench_base, + threshold=self.threshold, + suite_name=suite_name, + ) diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py index 5a92271a35391..3be7a95f73834 100644 --- a/dev/archery/archery/benchmark/core.py +++ b/dev/archery/archery/benchmark/core.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations def median(values): @@ -27,8 +28,9 @@ def median(values): class Benchmark: - def __init__(self, name, unit, less_is_better, values, time_unit, - times, counters=None): + def __init__( + self, name, unit, less_is_better, values, time_unit, times, counters=None + ): self.name = name self.unit = unit self.less_is_better = less_is_better @@ -43,7 +45,7 @@ def value(self): return self.median def __repr__(self): - return "Benchmark[name={},value={}]".format(self.name, self.value) + return f"Benchmark[name={self.name},value={self.value}]" class BenchmarkSuite: @@ -52,6 +54,4 @@ def __init__(self, name, benchmarks): self.benchmarks = benchmarks def __repr__(self): - return "BenchmarkSuite[name={}, benchmarks={}]".format( - self.name, self.benchmarks - ) + return f"BenchmarkSuite[name={self.name}, benchmarks={self.benchmarks}]" diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py index b07dc8cb30f84..76796cd685f9d 100644 --- a/dev/archery/archery/benchmark/google.py +++ b/dev/archery/archery/benchmark/google.py @@ -14,14 +14,15 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from itertools import filterfalse, groupby, tee import json import subprocess +from itertools import filterfalse, groupby, tee from tempfile import NamedTemporaryFile -from .core import Benchmark from ..utils.command import Command +from .core import Benchmark def partition(pred, iterable): @@ -31,7 +32,7 @@ def partition(pred, iterable): class GoogleBenchmarkCommand(Command): - """ Run a google benchmark binary. + """Run a google benchmark binary. This assumes the binary supports the standard command line options, notably `--benchmark_filter`, `--benchmark_format`, etc... @@ -45,16 +46,17 @@ def __init__(self, benchmark_bin, benchmark_filter=None, benchmark_extras=None): def list_benchmarks(self): argv = ["--benchmark_list_tests"] if self.benchmark_filter: - argv.append("--benchmark_filter={}".format(self.benchmark_filter)) - result = self.run(*argv, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + argv.append(f"--benchmark_filter={self.benchmark_filter}") + result = self.run(*argv, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return str.splitlines(result.stdout.decode("utf-8")) def results(self, repetitions=1, repetition_min_time=None): with NamedTemporaryFile() as out: - argv = [f"--benchmark_repetitions={repetitions}", - f"--benchmark_out={out.name}", - "--benchmark_out_format=json"] + argv = [ + f"--benchmark_repetitions={repetitions}", + f"--benchmark_out={out.name}", + "--benchmark_out_format=json", + ] if repetition_min_time is not None: argv.append(f"--benchmark_min_time={repetition_min_time:.6f}s") @@ -69,7 +71,7 @@ def results(self, repetitions=1, repetition_min_time=None): class GoogleBenchmarkObservation: - """ Represents one run of a single (google c++) benchmark. + """Represents one run of a single (google c++) benchmark. Aggregates are reported by Google Benchmark executables alongside other observations whenever repetitions are specified (with @@ -87,9 +89,18 @@ class GoogleBenchmarkObservation: RegressionSumKernel/32768/0_stddev 0 us 0 us 288.046MB/s """ - def __init__(self, name, real_time, cpu_time, time_unit, run_type, - size=None, bytes_per_second=None, items_per_second=None, - **counters): + def __init__( + self, + name, + real_time, + cpu_time, + time_unit, + run_type, + size=None, + bytes_per_second=None, + items_per_second=None, + **counters, + ): self._name = name self.real_time = real_time self.cpu_time = cpu_time @@ -102,12 +113,12 @@ def __init__(self, name, real_time, cpu_time, time_unit, run_type, @property def is_aggregate(self): - """ Indicate if the observation is a run or an aggregate. """ + """Indicate if the observation is a run or an aggregate.""" return self.run_type == "aggregate" @property def is_realtime(self): - """ Indicate if the preferred value is realtime instead of cputime. """ + """Indicate if the preferred value is realtime instead of cputime.""" return self.name.find("/real_time") != -1 @property @@ -121,7 +132,7 @@ def time(self): @property def value(self): - """ Return the benchmark value.""" + """Return the benchmark value.""" return self.bytes_per_second or self.items_per_second or self.time @property @@ -138,10 +149,10 @@ def __repr__(self): class GoogleBenchmark(Benchmark): - """ A set of GoogleBenchmarkObservations. """ + """A set of GoogleBenchmarkObservations.""" def __init__(self, name, runs): - """ Initialize a GoogleBenchmark. + """Initialize a GoogleBenchmark. Parameters ---------- @@ -162,17 +173,16 @@ def __init__(self, name, runs): times = [b.real_time for b in self.runs] # Slight kludge to extract the UserCounters for each benchmark counters = self.runs[0].counters - super().__init__(name, unit, less_is_better, values, time_unit, times, - counters) + super().__init__(name, unit, less_is_better, values, time_unit, times, counters) def __repr__(self): - return "GoogleBenchmark[name={},runs={}]".format(self.names, self.runs) + return f"GoogleBenchmark[name={self.names},runs={self.runs}]" @classmethod def from_json(cls, payload): def group_key(x): return x.name - benchmarks = map(lambda x: GoogleBenchmarkObservation(**x), payload) + benchmarks = (GoogleBenchmarkObservation(**x) for x in payload) groups = groupby(sorted(benchmarks, key=group_key), group_key) return [cls(k, list(bs)) for k, bs in groups] diff --git a/dev/archery/archery/benchmark/jmh.py b/dev/archery/archery/benchmark/jmh.py index f531b6de1638f..b1a97afda7976 100644 --- a/dev/archery/archery/benchmark/jmh.py +++ b/dev/archery/archery/benchmark/jmh.py @@ -14,15 +14,16 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from itertools import filterfalse, groupby, tee import json import subprocess +from itertools import filterfalse, groupby, tee from tempfile import NamedTemporaryFile -from .core import Benchmark from ..utils.command import Command from ..utils.maven import Maven +from .core import Benchmark def partition(pred, iterable): @@ -32,7 +33,7 @@ def partition(pred, iterable): class JavaMicrobenchmarkHarnessCommand(Command): - """ Run a Java Micro Benchmark Harness + """Run a Java Micro Benchmark Harness This assumes the binary supports the standard command line options, notably `-Dbenchmark_filter` @@ -58,9 +59,8 @@ def __init__(self, build, benchmark_filter=None): def list_benchmarks(self): argv = [] if self.benchmark_filter: - argv.append("-Dbenchmark.filter={}".format(self.benchmark_filter)) - result = self.build.list( - *argv, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + argv.append(f"-Dbenchmark.filter={self.benchmark_filter}") + result = self.build.list(*argv, stdout=subprocess.PIPE, stderr=subprocess.PIPE) lists = [] benchmarks = False @@ -77,24 +77,30 @@ def list_benchmarks(self): def results(self, repetitions): with NamedTemporaryFile(suffix=".json") as out: - argv = ["-Dbenchmark.runs={}".format(repetitions), - "-Dbenchmark.resultfile={}".format(out.name), - "-Dbenchmark.resultformat=json"] + argv = [ + f"-Dbenchmark.runs={repetitions}", + f"-Dbenchmark.resultfile={out.name}", + "-Dbenchmark.resultformat=json", + ] if self.benchmark_filter: - argv.append( - "-Dbenchmark.filter={}".format(self.benchmark_filter) - ) + argv.append(f"-Dbenchmark.filter={self.benchmark_filter}") self.build.benchmark(*argv, check=True) return json.load(out) class JavaMicrobenchmarkHarnessObservation: - """ Represents one run of a single Java Microbenchmark Harness - """ - - def __init__(self, benchmark, primaryMetric, - forks, warmupIterations, measurementIterations, **counters): + """Represents one run of a single Java Microbenchmark Harness""" + + def __init__( + self, + benchmark, + primaryMetric, + forks, + warmupIterations, + measurementIterations, + **counters, + ): self.name = benchmark self.primaryMetric = primaryMetric self.score = primaryMetric["score"] @@ -109,13 +115,12 @@ def __init__(self, benchmark, primaryMetric, "warmupTime": counters["warmupTime"], "measurements": measurementIterations, "measurementTime": counters["measurementTime"], - "jvmArgs": counters["jvmArgs"] + "jvmArgs": counters["jvmArgs"], } - self.reciprocal_value = True if self.score_unit.endswith( - "/op") else False + self.reciprocal_value = bool(self.score_unit.endswith("/op")) if self.score_unit.startswith("ops/"): idx = self.score_unit.find("/") - self.normalizePerSec(self.score_unit[idx+1:]) + self.normalizePerSec(self.score_unit[(idx + 1) :]) elif self.score_unit.endswith("/op"): idx = self.score_unit.find("/") self.normalizePerSec(self.score_unit[:idx]) @@ -124,7 +129,7 @@ def __init__(self, benchmark, primaryMetric, @property def value(self): - """ Return the benchmark value.""" + """Return the benchmark value.""" val = 1 / self.score if self.reciprocal_value else self.score return val * self.normalizeFactor @@ -146,9 +151,7 @@ def normalizePerSec(self, unit): @property def unit(self): - if self.score_unit.startswith("ops/"): - return "items_per_second" - elif self.score_unit.endswith("/op"): + if self.score_unit.startswith("ops/") or self.score_unit.endswith("/op"): return "items_per_second" else: return "?" @@ -158,10 +161,10 @@ def __repr__(self): class JavaMicrobenchmarkHarness(Benchmark): - """ A set of JavaMicrobenchmarkHarnessObservations. """ + """A set of JavaMicrobenchmarkHarnessObservations.""" def __init__(self, name, runs): - """ Initialize a JavaMicrobenchmarkHarness. + """Initialize a JavaMicrobenchmarkHarness. Parameters ---------- @@ -183,19 +186,16 @@ def __init__(self, name, runs): times = [] # Slight kludge to extract the UserCounters for each benchmark counters = self.runs[0].counters - super().__init__(name, unit, less_is_better, values, time_unit, times, - counters) + super().__init__(name, unit, less_is_better, values, time_unit, times, counters) def __repr__(self): - return "JavaMicrobenchmark[name={},runs={}]".format( - self.name, self.runs) + return f"JavaMicrobenchmark[name={self.name},runs={self.runs}]" @classmethod def from_json(cls, payload): def group_key(x): return x.name - benchmarks = map( - lambda x: JavaMicrobenchmarkHarnessObservation(**x), payload) + benchmarks = (JavaMicrobenchmarkHarnessObservation(**x) for x in payload) groups = groupby(sorted(benchmarks, key=group_key), group_key) return [cls(k, list(bs)) for k, bs in groups] diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py index 9ebb9226e3743..0f1264bfdca49 100644 --- a/dev/archery/archery/benchmark/runner.py +++ b/dev/archery/archery/benchmark/runner.py @@ -14,20 +14,22 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations +import contextlib import glob import json import os import re -from .core import BenchmarkSuite -from .google import GoogleBenchmarkCommand, GoogleBenchmark -from .jmh import JavaMicrobenchmarkHarnessCommand, JavaMicrobenchmarkHarness from ..lang.cpp import CppCMakeDefinition, CppConfiguration -from ..lang.java import JavaMavenDefinition, JavaConfiguration +from ..lang.java import JavaConfiguration, JavaMavenDefinition from ..utils.cmake import CMakeBuild -from ..utils.maven import MavenBuild from ..utils.logger import logger +from ..utils.maven import MavenBuild +from .core import BenchmarkSuite +from .google import GoogleBenchmark, GoogleBenchmarkCommand +from .jmh import JavaMicrobenchmarkHarness, JavaMicrobenchmarkHarnessCommand def regex_filter(re_expr): @@ -41,8 +43,13 @@ def regex_filter(re_expr): class BenchmarkRunner: - def __init__(self, suite_filter=None, benchmark_filter=None, - repetitions=DEFAULT_REPETITIONS, repetition_min_time=None): + def __init__( + self, + suite_filter=None, + benchmark_filter=None, + repetitions=DEFAULT_REPETITIONS, + repetition_min_time=None, + ): self.suite_filter = suite_filter self.benchmark_filter = benchmark_filter self.repetitions = repetitions @@ -54,12 +61,11 @@ def suites(self): @staticmethod def from_rev_or_path(src, root, rev_or_path, cmake_conf, **kwargs): - raise NotImplementedError( - "BenchmarkRunner must implement from_rev_or_path") + raise NotImplementedError("BenchmarkRunner must implement from_rev_or_path") class StaticBenchmarkRunner(BenchmarkRunner): - """ Run suites from a (static) set of suites. """ + """Run suites from a (static) set of suites.""" def __init__(self, suites, **kwargs): self._suites = suites @@ -69,7 +75,7 @@ def __init__(self, suites, **kwargs): def list_benchmarks(self): for suite in self._suites: for benchmark in suite.benchmarks: - yield "{}.{}".format(suite.name, benchmark.name) + yield f"{suite.name}.{benchmark.name}" @property def suites(self): @@ -83,10 +89,8 @@ def suites(self): @classmethod def is_json_result(cls, path_or_str): builder = None - try: + with contextlib.suppress(BaseException): builder = cls.from_json(path_or_str) - except BaseException: - pass return builder is not None @@ -94,6 +98,7 @@ def is_json_result(cls, path_or_str): def from_json(path_or_str, **kwargs): # .codec imported here to break recursive imports from .codec import BenchmarkRunnerCodec + if os.path.isfile(path_or_str): with open(path_or_str) as f: loaded = json.load(f) @@ -102,23 +107,25 @@ def from_json(path_or_str, **kwargs): return BenchmarkRunnerCodec.decode(loaded, **kwargs) def __repr__(self): - return "BenchmarkRunner[suites={}]".format(list(self.suites)) + return f"BenchmarkRunner[suites={list(self.suites)}]" class CppBenchmarkRunner(BenchmarkRunner): - """ Run suites from a CMakeBuild. """ + """Run suites from a CMakeBuild.""" def __init__(self, build, benchmark_extras, **kwargs): - """ Initialize a CppBenchmarkRunner. """ + """Initialize a CppBenchmarkRunner.""" self.build = build self.benchmark_extras = benchmark_extras super().__init__(**kwargs) @staticmethod def default_configuration(**kwargs): - """ Returns the default benchmark configuration. """ + """Returns the default benchmark configuration.""" return CppConfiguration( - build_type="release", with_tests=False, with_benchmarks=True, + build_type="release", + with_tests=False, + with_benchmarks=True, with_compute=True, with_csv=True, with_dataset=True, @@ -133,11 +140,12 @@ def default_configuration(**kwargs): with_snappy=True, with_zlib=True, with_zstd=True, - **kwargs) + **kwargs, + ) @property def suites_binaries(self): - """ Returns a list of benchmark binaries for this build. """ + """Returns a list of benchmark binaries for this build.""" # Ensure build is up-to-date to run benchmarks self.build() # Not the best method, but works for now @@ -145,9 +153,10 @@ def suites_binaries(self): return {os.path.basename(b): b for b in glob.glob(glob_expr)} def suite(self, name, suite_bin): - """ Returns the resulting benchmarks for a given suite. """ - suite_cmd = GoogleBenchmarkCommand(suite_bin, self.benchmark_filter, - self.benchmark_extras) + """Returns the resulting benchmarks for a given suite.""" + suite_cmd = GoogleBenchmarkCommand( + suite_bin, self.benchmark_filter, self.benchmark_extras + ) # Ensure there will be data benchmark_names = suite_cmd.list_benchmarks() @@ -155,8 +164,8 @@ def suite(self, name, suite_bin): return None results = suite_cmd.results( - repetitions=self.repetitions, - repetition_min_time=self.repetition_min_time) + repetitions=self.repetitions, repetition_min_time=self.repetition_min_time + ) benchmarks = GoogleBenchmark.from_json(results.get("benchmarks")) return BenchmarkSuite(name, benchmarks) @@ -165,18 +174,18 @@ def list_benchmarks(self): for suite_name, suite_bin in self.suites_binaries.items(): suite_cmd = GoogleBenchmarkCommand(suite_bin) for benchmark_name in suite_cmd.list_benchmarks(): - yield "{}.{}".format(suite_name, benchmark_name) + yield f"{suite_name}.{benchmark_name}" @property def suites(self): - """ Returns all suite for a runner. """ + """Returns all suite for a runner.""" suite_matcher = regex_filter(self.suite_filter) suite_found = False suite_and_binaries = self.suites_binaries for suite_name in suite_and_binaries: if not suite_matcher(suite_name): - logger.debug("Ignoring suite {}".format(suite_name)) + logger.debug(f"Ignoring suite {suite_name}") continue suite_bin = suite_and_binaries[suite_name] @@ -184,8 +193,7 @@ def suites(self): # Filter may exclude all benchmarks if not suite: - logger.debug("Suite {} executed but no results" - .format(suite_name)) + logger.debug(f"Suite {suite_name} executed but no results") continue suite_found = True @@ -196,7 +204,7 @@ def suites(self): @staticmethod def from_rev_or_path(src, root, rev_or_path, cmake_conf, **kwargs): - """ Returns a BenchmarkRunner from a path or a git revision. + """Returns a BenchmarkRunner from a path or a git revision. First, it checks if `rev_or_path` is a valid path (or string) of a json object that can deserialize to a BenchmarkRunner. If so, it initialize @@ -212,7 +220,7 @@ def from_rev_or_path(src, root, rev_or_path, cmake_conf, **kwargs): """ build = None if StaticBenchmarkRunner.is_json_result(rev_or_path): - kwargs.pop('benchmark_extras', None) + kwargs.pop("benchmark_extras", None) return StaticBenchmarkRunner.from_json(rev_or_path, **kwargs) elif CMakeBuild.is_build_dir(rev_or_path): build = CMakeBuild.from_path(rev_or_path) @@ -234,26 +242,25 @@ def from_rev_or_path(src, root, rev_or_path, cmake_conf, **kwargs): class JavaBenchmarkRunner(BenchmarkRunner): - """ Run suites for Java. """ + """Run suites for Java.""" # default repetitions is 5 for Java microbenchmark harness def __init__(self, build, **kwargs): - """ Initialize a JavaBenchmarkRunner. """ + """Initialize a JavaBenchmarkRunner.""" self.build = build super().__init__(**kwargs) @staticmethod def default_configuration(**kwargs): - """ Returns the default benchmark configuration. """ + """Returns the default benchmark configuration.""" return JavaConfiguration(**kwargs) def suite(self, name): - """ Returns the resulting benchmarks for a given suite. """ + """Returns the resulting benchmarks for a given suite.""" # update .m2 directory, which installs target jars self.build.build() - suite_cmd = JavaMicrobenchmarkHarnessCommand( - self.build, self.benchmark_filter) + suite_cmd = JavaMicrobenchmarkHarnessCommand(self.build, self.benchmark_filter) # Ensure there will be data benchmark_names = suite_cmd.list_benchmarks() @@ -267,32 +274,31 @@ def suite(self, name): @property def list_benchmarks(self): - """ Returns all suite names """ + """Returns all suite names""" # Ensure build is up-to-date to run benchmarks self.build.build() suite_cmd = JavaMicrobenchmarkHarnessCommand(self.build) benchmark_names = suite_cmd.list_benchmarks() for benchmark_name in benchmark_names: - yield "{}".format(benchmark_name) + yield f"{benchmark_name}" @property def suites(self): - """ Returns all suite for a runner. """ + """Returns all suite for a runner.""" suite_name = "JavaBenchmark" suite = self.suite(suite_name) # Filter may exclude all benchmarks if not suite: - logger.debug("Suite {} executed but no results" - .format(suite_name)) + logger.debug(f"Suite {suite_name} executed but no results") return yield suite @staticmethod def from_rev_or_path(src, root, rev_or_path, maven_conf, **kwargs): - """ Returns a BenchmarkRunner from a path or a git revision. + """Returns a BenchmarkRunner from a path or a git revision. First, it checks if `rev_or_path` is a valid path (or string) of a json object that can deserialize to a BenchmarkRunner. If so, it initialize diff --git a/dev/archery/archery/bot.py b/dev/archery/archery/bot.py index c361ed8711ec1..a25cf52be4eff 100644 --- a/dev/archery/archery/bot.py +++ b/dev/archery/archery/bot.py @@ -14,20 +14,21 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import enum import os import shlex -from pathlib import Path -from functools import lru_cache, partial import tempfile +from functools import lru_cache, partial +from pathlib import Path import click import github +from .crossbow import CommentReport, Config, Job, Queue, Repo, Target from .utils.git import git from .utils.logger import logger -from .crossbow import Repo, Queue, Config, Target, Job, CommentReport def cached_property(fn): @@ -39,17 +40,16 @@ class EventError(Exception): class CommandError(Exception): - def __init__(self, message): self.message = message class _CommandMixin: - def get_help_option(self, ctx): def show_help(ctx, param, value): if value and not ctx.resilient_parsing: raise click.UsageError(ctx.get_help()) + option = super().get_help_option(ctx) option.callback = show_help return option @@ -68,13 +68,12 @@ class Command(_CommandMixin, click.Command): class Group(_CommandMixin, click.Group): - def command(self, *args, **kwargs): - kwargs.setdefault('cls', Command) + kwargs.setdefault("cls", Command) return super().command(*args, **kwargs) def group(self, *args, **kwargs): - kwargs.setdefault('cls', Group) + kwargs.setdefault("cls", Group) return super().group(*args, **kwargs) def parse_args(self, ctx, args): @@ -101,11 +100,10 @@ class PullRequestState(enum.Enum): merge = f"{LABEL_PREFIX} merge" -COMMITTER_ROLES = {'OWNER', 'MEMBER'} +COMMITTER_ROLES = {"OWNER", "MEMBER"} class PullRequestWorkflowBot: - def __init__(self, event_name, event_payload, token=None, committers=None): kwargs = {} if token is not None: @@ -117,26 +115,21 @@ def __init__(self, event_name, event_payload, token=None, committers=None): @cached_property def pull(self): - """ - Returns a github.PullRequest object associated with the event. - """ - return self.repo.get_pull(self.event_payload['pull_request']['number']) + """Returns a github.PullRequest object associated with the event.""" + return self.repo.get_pull(self.event_payload["pull_request"]["number"]) @cached_property def repo(self): - return self.github.get_repo(self.event_payload['repository']['id'], lazy=True) + return self.github.get_repo(self.event_payload["repository"]["id"], lazy=True) def is_committer(self, action): - """ - Returns whether the author of the action is a committer or not. + """Returns whether the author of the action is a committer or not. If the list of committer usernames is not available it will use the author_association as a fallback mechanism. """ if self.committers: - return (self.event_payload[action]['user']['login'] in - self.committers) - return (self.event_payload[action]['author_association'] in - COMMITTER_ROLES) + return self.event_payload[action]["user"]["login"] in self.committers + return self.event_payload[action]["author_association"] in COMMITTER_ROLES def handle(self): current_state = None @@ -153,58 +146,64 @@ def handle(self): self.set_state(next_state) def get_current_state(self): - """ - Returns a PullRequestState with the current PR state label + """Returns a PullRequestState with the current PR state label based on label starting with LABEL_PREFIX. If more than one label is found raises EventError. If no label is found returns None. """ - states = [label.name for label in self.pull.get_labels() - if label.name.startswith(LABEL_PREFIX)] + states = [ + label.name + for label in self.pull.get_labels() + if label.name.startswith(LABEL_PREFIX) + ] if len(states) > 1: raise EventError(f"PR cannot be on more than one states - {states}") elif states: return PullRequestState(states[0]) def clear_current_state(self): - """ - Removes all existing labels starting with LABEL_PREFIX - """ + """Removes all existing labels starting with LABEL_PREFIX""" for label in self.pull.get_labels(): if label.name.startswith(LABEL_PREFIX): self.pull.remove_from_labels(label) def compute_next_state(self, current_state): - """ - Returns the expected next state based on the event and + """Returns the expected next state based on the event and the current state. """ - if (self.event_name == "pull_request_target" and - self.event_payload['action'] == 'opened'): - if self.is_committer('pull_request'): + if ( + self.event_name == "pull_request_target" + and self.event_payload["action"] == "opened" + ): + if self.is_committer("pull_request"): return PullRequestState.committer_review else: return PullRequestState.review - elif (self.event_name == "pull_request_review" and - self.event_payload["action"] == "submitted"): + elif ( + self.event_name == "pull_request_review" + and self.event_payload["action"] == "submitted" + ): review_state = self.event_payload["review"]["state"].lower() - if not self.is_committer('review'): + if not self.is_committer("review"): # Non-committer reviews cannot change state once committer has already # reviewed, requested changes or approved if current_state in ( - PullRequestState.change_review, - PullRequestState.changes, - PullRequestState.merge): + PullRequestState.change_review, + PullRequestState.changes, + PullRequestState.merge, + ): return current_state else: return PullRequestState.committer_review - if review_state == 'approved': + if review_state == "approved": return PullRequestState.merge else: return PullRequestState.changes - elif (self.event_name == "pull_request_target" and - self.event_payload['action'] == 'synchronize' and - current_state == PullRequestState.changes): + elif ( + self.event_name == "pull_request_target" + and self.event_payload["action"] == "synchronize" + and current_state == PullRequestState.changes + ): return PullRequestState.change_review # Default already opened PRs to Review state. if current_state is None: @@ -217,7 +216,6 @@ def set_state(self, state): class CommentBot: - def __init__(self, name, handler, token=None): # TODO(kszucs): validate assert isinstance(name, str) @@ -230,19 +228,19 @@ def __init__(self, name, handler, token=None): self.github = github.Github(**kwargs) def parse_command(self, payload): - mention = '@{}'.format(self.name) - comment = payload['comment'] + mention = f"@{self.name}" + comment = payload["comment"] - if payload['sender']['login'] == self.name: + if payload["sender"]["login"] == self.name: raise EventError("Don't respond to itself") - elif payload['action'] not in {'created', 'edited'}: + elif payload["action"] not in {"created", "edited"}: raise EventError("Don't respond to comment deletion") - elif not comment['body'].lstrip().startswith(mention): + elif not comment["body"].lstrip().startswith(mention): raise EventError("The bot is not mentioned") # Parse the comment, removing the bot mentioned (and everything # before it) - command = payload['comment']['body'].split(mention)[-1] + command = payload["comment"]["body"].split(mention)[-1] # then split on newlines and keep only the first line # (ignoring all other lines) @@ -254,18 +252,18 @@ def handle(self, event, payload): except EventError as e: logger.error(e) # see the possible reasons in the validate method - return + return None - if event == 'issue_comment': + if event == "issue_comment": return self.handle_issue_comment(command, payload) - elif event == 'pull_request_review_comment': + elif event == "pull_request_review_comment": return self.handle_review_comment(command, payload) else: - raise ValueError("Unexpected event type {}".format(event)) + raise ValueError(f"Unexpected event type {event}") def handle_issue_comment(self, command, payload): - repo = self.github.get_repo(payload['repository']['id'], lazy=True) - issue = repo.get_issue(payload['issue']['number']) + repo = self.github.get_repo(payload["repository"]["id"], lazy=True) + issue = repo.get_issue(payload["issue"]["number"]) try: pull = issue.as_pull_request() @@ -274,21 +272,20 @@ def handle_issue_comment(self, command, payload): "The comment bot only listens to pull request comments!" ) - comment = pull.get_issue_comment(payload['comment']['id']) + comment = pull.get_issue_comment(payload["comment"]["id"]) try: # Only allow users of apache org to submit commands, for more see # https://developer.github.com/v4/enum/commentauthorassociation/ # Checking privileges here enables the bot to respond # without relying on the handler. - allowed_roles = {'OWNER', 'MEMBER', 'COLLABORATOR'} - if payload['comment']['author_association'] not in allowed_roles: + allowed_roles = {"OWNER", "MEMBER", "COLLABORATOR"} + if payload["comment"]["author_association"] not in allowed_roles: raise EventError( "Only contributors can submit requests to this bot. " "Please ask someone from the community for help with " "getting the first commit in." ) - self.handler(command, issue=issue, pull_request=pull, - comment=comment) + self.handler(command, issue=issue, pull_request=pull, comment=comment) except Exception as e: logger.exception(e) url = "{server}/{repo}/actions/runs/{run_id}".format( @@ -297,16 +294,17 @@ def handle_issue_comment(self, command, payload): run_id=os.environ["GITHUB_RUN_ID"], ) pull.create_issue_comment( - f"```\n{e}\nThe Archery job run can be found at: {url}\n```") - comment.create_reaction('-1') + f"```\n{e}\nThe Archery job run can be found at: {url}\n```" + ) + comment.create_reaction("-1") else: - comment.create_reaction('+1') + comment.create_reaction("+1") def handle_review_comment(self, payload): raise NotImplementedError() -@group(name='@github-actions') +@group(name="@github-actions") @click.pass_context def actions(ctx): """Ursabot""" @@ -314,19 +312,20 @@ def actions(ctx): @actions.group() -@click.option('--crossbow', '-c', default='ursacomputing/crossbow', - help='Crossbow repository on github to use') +@click.option( + "--crossbow", + "-c", + default="ursacomputing/crossbow", + help="Crossbow repository on github to use", +) @click.pass_obj def crossbow(obj, crossbow): - """ - Trigger crossbow builds for this pull request - """ - obj['crossbow_repo'] = crossbow + """Trigger crossbow builds for this pull request""" + obj["crossbow_repo"] = crossbow def _clone_arrow_and_crossbow(dest, crossbow_repo, arrow_repo_url, pr_number): - """ - Clone the repositories and initialize crossbow objects. + """Clone the repositories and initialize crossbow objects. Parameters ---------- @@ -339,27 +338,28 @@ def _clone_arrow_and_crossbow(dest, crossbow_repo, arrow_repo_url, pr_number): "https://github.com/apache/arrow.git". pr_number : int Target PR number. + """ - arrow_path = dest / 'arrow' - queue_path = dest / 'crossbow' + arrow_path = dest / "arrow" + queue_path = dest / "crossbow" # we use unique branch name instead of fork's branch name to avoid # branch name conflict such as 'main' (GH-39996) - local_branch = f'archery/pr-{pr_number}' + local_branch = f"archery/pr-{pr_number}" # 1. clone arrow and checkout the PR's branch - pr_ref = f'pull/{pr_number}/head:{local_branch}' - git.clone('--no-checkout', arrow_repo_url, str(arrow_path)) + pr_ref = f"pull/{pr_number}/head:{local_branch}" + git.clone("--no-checkout", arrow_repo_url, str(arrow_path)) # fetch the PR's branch into the clone - git.fetch('origin', pr_ref, git_dir=arrow_path) + git.fetch("origin", pr_ref, git_dir=arrow_path) # checkout the PR's branch into the clone git.checkout(local_branch, git_dir=arrow_path) # 2. clone crossbow repository - crossbow_url = 'https://github.com/{}'.format(crossbow_repo) + crossbow_url = f"https://github.com/{crossbow_repo}" git.clone(crossbow_url, str(queue_path)) # 3. initialize crossbow objects - github_token = os.environ['CROSSBOW_GITHUB_TOKEN'] + github_token = os.environ["CROSSBOW_GITHUB_TOKEN"] arrow = Repo(arrow_path) queue = Queue(queue_path, github_token=github_token, require_https=True) @@ -367,26 +367,36 @@ def _clone_arrow_and_crossbow(dest, crossbow_repo, arrow_repo_url, pr_number): @crossbow.command() -@click.argument('tasks', nargs=-1, required=False) -@click.option('--group', '-g', 'groups', multiple=True, - help='Submit task groups as defined in tests.yml') -@click.option('--param', '-p', 'params', multiple=True, - help='Additional task parameters for rendering the CI templates') -@click.option('--arrow-version', '-v', default=None, - help='Set target version explicitly.') -@click.option('--wait', default=60, - help='Wait the specified seconds before generating a report.') -@click.option('--prefix', default='actions', - help='Prefix for job IDs.') +@click.argument("tasks", nargs=-1, required=False) +@click.option( + "--group", + "-g", + "groups", + multiple=True, + help="Submit task groups as defined in tests.yml", +) +@click.option( + "--param", + "-p", + "params", + multiple=True, + help="Additional task parameters for rendering the CI templates", +) +@click.option( + "--arrow-version", "-v", default=None, help="Set target version explicitly." +) +@click.option( + "--wait", default=60, help="Wait the specified seconds before generating a report." +) +@click.option("--prefix", default="actions", help="Prefix for job IDs.") @click.pass_obj def submit(obj, tasks, groups, params, arrow_version, wait, prefix): - """ - Submit crossbow testing tasks. + """Submit crossbow testing tasks. See groups defined in arrow/dev/tasks/tasks.yml """ - crossbow_repo = obj['crossbow_repo'] - pull_request = obj['pull_request'] + crossbow_repo = obj["crossbow_repo"] + pull_request = obj["pull_request"] with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) arrow, queue = _clone_arrow_and_crossbow( @@ -400,25 +410,28 @@ def submit(obj, tasks, groups, params, arrow_version, wait, prefix): config.validate() # initialize the crossbow build's target repository - target = Target.from_repo(arrow, version=arrow_version, - remote=pull_request.head.repo.clone_url, - branch=pull_request.head.ref) + target = Target.from_repo( + arrow, + version=arrow_version, + remote=pull_request.head.repo.clone_url, + branch=pull_request.head.ref, + ) # parse additional job parameters params = dict([p.split("=") for p in params]) - params['pr_number'] = pull_request.number + params["pr_number"] = pull_request.number # instantiate the job object - job = Job.from_config(config=config, target=target, tasks=tasks, - groups=groups, params=params) + job = Job.from_config( + config=config, target=target, tasks=tasks, groups=groups, params=params + ) # add the job to the crossbow queue and push to the remote repository queue.put(job, prefix=prefix, increment_job_id=False) queue.push() # render the response comment's content - report = CommentReport(job, crossbow_repo=crossbow_repo, - wait_for_task=wait) + report = CommentReport(job, crossbow_repo=crossbow_repo, wait_for_task=wait) # send the response pull_request.create_issue_comment(report.show()) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 3aa6d8a0733ff..18135137e5630 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -14,24 +14,27 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from collections import namedtuple -from io import StringIO -import click import json import logging import os import pathlib import sys +from collections import namedtuple +from io import StringIO + +import click from .benchmark.codec import JsonEncoder -from .benchmark.compare import RunnerComparator, DEFAULT_THRESHOLD +from .benchmark.compare import DEFAULT_THRESHOLD, RunnerComparator from .benchmark.runner import CppBenchmarkRunner, JavaBenchmarkRunner from .compat import _import_pandas from .lang.cpp import CppCMakeDefinition, CppConfiguration -from .utils.cli import ArrowBool, validate_arrow_sources, add_optional_command -from .utils.lint import linter, python_numpydoc, LintValidationException -from .utils.logger import logger, ctx as log_ctx +from .utils.cli import ArrowBool, add_optional_command, validate_arrow_sources +from .utils.lint import LintValidationException, linter, python_numpydoc +from .utils.logger import ctx as log_ctx +from .utils.logger import logger from .utils.source import ArrowSources from .utils.tmpdir import tmpdir @@ -43,16 +46,32 @@ @click.group(context_settings={"help_option_names": ["-h", "--help"]}) -@click.option("--debug", type=BOOL, is_flag=True, default=False, - envvar='ARCHERY_DEBUG', - help="Increase logging with debugging output.") -@click.option("--pdb", type=BOOL, is_flag=True, default=False, - help="Invoke pdb on uncaught exception.") -@click.option("-q", "--quiet", type=BOOL, is_flag=True, default=False, - help="Silence executed commands.") +@click.option( + "--debug", + type=BOOL, + is_flag=True, + default=False, + envvar="ARCHERY_DEBUG", + help="Increase logging with debugging output.", +) +@click.option( + "--pdb", + type=BOOL, + is_flag=True, + default=False, + help="Invoke pdb on uncaught exception.", +) +@click.option( + "-q", + "--quiet", + type=BOOL, + is_flag=True, + default=False, + help="Silence executed commands.", +) @click.pass_context def archery(ctx, debug, pdb, quiet): - """ Apache Arrow developer utilities. + """Apache Arrow developer utilities. See sub-commands help with `archery <cmd> --help`. @@ -64,23 +83,23 @@ def archery(ctx, debug, pdb, quiet): if debug: logger.setLevel(logging.DEBUG) - ctx.obj['debug'] = debug + ctx.obj["debug"] = debug if pdb: - import pdb + import pdb # noqa: T100 + sys.excepthook = lambda t, v, e: pdb.pm() build_dir_type = click.Path(dir_okay=True, file_okay=False, resolve_path=True) # Supported build types -build_type = click.Choice(["debug", "relwithdebinfo", "release"], - case_sensitive=False) +build_type = click.Choice(["debug", "relwithdebinfo", "release"], case_sensitive=False) # Supported warn levels -warn_level_type = click.Choice(["everything", "checkin", "production"], - case_sensitive=False) +warn_level_type = click.Choice( + ["everything", "checkin", "production"], case_sensitive=False +) -simd_level = click.Choice(["NONE", "SSE4_2", "AVX2", "AVX512"], - case_sensitive=True) +simd_level = click.Choice(["NONE", "SSE4_2", "AVX2", "AVX512"], case_sensitive=True) def cpp_toolchain_options(cmd): @@ -88,17 +107,22 @@ def cpp_toolchain_options(cmd): click.option("--cc", metavar="<compiler>", help="C compiler."), click.option("--cxx", metavar="<compiler>", help="C++ compiler."), click.option("--cxx-flags", help="C++ compiler flags."), - click.option("--cpp-package-prefix", - help=("Value to pass for ARROW_PACKAGE_PREFIX and " - "use ARROW_DEPENDENCY_SOURCE=SYSTEM")) + click.option( + "--cpp-package-prefix", + help=( + "Value to pass for ARROW_PACKAGE_PREFIX and " + "use ARROW_DEPENDENCY_SOURCE=SYSTEM" + ), + ), ] return _apply_options(cmd, options) def java_toolchain_options(cmd): options = [ - click.option("--java-home", metavar="<java_home>", - help="Path to Java Developers Kit."), + click.option( + "--java-home", metavar="<java_home>", help="Path to Java Developers Kit." + ), click.option("--java-options", help="java compiler options."), ] return _apply_options(cmd, options) @@ -111,109 +135,189 @@ def _apply_options(cmd, options): @archery.command(short_help="Initialize an Arrow C++ build") -@click.option("--src", metavar="<arrow_src>", default=None, - callback=validate_arrow_sources, - help="Specify Arrow source directory") +@click.option( + "--src", + metavar="<arrow_src>", + default=None, + callback=validate_arrow_sources, + help="Specify Arrow source directory", +) # toolchain @cpp_toolchain_options -@click.option("--build-type", default=None, type=build_type, - help="CMake's CMAKE_BUILD_TYPE") -@click.option("--build-static", default=True, type=BOOL, - help="Build static libraries") -@click.option("--build-shared", default=True, type=BOOL, - help="Build shared libraries") -@click.option("--build-unity", default=True, type=BOOL, - help="Use CMAKE_UNITY_BUILD") -@click.option("--warn-level", default="production", type=warn_level_type, - help="Controls compiler warnings -W(no-)error.") -@click.option("--use-gold-linker", default=True, type=BOOL, - help="Toggles ARROW_USE_LD_GOLD option.") -@click.option("--simd-level", default="DEFAULT", type=simd_level, - help="Toggles ARROW_SIMD_LEVEL option.") +@click.option( + "--build-type", default=None, type=build_type, help="CMake's CMAKE_BUILD_TYPE" +) +@click.option("--build-static", default=True, type=BOOL, help="Build static libraries") +@click.option("--build-shared", default=True, type=BOOL, help="Build shared libraries") +@click.option("--build-unity", default=True, type=BOOL, help="Use CMAKE_UNITY_BUILD") +@click.option( + "--warn-level", + default="production", + type=warn_level_type, + help="Controls compiler warnings -W(no-)error.", +) +@click.option( + "--use-gold-linker", + default=True, + type=BOOL, + help="Toggles ARROW_USE_LD_GOLD option.", +) +@click.option( + "--simd-level", + default="DEFAULT", + type=simd_level, + help="Toggles ARROW_SIMD_LEVEL option.", +) # Tests and benchmarks -@click.option("--with-tests", default=True, type=BOOL, - help="Build with tests.") -@click.option("--with-benchmarks", default=None, type=BOOL, - help="Build with benchmarks.") -@click.option("--with-examples", default=None, type=BOOL, - help="Build with examples.") -@click.option("--with-integration", default=None, type=BOOL, - help="Build with integration test executables.") +@click.option("--with-tests", default=True, type=BOOL, help="Build with tests.") +@click.option( + "--with-benchmarks", default=None, type=BOOL, help="Build with benchmarks." +) +@click.option("--with-examples", default=None, type=BOOL, help="Build with examples.") +@click.option( + "--with-integration", + default=None, + type=BOOL, + help="Build with integration test executables.", +) # Static checks -@click.option("--use-asan", default=None, type=BOOL, - help="Toggle ARROW_USE_ASAN sanitizer.") -@click.option("--use-tsan", default=None, type=BOOL, - help="Toggle ARROW_USE_TSAN sanitizer.") -@click.option("--use-ubsan", default=None, type=BOOL, - help="Toggle ARROW_USE_UBSAN sanitizer.") -@click.option("--with-fuzzing", default=None, type=BOOL, - help="Toggle ARROW_FUZZING.") +@click.option( + "--use-asan", default=None, type=BOOL, help="Toggle ARROW_USE_ASAN sanitizer." +) +@click.option( + "--use-tsan", default=None, type=BOOL, help="Toggle ARROW_USE_TSAN sanitizer." +) +@click.option( + "--use-ubsan", default=None, type=BOOL, help="Toggle ARROW_USE_UBSAN sanitizer." +) +@click.option("--with-fuzzing", default=None, type=BOOL, help="Toggle ARROW_FUZZING.") # Components -@click.option("--with-compute", default=None, type=BOOL, - help="Build the Arrow compute module.") -@click.option("--with-csv", default=None, type=BOOL, - help="Build the Arrow CSV parser module.") -@click.option("--with-cuda", default=None, type=BOOL, - help="Build the Arrow CUDA extensions.") -@click.option("--with-dataset", default=None, type=BOOL, - help="Build the Arrow dataset module.") -@click.option("--with-filesystem", default=None, type=BOOL, - help="Build the Arrow filesystem layer.") -@click.option("--with-flight", default=None, type=BOOL, - help="Build with Flight rpc support.") -@click.option("--with-gandiva", default=None, type=BOOL, - help="Build with Gandiva expression compiler support.") -@click.option("--with-gcs", default=None, type=BOOL, - help="Build Arrow with Google Cloud Storage (GCS) support.") -@click.option("--with-hdfs", default=None, type=BOOL, - help="Build the Arrow HDFS bridge.") -@click.option("--with-hiveserver2", default=None, type=BOOL, - help="Build the HiveServer2 client and arrow adapter.") -@click.option("--with-ipc", default=None, type=BOOL, - help="Build the Arrow IPC extensions.") -@click.option("--with-json", default=None, type=BOOL, - help="Build the Arrow JSON parser module.") -@click.option("--with-mimalloc", default=None, type=BOOL, - help="Build the Arrow mimalloc based allocator.") -@click.option("--with-parquet", default=None, type=BOOL, - help="Build with Parquet file support.") -@click.option("--with-python", default=None, type=BOOL, - help="Build the Arrow CPython extensions.") -@click.option("--with-r", default=None, type=BOOL, - help="Build the Arrow R extensions. This is not a CMake option, " - "it will toggle required options") -@click.option("--with-s3", default=None, type=BOOL, - help="Build Arrow with S3 support.") +@click.option( + "--with-compute", default=None, type=BOOL, help="Build the Arrow compute module." +) +@click.option( + "--with-csv", default=None, type=BOOL, help="Build the Arrow CSV parser module." +) +@click.option( + "--with-cuda", default=None, type=BOOL, help="Build the Arrow CUDA extensions." +) +@click.option( + "--with-dataset", default=None, type=BOOL, help="Build the Arrow dataset module." +) +@click.option( + "--with-filesystem", + default=None, + type=BOOL, + help="Build the Arrow filesystem layer.", +) +@click.option( + "--with-flight", default=None, type=BOOL, help="Build with Flight rpc support." +) +@click.option( + "--with-gandiva", + default=None, + type=BOOL, + help="Build with Gandiva expression compiler support.", +) +@click.option( + "--with-gcs", + default=None, + type=BOOL, + help="Build Arrow with Google Cloud Storage (GCS) support.", +) +@click.option( + "--with-hdfs", default=None, type=BOOL, help="Build the Arrow HDFS bridge." +) +@click.option( + "--with-hiveserver2", + default=None, + type=BOOL, + help="Build the HiveServer2 client and arrow adapter.", +) +@click.option( + "--with-ipc", default=None, type=BOOL, help="Build the Arrow IPC extensions." +) +@click.option( + "--with-json", default=None, type=BOOL, help="Build the Arrow JSON parser module." +) +@click.option( + "--with-mimalloc", + default=None, + type=BOOL, + help="Build the Arrow mimalloc based allocator.", +) +@click.option( + "--with-parquet", default=None, type=BOOL, help="Build with Parquet file support." +) +@click.option( + "--with-python", default=None, type=BOOL, help="Build the Arrow CPython extensions." +) +@click.option( + "--with-r", + default=None, + type=BOOL, + help="Build the Arrow R extensions. This is not a CMake option, " + "it will toggle required options", +) +@click.option("--with-s3", default=None, type=BOOL, help="Build Arrow with S3 support.") # Compressions -@click.option("--with-brotli", default=None, type=BOOL, - help="Build Arrow with brotli compression.") -@click.option("--with-bz2", default=None, type=BOOL, - help="Build Arrow with bz2 compression.") -@click.option("--with-lz4", default=None, type=BOOL, - help="Build Arrow with lz4 compression.") -@click.option("--with-snappy", default=None, type=BOOL, - help="Build Arrow with snappy compression.") -@click.option("--with-zlib", default=None, type=BOOL, - help="Build Arrow with zlib compression.") -@click.option("--with-zstd", default=None, type=BOOL, - help="Build Arrow with zstd compression.") +@click.option( + "--with-brotli", + default=None, + type=BOOL, + help="Build Arrow with brotli compression.", +) +@click.option( + "--with-bz2", default=None, type=BOOL, help="Build Arrow with bz2 compression." +) +@click.option( + "--with-lz4", default=None, type=BOOL, help="Build Arrow with lz4 compression." +) +@click.option( + "--with-snappy", + default=None, + type=BOOL, + help="Build Arrow with snappy compression.", +) +@click.option( + "--with-zlib", default=None, type=BOOL, help="Build Arrow with zlib compression." +) +@click.option( + "--with-zstd", default=None, type=BOOL, help="Build Arrow with zstd compression." +) # CMake extra feature -@click.option("--cmake-extras", type=str, multiple=True, - help="Extra flags/options to pass to cmake invocation. " - "Can be stacked") -@click.option("--install-prefix", type=str, - help="Destination directory where files are installed. Expand to" - "CMAKE_INSTALL_PREFIX. Defaults to to $CONDA_PREFIX if the" - "variable exists.") +@click.option( + "--cmake-extras", + type=str, + multiple=True, + help="Extra flags/options to pass to cmake invocation. Can be stacked", +) +@click.option( + "--install-prefix", + type=str, + help="Destination directory where files are installed. Expand to" + "CMAKE_INSTALL_PREFIX. Defaults to to $CONDA_PREFIX if the" + "variable exists.", +) # misc -@click.option("-f", "--force", type=BOOL, is_flag=True, default=False, - help="Delete existing build directory if found.") -@click.option("--targets", type=str, multiple=True, - help="Generator targets to run. Can be stacked.") +@click.option( + "-f", + "--force", + type=BOOL, + is_flag=True, + default=False, + help="Delete existing build directory if found.", +) +@click.option( + "--targets", + type=str, + multiple=True, + help="Generator targets to run. Can be stacked.", +) @click.argument("build_dir", type=build_dir_type) @click.pass_context def build(ctx, src, build_dir, force, targets, **kwargs): - """ Initialize a C++ build directory. + """Initialize a C++ build directory. The build command creates a directory initialized with Arrow's cpp source cmake and configuration. It can also optionally invoke the generator to @@ -224,7 +328,6 @@ def build(ctx, src, build_dir, force, targets, **kwargs): existing directory. Examples: - \b # Initialize build with clang8 and avx2 support in directory `clang8-build` \b @@ -233,6 +336,7 @@ def build(ctx, src, build_dir, force, targets, **kwargs): \b # Builds and run test archery build --targets=all --targets=test build + """ # Arrow's cpp cmake configuration conf = CppConfiguration(**kwargs) @@ -246,58 +350,69 @@ def build(ctx, src, build_dir, force, targets, **kwargs): build.run(target) -LintCheck = namedtuple('LintCheck', ('option_name', 'help')) +LintCheck = namedtuple("LintCheck", ("option_name", "help")) lint_checks = [ - LintCheck('clang-format', "Format C++ files with clang-format."), - LintCheck('clang-tidy', "Lint C++ files with clang-tidy."), - LintCheck('cpplint', "Lint C++ files with cpplint."), - LintCheck('iwyu', "Lint changed C++ files with Include-What-You-Use."), - LintCheck('python', - "Format and lint Python files with autopep8 and flake8."), - LintCheck('numpydoc', "Lint Python files with numpydoc."), - LintCheck('cmake-format', "Format CMake files with cmake-format.py."), - LintCheck('rat', - "Check all sources files for license texts via Apache RAT."), - LintCheck('r', "Lint R files."), - LintCheck('docker', "Lint Dockerfiles with hadolint."), - LintCheck('docs', "Lint docs with sphinx-lint."), + LintCheck("clang-format", "Format C++ files with clang-format."), + LintCheck("clang-tidy", "Lint C++ files with clang-tidy."), + LintCheck("cpplint", "Lint C++ files with cpplint."), + LintCheck("iwyu", "Lint changed C++ files with Include-What-You-Use."), + LintCheck("python", "Format and lint Python files with autopep8 and flake8."), + LintCheck("numpydoc", "Lint Python files with numpydoc."), + LintCheck("cmake-format", "Format CMake files with cmake-format.py."), + LintCheck("rat", "Check all sources files for license texts via Apache RAT."), + LintCheck("r", "Lint R files."), + LintCheck("docker", "Lint Dockerfiles with hadolint."), + LintCheck("docs", "Lint docs with sphinx-lint."), ] def decorate_lint_command(cmd): - """ - Decorate the lint() command function to add individual per-check options. - """ + """Decorate the lint() command function to add individual per-check options.""" for check in lint_checks: - option = click.option("--{0}/--no-{0}".format(check.option_name), - default=None, help=check.help) + option = click.option( + f"--{check.option_name}/--no-{check.option_name}", + default=None, + help=check.help, + ) cmd = option(cmd) return cmd @archery.command(short_help="Check Arrow source tree for errors") -@click.option("--src", metavar="<arrow_src>", default=None, - callback=validate_arrow_sources, - help="Specify Arrow source directory") -@click.option("--fix", is_flag=True, type=BOOL, default=False, - help="Toggle fixing the lint errors if the linter supports it.") -@click.option("--iwyu_all", is_flag=True, type=BOOL, default=False, - help="Run IWYU on all C++ files if enabled") -@click.option("-a", "--all", is_flag=True, default=False, - help="Enable all checks.") +@click.option( + "--src", + metavar="<arrow_src>", + default=None, + callback=validate_arrow_sources, + help="Specify Arrow source directory", +) +@click.option( + "--fix", + is_flag=True, + type=BOOL, + default=False, + help="Toggle fixing the lint errors if the linter supports it.", +) +@click.option( + "--iwyu_all", + is_flag=True, + type=BOOL, + default=False, + help="Run IWYU on all C++ files if enabled", +) +@click.option("-a", "--all", is_flag=True, default=False, help="Enable all checks.") @click.argument("path", required=False) @decorate_lint_command @click.pass_context def lint(ctx, src, fix, iwyu_all, path, **checks): - if checks.pop('all'): + if checks.pop("all"): # "--all" is given => enable all non-selected checks for k, v in checks.items(): if v is None: checks[k] = True if not any(checks.values()): - raise click.UsageError( - "Need to enable at least one lint check (try --help)") + raise click.UsageError("Need to enable at least one lint check (try --help)") try: linter(src, fix, iwyu_all=iwyu_all, path=path, **checks) except LintValidationException: @@ -307,22 +422,33 @@ def lint(ctx, src, fix, iwyu_all, path, **checks): def _flatten_numpydoc_rules(rules): flattened = [] for rule in rules: - flattened.extend(filter(None, rule.split(','))) + flattened.extend(filter(None, rule.split(","))) return flattened @archery.command(short_help="Lint python docstring with NumpyDoc") -@click.argument('symbols', nargs=-1) -@click.option("--src", metavar="<arrow_src>", default=None, - callback=validate_arrow_sources, - help="Specify Arrow source directory") -@click.option("--allow-rule", "-a", multiple=True, - help="Allow only these rules (can be comma-separated)") -@click.option("--disallow-rule", "-d", multiple=True, - help="Disallow these rules (can be comma-separated)") +@click.argument("symbols", nargs=-1) +@click.option( + "--src", + metavar="<arrow_src>", + default=None, + callback=validate_arrow_sources, + help="Specify Arrow source directory", +) +@click.option( + "--allow-rule", + "-a", + multiple=True, + help="Allow only these rules (can be comma-separated)", +) +@click.option( + "--disallow-rule", + "-d", + multiple=True, + help="Disallow these rules (can be comma-separated)", +) def numpydoc(src, symbols, allow_rule, disallow_rule): - """ - Pass list of modules or symbols as arguments to restrict the validation. + """Pass list of modules or symbols as arguments to restrict the validation. By default all modules of pyarrow are tried to be validated. @@ -331,12 +457,15 @@ def numpydoc(src, symbols, allow_rule, disallow_rule): archery numpydoc pyarrow.dataset archery numpydoc pyarrow.csv pyarrow.json pyarrow.parquet archery numpydoc pyarrow.array + """ - disallow_rule = disallow_rule or {'GL01', 'SA01', 'EX01', 'ES01'} + disallow_rule = disallow_rule or {"GL01", "SA01", "EX01", "ES01"} try: results = python_numpydoc( - symbols, allow_rules=_flatten_numpydoc_rules(allow_rule), - disallow_rules=_flatten_numpydoc_rules(disallow_rule)) + symbols, + allow_rules=_flatten_numpydoc_rules(allow_rule), + disallow_rules=_flatten_numpydoc_rules(disallow_rule), + ) for result in results: result.ok() except LintValidationException: @@ -346,11 +475,10 @@ def numpydoc(src, symbols, allow_rule, disallow_rule): @archery.group() @click.pass_context def benchmark(ctx): - """ Arrow benchmarking. + """Arrow benchmarking. Use the diff sub-command to benchmark revisions, and/or build directories. """ - pass def benchmark_common_options(cmd): @@ -360,30 +488,66 @@ def check_language(ctx, param, value): return value options = [ - click.option("--src", metavar="<arrow_src>", show_default=True, - default=None, callback=validate_arrow_sources, - help="Specify Arrow source directory"), - click.option("--preserve", type=BOOL, default=False, show_default=True, - is_flag=True, - help="Preserve workspace for investigation."), - click.option("--output", metavar="<output>", - type=click.File("w", encoding="utf8"), default=None, - help="Capture output result into file."), - click.option("--language", metavar="<lang>", type=str, default="cpp", - show_default=True, callback=check_language, - help="Specify target language for the benchmark"), - click.option("--build-extras", type=str, multiple=True, - help="Extra flags/options to pass to mvn build. " - "Can be stacked. For language=java"), - click.option("--benchmark-extras", type=str, multiple=True, - help="Extra flags/options to pass to mvn benchmark. " - "Can be stacked. For language=java"), - click.option("--cmake-extras", type=str, multiple=True, - help="Extra flags/options to pass to cmake invocation. " - "Can be stacked. For language=cpp"), - click.option("--cpp-benchmark-extras", type=str, multiple=True, - help="Extra flags/options to pass to C++ benchmark executables. " - "Can be stacked. For language=cpp"), + click.option( + "--src", + metavar="<arrow_src>", + show_default=True, + default=None, + callback=validate_arrow_sources, + help="Specify Arrow source directory", + ), + click.option( + "--preserve", + type=BOOL, + default=False, + show_default=True, + is_flag=True, + help="Preserve workspace for investigation.", + ), + click.option( + "--output", + metavar="<output>", + type=click.File("w", encoding="utf8"), + default=None, + help="Capture output result into file.", + ), + click.option( + "--language", + metavar="<lang>", + type=str, + default="cpp", + show_default=True, + callback=check_language, + help="Specify target language for the benchmark", + ), + click.option( + "--build-extras", + type=str, + multiple=True, + help="Extra flags/options to pass to mvn build. " + "Can be stacked. For language=java", + ), + click.option( + "--benchmark-extras", + type=str, + multiple=True, + help="Extra flags/options to pass to mvn benchmark. " + "Can be stacked. For language=java", + ), + click.option( + "--cmake-extras", + type=str, + multiple=True, + help="Extra flags/options to pass to cmake invocation. " + "Can be stacked. For language=cpp", + ), + click.option( + "--cpp-benchmark-extras", + type=str, + multiple=True, + help="Extra flags/options to pass to C++ benchmark executables. " + "Can be stacked. For language=cpp", + ), ] cmd = java_toolchain_options(cmd) @@ -393,71 +557,126 @@ def check_language(ctx, param, value): def benchmark_filter_options(cmd): options = [ - click.option("--suite-filter", metavar="<regex>", show_default=True, - type=str, default=None, - help="Regex filtering benchmark suites."), - click.option("--benchmark-filter", metavar="<regex>", - show_default=True, type=str, default=None, - help="Regex filtering benchmarks.") + click.option( + "--suite-filter", + metavar="<regex>", + show_default=True, + type=str, + default=None, + help="Regex filtering benchmark suites.", + ), + click.option( + "--benchmark-filter", + metavar="<regex>", + show_default=True, + type=str, + default=None, + help="Regex filtering benchmarks.", + ), ] return _apply_options(cmd, options) @benchmark.command(name="list", short_help="List benchmark suite") -@click.argument("rev_or_path", metavar="[<rev_or_path>]", - default="WORKSPACE", required=False) +@click.argument( + "rev_or_path", metavar="[<rev_or_path>]", default="WORKSPACE", required=False +) @benchmark_common_options @click.pass_context -def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras, - java_home, java_options, build_extras, benchmark_extras, - cpp_benchmark_extras, language, **kwargs): - """ List benchmark suite. - """ +def benchmark_list( + ctx, + rev_or_path, + src, + preserve, + output, + cmake_extras, + java_home, + java_options, + build_extras, + benchmark_extras, + cpp_benchmark_extras, + language, + **kwargs, +): + """List benchmark suite.""" with tmpdir(preserve=preserve) as root: - logger.debug("Running benchmark {}".format(rev_or_path)) + logger.debug(f"Running benchmark {rev_or_path}") if language == "cpp": conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) + cmake_extras=cmake_extras, **kwargs + ) runner_base = CppBenchmarkRunner.from_rev_or_path( - src, root, rev_or_path, conf, - benchmark_extras=cpp_benchmark_extras) + src, root, rev_or_path, conf, benchmark_extras=cpp_benchmark_extras + ) elif language == "java": - for key in {'cpp_package_prefix', 'cxx_flags', 'cxx', 'cc'}: + for key in ("cpp_package_prefix", "cxx_flags", "cxx", "cc"): del kwargs[key] conf = JavaBenchmarkRunner.default_configuration( - java_home=java_home, java_options=java_options, - build_extras=build_extras, benchmark_extras=benchmark_extras, - **kwargs) + java_home=java_home, + java_options=java_options, + build_extras=build_extras, + benchmark_extras=benchmark_extras, + **kwargs, + ) runner_base = JavaBenchmarkRunner.from_rev_or_path( - src, root, rev_or_path, conf) + src, root, rev_or_path, conf + ) for b in runner_base.list_benchmarks: click.echo(b, file=output or sys.stdout) @benchmark.command(name="run", short_help="Run benchmark suite") -@click.argument("rev_or_path", metavar="[<rev_or_path>]", - default="WORKSPACE", required=False) +@click.argument( + "rev_or_path", metavar="[<rev_or_path>]", default="WORKSPACE", required=False +) @benchmark_common_options @benchmark_filter_options -@click.option("--repetitions", type=int, default=-1, - help=("Number of repetitions of each benchmark. Increasing " - "may improve result precision. " - "[default: 1 for cpp, 5 for java]")) -@click.option("--repetition-min-time", type=float, default=None, - help=("Minimum duration of each repetition in seconds. " - "Currently only supported for language=cpp. " - "[default: use runner-specific defaults]")) +@click.option( + "--repetitions", + type=int, + default=-1, + help=( + "Number of repetitions of each benchmark. Increasing " + "may improve result precision. " + "[default: 1 for cpp, 5 for java]" + ), +) +@click.option( + "--repetition-min-time", + type=float, + default=None, + help=( + "Minimum duration of each repetition in seconds. " + "Currently only supported for language=cpp. " + "[default: use runner-specific defaults]" + ), +) @click.pass_context -def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, - java_home, java_options, build_extras, benchmark_extras, - language, suite_filter, benchmark_filter, repetitions, - repetition_min_time, cpp_benchmark_extras, **kwargs): - """ Run benchmark suite. +def benchmark_run( + ctx, + rev_or_path, + src, + preserve, + output, + cmake_extras, + java_home, + java_options, + build_extras, + benchmark_extras, + language, + suite_filter, + benchmark_filter, + repetitions, + repetition_min_time, + cpp_benchmark_extras, + **kwargs, +): + """Run benchmark suite. This command will run the benchmark suite for a single build. This is used to capture (and/or publish) the results. @@ -473,7 +692,6 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, workspace. This imply that no clone will be performed. Examples: - \b # Run the benchmarks on current git workspace \b @@ -493,34 +711,49 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, # Run the benchmarks on current git workspace and output results as a JSON file. \b archery benchmark run --output=run.json + """ with tmpdir(preserve=preserve) as root: - logger.debug("Running benchmark {}".format(rev_or_path)) + logger.debug(f"Running benchmark {rev_or_path}") if language == "cpp": conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) + cmake_extras=cmake_extras, **kwargs + ) repetitions = repetitions if repetitions != -1 else 1 runner_base = CppBenchmarkRunner.from_rev_or_path( - src, root, rev_or_path, conf, - repetitions=repetitions, repetition_min_time=repetition_min_time, - suite_filter=suite_filter, benchmark_filter=benchmark_filter, - benchmark_extras=cpp_benchmark_extras) + src, + root, + rev_or_path, + conf, + repetitions=repetitions, + repetition_min_time=repetition_min_time, + suite_filter=suite_filter, + benchmark_filter=benchmark_filter, + benchmark_extras=cpp_benchmark_extras, + ) elif language == "java": - for key in {'cpp_package_prefix', 'cxx_flags', 'cxx', 'cc'}: + for key in ("cpp_package_prefix", "cxx_flags", "cxx", "cc"): del kwargs[key] conf = JavaBenchmarkRunner.default_configuration( - java_home=java_home, java_options=java_options, - build_extras=build_extras, benchmark_extras=benchmark_extras, - **kwargs) + java_home=java_home, + java_options=java_options, + build_extras=build_extras, + benchmark_extras=benchmark_extras, + **kwargs, + ) repetitions = repetitions if repetitions != -1 else 5 runner_base = JavaBenchmarkRunner.from_rev_or_path( - src, root, rev_or_path, conf, + src, + root, + rev_or_path, + conf, repetitions=repetitions, - benchmark_filter=benchmark_filter) + benchmark_filter=benchmark_filter, + ) # XXX for some reason, the benchmark runner only does its work # when asked to JSON-serialize the results, so produce a JSON @@ -533,25 +766,59 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, @benchmark.command(name="diff", short_help="Compare benchmark suites") @benchmark_common_options @benchmark_filter_options -@click.option("--threshold", type=float, default=DEFAULT_THRESHOLD, - show_default=True, - help="Regression failure threshold in percentage.") -@click.option("--repetitions", type=int, default=1, show_default=True, - help=("Number of repetitions of each benchmark. Increasing " - "may improve result precision. " - "[default: 1 for cpp, 5 for java")) -@click.option("--no-counters", type=BOOL, default=False, is_flag=True, - help="Hide counters field in diff report.") -@click.argument("contender", metavar="[<contender>", - default=ArrowSources.WORKSPACE, required=False) -@click.argument("baseline", metavar="[<baseline>]]", default="origin/HEAD", - required=False) +@click.option( + "--threshold", + type=float, + default=DEFAULT_THRESHOLD, + show_default=True, + help="Regression failure threshold in percentage.", +) +@click.option( + "--repetitions", + type=int, + default=1, + show_default=True, + help=( + "Number of repetitions of each benchmark. Increasing " + "may improve result precision. " + "[default: 1 for cpp, 5 for java" + ), +) +@click.option( + "--no-counters", + type=BOOL, + default=False, + is_flag=True, + help="Hide counters field in diff report.", +) +@click.argument( + "contender", metavar="[<contender>", default=ArrowSources.WORKSPACE, required=False +) +@click.argument( + "baseline", metavar="[<baseline>]]", default="origin/HEAD", required=False +) @click.pass_context -def benchmark_diff(ctx, src, preserve, output, language, cmake_extras, - suite_filter, benchmark_filter, repetitions, no_counters, - java_home, java_options, build_extras, benchmark_extras, - cpp_benchmark_extras, threshold, contender, baseline, - **kwargs): +def benchmark_diff( + ctx, + src, + preserve, + output, + language, + cmake_extras, + suite_filter, + benchmark_filter, + repetitions, + no_counters, + java_home, + java_options, + build_extras, + benchmark_extras, + cpp_benchmark_extras, + threshold, + contender, + baseline, + **kwargs, +): """Compare (diff) benchmark runs. This command acts like git-diff but for benchmark results. @@ -574,7 +841,6 @@ def benchmark_diff(ctx, src, preserve, output, language, cmake_extras, workspace. This imply that no clone will be performed. Examples: - \b # Compare workspace (contender) against the mainline development branch # (baseline) @@ -624,54 +890,75 @@ def benchmark_diff(ctx, src, preserve, output, language, cmake_extras, \b # This should not recompute the benchmark from run.json archery --quiet benchmark diff WORKSPACE run.json > result.json + """ with tmpdir(preserve=preserve) as root: - logger.debug("Comparing {} (contender) with {} (baseline)" - .format(contender, baseline)) + logger.debug(f"Comparing {contender} (contender) with {baseline} (baseline)") if language == "cpp": conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) + cmake_extras=cmake_extras, **kwargs + ) repetitions = repetitions if repetitions != -1 else 1 runner_cont = CppBenchmarkRunner.from_rev_or_path( - src, root, contender, conf, + src, + root, + contender, + conf, repetitions=repetitions, suite_filter=suite_filter, benchmark_filter=benchmark_filter, - benchmark_extras=cpp_benchmark_extras) + benchmark_extras=cpp_benchmark_extras, + ) runner_base = CppBenchmarkRunner.from_rev_or_path( - src, root, baseline, conf, + src, + root, + baseline, + conf, repetitions=repetitions, suite_filter=suite_filter, benchmark_filter=benchmark_filter, - benchmark_extras=cpp_benchmark_extras) + benchmark_extras=cpp_benchmark_extras, + ) elif language == "java": - for key in {'cpp_package_prefix', 'cxx_flags', 'cxx', 'cc'}: + for key in ("cpp_package_prefix", "cxx_flags", "cxx", "cc"): del kwargs[key] conf = JavaBenchmarkRunner.default_configuration( - java_home=java_home, java_options=java_options, - build_extras=build_extras, benchmark_extras=benchmark_extras, - **kwargs) + java_home=java_home, + java_options=java_options, + build_extras=build_extras, + benchmark_extras=benchmark_extras, + **kwargs, + ) repetitions = repetitions if repetitions != -1 else 5 runner_cont = JavaBenchmarkRunner.from_rev_or_path( - src, root, contender, conf, + src, + root, + contender, + conf, repetitions=repetitions, - benchmark_filter=benchmark_filter) + benchmark_filter=benchmark_filter, + ) runner_base = JavaBenchmarkRunner.from_rev_or_path( - src, root, baseline, conf, + src, + root, + baseline, + conf, repetitions=repetitions, - benchmark_filter=benchmark_filter) + benchmark_filter=benchmark_filter, + ) runner_comp = RunnerComparator(runner_cont, runner_base, threshold) # TODO(kszucs): test that the output is properly formatted jsonlines comparisons_json = _get_comparisons_as_json(runner_comp.comparisons) ren_counters = language == "java" - formatted = _format_comparisons_with_pandas(comparisons_json, - no_counters, ren_counters) + formatted = _format_comparisons_with_pandas( + comparisons_json, no_counters, ren_counters + ) print(formatted, file=output or sys.stdout) @@ -684,38 +971,42 @@ def _get_comparisons_as_json(comparisons): return buf.getvalue() -def _format_comparisons_with_pandas(comparisons_json, no_counters, - ren_counters): +def _format_comparisons_with_pandas(comparisons_json, no_counters, ren_counters): pd = _import_pandas() df = pd.read_json(StringIO(comparisons_json), lines=True) # parse change % so we can sort by it - df['change %'] = df.pop('change').str[:-1].map(float) - first_regression = len(df) - df['regression'].sum() + df["change %"] = df.pop("change").str[:-1].map(float) + first_regression = len(df) - df["regression"].sum() - fields = ['benchmark', 'baseline', 'contender', 'change %'] + fields = ["benchmark", "baseline", "contender", "change %"] if not no_counters: - fields += ['counters'] + fields += ["counters"] df = df[fields] if ren_counters: - df = df.rename(columns={'counters': 'configurations'}) - df = df.sort_values(by='change %', ascending=False) + df = df.rename(columns={"counters": "configurations"}) + df = df.sort_values(by="change %", ascending=False) def labelled(title, df): if len(df) == 0: - return '' - title += ': ({})'.format(len(df)) + return "" + title += f": ({len(df)})" df_str = df.to_string(index=False) - bar = '-' * df_str.index('\n') - return '\n'.join([bar, title, bar, df_str]) + bar = "-" * df_str.index("\n") + return "\n".join([bar, title, bar, df_str]) - return '\n\n'.join([labelled('Non-regressions', df[:first_regression]), - labelled('Regressions', df[first_regression:])]) + return "\n\n".join( + [ + labelled("Non-regressions", df[:first_regression]), + labelled("Regressions", df[first_regression:]), + ] + ) # ---------------------------------------------------------------------- # Integration testing + def _set_default(opt, default): if opt is None: return default @@ -723,55 +1014,110 @@ def _set_default(opt, default): @archery.command(short_help="Execute protocol and Flight integration tests") -@click.option('--with-all', is_flag=True, default=False, - help=('Include all known implementations by default ' - 'in integration tests')) -@click.option('--random-seed', type=int, default=12345, - help="Seed for PRNG when generating test data") -@click.option('--with-cpp', type=bool, default=False, - help='Include C++ in integration tests') -@click.option('--with-csharp', type=bool, default=False, - help='Include C# in integration tests') -@click.option('--with-java', type=bool, default=False, - help='Include Java in integration tests', - envvar="ARCHERY_INTEGRATION_WITH_JAVA") -@click.option('--with-js', type=bool, default=False, - help='Include JavaScript in integration tests') -@click.option('--with-go', type=bool, default=False, - help='Include Go in integration tests', - envvar="ARCHERY_INTEGRATION_WITH_GO") -@click.option('--with-nanoarrow', type=bool, default=False, - help='Include nanoarrow in integration tests', - envvar="ARCHERY_INTEGRATION_WITH_NANOARROW") -@click.option('--with-rust', type=bool, default=False, - help='Include Rust in integration tests', - envvar="ARCHERY_INTEGRATION_WITH_RUST") -@click.option('--target-implementations', default='', - help=('Target implementations in this integration tests'), - envvar="ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS") -@click.option('--write_generated_json', default="", - help='Generate test JSON to indicated path') -@click.option('--run-ipc', is_flag=True, default=False, - help='Run IPC integration tests') -@click.option('--run-flight', is_flag=True, default=False, - help='Run Flight integration tests') -@click.option('--run-c-data', is_flag=True, default=False, - help='Run C Data Interface integration tests') -@click.option('--debug', is_flag=True, default=False, - help='Run executables in debug mode as relevant') -@click.option('--serial', is_flag=True, default=False, - help='Run tests serially, rather than in parallel') -@click.option('--tempdir', default=None, - help=('Directory to use for writing ' - 'integration test temporary files')) -@click.option('stop_on_error', '-x', '--stop-on-error', - is_flag=True, default=False, - help='Stop on first error') -@click.option('--gold-dirs', multiple=True, - help="gold integration test file paths") -@click.option('-k', '--match', - help=("Substring for test names to include in run, " - "e.g. -k primitive")) +@click.option( + "--with-all", + is_flag=True, + default=False, + help=("Include all known implementations by default in integration tests"), +) +@click.option( + "--random-seed", + type=int, + default=12345, + help="Seed for PRNG when generating test data", +) +@click.option( + "--with-cpp", type=bool, default=False, help="Include C++ in integration tests" +) +@click.option( + "--with-csharp", type=bool, default=False, help="Include C# in integration tests" +) +@click.option( + "--with-java", + type=bool, + default=False, + help="Include Java in integration tests", + envvar="ARCHERY_INTEGRATION_WITH_JAVA", +) +@click.option( + "--with-js", + type=bool, + default=False, + help="Include JavaScript in integration tests", +) +@click.option( + "--with-go", + type=bool, + default=False, + help="Include Go in integration tests", + envvar="ARCHERY_INTEGRATION_WITH_GO", +) +@click.option( + "--with-nanoarrow", + type=bool, + default=False, + help="Include nanoarrow in integration tests", + envvar="ARCHERY_INTEGRATION_WITH_NANOARROW", +) +@click.option( + "--with-rust", + type=bool, + default=False, + help="Include Rust in integration tests", + envvar="ARCHERY_INTEGRATION_WITH_RUST", +) +@click.option( + "--target-implementations", + default="", + help=("Target implementations in this integration tests"), + envvar="ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS", +) +@click.option( + "--write_generated_json", default="", help="Generate test JSON to indicated path" +) +@click.option( + "--run-ipc", is_flag=True, default=False, help="Run IPC integration tests" +) +@click.option( + "--run-flight", is_flag=True, default=False, help="Run Flight integration tests" +) +@click.option( + "--run-c-data", + is_flag=True, + default=False, + help="Run C Data Interface integration tests", +) +@click.option( + "--debug", + is_flag=True, + default=False, + help="Run executables in debug mode as relevant", +) +@click.option( + "--serial", + is_flag=True, + default=False, + help="Run tests serially, rather than in parallel", +) +@click.option( + "--tempdir", + default=None, + help=("Directory to use for writing integration test temporary files"), +) +@click.option( + "stop_on_error", + "-x", + "--stop-on-error", + is_flag=True, + default=False, + help="Stop on first error", +) +@click.option("--gold-dirs", multiple=True, help="gold integration test file paths") +@click.option( + "-k", + "--match", + help=("Substring for test names to include in run, e.g. -k primitive"), +) def integration(with_all=False, random_seed=12345, **args): """If you don't specify the "--target-implementations" option nor the "ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS" environment @@ -830,31 +1176,31 @@ def integration(with_all=False, random_seed=12345, **args): | Java | Rust | """ - - from .integration.runner import write_js_test_json, run_all_tests import numpy as np + from .integration.runner import run_all_tests, write_js_test_json + # FIXME(bkietz) Include help strings for individual testers. # For example, CPPTester's ARROW_CPP_EXE_PATH environment variable. # Make runs involving data generation deterministic np.random.seed(random_seed) - gen_path = args['write_generated_json'] + gen_path = args["write_generated_json"] - implementations = ['cpp', 'csharp', 'java', 'js', 'go', 'nanoarrow', 'rust'] - formats = ['ipc', 'flight', 'c_data'] + implementations = ["cpp", "csharp", "java", "js", "go", "nanoarrow", "rust"] + formats = ["ipc", "flight", "c_data"] enabled_implementations = 0 for lang in implementations: - param = f'with_{lang}' + param = f"with_{lang}" if with_all: args[param] = with_all enabled_implementations += args[param] enabled_formats = 0 for fmt in formats: - param = f'run_{fmt}' + param = f"run_{fmt}" enabled_formats += args[param] if gen_path: @@ -866,72 +1212,90 @@ def integration(with_all=False, random_seed=12345, **args): if enabled_formats == 0: raise click.UsageError( "Need to enable at least one format to test " - "(IPC, Flight, C Data Interface); try --help") + "(IPC, Flight, C Data Interface); try --help" + ) if enabled_implementations == 0: raise click.UsageError( - "Need to enable at least one implementation to test; try --help") + "Need to enable at least one implementation to test; try --help" + ) run_all_tests(**args) @archery.command() -@click.option('--arrow-token', envvar='ARROW_GITHUB_TOKEN', - help='OAuth token for responding comment in the arrow repo') -@click.option('--committers-file', '-c', type=click.File('r', encoding='utf8')) -@click.option('--event-name', '-n', required=True) -@click.option('--event-payload', '-p', type=click.File('r', encoding='utf8'), - default='-', required=True) +@click.option( + "--arrow-token", + envvar="ARROW_GITHUB_TOKEN", + help="OAuth token for responding comment in the arrow repo", +) +@click.option("--committers-file", "-c", type=click.File("r", encoding="utf8")) +@click.option("--event-name", "-n", required=True) +@click.option( + "--event-payload", + "-p", + type=click.File("r", encoding="utf8"), + default="-", + required=True, +) def trigger_bot(arrow_token, committers_file, event_name, event_payload): - from .bot import CommentBot, PullRequestWorkflowBot, actions from ruamel.yaml import YAML + from .bot import CommentBot, PullRequestWorkflowBot, actions + event_payload = json.loads(event_payload.read()) - if 'comment' in event_name: - bot = CommentBot(name='github-actions', handler=actions, token=arrow_token) + if "comment" in event_name: + bot = CommentBot(name="github-actions", handler=actions, token=arrow_token) bot.handle(event_name, event_payload) else: committers = None if committers_file: - committers = [committer['alias'] - for committer in YAML().load(committers_file)] - bot = PullRequestWorkflowBot(event_name, event_payload, token=arrow_token, - committers=committers) + committers = [ + committer["alias"] for committer in YAML().load(committers_file) + ] + bot = PullRequestWorkflowBot( + event_name, event_payload, token=arrow_token, committers=committers + ) bot.handle() @archery.group("linking") @click.pass_obj def linking(obj): - """ - Quick and dirty utilities for checking library linkage. - """ - pass + """Quick and dirty utilities for checking library linkage.""" @linking.command("check-dependencies") @click.argument("paths", nargs=-1) -@click.option("--allow", "-a", "allowed", multiple=True, - help="Name of the allowed libraries") -@click.option("--disallow", "-d", "disallowed", multiple=True, - help="Name of the disallowed libraries") +@click.option( + "--allow", "-a", "allowed", multiple=True, help="Name of the allowed libraries" +) +@click.option( + "--disallow", + "-d", + "disallowed", + multiple=True, + help="Name of the disallowed libraries", +) @click.pass_obj def linking_check_dependencies(obj, allowed, disallowed, paths): - from .linking import check_dynamic_library_dependencies, DependencyError + from .linking import DependencyError, check_dynamic_library_dependencies allowed, disallowed = set(allowed), set(disallowed) try: for path in map(pathlib.Path, paths): - check_dynamic_library_dependencies(path, allowed=allowed, - disallowed=disallowed) + check_dynamic_library_dependencies( + path, allowed=allowed, disallowed=disallowed + ) except DependencyError as e: raise click.ClickException(str(e)) -add_optional_command("docker", module=".docker.cli", function="docker", - parent=archery) -add_optional_command("release", module=".release.cli", function="release", - parent=archery) -add_optional_command("crossbow", module=".crossbow.cli", function="crossbow", - parent=archery) +add_optional_command("docker", module=".docker.cli", function="docker", parent=archery) +add_optional_command( + "release", module=".release.cli", function="release", parent=archery +) +add_optional_command( + "crossbow", module=".crossbow.cli", function="crossbow", parent=archery +) if __name__ == "__main__": diff --git a/dev/archery/archery/compat.py b/dev/archery/archery/compat.py index 33ff869668d29..7279c33edfa7f 100644 --- a/dev/archery/archery/compat.py +++ b/dev/archery/archery/compat.py @@ -14,13 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import pathlib import sys def _is_path_like(path): - return isinstance(path, str) or hasattr(path, '__fspath__') + return isinstance(path, str) or hasattr(path, "__fspath__") def _ensure_path(path): @@ -31,9 +32,7 @@ def _ensure_path(path): def _stringify_path(path): - """ - Convert *path* to a string or unicode path if possible. - """ + """Convert *path* to a string or unicode path if possible.""" if isinstance(path, str): return path @@ -48,15 +47,14 @@ def _stringify_path(path): def _import_pandas(): # ARROW-13425: avoid importing PyArrow from Pandas - sys.modules['pyarrow'] = None + sys.modules["pyarrow"] = None import pandas as pd + return pd def _get_module(obj, *, default=None): - """ - Try to find the name of the module *obj* is defined on. - """ + """Try to find the name of the module *obj* is defined on.""" try: return obj.__module__ except AttributeError: diff --git a/dev/archery/archery/crossbow/__init__.py b/dev/archery/archery/crossbow/__init__.py index bc72e81f05054..5f09fc4ff28e1 100644 --- a/dev/archery/archery/crossbow/__init__.py +++ b/dev/archery/archery/crossbow/__init__.py @@ -14,6 +14,18 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from .core import Config, Repo, Queue, Target, Job # noqa -from .reports import CommentReport, ConsoleReport, EmailReport # noqa +from .core import Config, Job, Queue, Repo, Target +from .reports import CommentReport, ConsoleReport, EmailReport + +__all__ = [ + "CommentReport", + "Config", + "ConsoleReport", + "EmailReport", + "Job", + "Queue", + "Repo", + "Target", +] diff --git a/dev/archery/archery/crossbow/cli.py b/dev/archery/archery/crossbow/cli.py index 5e31b37dd441a..64b878d96f3ad 100644 --- a/dev/archery/archery/crossbow/cli.py +++ b/dev/archery/archery/crossbow/cli.py @@ -14,19 +14,25 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations +import sys +import time from datetime import date from pathlib import Path -import time -import sys import click -from .core import Config, Repo, Queue, Target, Job, CrossbowError -from .reports import (ChatReport, Report, ReportUtils, ConsoleReport, - EmailReport, CommentReport) from ..utils.source import ArrowSources - +from .core import Config, CrossbowError, Job, Queue, Repo, Target +from .reports import ( + ChatReport, + CommentReport, + ConsoleReport, + EmailReport, + Report, + ReportUtils, +) _default_arrow_path = ArrowSources.find().path _default_queue_path = _default_arrow_path.parent / "crossbow" @@ -34,85 +40,155 @@ @click.group() -@click.option('--github-token', '-t', default=None, - envvar="CROSSBOW_GITHUB_TOKEN", - help='OAuth token for GitHub authentication') -@click.option('--arrow-path', '-a', - type=click.Path(), default=_default_arrow_path, - help='Arrow\'s repository path. Defaults to the repository of ' - 'this script') -@click.option('--queue-path', '-q', - envvar="CROSSBOW_QUEUE_PATH", - type=click.Path(), default=_default_queue_path, - help='The repository path used for scheduling the tasks. ' - 'Defaults to crossbow directory placed next to arrow') -@click.option('--queue-remote', '-qr', default=None, - help='Force to use this remote URL for the Queue repository') -@click.option('--output-file', metavar='<output>', - type=click.File('w', encoding='utf8'), default='-', - help='Capture output result into file.') +@click.option( + "--github-token", + "-t", + default=None, + envvar="CROSSBOW_GITHUB_TOKEN", + help="OAuth token for GitHub authentication", +) +@click.option( + "--arrow-path", + "-a", + type=click.Path(), + default=_default_arrow_path, + help="Arrow's repository path. Defaults to the repository of this script", +) +@click.option( + "--queue-path", + "-q", + envvar="CROSSBOW_QUEUE_PATH", + type=click.Path(), + default=_default_queue_path, + help="The repository path used for scheduling the tasks. " + "Defaults to crossbow directory placed next to arrow", +) +@click.option( + "--queue-remote", + "-qr", + default=None, + help="Force to use this remote URL for the Queue repository", +) +@click.option( + "--output-file", + metavar="<output>", + type=click.File("w", encoding="utf8"), + default="-", + help="Capture output result into file.", +) @click.pass_context -def crossbow(ctx, github_token, arrow_path, queue_path, queue_remote, - output_file): - """ - Schedule packaging tasks or nightly builds on CI services. - """ +def crossbow(ctx, github_token, arrow_path, queue_path, queue_remote, output_file): + """Schedule packaging tasks or nightly builds on CI services.""" ctx.ensure_object(dict) - ctx.obj['output'] = output_file - ctx.obj['arrow'] = Repo(arrow_path) - ctx.obj['queue'] = Queue(queue_path, remote_url=queue_remote, - github_token=github_token, require_https=True) + ctx.obj["output"] = output_file + ctx.obj["arrow"] = Repo(arrow_path) + ctx.obj["queue"] = Queue( + queue_path, + remote_url=queue_remote, + github_token=github_token, + require_https=True, + ) @crossbow.command() -@click.option('--config-path', '-c', - type=click.Path(exists=True), default=_default_config_path, - help='Task configuration yml. Defaults to tasks.yml') +@click.option( + "--config-path", + "-c", + type=click.Path(exists=True), + default=_default_config_path, + help="Task configuration yml. Defaults to tasks.yml", +) @click.pass_obj def check_config(obj, config_path): # load available tasks configuration and groups from yaml config = Config.load_yaml(config_path) config.validate() - output = obj['output'] + output = obj["output"] config.show(output) @crossbow.command() -@click.argument('tasks', nargs=-1, required=False) -@click.option('--group', '-g', 'groups', multiple=True, - help='Submit task groups as defined in task.yml') -@click.option('--param', '-p', 'params', multiple=True, - help='Additional task parameters for rendering the CI templates') -@click.option('--job-prefix', default='build', - help='Arbitrary prefix for branch names, e.g. nightly') -@click.option('--config-path', '-c', - type=click.Path(exists=True), default=_default_config_path, - help='Task configuration yml. Defaults to tasks.yml') -@click.option('--arrow-version', '-v', default=None, - help='Set target version explicitly.') -@click.option('--arrow-remote', '-r', default=None, - help='Set GitHub remote explicitly, which is going to be cloned ' - 'on the CI services. Note, that no validation happens ' - 'locally. Examples: https://github.com/apache/arrow or ' - 'https://github.com/kszucs/arrow.') -@click.option('--arrow-branch', '-b', default=None, - help='Give the branch name explicitly, e.g. ARROW-1949.') -@click.option('--arrow-sha', '-t', default=None, - help='Set commit SHA or Tag name explicitly, e.g. f67a515, ' - 'apache-arrow-0.11.1.') -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') -@click.option('--dry-run/--commit', default=False, - help='Just display the rendered CI configurations without ' - 'committing them') -@click.option('--no-push/--push', default=False, - help='Don\'t push the changes') +@click.argument("tasks", nargs=-1, required=False) +@click.option( + "--group", + "-g", + "groups", + multiple=True, + help="Submit task groups as defined in task.yml", +) +@click.option( + "--param", + "-p", + "params", + multiple=True, + help="Additional task parameters for rendering the CI templates", +) +@click.option( + "--job-prefix", + default="build", + help="Arbitrary prefix for branch names, e.g. nightly", +) +@click.option( + "--config-path", + "-c", + type=click.Path(exists=True), + default=_default_config_path, + help="Task configuration yml. Defaults to tasks.yml", +) +@click.option( + "--arrow-version", "-v", default=None, help="Set target version explicitly." +) +@click.option( + "--arrow-remote", + "-r", + default=None, + help="Set GitHub remote explicitly, which is going to be cloned " + "on the CI services. Note, that no validation happens " + "locally. Examples: https://github.com/apache/arrow or " + "https://github.com/kszucs/arrow.", +) +@click.option( + "--arrow-branch", + "-b", + default=None, + help="Give the branch name explicitly, e.g. ARROW-1949.", +) +@click.option( + "--arrow-sha", + "-t", + default=None, + help="Set commit SHA or Tag name explicitly, e.g. f67a515, apache-arrow-0.11.1.", +) +@click.option( + "--fetch/--no-fetch", + default=True, + help="Fetch references (branches and tags) from the remote", +) +@click.option( + "--dry-run/--commit", + default=False, + help="Just display the rendered CI configurations without committing them", +) +@click.option("--no-push/--push", default=False, help="Don't push the changes") @click.pass_obj -def submit(obj, tasks, groups, params, job_prefix, config_path, arrow_version, - arrow_remote, arrow_branch, arrow_sha, fetch, dry_run, no_push): - output = obj['output'] - queue, arrow = obj['queue'], obj['arrow'] +def submit( + obj, + tasks, + groups, + params, + job_prefix, + config_path, + arrow_version, + arrow_remote, + arrow_branch, + arrow_sha, + fetch, + dry_run, + no_push, +): + output = obj["output"] + queue, arrow = obj["queue"], obj["arrow"] # load available tasks configuration and groups from yaml config = Config.load_yaml(config_path) @@ -128,16 +204,22 @@ def submit(obj, tasks, groups, params, job_prefix, config_path, arrow_version, # in case of the release procedure), because the templates still # contain some business logic (dependency installation, deployments) # which will be reduced to a single command in the future. - target = Target.from_repo(arrow, remote=arrow_remote, branch=arrow_branch, - head=arrow_sha, version=arrow_version) + target = Target.from_repo( + arrow, + remote=arrow_remote, + branch=arrow_branch, + head=arrow_sha, + version=arrow_version, + ) # parse additional job parameters params = dict([p.split("=") for p in params]) # instantiate the job object try: - job = Job.from_config(config=config, target=target, tasks=tasks, - groups=groups, params=params) + job = Job.from_config( + config=config, target=target, tasks=tasks, groups=groups, params=params + ) except CrossbowError as e: raise click.ClickException(str(e)) @@ -150,137 +232,198 @@ def submit(obj, tasks, groups, params, job_prefix, config_path, arrow_version, queue.put(job, prefix=job_prefix) if no_push: - click.echo('Branches and commits created but not pushed: `{}`' - .format(job.branch)) + click.echo(f"Branches and commits created but not pushed: `{job.branch}`") else: queue.push() - click.echo('Pushed job identifier is: `{}`'.format(job.branch)) + click.echo(f"Pushed job identifier is: `{job.branch}`") @crossbow.command() -@click.option('--base-branch', default=None, - help='Set base branch for the PR.') -@click.option('--create-pr', is_flag=True, default=False, - help='Create GitHub Pull Request') -@click.option('--head-branch', default=None, - help='Give the branch name explicitly, e.g. release-9.0.0-rc0') -@click.option('--pr-body', default=None, - help='Set body for the PR.') -@click.option('--pr-title', default=None, - help='Set title for the PR.') -@click.option('--remote', default=None, - help='Set GitHub remote explicitly, which is going to be used ' - 'for the PR. Note, that no validation happens ' - 'locally. Examples: https://github.com/apache/arrow or ' - 'https://github.com/raulcd/arrow.') -@click.option('--rc', default=None, - help='Release Candidate number.') -@click.option('--version', default=None, - help='Release version.') -@click.option('--verify-binaries', is_flag=True, default=False, - help='Trigger the verify binaries jobs') -@click.option('--verify-source', is_flag=True, default=False, - help='Trigger the verify source jobs') -@click.option('--verify-wheels', is_flag=True, default=False, - help='Trigger the verify wheels jobs') +@click.option("--base-branch", default=None, help="Set base branch for the PR.") +@click.option( + "--create-pr", is_flag=True, default=False, help="Create GitHub Pull Request" +) +@click.option( + "--head-branch", + default=None, + help="Give the branch name explicitly, e.g. release-9.0.0-rc0", +) +@click.option("--pr-body", default=None, help="Set body for the PR.") +@click.option("--pr-title", default=None, help="Set title for the PR.") +@click.option( + "--remote", + default=None, + help="Set GitHub remote explicitly, which is going to be used " + "for the PR. Note, that no validation happens " + "locally. Examples: https://github.com/apache/arrow or " + "https://github.com/raulcd/arrow.", +) +@click.option("--rc", default=None, help="Release Candidate number.") +@click.option("--version", default=None, help="Release version.") +@click.option( + "--verify-binaries", + is_flag=True, + default=False, + help="Trigger the verify binaries jobs", +) +@click.option( + "--verify-source", + is_flag=True, + default=False, + help="Trigger the verify source jobs", +) +@click.option( + "--verify-wheels", + is_flag=True, + default=False, + help="Trigger the verify wheels jobs", +) @click.pass_obj -def verify_release_candidate(obj, base_branch, create_pr, - head_branch, pr_body, pr_title, remote, - rc, version, verify_binaries, verify_source, - verify_wheels): +def verify_release_candidate( + obj, + base_branch, + create_pr, + head_branch, + pr_body, + pr_title, + remote, + rc, + version, + verify_binaries, + verify_source, + verify_wheels, +): # The verify-release-candidate command will create a PR (or find one) # and add the verify-rc* comment to trigger the verify tasks # Redefine Arrow repo to use the correct arrow remote. - arrow = Repo(path=obj['arrow'].path, remote_url=remote) - - response = arrow.github_pr(title=pr_title, head=head_branch, - base=base_branch, body=pr_body, - github_token=obj['queue'].github_token, - create=create_pr) + arrow = Repo(path=obj["arrow"].path, remote_url=remote) + + response = arrow.github_pr( + title=pr_title, + head=head_branch, + base=base_branch, + body=pr_body, + github_token=obj["queue"].github_token, + create=create_pr, + ) # If we want to trigger any verification job we add a comment to the PR. verify_flags = [verify_source, verify_binaries, verify_wheels] if any(verify_flags): command = "@github-actions crossbow submit" - verify_groups = ["verify-rc-source", - "verify-rc-binaries", "verify-rc-wheels"] + verify_groups = ["verify-rc-source", "verify-rc-binaries", "verify-rc-wheels"] job_groups = "" for flag, group in zip(verify_flags, verify_groups): if flag: job_groups += f" --group {group}" response.create_comment( - f"{command} {job_groups} --param " + - f"release={version} --param rc={rc}") + f"{command} {job_groups} --param " + f"release={version} --param rc={rc}" + ) @crossbow.command() -@click.argument('task', required=True) -@click.option('--config-path', '-c', - type=click.Path(exists=True), default=_default_config_path, - help='Task configuration yml. Defaults to tasks.yml') -@click.option('--arrow-version', '-v', default=None, - help='Set target version explicitly.') -@click.option('--arrow-remote', '-r', default=None, - help='Set GitHub remote explicitly, which is going to be cloned ' - 'on the CI services. Note, that no validation happens ' - 'locally. Examples: https://github.com/apache/arrow or ' - 'https://github.com/kszucs/arrow.') -@click.option('--arrow-branch', '-b', default=None, - help='Give the branch name explicitly, e.g. ARROW-1949.') -@click.option('--arrow-sha', '-t', default=None, - help='Set commit SHA or Tag name explicitly, e.g. f67a515, ' - 'apache-arrow-0.11.1.') -@click.option('--param', '-p', 'params', multiple=True, - help='Additional task parameters for rendering the CI templates') +@click.argument("task", required=True) +@click.option( + "--config-path", + "-c", + type=click.Path(exists=True), + default=_default_config_path, + help="Task configuration yml. Defaults to tasks.yml", +) +@click.option( + "--arrow-version", "-v", default=None, help="Set target version explicitly." +) +@click.option( + "--arrow-remote", + "-r", + default=None, + help="Set GitHub remote explicitly, which is going to be cloned " + "on the CI services. Note, that no validation happens " + "locally. Examples: https://github.com/apache/arrow or " + "https://github.com/kszucs/arrow.", +) +@click.option( + "--arrow-branch", + "-b", + default=None, + help="Give the branch name explicitly, e.g. ARROW-1949.", +) +@click.option( + "--arrow-sha", + "-t", + default=None, + help="Set commit SHA or Tag name explicitly, e.g. f67a515, apache-arrow-0.11.1.", +) +@click.option( + "--param", + "-p", + "params", + multiple=True, + help="Additional task parameters for rendering the CI templates", +) @click.pass_obj -def render(obj, task, config_path, arrow_version, arrow_remote, arrow_branch, - arrow_sha, params): - """ - Utility command to check the rendered CI templates. - """ +def render( + obj, task, config_path, arrow_version, arrow_remote, arrow_branch, arrow_sha, params +): + """Utility command to check the rendered CI templates.""" from .core import _flatten def highlight(code): try: from pygments import highlight - from pygments.lexers import YamlLexer from pygments.formatters import TerminalFormatter + from pygments.lexers import YamlLexer + return highlight(code, YamlLexer(), TerminalFormatter()) except ImportError: return code - arrow = obj['arrow'] + arrow = obj["arrow"] - target = Target.from_repo(arrow, remote=arrow_remote, branch=arrow_branch, - head=arrow_sha, version=arrow_version) + target = Target.from_repo( + arrow, + remote=arrow_remote, + branch=arrow_branch, + head=arrow_sha, + version=arrow_version, + ) config = Config.load_yaml(config_path) params = dict([p.split("=") for p in params]) params["queue_remote_url"] = "https://github.com/org/crossbow" - job = Job.from_config(config=config, target=target, tasks=[task], - params=params) + job = Job.from_config(config=config, target=target, tasks=[task], params=params) - for task_name, rendered_files in job.render_tasks().items(): + for _task_name, rendered_files in job.render_tasks().items(): for path, content in _flatten(rendered_files).items(): - click.echo('#' * 80) - click.echo('### {:^72} ###'.format("/".join(path))) - click.echo('#' * 80) + click.echo("#" * 80) + click.echo("### {:^72} ###".format("/".join(path))) + click.echo("#" * 80) click.echo(highlight(content)) @crossbow.command() -@click.argument('job-name', required=True) -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') -@click.option('--task-filter', '-f', 'task_filters', multiple=True, - help='Glob pattern for filtering relevant tasks') -@click.option('--validate/--no-validate', default=False, - help='Return non-zero exit code ' - 'if there is any non-success task') +@click.argument("job-name", required=True) +@click.option( + "--fetch/--no-fetch", + default=True, + help="Fetch references (branches and tags) from the remote", +) +@click.option( + "--task-filter", + "-f", + "task_filters", + multiple=True, + help="Glob pattern for filtering relevant tasks", +) +@click.option( + "--validate/--no-validate", + default=False, + help="Return non-zero exit code if there is any non-success task", +) @click.pass_obj def status(obj, job_name, fetch, task_filters, validate): - output = obj['output'] - queue = obj['queue'] + output = obj["output"] + queue = obj["queue"] if fetch: queue.fetch() job = queue.get(job_name) @@ -289,7 +432,7 @@ def status(obj, job_name, fetch, task_filters, validate): def asset_callback(task_name, task, asset): nonlocal success - if task.status().combined_state in {'error', 'failure'}: + if task.status().combined_state in {"error", "failure"}: success = False if asset is None: success = False @@ -301,43 +444,58 @@ def asset_callback(task_name, task, asset): @crossbow.command() -@click.option('--arrow-remote', '-r', default=None, - help='Set GitHub remote explicitly, which is going to be cloned ' - 'on the CI services. Note, that no validation happens ' - 'locally. Examples: "https://github.com/apache/arrow" or ' - '"raulcd/arrow".') -@click.option('--crossbow', '-c', default='ursacomputing/crossbow', - help='Crossbow repository on github to use') -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') -@click.option('--job-name', required=True) -@click.option('--pr-title', required=True, - help='Track the job submitted on PR with given title') +@click.option( + "--arrow-remote", + "-r", + default=None, + help="Set GitHub remote explicitly, which is going to be cloned " + "on the CI services. Note, that no validation happens " + 'locally. Examples: "https://github.com/apache/arrow" or ' + '"raulcd/arrow".', +) +@click.option( + "--crossbow", + "-c", + default="ursacomputing/crossbow", + help="Crossbow repository on github to use", +) +@click.option( + "--fetch/--no-fetch", + default=True, + help="Fetch references (branches and tags) from the remote", +) +@click.option("--job-name", required=True) +@click.option( + "--pr-title", required=True, help="Track the job submitted on PR with given title" +) @click.pass_obj def report_pr(obj, arrow_remote, crossbow, fetch, job_name, pr_title): - arrow = obj['arrow'] - queue = obj['queue'] + arrow = obj["arrow"] + queue = obj["queue"] if fetch: queue.fetch() job = queue.get(job_name) report = CommentReport(job, crossbow_repo=crossbow) target_arrow = Repo(path=arrow.path, remote_url=arrow_remote) - pull_request = target_arrow.github_pr(title=pr_title, - github_token=queue.github_token, - create=False) + pull_request = target_arrow.github_pr( + title=pr_title, github_token=queue.github_token, create=False + ) # render the response comment's content on the PR pull_request.create_comment(report.show()) - click.echo(f'Job is tracked on PR {pull_request.html_url}') + click.echo(f"Job is tracked on PR {pull_request.html_url}") @crossbow.command() -@click.argument('prefix', required=True) -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') +@click.argument("prefix", required=True) +@click.option( + "--fetch/--no-fetch", + default=True, + help="Fetch references (branches and tags) from the remote", +) @click.pass_obj def latest_prefix(obj, prefix, fetch): - queue = obj['queue'] + queue = obj["queue"] if fetch: queue.fetch() latest = queue.latest_for_prefix(prefix) @@ -345,40 +503,64 @@ def latest_prefix(obj, prefix, fetch): @crossbow.command() -@click.argument('job-name', required=True) -@click.option('--sender-name', '-n', - help='Name to use for report e-mail.') -@click.option('--sender-email', '-e', - help='E-mail to use for report e-mail.') -@click.option('--recipient-email', '-r', - help='Where to send the e-mail report') -@click.option('--smtp-user', '-u', - help='E-mail address to use for SMTP login') -@click.option('--smtp-password', '-P', - help='SMTP password to use for report e-mail.') -@click.option('--smtp-server', '-s', default='smtp.gmail.com', - help='SMTP server to use for report e-mail.') -@click.option('--smtp-port', '-p', default=465, - help='SMTP port to use for report e-mail.') -@click.option('--poll/--no-poll', default=False, - help='Wait for completion if there are tasks pending') -@click.option('--poll-max-minutes', default=180, - help='Maximum amount of time waiting for job completion') -@click.option('--poll-interval-minutes', default=10, - help='Number of minutes to wait to check job status again') -@click.option('--send/--dry-run', default=False, - help='Just display the report, don\'t send it') -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') +@click.argument("job-name", required=True) +@click.option("--sender-name", "-n", help="Name to use for report e-mail.") +@click.option("--sender-email", "-e", help="E-mail to use for report e-mail.") +@click.option("--recipient-email", "-r", help="Where to send the e-mail report") +@click.option("--smtp-user", "-u", help="E-mail address to use for SMTP login") +@click.option("--smtp-password", "-P", help="SMTP password to use for report e-mail.") +@click.option( + "--smtp-server", + "-s", + default="smtp.gmail.com", + help="SMTP server to use for report e-mail.", +) +@click.option( + "--smtp-port", "-p", default=465, help="SMTP port to use for report e-mail." +) +@click.option( + "--poll/--no-poll", + default=False, + help="Wait for completion if there are tasks pending", +) +@click.option( + "--poll-max-minutes", + default=180, + help="Maximum amount of time waiting for job completion", +) +@click.option( + "--poll-interval-minutes", + default=10, + help="Number of minutes to wait to check job status again", +) +@click.option( + "--send/--dry-run", default=False, help="Just display the report, don't send it" +) +@click.option( + "--fetch/--no-fetch", + default=True, + help="Fetch references (branches and tags) from the remote", +) @click.pass_obj -def report(obj, job_name, sender_name, sender_email, recipient_email, - smtp_user, smtp_password, smtp_server, smtp_port, poll, - poll_max_minutes, poll_interval_minutes, send, fetch): - """ - Send an e-mail report showing success/failure of tasks in a Crossbow run - """ - output = obj['output'] - queue = obj['queue'] +def report( + obj, + job_name, + sender_name, + sender_email, + recipient_email, + smtp_user, + smtp_password, + smtp_server, + smtp_port, + poll, + poll_max_minutes, + poll_interval_minutes, + send, + fetch, +): + """Send an e-mail report showing success/failure of tasks in a Crossbow run""" + output = obj["output"] + queue = obj["queue"] if fetch: queue.fetch() @@ -387,13 +569,13 @@ def report(obj, job_name, sender_name, sender_email, recipient_email, report=Report(job), sender_name=sender_name, sender_email=sender_email, - recipient_email=recipient_email + recipient_email=recipient_email, ) if poll: job.wait_until_finished( poll_max_minutes=poll_max_minutes, - poll_interval_minutes=poll_interval_minutes + poll_interval_minutes=poll_interval_minutes, ) if send: @@ -403,40 +585,55 @@ def report(obj, job_name, sender_name, sender_email, recipient_email, smtp_server=smtp_server, smtp_port=smtp_port, recipient_email=recipient_email, - message=email_report.render("nightly_report") + message=email_report.render("nightly_report"), ) else: output.write(email_report.render("nightly_report")) @crossbow.command() -@click.argument('job-name', required=True) -@click.option('--send/--dry-run', default=False, - help='Just display the report, don\'t send it') -@click.option('--webhook', '-w', - help='Zulip/Slack Webhook address to send the report to') -@click.option('--extra-message-success', '-s', default=None, - help='Extra message, will be appended if no failures.') -@click.option('--extra-message-failure', '-f', default=None, - help='Extra message, will be appended if there are failures.') -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') +@click.argument("job-name", required=True) +@click.option( + "--send/--dry-run", default=False, help="Just display the report, don't send it" +) +@click.option( + "--webhook", "-w", help="Zulip/Slack Webhook address to send the report to" +) +@click.option( + "--extra-message-success", + "-s", + default=None, + help="Extra message, will be appended if no failures.", +) +@click.option( + "--extra-message-failure", + "-f", + default=None, + help="Extra message, will be appended if there are failures.", +) +@click.option( + "--fetch/--no-fetch", + default=True, + help="Fetch references (branches and tags) from the remote", +) @click.pass_obj -def report_chat(obj, job_name, send, webhook, extra_message_success, - extra_message_failure, fetch): - """ - Send a chat report to a webhook showing success/failure +def report_chat( + obj, job_name, send, webhook, extra_message_success, extra_message_failure, fetch +): + """Send a chat report to a webhook showing success/failure of tasks in a Crossbow run. """ - output = obj['output'] - queue = obj['queue'] + output = obj["output"] + queue = obj["queue"] if fetch: queue.fetch() job = queue.get(job_name) - report_chat = ChatReport(report=Report(job), - extra_message_success=extra_message_success, - extra_message_failure=extra_message_failure) + report_chat = ChatReport( + report=Report(job), + extra_message_success=extra_message_success, + extra_message_failure=extra_message_failure, + ) if send: ReportUtils.send_message(webhook, report_chat.render("text")) else: @@ -444,19 +641,22 @@ def report_chat(obj, job_name, send, webhook, extra_message_success, @crossbow.command() -@click.argument('job-name', required=True) -@click.option('--save/--dry-run', default=False, - help='Just display the report, don\'t save it') -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') +@click.argument("job-name", required=True) +@click.option( + "--save/--dry-run", default=False, help="Just display the report, don't save it" +) +@click.option( + "--fetch/--no-fetch", + default=True, + help="Fetch references (branches and tags) from the remote", +) @click.pass_obj def report_csv(obj, job_name, save, fetch): - """ - Generates a CSV report with the different tasks information + """Generates a CSV report with the different tasks information from a Crossbow run. """ - output = obj['output'] - queue = obj['queue'] + output = obj["output"] + queue = obj["queue"] if fetch: queue.fetch() @@ -469,27 +669,45 @@ def report_csv(obj, job_name, save, fetch): @crossbow.command() -@click.argument('job-name', required=True) -@click.option('-t', '--target-dir', - default=_default_arrow_path / 'packages', - type=click.Path(file_okay=False, dir_okay=True), - help='Directory to download the build artifacts') -@click.option('--dry-run/--execute', default=False, - help='Just display process, don\'t download anything') -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') -@click.option('--task-filter', '-f', 'task_filters', multiple=True, - help='Glob pattern for filtering relevant tasks') -@click.option('--validate-patterns/--skip-pattern-validation', default=True, - help='Whether to validate artifact name patterns or not') +@click.argument("job-name", required=True) +@click.option( + "-t", + "--target-dir", + default=_default_arrow_path / "packages", + type=click.Path(file_okay=False, dir_okay=True), + help="Directory to download the build artifacts", +) +@click.option( + "--dry-run/--execute", + default=False, + help="Just display process, don't download anything", +) +@click.option( + "--fetch/--no-fetch", + default=True, + help="Fetch references (branches and tags) from the remote", +) +@click.option( + "--task-filter", + "-f", + "task_filters", + multiple=True, + help="Glob pattern for filtering relevant tasks", +) +@click.option( + "--validate-patterns/--skip-pattern-validation", + default=True, + help="Whether to validate artifact name patterns or not", +) @click.pass_obj -def download_artifacts(obj, job_name, target_dir, dry_run, fetch, - validate_patterns, task_filters): +def download_artifacts( + obj, job_name, target_dir, dry_run, fetch, validate_patterns, task_filters +): """Download build artifacts from GitHub releases""" - output = obj['output'] + output = obj["output"] # fetch the queue repository - queue = obj['queue'] + queue = obj["queue"] if fetch: queue.fetch() @@ -511,12 +729,11 @@ def need_download(): return False if not path.exists(): return True - if path.stat().st_size != asset.size: - return True - return False + return path.stat().st_size != asset.size if need_download(): import github3 + max_n_retries = 5 n_retries = 0 while True: @@ -527,60 +744,65 @@ def need_download(): if n_retries == max_n_retries: raise wait_seconds = 60 - click.echo(f'Failed to download {path}') - click.echo(f'Retry #{n_retries} after {wait_seconds}s') + click.echo(f"Failed to download {path}") + click.echo(f"Retry #{n_retries} after {wait_seconds}s") click.echo(error) time.sleep(wait_seconds) else: break - click.echo('Downloading {}\'s artifacts.'.format(job_name)) - click.echo('Destination directory is {}'.format(target_dir)) + click.echo(f"Downloading {job_name}'s artifacts.") + click.echo(f"Destination directory is {target_dir}") click.echo() report = ConsoleReport(job, task_filters=task_filters) report.show( - output, - asset_callback=asset_callback, - validate_patterns=validate_patterns + output, asset_callback=asset_callback, validate_patterns=validate_patterns ) @crossbow.command() -@click.argument('patterns', nargs=-1, required=True) -@click.option('--sha', required=True, help='Target committish') -@click.option('--tag', required=True, help='Target tag') -@click.option('--method', default='curl', help='Use cURL to upload') +@click.argument("patterns", nargs=-1, required=True) +@click.option("--sha", required=True, help="Target committish") +@click.option("--tag", required=True, help="Target tag") +@click.option("--method", default="curl", help="Use cURL to upload") @click.pass_obj def upload_artifacts(obj, tag, sha, patterns, method): - queue = obj['queue'] + queue = obj["queue"] queue.github_overwrite_release_assets( tag_name=tag, target_commitish=sha, method=method, patterns=patterns ) @crossbow.command() -@click.option('--dry-run/--execute', default=False, - help='Just display process, don\'t download anything') -@click.option('--days', default=90, - help='Branches older than this amount of days will be deleted') -@click.option('--maximum', default=1000, - help='Maximum limit of branches to delete for a single run') +@click.option( + "--dry-run/--execute", + default=False, + help="Just display process, don't download anything", +) +@click.option( + "--days", default=90, help="Branches older than this amount of days will be deleted" +) +@click.option( + "--maximum", + default=1000, + help="Maximum limit of branches to delete for a single run", +) @click.pass_obj def delete_old_branches(obj, dry_run, days, maximum): - """ - Deletes branches on queue repository (crossbow) that are older than number + """Deletes branches on queue repository (crossbow) that are older than number of days. With a maximum number of branches to be deleted. This is required to avoid triggering GitHub protection limits. """ - queue = obj['queue'] + queue = obj["queue"] ts = time.time() - days * 24 * 3600 refs = [] for ref in queue.repo.listall_reference_objects(): commit = ref.peel() if commit.commit_time < ts and not ref.name.startswith( - "refs/remotes/origin/pr/"): + "refs/remotes/origin/pr/" + ): # Check if reference is a remote reference to point # to the remote head. ref_name = ref.name @@ -593,7 +815,7 @@ def batch_gen(iterable, step): to_delete = min(total_length, maximum) print(f"Total number of references to be deleted: {to_delete}") for index in range(0, to_delete, step): - yield iterable[index:min(index + step, to_delete)] + yield iterable[index : min(index + step, to_delete)] for batch in batch_gen(refs, 50): if not dry_run: @@ -603,42 +825,54 @@ def batch_gen(iterable, step): @crossbow.command() -@click.option('--days', default=30, - help='Notification will be sent if expiration date is ' - 'closer than the number of days.') -@click.option('--sender-name', '-n', - help='Name to use for report e-mail.') -@click.option('--sender-email', '-e', - help='E-mail to use for report e-mail.') -@click.option('--recipient-email', '-r', - help='Where to send the e-mail report') -@click.option('--smtp-user', '-u', - help='E-mail address to use for SMTP login') -@click.option('--smtp-password', '-P', - help='SMTP password to use for report e-mail.') -@click.option('--smtp-server', '-s', default='smtp.gmail.com', - help='SMTP server to use for report e-mail.') -@click.option('--smtp-port', '-p', default=465, - help='SMTP port to use for report e-mail.') -@click.option('--send/--dry-run', default=False, - help='Just display the report, don\'t send it') +@click.option( + "--days", + default=30, + help="Notification will be sent if expiration date is " + "closer than the number of days.", +) +@click.option("--sender-name", "-n", help="Name to use for report e-mail.") +@click.option("--sender-email", "-e", help="E-mail to use for report e-mail.") +@click.option("--recipient-email", "-r", help="Where to send the e-mail report") +@click.option("--smtp-user", "-u", help="E-mail address to use for SMTP login") +@click.option("--smtp-password", "-P", help="SMTP password to use for report e-mail.") +@click.option( + "--smtp-server", + "-s", + default="smtp.gmail.com", + help="SMTP server to use for report e-mail.", +) +@click.option( + "--smtp-port", "-p", default=465, help="SMTP port to use for report e-mail." +) +@click.option( + "--send/--dry-run", default=False, help="Just display the report, don't send it" +) @click.pass_obj -def notify_token_expiration(obj, days, sender_name, sender_email, - recipient_email, smtp_user, smtp_password, - smtp_server, smtp_port, send): - """ - Check if token is close to expiration and send email notifying. - """ - output = obj['output'] - queue = obj['queue'] +def notify_token_expiration( + obj, + days, + sender_name, + sender_email, + recipient_email, + smtp_user, + smtp_password, + smtp_server, + smtp_port, + send, +): + """Check if token is close to expiration and send email notifying.""" + output = obj["output"] + queue = obj["queue"] token_expiration_date = queue.token_expiration_date() days_left = 0 if token_expiration_date: days_left = (token_expiration_date - date.today()).days if days_left > days: - output.write("Notification not sent. " + - f"Token will expire in {days_left} days.") + output.write( + "Notification not sent. " + f"Token will expire in {days_left} days." + ) return class TokenExpirationReport: @@ -648,10 +882,11 @@ def __init__(self, token_expiration_date, days_left): email_report = EmailReport( report=TokenExpirationReport( - token_expiration_date or "ALREADY_EXPIRED", days_left), + token_expiration_date or "ALREADY_EXPIRED", days_left + ), sender_name=sender_name, sender_email=sender_email, - recipient_email=recipient_email + recipient_email=recipient_email, ) message = email_report.render("token_expiration").strip() @@ -662,7 +897,7 @@ def __init__(self, token_expiration_date, days_left): smtp_server=smtp_server, smtp_port=smtp_port, recipient_email=recipient_email, - message=message + message=message, ) else: output.write(message) diff --git a/dev/archery/archery/crossbow/core.py b/dev/archery/archery/crossbow/core.py index d4ca6cbc5b5f0..e7d56b62bdad8 100644 --- a/dev/archery/archery/crossbow/core.py +++ b/dev/archery/archery/crossbow/core.py @@ -14,27 +14,29 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -import os -import re import fnmatch import glob -import time import logging import mimetypes +import os +import re import subprocess import textwrap +import time import uuid +import warnings +from datetime import date from io import StringIO from pathlib import Path -from datetime import date -import warnings import jinja2 from ruamel.yaml import YAML try: import github3 + _have_github3 = True except ImportError: github3 = object @@ -51,7 +53,6 @@ from ..utils.source import ArrowSources - for pkg in ["requests", "urllib3", "github3"]: logging.getLogger(pkg).setLevel(logging.WARNING) @@ -115,7 +116,7 @@ def _unflatten_tree(files): } } """ - files = {tuple(k.split('/')): v for k, v in files.items()} + files = {tuple(k.split("/")): v for k, v in files.items()} return _unflatten(files) @@ -124,10 +125,13 @@ def format_all(items, pattern): return [pattern.format(item) for item in items] loader = jinja2.FileSystemLoader(searchpath) - env = jinja2.Environment(loader=loader, trim_blocks=True, - lstrip_blocks=True, - undefined=jinja2.StrictUndefined) - env.filters['format_all'] = format_all + env = jinja2.Environment( + loader=loader, + trim_blocks=True, + lstrip_blocks=True, + undefined=jinja2.StrictUndefined, + ) + env.filters["format_all"] = format_all template = env.get_template(template) return template.render(**params) @@ -170,13 +174,12 @@ def format_all(items, pattern): """ _default_tree = { - '.travis.yml': _default_travis_yml, - '.circleci/config.yml': _default_circle_yml + ".travis.yml": _default_travis_yml, + ".circleci/config.yml": _default_circle_yml, } class GitRemoteCallbacks(PygitRemoteCallbacks): - def __init__(self, token): self.token = token self.attempts = 0 @@ -194,27 +197,26 @@ def credentials(self, url, username_from_url, allowed_types): if self.attempts >= 5: # pygit2 doesn't propagate the exception properly - msg = 'Wrong oauth personal access token' + msg = "Wrong oauth personal access token" print(msg) raise CrossbowError(msg) - if (allowed_types & - pygit2.credentials.CredentialType.USERPASS_PLAINTEXT): - return pygit2.UserPass('x-oauth-basic', self.token) + if allowed_types & pygit2.credentials.CredentialType.USERPASS_PLAINTEXT: + return pygit2.UserPass("x-oauth-basic", self.token) else: return None def _git_ssh_to_https(url): - return url.replace('git@github.com:', 'https://github.com/') + return url.replace("git@github.com:", "https://github.com/") def _parse_github_user_repo(remote_url): # TODO: use a proper URL parser instead? - m = re.match(r'.*\/([^\/]+)\/([^\/\.]+)(\.git|/)?$', remote_url) + m = re.match(r".*\/([^\/]+)\/([^\/\.]+)(\.git|/)?$", remote_url) if m is None: # Perhaps it's simply "username/reponame"? - m = re.match(r'^(\w+)/(\w+)$', remote_url) + m = re.match(r"^(\w+)/(\w+)$", remote_url) if m is None: raise CrossbowError( f"Unable to parse the github owner and repository from the " @@ -225,8 +227,7 @@ def _parse_github_user_repo(remote_url): class Repo: - """ - Base class for interaction with local git repositories + """Base class for interaction with local git repositories A high level wrapper used for both reading revision information from arrow's repository and pushing continuous integration tasks to the queue @@ -236,10 +237,10 @@ class Repo: ---------- require_https : boolean, default False Raise exception for SSH origin URLs + """ - def __init__(self, path, github_token=None, remote_url=None, - require_https=False): + def __init__(self, path, github_token=None, remote_url=None, require_https=False): self.path = Path(path) self.github_token = github_token self.require_https = require_https @@ -249,14 +250,12 @@ def __init__(self, path, github_token=None, remote_url=None, self._updated_refs = [] def __str__(self): - tpl = textwrap.dedent(''' + tpl = textwrap.dedent(""" Repo: {remote}@{branch} Commit: {head} - ''') + """) return tpl.format( - remote=self.remote_url, - branch=self.branch.branch_name, - head=self.head + remote=self.remote_url, branch=self.branch.branch_name, head=self.head ) @property @@ -267,14 +266,15 @@ def repo(self): @property def origin(self): - remote = self.repo.remotes['origin'] - if self.require_https and remote.url.startswith('git@github.com'): - raise CrossbowError("Change SSH origin URL to HTTPS to use " - "Crossbow: {}".format(remote.url)) + remote = self.repo.remotes["origin"] + if self.require_https and remote.url.startswith("git@github.com"): + raise CrossbowError( + f"Change SSH origin URL to HTTPS to use Crossbow: {remote.url}" + ) return remote def fetch(self, retry=3): - refspec = '+refs/heads/*:refs/remotes/origin/*' + refspec = "+refs/heads/*:refs/remotes/origin/*" attempt = 1 while True: try: @@ -290,18 +290,19 @@ def push(self, refs=None, github_token=None): github_token = github_token or self.github_token if github_token is None: raise RuntimeError( - 'Could not determine GitHub token. Please set the ' - 'CROSSBOW_GITHUB_TOKEN environment variable to a ' - 'valid GitHub access token or pass one to --github-token.' + "Could not determine GitHub token. Please set the " + "CROSSBOW_GITHUB_TOKEN environment variable to a " + "valid GitHub access token or pass one to --github-token." ) callbacks = GitRemoteCallbacks(github_token) refs = refs or [] try: self.origin.push(refs + self._updated_refs, callbacks=callbacks) except pygit2.GitError: - raise RuntimeError('Failed to push updated references, ' - 'potentially because of credential issues: {}' - .format(self._updated_refs)) + raise RuntimeError( + "Failed to push updated references, " + f"potentially because of credential issues: {self._updated_refs}" + ) else: self.updated_refs = [] @@ -317,9 +318,9 @@ def branch(self): return self.repo.branches[self.repo.head.shorthand] except KeyError: raise CrossbowError( - 'Cannot determine the current branch of the Arrow repository ' - 'to clone or push to, perhaps it is in detached HEAD state. ' - 'Please checkout a branch.' + "Cannot determine the current branch of the Arrow repository " + "to clone or push to, perhaps it is in detached HEAD state. " + "Please checkout a branch." ) @property @@ -329,9 +330,9 @@ def remote(self): return self.repo.remotes[self.branch.upstream.remote_name] except (AttributeError, KeyError): raise CrossbowError( - 'Cannot determine git remote for the Arrow repository to ' - 'clone or push to, try to push the `{}` branch first to have ' - 'a remote tracking counterpart.'.format(self.branch.name) + "Cannot determine git remote for the Arrow repository to " + f"clone or push to, try to push the `{self.branch.name}` " + "branch first to have a remote tracking counterpart." ) @property @@ -346,21 +347,20 @@ def remote_url(self): @property def user_name(self): try: - return next(self.repo.config.get_multivar('user.name')) + return next(self.repo.config.get_multivar("user.name")) except StopIteration: - return os.environ.get('GIT_COMMITTER_NAME', 'unknown') + return os.environ.get("GIT_COMMITTER_NAME", "unknown") @property def user_email(self): try: - return next(self.repo.config.get_multivar('user.email')) + return next(self.repo.config.get_multivar("user.email")) except StopIteration: - return os.environ.get('GIT_COMMITTER_EMAIL', 'unknown') + return os.environ.get("GIT_COMMITTER_EMAIL", "unknown") @property def signature(self): - return pygit2.Signature(self.user_name, self.user_email, - int(time.time())) + return pygit2.Signature(self.user_name, self.user_email, int(time.time())) @property def default_branch_name(self): @@ -374,12 +374,15 @@ def default_branch_name(self): default_branch_name = target_name_tokenized[-1] except KeyError: default_branch_name = "main" - warnings.warn('Unable to determine default branch name: ' - 'ARCHERY_DEFAULT_BRANCH environment variable is ' - 'not set. Git repository does not contain a ' - '\'refs/remotes/origin/HEAD\'reference. Setting ' - 'the default branch name to ' + - default_branch_name, RuntimeWarning) + warnings.warn( + "Unable to determine default branch name: " + "ARCHERY_DEFAULT_BRANCH environment variable is " + "not set. Git repository does not contain a " + "'refs/remotes/origin/HEAD'reference. Setting " + "the default branch name to " + default_branch_name, + RuntimeWarning, + stacklevel=2, + ) return default_branch_name @@ -399,8 +402,7 @@ def create_tree(self, files): tree_id = builder.write() return tree_id - def create_commit(self, files, parents=None, message='', - reference_name=None): + def create_commit(self, files, parents=None, message="", reference_name=None): if parents is None: # by default use the main branch as the base of the new branch # required to reuse github actions cache across crossbow tasks @@ -409,12 +411,14 @@ def create_commit(self, files, parents=None, message='', tree_id = self.create_tree(files) author = committer = self.signature - commit_id = self.repo.create_commit(reference_name, author, committer, - message, tree_id, parents) + commit_id = self.repo.create_commit( + reference_name, author, committer, message, tree_id, parents + ) return self.repo[commit_id] - def create_branch(self, branch_name, files, parents=None, message='', - signature=None): + def create_branch( + self, branch_name, files, parents=None, message="", signature=None + ): # create commit with the passed tree commit = self.create_commit(files, parents=parents, message=message) @@ -422,23 +426,22 @@ def create_branch(self, branch_name, files, parents=None, message='', branch = self.repo.create_branch(branch_name, commit) # append to the pushable references - self._updated_refs.append('refs/heads/{}'.format(branch_name)) + self._updated_refs.append(f"refs/heads/{branch_name}") return branch - def create_tag(self, tag_name, commit_id, message=''): + def create_tag(self, tag_name, commit_id, message=""): git_object_commit = ( pygit2.GIT_OBJECT_COMMIT - if getattr(pygit2, 'GIT_OBJECT_COMMIT') + if pygit2.GIT_OBJECT_COMMIT else pygit2.GIT_OBJ_COMMIT ) - tag_id = self.repo.create_tag(tag_name, commit_id, - git_object_commit, - self.signature, - message) + tag_id = self.repo.create_tag( + tag_name, commit_id, git_object_commit, self.signature, message + ) # append to the pushable references - self._updated_refs.append('refs/tags/{}'.format(tag_name)) + self._updated_refs.append(f"refs/tags/{tag_name}") return self.repo[tag_id] @@ -451,11 +454,10 @@ def file_contents(self, commit_id, file): def _github_login(self, github_token): """Returns a logged in github3.GitHub instance""" if not _have_github3: - raise ImportError('Must install github3.py') + raise ImportError("Must install github3.py") github_token = github_token or self.github_token session = github3.session.GitHubSession( - default_connect_timeout=10, - default_read_timeout=30 + default_connect_timeout=10, default_read_timeout=30 ) github = github3.GitHub(session=session) github.login(token=github_token) @@ -476,8 +478,7 @@ def token_expiration_date(self, github_token=None): # to access the response headers. resp = github._get(github.session.base_url) # Response in the form '2023-01-23 10:40:28 UTC' - date_string = resp.headers.get( - 'github-authentication-token-expiration') + date_string = resp.headers.get("github-authentication-token-expiration") if date_string: return date.fromisoformat(date_string.split()[0]) @@ -492,25 +493,26 @@ def github_release(self, tag): except github3.exceptions.NotFoundError: return None - def github_upload_asset_requests(self, release, path, name, mime, - max_retries=None, retry_backoff=None): + def github_upload_asset_requests( + self, release, path, name, mime, max_retries=None, retry_backoff=None + ): if max_retries is None: - max_retries = int(os.environ.get('CROSSBOW_MAX_RETRIES', 8)) + max_retries = int(os.environ.get("CROSSBOW_MAX_RETRIES", 8)) if retry_backoff is None: - retry_backoff = int(os.environ.get('CROSSBOW_RETRY_BACKOFF', 5)) + retry_backoff = int(os.environ.get("CROSSBOW_RETRY_BACKOFF", 5)) for i in range(max_retries): try: - with open(path, 'rb') as fp: - result = release.upload_asset(name=name, asset=fp, - content_type=mime) + with open(path, "rb") as fp: + result = release.upload_asset( + name=name, asset=fp, content_type=mime + ) except github3.exceptions.ResponseError as e: - logger.error('Attempt {} has failed with message: {}.' - .format(i + 1, str(e))) - logger.error('Error message {}'.format(e.msg)) - logger.error('List of errors provided by GitHub:') + logger.error(f"Attempt {i + 1} has failed with message: {e!s}.") + logger.error(f"Error message {e.msg}") + logger.error("List of errors provided by GitHub:") for err in e.errors: - logger.error(' - {}'.format(err)) + logger.error(f" - {err}") if e.code == 422: # 422 Validation Failed, probably raised because @@ -518,47 +520,51 @@ def github_upload_asset_requests(self, release, path, name, mime, # reattempting the asset upload for asset in release.assets(): if asset.name == name: - logger.info('Release asset {} already exists, ' - 'removing it...'.format(name)) + logger.info( + f"Release asset {name} already exists, removing it..." + ) asset.delete() - logger.info('Asset {} removed.'.format(name)) + logger.info(f"Asset {name} removed.") break except github3.exceptions.ConnectionError as e: - logger.error('Attempt {} has failed with message: {}.' - .format(i + 1, str(e))) + logger.error(f"Attempt {i + 1} has failed with message: {e!s}.") else: - logger.info('Attempt {} has finished.'.format(i + 1)) + logger.info(f"Attempt {i + 1} has finished.") return result time.sleep(retry_backoff) - raise RuntimeError('GitHub asset uploading has failed!') + raise RuntimeError("GitHub asset uploading has failed!") def github_upload_asset_curl(self, release, path, name, mime): - upload_url, _ = release.upload_url.split('{?') - upload_url += '?name={}'.format(name) + upload_url, _ = release.upload_url.split("{?") + upload_url += f"?name={name}" command = [ - 'curl', - '--fail', - '-H', "Authorization: token {}".format(self.github_token), - '-H', "Content-Type: {}".format(mime), - '--data-binary', '@{}'.format(path), - upload_url + "curl", + "--fail", + "-H", + f"Authorization: token {self.github_token}", + "-H", + f"Content-Type: {mime}", + "--data-binary", + f"@{path}", + upload_url, ] return subprocess.run(command, shell=False, check=True) - def github_overwrite_release_assets(self, tag_name, target_commitish, - patterns, method='requests'): + def github_overwrite_release_assets( + self, tag_name, target_commitish, patterns, method="requests" + ): # Since github has changed something the asset uploading via requests # got instable, so prefer the cURL alternative. # Potential cause: # sigmavirus24/github3.py/issues/779#issuecomment-379470626 repo = self.as_github_repo() if not tag_name: - raise CrossbowError('Empty tag name') + raise CrossbowError("Empty tag name") if not target_commitish: - raise CrossbowError('Empty target commit for the release tag') + raise CrossbowError("Empty target commit for the release tag") # remove the whole release if it already exists try: @@ -573,39 +579,35 @@ def github_overwrite_release_assets(self, tag_name, target_commitish, for path in glob.glob(pattern, recursive=True): name = os.path.basename(path) size = os.path.getsize(path) - mime = mimetypes.guess_type(name)[0] or 'application/zip' + mime = mimetypes.guess_type(name)[0] or "application/zip" logger.info( - 'Uploading asset `{}` with mimetype {} and size {}...' - .format(name, mime, size) + f"Uploading asset `{name}` with mimetype {mime} and size {size}..." ) - if method == 'requests': - self.github_upload_asset_requests(release, path, name=name, - mime=mime) - elif method == 'curl': - self.github_upload_asset_curl(release, path, name=name, - mime=mime) - else: - raise CrossbowError( - 'Unsupported upload method {}'.format(method) + if method == "requests": + self.github_upload_asset_requests( + release, path, name=name, mime=mime ) + elif method == "curl": + self.github_upload_asset_curl(release, path, name=name, mime=mime) + else: + raise CrossbowError(f"Unsupported upload method {method}") - def github_pr(self, title, head=None, base=None, body=None, - github_token=None, create=False): + def github_pr( + self, title, head=None, base=None, body=None, github_token=None, create=False + ): if create: # Default value for base is the default_branch_name base = self.default_branch_name if base is None else base github_token = github_token or self.github_token repo = self.as_github_repo(github_token=github_token) if create: - return repo.create_pull(title=title, base=base, head=head, - body=body) + return repo.create_pull(title=title, base=base, head=head, body=body) else: # Retrieve open PR for base and head. # There should be a single open one with that title. - for pull in repo.pull_requests(state="open", head=head, - base=base): + for pull in repo.pull_requests(state="open", head=head, base=base): if title in pull.title: return pull raise CrossbowError( @@ -615,9 +617,8 @@ def github_pr(self, title, head=None, base=None, body=None, class Queue(Repo): - def _latest_prefix_id(self, prefix): - pattern = re.compile(r'[\w\/-]*{}-(\d+)'.format(prefix)) + pattern = re.compile(rf"[\w\/-]*{prefix}-(\d+)") matches = list(filter(None, map(pattern.match, self.repo.branches))) if matches: latest = max(int(m.group(1)) for m in matches) @@ -626,13 +627,13 @@ def _latest_prefix_id(self, prefix): return latest def _prefix_contains_date(self, prefix): - prefix_date_pattern = re.compile(r'[\w\/-]*-(\d+)-(\d+)-(\d+)') + prefix_date_pattern = re.compile(r"[\w\/-]*-(\d+)-(\d+)-(\d+)") match_prefix = prefix_date_pattern.match(prefix) if match_prefix: return match_prefix.group(0)[-10:] def _latest_prefix_date(self, prefix): - pattern = re.compile(r'[\w\/-]*{}-(\d+)-(\d+)-(\d+)'.format(prefix)) + pattern = re.compile(rf"[\w\/-]*{prefix}-(\d+)-(\d+)-(\d+)") matches = list(filter(None, map(pattern.match, self.repo.branches))) if matches: latest = sorted([m.group(0) for m in matches])[-1] @@ -645,12 +646,12 @@ def _latest_prefix_date(self, prefix): def _next_job_id(self, prefix): """Auto increments the branch's identifier based on the prefix""" latest_id = self._latest_prefix_id(prefix) - return '{}-{}'.format(prefix, latest_id + 1) + return f"{prefix}-{latest_id + 1}" def _new_hex_id(self, prefix): """Append a new id to branch's identifier based on the prefix""" hex_id = uuid.uuid4().hex[:10] - return '{}-{}'.format(prefix, hex_id) + return f"{prefix}-{hex_id}" def latest_for_prefix(self, prefix): prefix_date = self._prefix_contains_date(prefix) @@ -667,13 +668,13 @@ def latest_for_prefix(self, prefix): raise RuntimeError( f"No job has been submitted with prefix '{prefix}' yet" ) - job_name = '{}-{}'.format(prefix, latest_id) + job_name = f"{prefix}-{latest_id}" return self.get(job_name) def date_of(self, job): # it'd be better to bound to the queue repository on deserialization # and reorganize these methods to Job - branch_name = 'origin/{}'.format(job.branch) + branch_name = f"origin/{job.branch}" branch = self.repo.branches[branch_name] commit = self.repo[branch.target] return date.fromtimestamp(commit.commit_time) @@ -682,7 +683,7 @@ def jobs(self, pattern): """Return jobs sorted by its identifier in reverse order""" job_names = [] for name in self.repo.branches.remote: - origin, name = name.split('/', 1) + origin, name = name.split("/", 1) result = re.match(pattern, name) if result: job_names.append(name) @@ -691,26 +692,25 @@ def jobs(self, pattern): yield self.get(name) def get(self, job_name): - branch_name = 'origin/{}'.format(job_name) + branch_name = f"origin/{job_name}" branch = self.repo.branches[branch_name] try: - content = self.file_contents(branch.target, 'job.yml') + content = self.file_contents(branch.target, "job.yml") except KeyError: - raise CrossbowError( - 'No job is found with name: {}'.format(job_name) - ) + raise CrossbowError(f"No job is found with name: {job_name}") - buffer = StringIO(content.decode('utf-8')) + buffer = StringIO(content.decode("utf-8")) job = yaml.load(buffer) job.queue = self return job - def put(self, job, prefix='build', increment_job_id=True): + def put(self, job, prefix="build", increment_job_id=True): if not isinstance(job, Job): - raise CrossbowError('`job` must be an instance of Job') + raise CrossbowError("`job` must be an instance of Job") if job.branch is not None: - raise CrossbowError('`job.branch` is automatically generated, ' - 'thus it must be blank') + raise CrossbowError( + "`job.branch` is automatically generated, thus it must be blank" + ) job.queue = self if increment_job_id: @@ -724,12 +724,12 @@ def put(self, job, prefix='build', increment_job_id=True): for task_name, task in job.tasks.items(): # adding CI's name to the end of the branch in order to use skip # patterns on travis and circleci - task.branch = '{}-{}-{}'.format(job.branch, task.ci, task_name) + task.branch = f"{job.branch}-{task.ci}-{task_name}" params = { **job.params, "arrow": job.target, "job": job, - "queue_remote_url": self.remote_url + "queue_remote_url": self.remote_url, } files = task.render_files(job.template_searchpath, params=params) branch = self.create_branch(task.branch, files=files) @@ -741,22 +741,19 @@ def put(self, job, prefix='build', increment_job_id=True): def get_version(root, **kwargs): - """ - Parse function for setuptools_scm that ignores tags for non-C++ + """Parse function for setuptools_scm that ignores tags for non-C++ subprojects, e.g. apache-arrow-js-XXX tags. """ - from setuptools_scm.git import parse as parse_git_version from setuptools_scm import Configuration + from setuptools_scm.git import parse as parse_git_version # query the calculated version based on the git tags - kwargs['describe_command'] = ( + kwargs["describe_command"] = ( 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' ) # Create a Configuration object with necessary parameters - config = Configuration( - git_describe_command=kwargs['describe_command'] - ) + config = Configuration(git_describe_command=kwargs["describe_command"]) version = parse_git_version(root, config=config, **kwargs) tag = str(version.tag) @@ -769,24 +766,22 @@ def get_version(root, **kwargs): pattern = r"^(\d+)\.(\d+)\.(\d+)" match = re.match(pattern, tag) major, minor, patch = map(int, match.groups()) - if 'dev' not in tag: + if "dev" not in tag: major += 1 - return "{}.{}.{}.dev{}".format(major, minor, patch, version.distance or 0) + return f"{major}.{minor}.{patch}.dev{version.distance or 0}" class Serializable: - @classmethod def to_yaml(cls, representer, data): - tag = '!{}'.format(cls.__name__) - dct = {k: v for k, v in data.__dict__.items() if not k.startswith('_')} + tag = f"!{cls.__name__}" + dct = {k: v for k, v in data.__dict__.items() if not k.startswith("_")} return representer.represent_mapping(tag, dct) class Target(Serializable): - """ - Describes target repository and revision the builds run against + """Describes target repository and revision the builds run against This serializable data container holding information about arrow's git remote, branch, sha and version number as well as some metadata @@ -801,8 +796,8 @@ def __init__(self, head, branch, remote, version, r_version, email=None): self.github_repo = "/".join(_parse_github_user_repo(remote)) self.version = version self.r_version = r_version - self.no_rc_version = re.sub(r'-rc\d+\Z', '', version) - self.no_rc_r_version = re.sub(r'-rc\d+\Z', '', r_version) + self.no_rc_version = re.sub(r"-rc\d+\Z", "", version) + self.no_rc_r_version = re.sub(r"-rc\d+\Z", "", r_version) # MAJOR.MINOR.PATCH Versioning # # Excludes: @@ -814,8 +809,7 @@ def __init__(self, head, branch, remote, version, r_version, email=None): # # '19.0.0.dev66' -> # '19.0.0' - self.no_rc_no_dev_version = \ - re.sub(r'\.dev\d+\Z', '', self.no_rc_version) + self.no_rc_no_dev_version = re.sub(r"\.dev\d+\Z", "", self.no_rc_version) # Semantic Versioning 1.0.0: https://semver.org/spec/v1.0.0.html # # > A pre-release version number MAY be denoted by appending an @@ -827,8 +821,7 @@ def __init__(self, head, branch, remote, version, r_version, email=None): # # '0.16.1.dev10' -> # '0.16.1-dev10' - self.no_rc_semver_version = \ - re.sub(r'\.(dev\d+)\Z', r'-\1', self.no_rc_version) + self.no_rc_semver_version = re.sub(r"\.(dev\d+)\Z", r"-\1", self.no_rc_version) # Substitute dev version for SNAPSHOT # # Example: @@ -836,7 +829,8 @@ def __init__(self, head, branch, remote, version, r_version, email=None): # '10.0.0.dev235' -> # '10.0.0-SNAPSHOT' self.no_rc_snapshot_version = re.sub( - r'\.(dev\d+)$', '-SNAPSHOT', self.no_rc_version) + r"\.(dev\d+)$", "-SNAPSHOT", self.no_rc_version + ) # SO (shared object) version for C++/C GLib # # Example: @@ -847,8 +841,9 @@ def __init__(self, head, branch, remote, version, r_version, email=None): self.so_version = f"{major * 100 + minor}" @classmethod - def from_repo(cls, repo, head=None, branch=None, remote=None, version=None, - email=None): + def from_repo( + cls, repo, head=None, branch=None, remote=None, version=None, email=None + ): """Initialize from a repository Optionally override detected remote, branch, head, and/or version. @@ -870,8 +865,7 @@ def from_repo(cls, repo, head=None, branch=None, remote=None, version=None, if version_dev_match: with open(f"{repo.path}/r/DESCRIPTION") as description_file: description = description_file.read() - r_version_pattern = re.compile(r"^Version:\s*(.*)$", - re.MULTILINE) + r_version_pattern = re.compile(r"^Version:\s*(.*)$", re.MULTILINE) r_version = re.findall(r_version_pattern, description)[0] if r_version: version_dev = int(version_dev_match[1]) @@ -897,16 +891,21 @@ def from_repo(cls, repo, head=None, branch=None, remote=None, version=None, else: r_version = version - return cls(head=head, email=email, branch=branch, remote=remote, - version=version, r_version=r_version) + return cls( + head=head, + email=email, + branch=branch, + remote=remote, + version=version, + r_version=r_version, + ) def is_default_branch(self): - return self.branch == 'main' + return self.branch == "main" class Task(Serializable): - """ - Describes a build task and metadata required to render CI templates + """Describes a build task and metadata required to render CI templates A task is represented as a single git commit and branch containing jinja2 rendered files (currently appveyor.yml or .travis.yml configurations). @@ -917,14 +916,7 @@ class Task(Serializable): """ def __init__(self, name, ci, template, artifacts=None, params=None): - assert ci in { - 'circle', - 'travis', - 'appveyor', - 'azure', - 'github', - 'drone', - } + assert ci in {"circle", "travis", "appveyor", "azure", "github", "drone"} self.name = name self.ci = ci self.template = template @@ -939,13 +931,11 @@ def __init__(self, name, ci, template, artifacts=None, params=None): def render_files(self, searchpath, params=None): params = {**self.params, **(params or {}), "task": self} try: - rendered = _render_jinja_template(searchpath, self.template, - params=params) + rendered = _render_jinja_template(searchpath, self.template, params=params) except jinja2.TemplateError as e: raise RuntimeError( - 'Failed to render template `{}` with {}: {}'.format( - self.template, e.__class__.__name__, str(e) - ) + f"Failed to render template `{self.template}` with " + f"{e.__class__.__name__}: {e!s}" ) tree = {**_default_tree, self.filename: rendered} @@ -958,35 +948,36 @@ def tag(self): @property def filename(self): config_files = { - 'circle': '.circleci/config.yml', - 'travis': '.travis.yml', - 'appveyor': 'appveyor.yml', - 'azure': 'azure-pipelines.yml', - 'github': '.github/workflows/crossbow.yml', - 'drone': '.drone.yml', + "circle": ".circleci/config.yml", + "travis": ".travis.yml", + "appveyor": "appveyor.yml", + "azure": "azure-pipelines.yml", + "github": ".github/workflows/crossbow.yml", + "drone": ".drone.yml", } return config_files[self.ci] def status(self, force_query=False): - _status = getattr(self, '_status', None) + _status = getattr(self, "_status", None) if force_query or _status is None: github_commit = self._queue.github_commit(self.commit) self._status = TaskStatus(github_commit) return self._status def assets(self, force_query=False, validate_patterns=True): - _assets = getattr(self, '_assets', None) + _assets = getattr(self, "_assets", None) if force_query or _assets is None: github_release = self._queue.github_release(self.tag) - self._assets = TaskAssets(github_release, - artifact_patterns=self.artifacts, - validate_patterns=validate_patterns) + self._assets = TaskAssets( + github_release, + artifact_patterns=self.artifacts, + validate_patterns=validate_patterns, + ) return self._assets class TaskStatus: - """ - Combine the results from status and checks API to a single state. + """Combine the results from status and checks API to a single state. Azure pipelines uses checks API which doesn't provide a combined interface like status API does, so we need to manually combine @@ -1019,6 +1010,7 @@ class TaskStatus: github_check_runs='github checks associated with the commit', total_count='number of statuses and checks' ) + """ def __init__(self, commit): @@ -1027,25 +1019,24 @@ def __init__(self, commit): states = [s.state for s in status.statuses] for check in check_runs: - if check.status == 'completed': - if check.conclusion in {'success', 'failure'}: + if check.status == "completed": + if check.conclusion in {"success", "failure"}: states.append(check.conclusion) - elif check.conclusion in {'cancelled', 'timed_out', - 'action_required'}: - states.append('error') + elif check.conclusion in {"cancelled", "timed_out", "action_required"}: + states.append("error") # omit `neutral` conclusion else: - states.append('pending') + states.append("pending") # it could be more effective, but the following is more descriptive - combined_state = 'error' + combined_state = "error" if len(states): - if any(state in {'error', 'failure'} for state in states): - combined_state = 'failure' - elif any(state == 'pending' for state in states): - combined_state = 'pending' - elif all(state == 'success' for state in states): - combined_state = 'success' + if any(state in {"error", "failure"} for state in states): + combined_state = "failure" + elif any(state == "pending" for state in states): + combined_state = "pending" + elif all(state == "success" for state in states): + combined_state = "success" # show link to the actual build, some of the CI providers implement # the statuses API others implement the checks API, so display both @@ -1060,9 +1051,7 @@ def __init__(self, commit): class TaskAssets(dict): - - def __init__(self, github_release, artifact_patterns, - validate_patterns=True): + def __init__(self, github_release, artifact_patterns, validate_patterns=True): # HACK(kszucs): don't expect uploaded assets of no artifacts were # defined for the tasks in order to spare a bit of github rate limit if not artifact_patterns: @@ -1073,28 +1062,26 @@ def __init__(self, github_release, artifact_patterns, else: github_assets = {a.name: a for a in github_release.assets()} - if not validate_patterns: + if validate_patterns: + for pattern in artifact_patterns: + # artifact can be a regex pattern + compiled = re.compile(f"^{pattern}$") + matches = list(filter(None, map(compiled.match, github_assets.keys()))) + num_matches = len(matches) + + # validate artifact pattern matches single asset + if num_matches == 0: + self[pattern] = None + elif num_matches == 1: + self[pattern] = github_assets[matches[0].group(0)] + else: + raise CrossbowError( + "Only a single asset should match pattern `{}`, there are " + "multiple ones: {}".format(pattern, ", ".join(matches)) + ) + else: # shortcut to avoid pattern validation and just set all artifacts - return self.update(github_assets) - - for pattern in artifact_patterns: - # artifact can be a regex pattern - compiled = re.compile(f"^{pattern}$") - matches = list( - filter(None, map(compiled.match, github_assets.keys())) - ) - num_matches = len(matches) - - # validate artifact pattern matches single asset - if num_matches == 0: - self[pattern] = None - elif num_matches == 1: - self[pattern] = github_assets[matches[0].group(0)] - else: - raise CrossbowError( - 'Only a single asset should match pattern `{}`, there are ' - 'multiple ones: {}'.format(pattern, ', '.join(matches)) - ) + self.update(github_assets) def missing_patterns(self): return [pattern for pattern, asset in self.items() if asset is None] @@ -1108,13 +1095,13 @@ class Job(Serializable): def __init__(self, target, tasks, params=None, template_searchpath=None): if not tasks: - raise ValueError('no tasks were provided for the job') + raise ValueError("no tasks were provided for the job") if not all(isinstance(task, Task) for task in tasks.values()): - raise ValueError('each `tasks` mus be an instance of Task') + raise ValueError("each `tasks` mus be an instance of Task") if not isinstance(target, Target): - raise ValueError('`target` must be an instance of Target') + raise ValueError("`target` must be an instance of Target") if not isinstance(params, dict): - raise ValueError('`params` must be an instance of dict') + raise ValueError("`params` must be an instance of dict") self.target = target self.tasks = tasks @@ -1135,12 +1122,7 @@ def render_files(self): def render_tasks(self, params=None): result = {} - params = { - **self.params, - "arrow": self.target, - "job": self, - **(params or {}) - } + params = {**self.params, "arrow": self.target, "job": self, **(params or {})} for task_name, task in self.tasks.items(): files = task.render_files(self._template_searchpath, params) result[task_name] = files @@ -1164,7 +1146,7 @@ def queue(self, queue): @property def email(self): - return os.environ.get('CROSSBOW_EMAIL', self.target.email) + return os.environ.get("CROSSBOW_EMAIL", self.target.email) @property def date(self): @@ -1175,8 +1157,7 @@ def show(self, stream=None): @classmethod def from_config(cls, config, target, tasks=None, groups=None, params=None): - """ - Instantiate a job from based on a config. + """Instantiate a job from based on a config. Parameters ---------- @@ -1200,38 +1181,42 @@ def from_config(cls, config, target, tasks=None, groups=None, params=None): ------ Exception: If invalid groups or tasks has been passed. + """ task_definitions = config.select(tasks, groups=groups) # instantiate the tasks tasks = {} versions = { - 'version': target.version, - 'no_rc_version': target.no_rc_version, - 'no_rc_no_dev_version': target.no_rc_no_dev_version, - 'no_rc_semver_version': target.no_rc_semver_version, - 'no_rc_snapshot_version': target.no_rc_snapshot_version, - 'r_version': target.r_version, - 'no_rc_r_version': target.no_rc_r_version, - 'so_version': target.so_version, + "version": target.version, + "no_rc_version": target.no_rc_version, + "no_rc_no_dev_version": target.no_rc_no_dev_version, + "no_rc_semver_version": target.no_rc_semver_version, + "no_rc_snapshot_version": target.no_rc_snapshot_version, + "r_version": target.r_version, + "no_rc_r_version": target.no_rc_r_version, + "so_version": target.so_version, } for task_name, task in task_definitions.items(): task = task.copy() - artifacts = task.pop('artifacts', None) or [] # because of yaml + artifacts = task.pop("artifacts", None) or [] # because of yaml artifacts = [fn.format(**versions) for fn in artifacts] tasks[task_name] = Task(task_name, artifacts=artifacts, **task) - return cls(target=target, tasks=tasks, params=params, - template_searchpath=config.template_searchpath) + return cls( + target=target, + tasks=tasks, + params=params, + template_searchpath=config.template_searchpath, + ) def is_finished(self): for task in self.tasks.values(): status = task.status(force_query=True) - if status.combined_state == 'pending': + if status.combined_state == "pending": return False return True - def wait_until_finished(self, poll_max_minutes=120, - poll_interval_minutes=10): + def wait_until_finished(self, poll_max_minutes=120, poll_interval_minutes=10): started_at = time.time() while True: if self.is_finished(): @@ -1239,17 +1224,19 @@ def wait_until_finished(self, poll_max_minutes=120, waited_for_minutes = (time.time() - started_at) / 60 if waited_for_minutes > poll_max_minutes: - msg = ('Exceeded the maximum amount of time waiting for job ' - 'to finish, waited for {} minutes.') + msg = ( + "Exceeded the maximum amount of time waiting for job " + "to finish, waited for {} minutes." + ) raise RuntimeError(msg.format(waited_for_minutes)) - logger.info('Waiting {} minutes and then checking again' - .format(poll_interval_minutes)) + logger.info( + f"Waiting {poll_interval_minutes} minutes and then checking again" + ) time.sleep(poll_interval_minutes * 60) class Config(dict): - def __init__(self, tasks, template_searchpath): super().__init__(tasks) self.template_searchpath = template_searchpath @@ -1258,8 +1245,7 @@ def __init__(self, tasks, template_searchpath): def load_yaml(cls, path): path = Path(path) searchpath = path.parent - rendered = _render_jinja_template(searchpath, template=path.name, - params={}) + rendered = _render_jinja_template(searchpath, template=path.name, params={}) config = yaml.load(rendered) return cls(config, template_searchpath=searchpath) @@ -1267,8 +1253,8 @@ def show(self, stream=None): return yaml.dump(dict(self), stream=stream) def select(self, tasks=None, groups=None): - config_groups = dict(self['groups']) - config_tasks = dict(self['tasks']) + config_groups = dict(self["groups"]) + config_tasks = dict(self["tasks"]) valid_groups = set(config_groups.keys()) valid_tasks = set(config_tasks.keys()) group_allowlist = list(groups or []) @@ -1278,8 +1264,8 @@ def select(self, tasks=None, groups=None): requested_groups = set(group_allowlist) invalid_groups = requested_groups - valid_groups if invalid_groups: - msg = 'Invalid group(s) {!r}. Must be one of {!r}'.format( - invalid_groups, valid_groups + msg = ( + f"Invalid group(s) {invalid_groups!r}. Must be one of {valid_groups!r}" ) raise CrossbowError(msg) @@ -1290,16 +1276,15 @@ def select(self, tasks=None, groups=None): if len(matches): requested_tasks.update(matches) else: - raise CrossbowError( - "Unable to match any tasks for `{}`".format(pattern) - ) + raise CrossbowError(f"Unable to match any tasks for `{pattern}`") requested_group_tasks = set() for group in group_allowlist: # separate the patterns from the blocklist patterns task_patterns = list(config_groups[group]) task_blocklist_patterns = [ - x.strip("~") for x in task_patterns if x.startswith("~")] + x.strip("~") for x in task_patterns if x.startswith("~") + ] task_patterns = [x for x in task_patterns if not x.startswith("~")] # treat the task names as glob patterns to select tasks more easily @@ -1308,58 +1293,49 @@ def select(self, tasks=None, groups=None): if len(matches): requested_group_tasks.update(matches) else: - raise CrossbowError( - "Unable to match any tasks for `{}`".format(pattern) - ) + raise CrossbowError(f"Unable to match any tasks for `{pattern}`") # remove any tasks that are negated with ~task-name for block_pattern in task_blocklist_patterns: matches = fnmatch.filter(valid_tasks, block_pattern) if len(matches): - requested_group_tasks = requested_group_tasks.difference( - matches) + requested_group_tasks = requested_group_tasks.difference(matches) else: - raise CrossbowError( - "Unable to match any tasks for `{}`".format(pattern) - ) + raise CrossbowError(f"Unable to match any tasks for `{pattern}`") requested_tasks = requested_tasks.union(requested_group_tasks) # validate that the passed and matched tasks are defined in the config invalid_tasks = requested_tasks - valid_tasks if invalid_tasks: - msg = 'Invalid task(s) {!r}. Must be one of {!r}'.format( - invalid_tasks, valid_tasks - ) + msg = f"Invalid task(s) {invalid_tasks!r}. Must be one of {valid_tasks!r}" raise CrossbowError(msg) - return { - task_name: config_tasks[task_name] for task_name in requested_tasks - } + return {task_name: config_tasks[task_name] for task_name in requested_tasks} def validate(self): # validate that the task groups are properly referring to the tasks - for group_name, group in self['groups'].items(): + for group_name, group in self["groups"].items(): for pattern in group: # remove the negation character for blocklisted tasks pattern = pattern.strip("~") tasks = self.select(tasks=[pattern]) if not tasks: raise CrossbowError( - "The pattern `{}` defined for task group `{}` is not " - "matching any of the tasks defined in the " - "configuration file.".format(pattern, group_name) + f"The pattern `{pattern}` defined for task group " + f"`{group_name}` is not matching any of the tasks " + "defined in the configuration file." ) # validate that the tasks are constructible - for task_name, task in self['tasks'].items(): + for task_name, task in self["tasks"].items(): try: Task(task_name, **task) except Exception as e: raise CrossbowError( - 'Unable to construct a task object from the ' - 'definition of task `{}`. The original error message ' - 'is: `{}`'.format(task_name, str(e)) + "Unable to construct a task object from the " + f"definition of task `{task_name}`. The original error message " + f"is: `{e!s}`" ) # Get the default branch name from the repository @@ -1369,32 +1345,35 @@ def validate(self): # validate that the defined tasks are renderable, in order to to that # define the required object with dummy data target = Target( - head='e279a7e06e61c14868ca7d71dea795420aea6539', + head="e279a7e06e61c14868ca7d71dea795420aea6539", branch=repo.default_branch_name, - remote='https://github.com/apache/arrow', - version='1.0.0dev123', - r_version='0.13.0.100000123', - email='dummy@example.ltd' + remote="https://github.com/apache/arrow", + version="1.0.0dev123", + r_version="0.13.0.100000123", + email="dummy@example.ltd", + ) + job = Job.from_config( + config=self, + target=target, + tasks=self["tasks"], + groups=self["groups"], + params={}, ) - job = Job.from_config(config=self, - target=target, - tasks=self['tasks'], - groups=self['groups'], - params={}) - for task_name, task in self['tasks'].items(): + for task_name, task in self["tasks"].items(): task = Task(task_name, **task) files = task.render_files( self.template_searchpath, params=dict( arrow=target, job=job, - queue_remote_url='https://github.com/org/crossbow' - ) + queue_remote_url="https://github.com/org/crossbow", + ), ) if not files: - raise CrossbowError('No files have been rendered for task `{}`' - .format(task_name)) + raise CrossbowError( + f"No files have been rendered for task `{task_name}`" + ) # configure yaml serializer diff --git a/dev/archery/archery/crossbow/reports.py b/dev/archery/archery/crossbow/reports.py index d8efa42341ce6..0ef0009dba03c 100644 --- a/dev/archery/archery/crossbow/reports.py +++ b/dev/archery/archery/crossbow/reports.py @@ -14,12 +14,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import collections import csv -import operator import fnmatch import functools +import operator import time import click @@ -30,7 +31,6 @@ # TODO(kszucs): use archery.report.JinjaReport instead class Report: - ROW_HEADERS = [ "task_name", "task_status", @@ -59,13 +59,13 @@ def __init__(self, job, task_filters=None, wait_for_task=None): @property def repo_url(self): url = self.job.queue.remote_url - return url[:-4] if url.endswith('.git') else url + return url[:-4] if url.endswith(".git") else url def url(self, query): - return '{}/branches/all?query={}'.format(self.repo_url, query) + return f"{self.repo_url}/branches/all?query={query}" def branch_url(self, branch): - return '{}/tree/{}'.format(self.repo_url, branch) + return f"{self.repo_url}/tree/{branch}" def task_url(self, task): build_links = task.status().build_links @@ -93,8 +93,7 @@ def tasks_by_state(self): @property def contains_failures(self): - return any(self.tasks_by_state[state] for state in ( - "error", "failure")) + return any(self.tasks_by_state[state] for state in ("error", "failure")) @property def tasks(self): @@ -105,8 +104,7 @@ def show(self): @property def rows(self): - """ - Produces a generator that allow us to iterate over + """Produces a generator that allow us to iterate over the job tasks as a list of rows. Row headers are defined at Report.ROW_HEADERS. """ @@ -120,10 +118,10 @@ def rows(self): task.ci, # We want this to be serialized as a dict instead # of an orderedict. - {k: v for k, v in task.params.items()}, + dict(task.params.items()), task.template, # Arrow repository commit - self.job.target.head + self.job.target.head, ] yield row @@ -132,54 +130,51 @@ class ConsoleReport(Report): """Report the status of a Job to the console using click""" # output table's header template - HEADER = '[{state:>7}] {branch:<52} {content:>16}' - DETAILS = ' â”” {url}' + HEADER = "[{state:>7}] {branch:<52} {content:>16}" + DETAILS = " â”” {url}" # output table's row template for assets - ARTIFACT_NAME = '{artifact:>69} ' - ARTIFACT_STATE = '[{state:>7}]' + ARTIFACT_NAME = "{artifact:>69} " + ARTIFACT_STATE = "[{state:>7}]" # state color mapping to highlight console output COLORS = { # from CombinedStatus - 'error': 'red', - 'failure': 'red', - 'pending': 'yellow', - 'success': 'green', + "error": "red", + "failure": "red", + "pending": "yellow", + "success": "green", # custom state messages - 'ok': 'green', - 'missing': 'red' + "ok": "green", + "missing": "red", } def lead(self, state, branch, n_uploaded, n_expected): line = self.HEADER.format( state=state.upper(), branch=branch, - content='uploaded {} / {}'.format(n_uploaded, n_expected) + content=f"uploaded {n_uploaded} / {n_expected}", ) return click.style(line, fg=self.COLORS[state.lower()]) def header(self): header = self.HEADER.format( - state='state', - branch='Task / Branch', - content='Artifacts' + state="state", branch="Task / Branch", content="Artifacts" ) - delimiter = '-' * len(header) - return '{}\n{}'.format(header, delimiter) + delimiter = "-" * len(header) + return f"{header}\n{delimiter}" def artifact(self, state, pattern, asset): if asset is None: artifact = pattern - state = 'pending' if state == 'pending' else 'missing' + state = "pending" if state == "pending" else "missing" else: artifact = asset.name - state = 'ok' + state = "ok" name_ = self.ARTIFACT_NAME.format(artifact=artifact) state_ = click.style( - self.ARTIFACT_STATE.format(state=state.upper()), - self.COLORS[state] + self.ARTIFACT_STATE.format(state=state.upper()), self.COLORS[state] ) return name_ + state_ @@ -198,8 +193,7 @@ def show(self, outstream, asset_callback=None, validate_patterns=True): # mapping of artifact pattern to asset or None of not uploaded n_expected = len(task.artifacts) n_uploaded = len(assets.uploaded_assets()) - echo(self.lead(status.combined_state, task_name, n_uploaded, - n_expected)) + echo(self.lead(status.combined_state, task_name, n_uploaded, n_expected)) # show link to the actual build, some of the CI providers implement # the statuses API others implement the checks API, so display both @@ -210,42 +204,31 @@ def show(self, outstream, asset_callback=None, validate_patterns=True): for artifact_pattern, asset in assets.items(): if asset_callback is not None: asset_callback(task_name, task, asset) - echo(self.artifact(status.combined_state, artifact_pattern, - asset)) + echo(self.artifact(status.combined_state, artifact_pattern, asset)) class ChatReport(JinjaReport): - templates = { - 'text': 'chat_nightly_report.txt.j2', - } - fields = [ - 'report', - 'extra_message_success', - 'extra_message_failure', - ] + templates = {"text": "chat_nightly_report.txt.j2"} + fields = ["report", "extra_message_success", "extra_message_failure"] class ReportUtils: - @classmethod def send_message(cls, webhook, message): - resp = requests.post(webhook, json={ - "blocks": [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": message - } - } - ] - } + resp = requests.post( + webhook, + json={ + "blocks": [ + {"type": "section", "text": {"type": "mrkdwn", "text": message}} + ] + }, ) return resp @classmethod - def send_email(cls, smtp_user, smtp_password, smtp_server, smtp_port, - recipient_email, message): + def send_email( + cls, smtp_user, smtp_password, smtp_server, smtp_port, recipient_email, message + ): import smtplib server = smtplib.SMTP_SSL(smtp_server, smtp_port) @@ -256,7 +239,7 @@ def send_email(cls, smtp_user, smtp_password, smtp_server, smtp_port, @classmethod def write_csv(cls, report, add_headers=True): - with open(f'{report.job.branch}.csv', 'w') as csvfile: + with open(f"{report.job.branch}.csv", "w") as csvfile: task_writer = csv.writer(csvfile) if add_headers: task_writer.writerow(report.ROW_HEADERS) @@ -265,54 +248,44 @@ def write_csv(cls, report, add_headers=True): class EmailReport(JinjaReport): templates = { - 'nightly_report': 'email_nightly_report.txt.j2', - 'token_expiration': 'email_token_expiration.txt.j2', + "nightly_report": "email_nightly_report.txt.j2", + "token_expiration": "email_token_expiration.txt.j2", } - fields = [ - 'report', - 'sender_name', - 'sender_email', - 'recipient_email', - ] + fields = ["report", "sender_name", "sender_email", "recipient_email"] class CommentReport(Report): - - _markdown_badge = '[]({{url}})' + _markdown_badge = "[]({{url}})" badges = { - 'github': _markdown_badge.format( - title='GitHub Actions', + "github": _markdown_badge.format( + title="GitHub Actions", badge=( - 'https://github.com/{repo}/actions/workflows/crossbow.yml/' - 'badge.svg?branch={branch}' + "https://github.com/{repo}/actions/workflows/crossbow.yml/" + "badge.svg?branch={branch}" ), ), - 'azure': _markdown_badge.format( - title='Azure', + "azure": _markdown_badge.format( + title="Azure", badge=( - 'https://dev.azure.com/{repo}/_apis/build/status/' - '{repo_dotted}?branchName={branch}' - ) + "https://dev.azure.com/{repo}/_apis/build/status/" + "{repo_dotted}?branchName={branch}" + ), ), - 'travis': _markdown_badge.format( - title='Travis CI', - badge='https://img.shields.io/travis/{repo}/{branch}.svg' + "travis": _markdown_badge.format( + title="Travis CI", badge="https://img.shields.io/travis/{repo}/{branch}.svg" ), - 'circle': _markdown_badge.format( - title='CircleCI', - badge=( - 'https://img.shields.io/circleci/build/github' - '/{repo}/{branch}.svg' - ) + "circle": _markdown_badge.format( + title="CircleCI", + badge=("https://img.shields.io/circleci/build/github/{repo}/{branch}.svg"), ), - 'appveyor': _markdown_badge.format( - title='AppVeyor', - badge='https://img.shields.io/appveyor/ci/{repo}/{branch}.svg' + "appveyor": _markdown_badge.format( + title="AppVeyor", + badge="https://img.shields.io/appveyor/ci/{repo}/{branch}.svg", ), - 'drone': _markdown_badge.format( - title='Drone', - badge='https://img.shields.io/drone/build/{repo}/{branch}.svg' + "drone": _markdown_badge.format( + title="Drone", + badge="https://img.shields.io/drone/build/{repo}/{branch}.svg", ), } @@ -321,13 +294,13 @@ def __init__(self, job, crossbow_repo, wait_for_task=None): super().__init__(job, wait_for_task=wait_for_task) def show(self): - url = 'https://github.com/{repo}/branches/all?query={branch}' + url = "https://github.com/{repo}/branches/all?query={branch}" sha = self.job.target.head - msg = 'Revision: {}\n\n'.format(sha) - msg += 'Submitted crossbow builds: [{repo} @ {branch}]' - msg += '({})\n'.format(url) - msg += '\n|Task|Status|\n|----|------|' + msg = f"Revision: {sha}\n\n" + msg += "Submitted crossbow builds: [{repo} @ {branch}]" + msg += f"({url})\n" + msg += "\n|Task|Status|\n|----|------|" tasks = sorted(self.job.tasks.items(), key=operator.itemgetter(0)) for key, task in tasks: @@ -337,13 +310,13 @@ def show(self): template = self.badges[task.ci] badge = template.format( repo=self.crossbow_repo, - repo_dotted=self.crossbow_repo.replace('/', '.'), + repo_dotted=self.crossbow_repo.replace("/", "."), branch=branch, - url=self.task_url(task) + url=self.task_url(task), ) except KeyError: - badge = 'unsupported CI service `{}`'.format(task.ci) + badge = f"unsupported CI service `{task.ci}`" - msg += '\n|{}|{}|'.format(key, badge) + msg += f"\n|{key}|{badge}|" return msg.format(repo=self.crossbow_repo, branch=self.job.branch) diff --git a/dev/archery/archery/crossbow/tests/test_core.py b/dev/archery/archery/crossbow/tests/test_core.py index 3d538b89b262a..71ce58eb491cd 100644 --- a/dev/archery/archery/crossbow/tests/test_core.py +++ b/dev/archery/archery/crossbow/tests/test_core.py @@ -14,13 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -from archery.utils.source import ArrowSources -from archery.crossbow import Config, Queue +from __future__ import annotations import pathlib from unittest import mock +from archery.crossbow import Config, Queue +from archery.utils.source import ArrowSources + def test_config(): src = ArrowSources.find() @@ -29,8 +30,9 @@ def test_config(): def test_task_select(request): - conf = Config.load_yaml(pathlib.Path( - request.node.fspath).parent / "fixtures" / "tasks.yaml") + conf = Config.load_yaml( + pathlib.Path(request.node.fspath).parent / "fixtures" / "tasks.yaml" + ) conf.validate() test_out = conf.select(tasks=["test-a-test-two"]) @@ -38,8 +40,9 @@ def test_task_select(request): def test_group_select(request): - conf = Config.load_yaml(pathlib.Path( - request.node.fspath).parent / "fixtures" / "tasks.yaml") + conf = Config.load_yaml( + pathlib.Path(request.node.fspath).parent / "fixtures" / "tasks.yaml" + ) conf.validate() test_out = conf.select(groups=["test"]) @@ -47,8 +50,9 @@ def test_group_select(request): def test_group_select_blocklist(request): - conf = Config.load_yaml(pathlib.Path( - request.node.fspath).parent / "fixtures" / "tasks.yaml") + conf = Config.load_yaml( + pathlib.Path(request.node.fspath).parent / "fixtures" / "tasks.yaml" + ) conf.validate() # we respect the nightly blocklist @@ -57,19 +61,22 @@ def test_group_select_blocklist(request): # but if a task is not blocked in both groups, it shows up at least once test_nightly_out = conf.select(groups=["nightly", "test"]) - assert test_nightly_out.keys() >= { - "test-a-test-two", "test-a-test", "nightly-fine"} + assert test_nightly_out.keys() >= {"test-a-test-two", "test-a-test", "nightly-fine"} # but can then over-ride by requesting the task test_nightly_out = conf.select( - tasks=["nightly-not-fine", "nightly-fine"], groups=["nightly", "test"]) + tasks=["nightly-not-fine", "nightly-fine"], groups=["nightly", "test"] + ) assert test_nightly_out.keys() >= { - "test-a-test-two", "test-a-test", "nightly-fine", "nightly-not-fine"} + "test-a-test-two", + "test-a-test", + "nightly-fine", + "nightly-not-fine", + } # and we can glob with the blocklist too! test_nightly_no_test_out = conf.select(groups=["nightly-no-test"]) - assert test_nightly_no_test_out.keys( - ) >= {"nightly-fine", "nightly-not-fine"} + assert test_nightly_no_test_out.keys() >= {"nightly-fine", "nightly-not-fine"} def test_latest_for_prefix(request): @@ -81,8 +88,7 @@ def test_latest_for_prefix(request): ] with mock.patch("archery.crossbow.core.Queue.get") as mocked_get: queue.latest_for_prefix("nightly-packaging-2022-04-10") - mocked_get.assert_called_once_with( - "nightly-packaging-2022-04-10-0") + mocked_get.assert_called_once_with("nightly-packaging-2022-04-10-0") with mock.patch("archery.crossbow.core.Repo.repo") as mocked_repo: mocked_repo.branches = [ @@ -91,5 +97,4 @@ def test_latest_for_prefix(request): ] with mock.patch("archery.crossbow.core.Queue.get") as mocked_get: queue.latest_for_prefix("nightly-packaging") - mocked_get.assert_called_once_with( - "nightly-packaging-2022-04-11-0") + mocked_get.assert_called_once_with("nightly-packaging-2022-04-11-0") diff --git a/dev/archery/archery/crossbow/tests/test_crossbow_cli.py b/dev/archery/archery/crossbow/tests/test_crossbow_cli.py index ee9ba1ee2fc83..e61a98307eb82 100644 --- a/dev/archery/archery/crossbow/tests/test_crossbow_cli.py +++ b/dev/archery/archery/crossbow/tests/test_crossbow_cli.py @@ -14,9 +14,10 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from click.testing import CliRunner import pytest +from click.testing import CliRunner from archery.crossbow.cli import crossbow from archery.utils.git import git @@ -27,17 +28,17 @@ def test_crossbow_submit(tmp_path): runner = CliRunner() def invoke(*args): - return runner.invoke(crossbow, ['--queue-path', str(tmp_path), *args]) + return runner.invoke(crossbow, ["--queue-path", str(tmp_path), *args]) # initialize an empty crossbow repository git.run_cmd("init", str(tmp_path)) - git.run_cmd("-C", str(tmp_path), "remote", "add", "origin", - "https://github.com/dummy/repo") - git.run_cmd("-C", str(tmp_path), "commit", "-m", "initial", - "--allow-empty") + git.run_cmd( + "-C", str(tmp_path), "remote", "add", "origin", "https://github.com/dummy/repo" + ) + git.run_cmd("-C", str(tmp_path), "commit", "-m", "initial", "--allow-empty") - result = invoke('check-config') + result = invoke("check-config") assert result.exit_code == 0 - result = invoke('submit', '--no-fetch', '--no-push', '-g', 'wheel') + result = invoke("submit", "--no-fetch", "--no-push", "-g", "wheel") assert result.exit_code == 0 diff --git a/dev/archery/archery/crossbow/tests/test_reports.py b/dev/archery/archery/crossbow/tests/test_reports.py index 620b4c78bbc71..07fb3b547583d 100644 --- a/dev/archery/archery/crossbow/tests/test_reports.py +++ b/dev/archery/archery/crossbow/tests/test_reports.py @@ -14,91 +14,100 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import textwrap from archery.crossbow.core import yaml -from archery.crossbow.reports import (ChatReport, CommentReport, EmailReport, - Report) +from archery.crossbow.reports import ChatReport, CommentReport, EmailReport, Report def test_crossbow_comment_formatter(load_fixture): - msg = load_fixture('crossbow-success-message.md') - job = load_fixture('crossbow-job.yaml', decoder=yaml.load) + msg = load_fixture("crossbow-success-message.md") + job = load_fixture("crossbow-job.yaml", decoder=yaml.load) - report = CommentReport(job, crossbow_repo='ursa-labs/crossbow') + report = CommentReport(job, crossbow_repo="ursa-labs/crossbow") expected = msg.format( - repo='ursa-labs/crossbow', - branch='ursabot-1', - revision='f766a1d615dd1b7ee706d05102e579195951a61c', - status='has been succeeded.' + repo="ursa-labs/crossbow", + branch="ursabot-1", + revision="f766a1d615dd1b7ee706d05102e579195951a61c", + status="has been succeeded.", ) assert report.show() == textwrap.dedent(expected).strip() def test_crossbow_chat_report(load_fixture): - expected_msg = load_fixture('chat-report.txt') - job = load_fixture('crossbow-job.yaml', decoder=yaml.load) + expected_msg = load_fixture("chat-report.txt") + job = load_fixture("crossbow-job.yaml", decoder=yaml.load) report = Report(job) assert report.tasks_by_state is not None - report_chat = ChatReport(report=report, extra_message_success=None, - extra_message_failure=None) + report_chat = ChatReport( + report=report, extra_message_success=None, extra_message_failure=None + ) assert report_chat.render("text") == textwrap.dedent(expected_msg) def test_crossbow_chat_report_extra_message_failure(load_fixture): - expected_msg = load_fixture('chat-report-extra-message-failure.txt') - job = load_fixture('crossbow-job.yaml', decoder=yaml.load) + expected_msg = load_fixture("chat-report-extra-message-failure.txt") + job = load_fixture("crossbow-job.yaml", decoder=yaml.load) report = Report(job) assert report.tasks_by_state is not None - report_chat = ChatReport(report=report, - extra_message_success="Should not be present", - extra_message_failure="Failure present") + report_chat = ChatReport( + report=report, + extra_message_success="Should not be present", + extra_message_failure="Failure present", + ) assert report_chat.render("text") == textwrap.dedent(expected_msg) def test_crossbow_chat_report_extra_message_success(load_fixture): - expected_msg = load_fixture('chat-report-extra-message-success.txt') - job = load_fixture('crossbow-job-no-failure.yaml', decoder=yaml.load) + expected_msg = load_fixture("chat-report-extra-message-success.txt") + job = load_fixture("crossbow-job-no-failure.yaml", decoder=yaml.load) report = Report(job) assert report.tasks_by_state is not None - report_chat = ChatReport(report=report, - extra_message_success="Success present", - extra_message_failure="Should not be present") + report_chat = ChatReport( + report=report, + extra_message_success="Success present", + extra_message_failure="Should not be present", + ) assert report_chat.render("text") == textwrap.dedent(expected_msg) def test_crossbow_email_report(load_fixture): - expected_msg = load_fixture('email-report.txt') - job = load_fixture('crossbow-job.yaml', decoder=yaml.load) + expected_msg = load_fixture("email-report.txt") + job = load_fixture("crossbow-job.yaml", decoder=yaml.load) report = Report(job) assert report.tasks_by_state is not None - email_report = EmailReport(report=report, sender_name="Sender Reporter", - sender_email="sender@arrow.com", - recipient_email="recipient@arrow.com") - - assert ( - email_report.render("nightly_report") == textwrap.dedent(expected_msg) + email_report = EmailReport( + report=report, + sender_name="Sender Reporter", + sender_email="sender@arrow.com", + recipient_email="recipient@arrow.com", ) + assert email_report.render("nightly_report") == textwrap.dedent(expected_msg) + def test_crossbow_export_report(load_fixture): - job = load_fixture('crossbow-job.yaml', decoder=yaml.load) + job = load_fixture("crossbow-job.yaml", decoder=yaml.load) report = Report(job) assert len(list(report.rows)) == 4 expected_first_row = [ - 'docker-cpp-cmake32', - 'success', - ['https://github.com/apache/crossbow/runs/1'], - 'https://github.com/apache/crossbow/tree/' - 'ursabot-1-circle-docker-cpp-cmake32', - 'circle', - {'commands': ['docker compose build cpp-cmake32', - 'docker compose run cpp-cmake32']}, - 'docker-tests/circle.linux.yml', - 'f766a1d615dd1b7ee706d05102e579195951a61c' + "docker-cpp-cmake32", + "success", + ["https://github.com/apache/crossbow/runs/1"], + "https://github.com/apache/crossbow/tree/ursabot-1-circle-docker-cpp-cmake32", + "circle", + { + "commands": [ + "docker compose build cpp-cmake32", + "docker compose run cpp-cmake32", + ] + }, + "docker-tests/circle.linux.yml", + "f766a1d615dd1b7ee706d05102e579195951a61c", ] assert next(report.rows) == expected_first_row diff --git a/dev/archery/archery/docker/__init__.py b/dev/archery/archery/docker/__init__.py index 6be29c91638db..4dc141c0ba1c6 100644 --- a/dev/archery/archery/docker/__init__.py +++ b/dev/archery/archery/docker/__init__.py @@ -14,5 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from .core import DockerCompose, UndefinedImage # noqa +from .core import DockerCompose, UndefinedImage + +__all__ = ["DockerCompose", "UndefinedImage"] diff --git a/dev/archery/archery/docker/cli.py b/dev/archery/archery/docker/cli.py index 5f9cea872a2a8..b619779632c79 100644 --- a/dev/archery/archery/docker/cli.py +++ b/dev/archery/archery/docker/cli.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import os import sys @@ -26,181 +27,243 @@ def _mock_compose_calls(compose): - from types import MethodType from subprocess import CompletedProcess + from types import MethodType def _mock(compose, command_tuple): def _execute(self, *args, **kwargs): - params = [f'{k}={v}' - for k, v in self.config.params.items()] - command = ' '.join(params + command_tuple + args) + params = [f"{k}={v}" for k, v in self.config.params.items()] + command = " ".join(params + command_tuple + args) click.echo(command) return CompletedProcess([], 0) + return MethodType(_execute, compose) - compose._execute_docker = _mock(compose, command_tuple=('docker',)) - compose._execute_compose = _mock(compose, command_tuple=('docker', 'compose')) + compose._execute_docker = _mock(compose, command_tuple=("docker",)) + compose._execute_compose = _mock(compose, command_tuple=("docker", "compose")) @click.group() -@click.option("--src", metavar="<arrow_src>", default=None, - callback=validate_arrow_sources, - help="Specify Arrow source directory.") -@click.option('--dry-run/--execute', default=False, - help="Display the docker commands instead of executing them.") -@click.option('--using-legacy-docker-compose', default=False, is_flag=True, - envvar='ARCHERY_USE_LEGACY_DOCKER_COMPOSE', - help="Use legacy docker-compose utility instead of the built-in " - "`docker compose` subcommand. This may be necessary if the " - "Docker client is too old for some options.") -@click.option('--using-docker-cli', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_CLI', - help="Use docker CLI directly for building instead of calling " - "`docker compose`. This may help to reuse cached layers.") -@click.option('--using-docker-buildx', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_BUILDX', - help="Use buildx with docker CLI directly for building instead " - "of calling `docker compose` or the plain docker build " - "command. This option makes the build cache reusable " - "across hosts.") +@click.option( + "--src", + metavar="<arrow_src>", + default=None, + callback=validate_arrow_sources, + help="Specify Arrow source directory.", +) +@click.option( + "--dry-run/--execute", + default=False, + help="Display the docker commands instead of executing them.", +) +@click.option( + "--using-legacy-docker-compose", + default=False, + is_flag=True, + envvar="ARCHERY_USE_LEGACY_DOCKER_COMPOSE", + help="Use legacy docker-compose utility instead of the built-in " + "`docker compose` subcommand. This may be necessary if the " + "Docker client is too old for some options.", +) +@click.option( + "--using-docker-cli", + default=False, + is_flag=True, + envvar="ARCHERY_USE_DOCKER_CLI", + help="Use docker CLI directly for building instead of calling " + "`docker compose`. This may help to reuse cached layers.", +) +@click.option( + "--using-docker-buildx", + default=False, + is_flag=True, + envvar="ARCHERY_USE_DOCKER_BUILDX", + help="Use buildx with docker CLI directly for building instead " + "of calling `docker compose` or the plain docker build " + "command. This option makes the build cache reusable " + "across hosts.", +) @click.pass_context -def docker(ctx, src, dry_run, using_legacy_docker_compose, using_docker_cli, - using_docker_buildx): - """ - Interact with Docker Compose based builds. - """ +def docker( + ctx, + src, + dry_run, + using_legacy_docker_compose, + using_docker_cli, + using_docker_buildx, +): + """Interact with Docker Compose based builds.""" ctx.ensure_object(dict) - config_path = src.path / 'docker-compose.yml' + config_path = src.path / "docker-compose.yml" if not config_path.exists(): raise click.ClickException( - "Docker compose configuration cannot be found in directory {}, " - "try to pass the arrow source directory explicitly.".format(src) + f"Docker compose configuration cannot be found in directory {src}, " + "try to pass the arrow source directory explicitly." ) # take the Docker Compose parameters like PYTHON, PANDAS, UBUNTU from the # environment variables to keep the usage similar to docker compose using_docker_cli |= using_docker_buildx - compose_bin = ("docker-compose" if using_legacy_docker_compose - else "docker compose") + compose_bin = "docker-compose" if using_legacy_docker_compose else "docker compose" with group("Docker: Prepare"): - compose = DockerCompose(config_path, params=os.environ, - using_docker=using_docker_cli, - using_buildx=using_docker_buildx, - debug=ctx.obj.get('debug', False), - compose_bin=compose_bin) + compose = DockerCompose( + config_path, + params=os.environ, + using_docker=using_docker_cli, + using_buildx=using_docker_buildx, + debug=ctx.obj.get("debug", False), + compose_bin=compose_bin, + ) if dry_run: _mock_compose_calls(compose) - ctx.obj['compose'] = compose + ctx.obj["compose"] = compose @docker.command("check-config") @click.pass_obj def check_config(obj): - """ - Validate Docker Compose configuration. - """ + """Validate Docker Compose configuration.""" # executes the body of the docker function above which does the validation # during the configuration loading -@docker.command('pull') -@click.argument('image') -@click.option('--pull-leaf/--no-leaf', default=True, - help="Whether to pull leaf images too.") -@click.option('--ignore-pull-failures/--no-ignore-pull-failures', default=True, - help="Whether to ignore pull failures.") +@docker.command("pull") +@click.argument("image") +@click.option( + "--pull-leaf/--no-leaf", default=True, help="Whether to pull leaf images too." +) +@click.option( + "--ignore-pull-failures/--no-ignore-pull-failures", + default=True, + help="Whether to ignore pull failures.", +) @click.pass_obj def docker_pull(obj, image, *, pull_leaf, ignore_pull_failures): - """ - Execute docker compose pull. - """ - compose = obj['compose'] + """Execute docker compose pull.""" + compose = obj["compose"] try: - compose.pull(image, pull_leaf=pull_leaf, - ignore_pull_failures=ignore_pull_failures) + compose.pull( + image, pull_leaf=pull_leaf, ignore_pull_failures=ignore_pull_failures + ) except UndefinedImage as e: raise click.ClickException( - "There is no service/image defined in docker-compose.yml with " - "name: {}".format(str(e)) + f"There is no service/image defined in docker-compose.yml with name: {e!s}" ) except RuntimeError as e: raise click.ClickException(str(e)) -@docker.command('build') -@click.argument('image') -@click.option('--force-pull/--no-pull', default=True, - help="Whether to force pull the image and its ancestor images") -@click.option('--use-cache/--no-cache', default=True, - help="Whether to use cache when building the image and its " - "ancestor images") -@click.option('--use-leaf-cache/--no-leaf-cache', default=True, - help="Whether to use cache when building only the (leaf) image " - "passed as the argument. To disable caching for both the " - "image and its ancestors use --no-cache option.") +@docker.command("build") +@click.argument("image") +@click.option( + "--force-pull/--no-pull", + default=True, + help="Whether to force pull the image and its ancestor images", +) +@click.option( + "--use-cache/--no-cache", + default=True, + help="Whether to use cache when building the image and its ancestor images", +) +@click.option( + "--use-leaf-cache/--no-leaf-cache", + default=True, + help="Whether to use cache when building only the (leaf) image " + "passed as the argument. To disable caching for both the " + "image and its ancestors use --no-cache option.", +) @click.pass_obj def docker_build(obj, image, *, force_pull, use_cache, use_leaf_cache): - """ - Execute Docker Compose builds. - """ - compose = obj['compose'] + """Execute Docker Compose builds.""" + compose = obj["compose"] try: if force_pull: compose.pull(image, pull_leaf=use_leaf_cache) - compose.build(image, use_cache=use_cache, - use_leaf_cache=use_leaf_cache, - pull_parents=force_pull) + compose.build( + image, + use_cache=use_cache, + use_leaf_cache=use_leaf_cache, + pull_parents=force_pull, + ) except UndefinedImage as e: raise click.ClickException( - "There is no service/image defined in docker-compose.yml with " - "name: {}".format(str(e)) + f"There is no service/image defined in docker-compose.yml with name: {e!s}" ) except RuntimeError as e: raise click.ClickException(str(e)) -@docker.command('run') -@click.argument('image') -@click.argument('command', required=False, default=None) -@click.option('--env', '-e', multiple=True, - help="Set environment variable within the container") -@click.option('--user', '-u', default=None, - help="Username or UID to run the container with") -@click.option('--force-pull/--no-pull', default=True, - help="Whether to force pull the image and its ancestor images") -@click.option('--force-build/--no-build', default=True, - help="Whether to force build the image and its ancestor images") -@click.option('--build-only', default=False, is_flag=True, - help="Pull and/or build the image, but do not run it") -@click.option('--use-cache/--no-cache', default=True, - help="Whether to use cache when building the image and its " - "ancestor images") -@click.option('--use-leaf-cache/--no-leaf-cache', default=True, - help="Whether to use cache when building only the (leaf) image " - "passed as the argument. To disable caching for both the " - "image and its ancestors use --no-cache option.") -@click.option('--resource-limit', default=None, - help="A CPU/memory limit preset to mimic CI environments like " - "GitHub Actions. Mandates --using-docker-cli. Note that " - "exporting ARCHERY_DOCKER_BIN=\"sudo docker\" is likely " - "required, unless Docker is configured with cgroups v2 " - "(else Docker will silently ignore the limits).") -@click.option('--volume', '-v', multiple=True, - help="Set volume within the container") +@docker.command("run") +@click.argument("image") +@click.argument("command", required=False, default=None) +@click.option( + "--env", "-e", multiple=True, help="Set environment variable within the container" +) +@click.option( + "--user", "-u", default=None, help="Username or UID to run the container with" +) +@click.option( + "--force-pull/--no-pull", + default=True, + help="Whether to force pull the image and its ancestor images", +) +@click.option( + "--force-build/--no-build", + default=True, + help="Whether to force build the image and its ancestor images", +) +@click.option( + "--build-only", + default=False, + is_flag=True, + help="Pull and/or build the image, but do not run it", +) +@click.option( + "--use-cache/--no-cache", + default=True, + help="Whether to use cache when building the image and its ancestor images", +) +@click.option( + "--use-leaf-cache/--no-leaf-cache", + default=True, + help="Whether to use cache when building only the (leaf) image " + "passed as the argument. To disable caching for both the " + "image and its ancestors use --no-cache option.", +) +@click.option( + "--resource-limit", + default=None, + help="A CPU/memory limit preset to mimic CI environments like " + "GitHub Actions. Mandates --using-docker-cli. Note that " + 'exporting ARCHERY_DOCKER_BIN="sudo docker" is likely ' + "required, unless Docker is configured with cgroups v2 " + "(else Docker will silently ignore the limits).", +) +@click.option("--volume", "-v", multiple=True, help="Set volume within the container") @click.pass_obj -def docker_run(obj, image, command, *, env, user, force_pull, force_build, - build_only, use_cache, use_leaf_cache, resource_limit, - volume): - """ - Execute Docker Compose builds. +def docker_run( + obj, + image, + command, + *, + env, + user, + force_pull, + force_build, + build_only, + use_cache, + use_leaf_cache, + resource_limit, + volume, +): + """Execute Docker Compose builds. To see the available builds run `archery docker images`. Examples: - # execute a single build archery docker run conda-python @@ -225,18 +288,18 @@ def docker_run(obj, image, command, *, env, user, force_pull, force_build, # starting an interactive bash session for debugging archery docker run ubuntu-cpp bash + """ - compose = obj['compose'] + compose = obj["compose"] - env = dict(kv.split('=', 1) for kv in env) + env = dict(kv.split("=", 1) for kv in env) try: if force_pull: with group("Docker: Pull"): compose.pull(image, pull_leaf=use_leaf_cache) if force_build: with group("Docker: Build"): - compose.build(image, use_cache=use_cache, - use_leaf_cache=use_leaf_cache) + compose.build(image, use_cache=use_cache, use_leaf_cache=use_leaf_cache) if build_only: return compose.run( @@ -245,46 +308,58 @@ def docker_run(obj, image, command, *, env, user, force_pull, force_build, env=env, user=user, resource_limit=resource_limit, - volumes=volume + volumes=volume, ) except UndefinedImage as e: raise click.ClickException( - "There is no service/image defined in docker-compose.yml with " - "name: {}".format(str(e)) + f"There is no service/image defined in docker-compose.yml with name: {e!s}" ) except RuntimeError as e: raise click.ClickException(str(e)) -@docker.command('push') -@click.argument('image') -@click.option('--user', '-u', required=False, envvar='ARCHERY_DOCKER_USER', - help='Docker repository username') -@click.option('--password', '-p', required=False, - envvar='ARCHERY_DOCKER_PASSWORD', - help='Docker repository password') +@docker.command("push") +@click.argument("image") +@click.option( + "--user", + "-u", + required=False, + envvar="ARCHERY_DOCKER_USER", + help="Docker repository username", +) +@click.option( + "--password", + "-p", + required=False, + envvar="ARCHERY_DOCKER_PASSWORD", + help="Docker repository password", +) @click.pass_obj def docker_compose_push(obj, image, user, password): """Push the generated Docker Compose image.""" - compose = obj['compose'] + compose = obj["compose"] compose.push(image, user=user, password=password) -@docker.command('images') +@docker.command("images") @click.pass_obj def docker_compose_images(obj): """List the available Docker Compose images.""" - compose = obj['compose'] - click.echo('Available images:') + compose = obj["compose"] + click.echo("Available images:") for image in compose.images(): - click.echo(f' - {image}') - - -@docker.command('info') -@click.argument('service_name') -@click.option('--show', '-s', required=False, - help="Show only specific docker compose key. Examples of keys:" - " command, environment, build, dockerfile") + click.echo(f" - {image}") + + +@docker.command("info") +@click.argument("service_name") +@click.option( + "--show", + "-s", + required=False, + help="Show only specific docker compose key. Examples of keys:" + " command, environment, build, dockerfile", +) @click.pass_obj def docker_compose_info(obj, service_name, show): """Show Docker Compose definition info for service_name. @@ -292,13 +367,13 @@ def docker_compose_info(obj, service_name, show): SERVICE_NAME is the name of the docker service defined in docker-compose.yml. Look at `archery docker images` output for names. """ - compose = obj['compose'] + compose = obj["compose"] try: service = compose.config.raw_config["services"][service_name] except KeyError: - click.echo(f'Service name {service_name} could not be found', err=True) + click.echo(f"Service name {service_name} could not be found", err=True) sys.exit(1) else: - click.echo(f'Service {service_name} Docker Compose config:') + click.echo(f"Service {service_name} Docker Compose config:") output = "\n".join(compose.info(service, show)) click.echo(output) diff --git a/dev/archery/archery/docker/core.py b/dev/archery/archery/docker/core.py index faf5c29744522..eb7cd0c700d8c 100644 --- a/dev/archery/archery/docker/core.py +++ b/dev/archery/archery/docker/core.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import os import shlex @@ -23,10 +24,10 @@ from dotenv import dotenv_values from ruamel.yaml import YAML +from ..compat import _ensure_path from ..utils.command import Command, default_bin from ..utils.logger import running_in_ci from ..utils.source import arrow_path -from ..compat import _ensure_path def flatten(node, parents=None): @@ -44,13 +45,8 @@ def flatten(node, parents=None): raise TypeError(node) -_arch_short_mapping = { - 'arm64v8': 'arm64', -} -_arch_alias_mapping = { - 'amd64': 'x86_64', - 'arm64v8': 'aarch64', -} +_arch_short_mapping = {"arm64v8": "arm64"} +_arch_alias_mapping = {"amd64": "x86_64", "arm64v8": "aarch64"} class UndefinedImage(Exception): @@ -58,10 +54,16 @@ class UndefinedImage(Exception): class ComposeConfig: - - def __init__(self, config_path, dotenv_path, compose_bin, - using_docker=False, using_buildx=False, - params=None, debug=False): + def __init__( + self, + config_path, + dotenv_path, + compose_bin, + using_docker=False, + using_buildx=False, + params=None, + debug=False, + ): self.using_docker = using_docker self.using_buildx = using_buildx self.debug = debug @@ -69,18 +71,16 @@ def __init__(self, config_path, dotenv_path, compose_bin, if dotenv_path: dotenv_path = _ensure_path(dotenv_path) else: - dotenv_path = config_path.parent / '.env' + dotenv_path = config_path.parent / ".env" if self.debug: # Log docker version - Docker().run('version') + Docker().run("version") self._read_env(dotenv_path, params) self._read_config(config_path, compose_bin) def _read_env(self, dotenv_path, params): - """ - Read .env and merge it with explicitly passed parameters. - """ + """Read .env and merge it with explicitly passed parameters.""" self.dotenv = dotenv_values(str(dotenv_path)) if params is None: self.params = {} @@ -95,62 +95,60 @@ def _read_env(self, dotenv_path, params): self.env.update(self.params) # translate docker's architecture notation to a more widely used one - arch = self.env.get('ARCH', 'amd64') - self.env['ARCH_ALIAS'] = _arch_alias_mapping.get(arch, arch) - self.env['ARCH_SHORT'] = _arch_short_mapping.get(arch, arch) + arch = self.env.get("ARCH", "amd64") + self.env["ARCH_ALIAS"] = _arch_alias_mapping.get(arch, arch) + self.env["ARCH_SHORT"] = _arch_short_mapping.get(arch, arch) def _read_config(self, config_path, compose_bin): - """ - Validate and read the docker-compose.yml - """ + """Validate and read the docker-compose.yml""" yaml = YAML() with config_path.open() as fp: self.raw_config = yaml.load(fp) - services = self.raw_config['services'].keys() - self.hierarchy = dict(flatten(self.raw_config.get('x-hierarchy', {}))) - self.limit_presets = self.raw_config.get('x-limit-presets', {}) - self.with_gpus = self.raw_config.get('x-with-gpus', []) + services = self.raw_config["services"].keys() + self.hierarchy = dict(flatten(self.raw_config.get("x-hierarchy", {}))) + self.limit_presets = self.raw_config.get("x-limit-presets", {}) + self.with_gpus = self.raw_config.get("x-with-gpus", []) nodes = self.hierarchy.keys() errors = [] for name in self.with_gpus: if name not in services: errors.append( - 'Service `{}` defined in `x-with-gpus` bot not in ' - '`services`'.format(name) + f"Service `{name}` defined in `x-with-gpus` bot not in `services`" ) for name in nodes - services: errors.append( - 'Service `{}` is defined in `x-hierarchy` bot not in ' - '`services`'.format(name) + f"Service `{name}` is defined in `x-hierarchy` bot not in `services`" ) for name in services - nodes: errors.append( - 'Service `{}` is defined in `services` but not in ' - '`x-hierarchy`'.format(name) + f"Service `{name}` is defined in `services` but not in `x-hierarchy`" ) # trigger Docker Compose's own validation if self.using_docker: compose = Docker() - args = ['compose'] + args = ["compose"] else: compose = Command(compose_bin) args = [] - args += [f'--file={config_path}', 'config'] - result = compose.run(*args, env=self.env, check=False, - stderr=subprocess.PIPE, stdout=subprocess.PIPE) + args += [f"--file={config_path}", "config"] + result = compose.run( + *args, + env=self.env, + check=False, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) if result.returncode != 0: # strip the intro line of docker compose errors errors += result.stderr.decode().splitlines() if errors: - msg = '\n'.join([' - {}'.format(msg) for msg in errors]) - raise ValueError( - 'Found errors with docker-compose:\n{}'.format(msg) - ) + msg = "\n".join([f" - {msg}" for msg in errors]) + raise ValueError(f"Found errors with docker-compose:\n{msg}") rendered_config = StringIO(result.stdout.decode()) self.path = config_path @@ -158,36 +156,47 @@ def _read_config(self, config_path, compose_bin): def get(self, service_name): try: - service = self.config['services'][service_name] + service = self.config["services"][service_name] except KeyError: raise UndefinedImage(service_name) - service['name'] = service_name - service['need_gpu'] = service_name in self.with_gpus - service['ancestors'] = self.hierarchy[service_name] + service["name"] = service_name + service["need_gpu"] = service_name in self.with_gpus + service["ancestors"] = self.hierarchy[service_name] return service def __getitem__(self, service_name): return self.get(service_name) def verbosity_args(self): - return ['--quiet'] if running_in_ci() else [] + return ["--quiet"] if running_in_ci() else [] class Docker(Command): - def __init__(self, docker_bin=None): self.bin = default_bin(docker_bin, "docker") class DockerCompose(Command): - - def __init__(self, config_path, dotenv_path=None, compose_bin=None, - using_docker=False, using_buildx=False, params=None, - debug=False): - compose_bin = default_bin(compose_bin, 'docker compose') - self.config = ComposeConfig(config_path, dotenv_path, compose_bin, - params=params, using_docker=using_docker, - using_buildx=using_buildx, debug=debug) + def __init__( + self, + config_path, + dotenv_path=None, + compose_bin=None, + using_docker=False, + using_buildx=False, + params=None, + debug=False, + ): + compose_bin = default_bin(compose_bin, "docker compose") + self.config = ComposeConfig( + config_path, + dotenv_path, + compose_bin, + params=params, + using_docker=using_docker, + using_buildx=using_buildx, + debug=debug, + ) self.bin = compose_bin self.pull_memory = set() @@ -197,14 +206,15 @@ def clear_pull_memory(self): def _execute_compose(self, *args, **kwargs): # execute as a docker compose command try: - result = super().run(f'--file={self.config.path}', *args, - env=self.config.env, **kwargs) + result = super().run( + f"--file={self.config.path}", *args, env=self.config.env, **kwargs + ) result.check_returncode() except subprocess.CalledProcessError as e: + def formatdict(d, template): - return '\n'.join( - template.format(k, v) for k, v in sorted(d.items()) - ) + return "\n".join(template.format(k, v) for k, v in sorted(d.items())) + msg = ( "`{cmd}` exited with a non-zero exit code {code}, see the " "process log above.\n\nThe {bin} command was " @@ -213,13 +223,11 @@ def formatdict(d, template): ) raise RuntimeError( msg.format( - cmd=' '.join(e.cmd), + cmd=" ".join(e.cmd), code=e.returncode, bin=self.bin, - dotenv=formatdict(self.config.dotenv, template=' {}: {}'), - params=formatdict( - self.config.params, template=' export {}={}' - ) + dotenv=formatdict(self.config.dotenv, template=" {}: {}"), + params=formatdict(self.config.params, template=" export {}={}"), ) ) @@ -231,19 +239,19 @@ def _execute_docker(self, *args, **kwargs): except subprocess.CalledProcessError as e: raise RuntimeError( "{} exited with non-zero exit code {}".format( - ' '.join(e.cmd), e.returncode + " ".join(e.cmd), e.returncode ) ) def pull(self, service_name, pull_leaf=True, ignore_pull_failures=True): def _pull(service): - args = ['pull'] + self.config.verbosity_args() - if service['image'] in self.pull_memory: + args = ["pull"] + self.config.verbosity_args() + if service["image"] in self.pull_memory: return if self.config.using_docker: try: - self._execute_docker(*args, service['image']) + self._execute_docker(*args, service["image"]) except Exception as e: if ignore_pull_failures: # better --ignore-pull-failures handling @@ -252,163 +260,177 @@ def _pull(service): raise else: if ignore_pull_failures: - args.append('--ignore-pull-failures') - self._execute_compose(*args, service['name']) + args.append("--ignore-pull-failures") + self._execute_compose(*args, service["name"]) - self.pull_memory.add(service['image']) + self.pull_memory.add(service["image"]) service = self.config.get(service_name) - for ancestor in service['ancestors']: + for ancestor in service["ancestors"]: _pull(self.config.get(ancestor)) if pull_leaf: _pull(service) - def build(self, service_name, use_cache=True, use_leaf_cache=True, - pull_parents=True): + def build( + self, service_name, use_cache=True, use_leaf_cache=True, pull_parents=True + ): def _build(service, use_cache): - if 'build' not in service: + if "build" not in service: # nothing to do return args = [] - cache_from = list(service.get('build', {}).get('cache_from', [])) + cache_from = list(service.get("build", {}).get("cache_from", [])) if pull_parents: for image in cache_from: if image not in self.pull_memory: try: - self._execute_docker('pull', image) + self._execute_docker("pull", image) except Exception as e: print(e) finally: self.pull_memory.add(image) if not use_cache: - args.append('--no-cache') + args.append("--no-cache") # turn on inline build cache, this is a docker buildx feature # used to bundle the image build cache to the pushed image manifest # so the build cache can be reused across hosts, documented at # https://github.com/docker/buildx#--cache-tonametypetypekeyvalue - if self.config.env.get('BUILDKIT_INLINE_CACHE') == '1': - args.extend(['--build-arg', 'BUILDKIT_INLINE_CACHE=1']) + if self.config.env.get("BUILDKIT_INLINE_CACHE") == "1": + args.extend(["--build-arg", "BUILDKIT_INLINE_CACHE=1"]) if self.config.using_buildx: - for k, v in service['build'].get('args', {}).items(): - args.extend(['--build-arg', '{}={}'.format(k, v)]) + for k, v in service["build"].get("args", {}).items(): + args.extend(["--build-arg", f"{k}={v}"]) if use_cache: - cache_ref = '{}-cache'.format(service['image']) - cache_from = 'type=registry,ref={}'.format(cache_ref) - cache_to = ( - 'type=registry,ref={},mode=max'.format(cache_ref) - ) - args.extend([ - '--cache-from', cache_from, - '--cache-to', cache_to, - ]) - - args.extend([ - '--output', 'type=docker', - '-f', arrow_path(service['build']['dockerfile']), - '-t', service['image'], - service['build'].get('context', '.') - ]) + cache_ref = "{}-cache".format(service["image"]) + cache_from = f"type=registry,ref={cache_ref}" + cache_to = f"type=registry,ref={cache_ref},mode=max" + args.extend(["--cache-from", cache_from, "--cache-to", cache_to]) + + args.extend( + [ + "--output", + "type=docker", + "-f", + arrow_path(service["build"]["dockerfile"]), + "-t", + service["image"], + service["build"].get("context", "."), + ] + ) self._execute_docker("buildx", "build", *args) elif self.config.using_docker: # better for caching if self.config.debug and os.name != "nt": args.append("--progress=plain") - for k, v in service['build'].get('args', {}).items(): - args.extend(['--build-arg', '{}={}'.format(k, v)]) + for k, v in service["build"].get("args", {}).items(): + args.extend(["--build-arg", f"{k}={v}"]) for img in cache_from: - args.append('--cache-from="{}"'.format(img)) - args.extend([ - '-f', arrow_path(service['build']['dockerfile']), - '-t', service['image'], - service['build'].get('context', '.') - ]) + args.append(f'--cache-from="{img}"') + args.extend( + [ + "-f", + arrow_path(service["build"]["dockerfile"]), + "-t", + service["image"], + service["build"].get("context", "."), + ] + ) self._execute_docker("build", *args) else: if self.config.debug and os.name != "nt": args.append("--progress=plain") - self._execute_compose("build", *args, service['name']) + self._execute_compose("build", *args, service["name"]) service = self.config.get(service_name) # build ancestor services - for ancestor in service['ancestors']: + for ancestor in service["ancestors"]: _build(self.config.get(ancestor), use_cache=use_cache) # build the leaf/target service _build(service, use_cache=use_cache and use_leaf_cache) - def run(self, service_name, command=None, *, env=None, volumes=None, - user=None, resource_limit=None): + def run( + self, + service_name, + command=None, + *, + env=None, + volumes=None, + user=None, + resource_limit=None, + ): service = self.config.get(service_name) args = [] - use_docker = self.config.using_docker or service['need_gpu'] or resource_limit + use_docker = self.config.using_docker or service["need_gpu"] or resource_limit if use_docker: # use gpus, requires docker>=19.03 - if service['need_gpu']: - args.extend(['--gpus', 'all']) + if service["need_gpu"]: + args.extend(["--gpus", "all"]) - if service.get('shm_size'): - args.extend(['--shm-size', service['shm_size']]) + if service.get("shm_size"): + args.extend(["--shm-size", service["shm_size"]]) # append env variables from the compose conf - for k, v in service.get('environment', {}).items(): + for k, v in service.get("environment", {}).items(): if v is not None: - args.extend(['-e', '{}={}'.format(k, v)]) + args.extend(["-e", f"{k}={v}"]) # append volumes from the compose conf - for v in service.get('volumes', []): + for v in service.get("volumes", []): if not isinstance(v, str): # if not the compact string volume definition - v = "{}:{}".format(v['source'], v['target']) - args.extend(['-v', v]) + v = "{}:{}".format(v["source"], v["target"]) + args.extend(["-v", v]) # append capabilities from the compose conf - for c in service.get('cap_add', []): - args.extend([f'--cap-add={c}']) + for c in service.get("cap_add", []): + args.extend([f"--cap-add={c}"]) # infer whether an interactive shell is desired or not - if command in ['cmd.exe', 'bash', 'sh', 'powershell']: - args.append('-it') + if command in ["cmd.exe", "bash", "sh", "powershell"]: + args.append("-it") if resource_limit: limits = self.config.limit_presets.get(resource_limit) if not limits: raise ValueError( - f"Unknown resource limit preset '{resource_limit}'") - cpuset = limits.get('cpuset_cpus', []) + f"Unknown resource limit preset '{resource_limit}'" + ) + cpuset = limits.get("cpuset_cpus", []) if cpuset: - args.append(f'--cpuset-cpus={",".join(map(str, cpuset))}') - memory = limits.get('memory') + args.append(f"--cpuset-cpus={','.join(map(str, cpuset))}") + memory = limits.get("memory") if memory: - args.append(f'--memory={memory}') - args.append(f'--memory-swap={memory}') + args.append(f"--memory={memory}") + args.append(f"--memory-swap={memory}") if user is not None: - args.extend(['-u', user]) + args.extend(["-u", user]) if env is not None: for k, v in env.items(): - args.extend(['-e', '{}={}'.format(k, v)]) + args.extend(["-e", f"{k}={v}"]) if volumes is not None: for volume in volumes: - args.extend(['--volume', volume]) + args.extend(["--volume", volume]) if use_docker: # get the actual docker image name instead of the compose service # name which we refer as image in general - args.append(service['image']) + args.append(service["image"]) # add command from compose if it wasn't overridden if command is not None: args.append(command) else: - cmd = service.get('command', '') + cmd = service.get("command", "") if cmd: # service command might be already defined as a list # in docker-compose.yml. @@ -421,55 +443,52 @@ def run(self, service_name, command=None, *, env=None, volumes=None, args.extend(shlex.split(cmd)) # execute as a plain docker cli command - self._execute_docker('run', '--rm', *args) + self._execute_docker("run", "--rm", *args) else: # execute as a docker compose command args.append(service_name) if command is not None: args.append(command) - self._execute_compose('run', '--rm', *args) + self._execute_compose("run", "--rm", *args) def push(self, service_name, user=None, password=None): def _push(service): - args = ['push'] + self.config.verbosity_args() + args = ["push"] + self.config.verbosity_args() if self.config.using_docker: - return self._execute_docker(*args, service['image']) + return self._execute_docker(*args, service["image"]) else: - return self._execute_compose(*args, service['name']) + return self._execute_compose(*args, service["name"]) if user is not None: try: # TODO(kszucs): have an option for a prompt - self._execute_docker('login', '-u', user, '-p', password) + self._execute_docker("login", "-u", user, "-p", password) except subprocess.CalledProcessError: # hide credentials - msg = ('Failed to push `{}`, check the passed credentials' - .format(service_name)) + msg = f"Failed to push `{service_name}`, check the passed credentials" raise RuntimeError(msg) from None service = self.config.get(service_name) - for ancestor in service['ancestors']: + for ancestor in service["ancestors"]: _push(self.config.get(ancestor)) _push(service) def images(self): return sorted(self.config.hierarchy.keys()) - def info(self, key_name, filters=None, prefix=' '): + def info(self, key_name, filters=None, prefix=" "): output = [] for key, value in key_name.items(): - if hasattr(value, 'items'): + if hasattr(value, "items"): temp_filters = filters if key == filters or filters is None: - output.append(f'{prefix} {key}') + output.append(f"{prefix} {key}") # Keep showing this specific key # as parent matched filter temp_filters = None output.extend(self.info(value, temp_filters, prefix + " ")) - else: - if key == filters or filters is None: - output.append( - f'{prefix} {key}: ' + - f'{value if value is not None else "<inherited>"}' - ) + elif key == filters or filters is None: + output.append( + f"{prefix} {key}: {value if value is not None else '<inherited>'}" + ) return output diff --git a/dev/archery/archery/docker/tests/test_docker.py b/dev/archery/archery/docker/tests/test_docker.py index 432d1c0a35202..6dfc2a7e79397 100644 --- a/dev/archery/archery/docker/tests/test_docker.py +++ b/dev/archery/archery/docker/tests/test_docker.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import collections import os @@ -24,8 +25,7 @@ import pytest from archery.docker import DockerCompose -from archery.testing import assert_subprocess_calls, override_env, PartialEnv - +from archery.testing import PartialEnv, assert_subprocess_calls, override_env missing_service_compose_yml = """ version: '3.5' @@ -182,24 +182,24 @@ """ arrow_compose_env = { - 'UBUNTU': '20.04', # overridden below - 'PYTHON': '3.8', - 'PANDAS': 'latest', - 'DASK': 'latest', # overridden below + "UBUNTU": "20.04", # overridden below + "PYTHON": "3.8", + "PANDAS": "latest", + "DASK": "latest", # overridden below } def create_config(directory, yml_content, env_content=None): - env_path = directory / '.env' - config_path = directory / 'docker-compose.yml' + env_path = directory / ".env" + config_path = directory / "docker-compose.yml" - with config_path.open('w') as fp: + with config_path.open("w") as fp: fp.write(yml_content) if env_content is not None: - with env_path.open('w') as fp: + with env_path.open("w") as fp: for k, v in env_content.items(): - fp.write("{}={}\n".format(k, v)) + fp.write(f"{k}={v}\n") return config_path @@ -241,7 +241,7 @@ def test_config_validation(tmpdir): def assert_docker_calls(compose, expected_args): - base_command = ['docker'] + base_command = ["docker"] expected_commands = [] for args in expected_args: if isinstance(args, str): @@ -251,7 +251,7 @@ def assert_docker_calls(compose, expected_args): def assert_compose_calls(compose, expected_args, env=mock.ANY): - base_command = ['docker', 'compose', f'--file={compose.config.path}'] + base_command = ["docker", "compose", f"--file={compose.config.path}"] expected_commands = [] for args in expected_args: if isinstance(args, str): @@ -265,53 +265,41 @@ def test_arrow_example_validation_passes(arrow_compose_path): def test_compose_default_params_and_env(arrow_compose_path): - compose = DockerCompose(arrow_compose_path, params=dict( - UBUNTU='18.04', - DASK='upstream_devel' - )) + compose = DockerCompose( + arrow_compose_path, params=dict(UBUNTU="18.04", DASK="upstream_devel") + ) assert compose.config.dotenv == arrow_compose_env - assert compose.config.params == { - 'UBUNTU': '18.04', - 'DASK': 'upstream_devel', - } + assert compose.config.params == {"UBUNTU": "18.04", "DASK": "upstream_devel"} def test_forwarding_env_variables(arrow_compose_path): - expected_calls = [ - "pull --ignore-pull-failures conda-cpp", - "build conda-cpp", - ] - expected_env = PartialEnv( - MY_CUSTOM_VAR_A='a', - MY_CUSTOM_VAR_B='b' - ) - with override_env({'MY_CUSTOM_VAR_A': 'a', 'MY_CUSTOM_VAR_B': 'b'}): + expected_calls = ["pull --ignore-pull-failures conda-cpp", "build conda-cpp"] + expected_env = PartialEnv(MY_CUSTOM_VAR_A="a", MY_CUSTOM_VAR_B="b") + with override_env({"MY_CUSTOM_VAR_A": "a", "MY_CUSTOM_VAR_B": "b"}): compose = DockerCompose(arrow_compose_path) with assert_compose_calls(compose, expected_calls, env=expected_env): - assert os.environ['MY_CUSTOM_VAR_A'] == 'a' - assert os.environ['MY_CUSTOM_VAR_B'] == 'b' - compose.pull('conda-cpp') - compose.build('conda-cpp') + assert os.environ["MY_CUSTOM_VAR_A"] == "a" + assert os.environ["MY_CUSTOM_VAR_B"] == "b" + compose.pull("conda-cpp") + compose.build("conda-cpp") def test_compose_pull(arrow_compose_path, monkeypatch): compose = DockerCompose(arrow_compose_path) - expected_calls = [ - "pull --ignore-pull-failures conda-cpp", - ] + expected_calls = ["pull --ignore-pull-failures conda-cpp"] with assert_compose_calls(compose, expected_calls): compose.clear_pull_memory() - compose.pull('conda-cpp') + compose.pull("conda-cpp") expected_calls = [ "pull --ignore-pull-failures conda-cpp", "pull --ignore-pull-failures conda-python", - "pull --ignore-pull-failures conda-python-pandas" + "pull --ignore-pull-failures conda-python-pandas", ] with assert_compose_calls(compose, expected_calls): compose.clear_pull_memory() - compose.pull('conda-python-pandas') + compose.pull("conda-python-pandas") expected_calls = [ "pull --ignore-pull-failures conda-cpp", @@ -319,17 +307,15 @@ def test_compose_pull(arrow_compose_path, monkeypatch): ] with assert_compose_calls(compose, expected_calls): compose.clear_pull_memory() - compose.pull('conda-python-pandas', pull_leaf=False) + compose.pull("conda-python-pandas", pull_leaf=False) with monkeypatch.context() as m: # `--quiet` is passed to `docker` on CI m.setenv("GITHUB_ACTIONS", "true") - expected_calls = [ - "pull --quiet --ignore-pull-failures conda-cpp", - ] + expected_calls = ["pull --quiet --ignore-pull-failures conda-cpp"] with assert_compose_calls(compose, expected_calls): compose.clear_pull_memory() - compose.pull('conda-cpp') + compose.pull("conda-cpp") def test_compose_pull_params(arrow_compose_path): @@ -337,35 +323,31 @@ def test_compose_pull_params(arrow_compose_path): "pull --ignore-pull-failures conda-cpp", "pull --ignore-pull-failures conda-python", ] - compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU='18.04')) - expected_env = PartialEnv(PYTHON='3.8', PANDAS='latest') + compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU="18.04")) + expected_env = PartialEnv(PYTHON="3.8", PANDAS="latest") with assert_compose_calls(compose, expected_calls, env=expected_env): compose.clear_pull_memory() - compose.pull('conda-python-pandas', pull_leaf=False) + compose.pull("conda-python-pandas", pull_leaf=False) def test_compose_build(arrow_compose_path): compose = DockerCompose(arrow_compose_path) - expected_calls = [ - "build conda-cpp", - ] + expected_calls = ["build conda-cpp"] with assert_compose_calls(compose, expected_calls): - compose.build('conda-cpp') + compose.build("conda-cpp") - expected_calls = [ - "build --no-cache conda-cpp" - ] + expected_calls = ["build --no-cache conda-cpp"] with assert_compose_calls(compose, expected_calls): - compose.build('conda-cpp', use_cache=False) + compose.build("conda-cpp", use_cache=False) expected_calls = [ "build conda-cpp", "build conda-python", - "build conda-python-pandas" + "build conda-python-pandas", ] with assert_compose_calls(compose, expected_calls): - compose.build('conda-python-pandas') + compose.build("conda-python-pandas") expected_calls = [ "build --no-cache conda-cpp", @@ -373,7 +355,7 @@ def test_compose_build(arrow_compose_path): "build --no-cache conda-python-pandas", ] with assert_compose_calls(compose, expected_calls): - compose.build('conda-python-pandas', use_cache=False) + compose.build("conda-python-pandas", use_cache=False) expected_calls = [ "build conda-cpp", @@ -381,142 +363,128 @@ def test_compose_build(arrow_compose_path): "build --no-cache conda-python-pandas", ] with assert_compose_calls(compose, expected_calls): - compose.build('conda-python-pandas', use_cache=True, - use_leaf_cache=False) + compose.build("conda-python-pandas", use_cache=True, use_leaf_cache=False) @mock.patch.dict(os.environ, {"BUILDKIT_INLINE_CACHE": "1"}) def test_compose_buildkit_inline_cache(arrow_compose_path): compose = DockerCompose(arrow_compose_path) - expected_calls = [ - "build --build-arg BUILDKIT_INLINE_CACHE=1 conda-cpp", - ] + expected_calls = ["build --build-arg BUILDKIT_INLINE_CACHE=1 conda-cpp"] with assert_compose_calls(compose, expected_calls): - compose.build('conda-cpp') + compose.build("conda-cpp") def test_compose_build_params(arrow_compose_path): - expected_calls = [ - "build ubuntu-cpp", - ] + expected_calls = ["build ubuntu-cpp"] - compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU='18.04')) + compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU="18.04")) expected_env = PartialEnv(UBUNTU="18.04") with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.build('ubuntu-cpp') + compose.build("ubuntu-cpp") - compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU='16.04')) + compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU="16.04")) expected_env = PartialEnv(UBUNTU="16.04") with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.build('ubuntu-cpp') + compose.build("ubuntu-cpp") expected_calls = [ "build --no-cache conda-cpp", "build --no-cache conda-python", "build --no-cache conda-python-pandas", ] - compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU='18.04')) - expected_env = PartialEnv(PYTHON='3.8', PANDAS='latest') + compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU="18.04")) + expected_env = PartialEnv(PYTHON="3.8", PANDAS="latest") with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.build('conda-python-pandas', use_cache=False) + compose.build("conda-python-pandas", use_cache=False) def test_compose_run(arrow_compose_path): - expected_calls = [ - format_run("conda-cpp"), - ] + expected_calls = [format_run("conda-cpp")] compose = DockerCompose(arrow_compose_path) with assert_compose_calls(compose, expected_calls): - compose.run('conda-cpp') + compose.run("conda-cpp") - expected_calls = [ - format_run("conda-python") - ] - expected_env = PartialEnv(PYTHON='3.8') + expected_calls = [format_run("conda-python")] + expected_env = PartialEnv(PYTHON="3.8") with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.run('conda-python') + compose.run("conda-python") - compose = DockerCompose(arrow_compose_path, params=dict(PYTHON='3.9')) - expected_env = PartialEnv(PYTHON='3.9') + compose = DockerCompose(arrow_compose_path, params=dict(PYTHON="3.9")) + expected_env = PartialEnv(PYTHON="3.9") with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.run('conda-python') + compose.run("conda-python") - compose = DockerCompose(arrow_compose_path, params=dict(PYTHON='3.9')) + compose = DockerCompose(arrow_compose_path, params=dict(PYTHON="3.9")) for command in ["bash", "echo 1"]: - expected_calls = [ - format_run(["conda-python", command]), - ] - expected_env = PartialEnv(PYTHON='3.9') + expected_calls = [format_run(["conda-python", command])] + expected_env = PartialEnv(PYTHON="3.9") with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.run('conda-python', command) + compose.run("conda-python", command) expected_calls = [ - ( - format_run("-e CONTAINER_ENV_VAR_A=a -e CONTAINER_ENV_VAR_B=b " - "conda-python") - ) + (format_run("-e CONTAINER_ENV_VAR_A=a -e CONTAINER_ENV_VAR_B=b conda-python")) ] compose = DockerCompose(arrow_compose_path) - expected_env = PartialEnv(PYTHON='3.8') + expected_env = PartialEnv(PYTHON="3.8") with assert_compose_calls(compose, expected_calls, env=expected_env): - env = collections.OrderedDict([ - ("CONTAINER_ENV_VAR_A", "a"), - ("CONTAINER_ENV_VAR_B", "b") - ]) - compose.run('conda-python', env=env) + env = collections.OrderedDict( + [("CONTAINER_ENV_VAR_A", "a"), ("CONTAINER_ENV_VAR_B", "b")] + ) + compose.run("conda-python", env=env) expected_calls = [ ( - format_run("--volume /host/build:/build --volume " - "/host/ccache:/ccache:delegated conda-python") + format_run( + "--volume /host/build:/build --volume " + "/host/ccache:/ccache:delegated conda-python" + ) ) ] compose = DockerCompose(arrow_compose_path) with assert_compose_calls(compose, expected_calls): volumes = ("/host/build:/build", "/host/ccache:/ccache:delegated") - compose.run('conda-python', volumes=volumes) + compose.run("conda-python", volumes=volumes) def test_compose_run_with_resource_limits(arrow_compose_path): expected_calls = [ - format_run([ - "--cpuset-cpus=0,1", - "--memory=7g", - "--memory-swap=7g", - "org/conda-cpp" - ]), + format_run( + ["--cpuset-cpus=0,1", "--memory=7g", "--memory-swap=7g", "org/conda-cpp"] + ) ] compose = DockerCompose(arrow_compose_path) with assert_docker_calls(compose, expected_calls): - compose.run('conda-cpp', resource_limit="github") + compose.run("conda-cpp", resource_limit="github") def test_compose_push(arrow_compose_path): - compose = DockerCompose(arrow_compose_path, params=dict(PYTHON='3.9')) + compose = DockerCompose(arrow_compose_path, params=dict(PYTHON="3.9")) expected_env = PartialEnv(PYTHON="3.9") expected_calls = [ - mock.call(["docker", "login", "-u", "user", "-p", "pass"], check=True), + mock.call(["docker", "login", "-u", "user", "-p", "pass"], check=True) ] for image in ["conda-cpp", "conda-python", "conda-python-pandas"]: expected_calls.append( - mock.call(["docker", "compose", f"--file={compose.config.path}", - "push", image], check=True, env=expected_env) + mock.call( + ["docker", "compose", f"--file={compose.config.path}", "push", image], + check=True, + env=expected_env, + ) ) with assert_subprocess_calls(expected_calls): - compose.push('conda-python-pandas', user='user', password='pass') + compose.push("conda-python-pandas", user="user", password="pass") # noqa: S106 def test_compose_error(arrow_compose_path): - compose = DockerCompose(arrow_compose_path, params=dict( - PYTHON='3.8', - PANDAS='upstream_devel' - )) + compose = DockerCompose( + arrow_compose_path, params=dict(PYTHON="3.8", PANDAS="upstream_devel") + ) error = subprocess.CalledProcessError(99, []) - with mock.patch('subprocess.run', side_effect=error): + with mock.patch("subprocess.run", side_effect=error): with pytest.raises(RuntimeError) as exc: - compose.run('conda-cpp') + compose.run("conda-cpp") exception_message = str(exc.value) assert "exited with a non-zero exit code 99" in exception_message @@ -529,30 +497,38 @@ def test_image_with_gpu(arrow_compose_path): expected_calls = [ [ - "run", "--rm", "--gpus", "all", - "-e", "CUDA_ENV=1", - "-e", "OTHER_ENV=2", - "-v", "/host:/container", + "run", + "--rm", + "--gpus", + "all", + "-e", + "CUDA_ENV=1", + "-e", + "OTHER_ENV=2", + "-v", + "/host:/container", "org/ubuntu-cuda", - "/bin/bash", "-c", "echo 1 > /tmp/dummy && cat /tmp/dummy", + "/bin/bash", + "-c", + "echo 1 > /tmp/dummy && cat /tmp/dummy", ] ] with assert_docker_calls(compose, expected_calls): - compose.run('ubuntu-cuda') + compose.run("ubuntu-cuda") def test_listing_images(arrow_compose_path): compose = DockerCompose(arrow_compose_path) assert sorted(compose.images()) == [ - 'conda-cpp', - 'conda-python', - 'conda-python-dask', - 'conda-python-pandas', - 'ubuntu-c-glib', - 'ubuntu-cpp', - 'ubuntu-cpp-cmake32', - 'ubuntu-cuda', - 'ubuntu-ruby', + "conda-cpp", + "conda-python", + "conda-python-dask", + "conda-python-pandas", + "ubuntu-c-glib", + "ubuntu-cpp", + "ubuntu-cpp-cmake32", + "ubuntu-cuda", + "ubuntu-ruby", ] @@ -563,7 +539,7 @@ def test_service_info(arrow_compose_path): " image: org/conda-cpp", " build", " context: .", - " dockerfile: ci/docker/conda-cpp.dockerfile" + " dockerfile: ci/docker/conda-cpp.dockerfile", ] @@ -588,5 +564,5 @@ def test_service_info_inherited_env(arrow_compose_path): " environment", " AWS_ACCESS_KEY_ID: <inherited>", " AWS_SECRET_ACCESS_KEY: <inherited>", - " SCCACHE_BUCKET: <inherited>" + " SCCACHE_BUCKET: <inherited>", ] diff --git a/dev/archery/archery/docker/tests/test_docker_cli.py b/dev/archery/archery/docker/tests/test_docker_cli.py index c117a3edfff65..e2d4c62123010 100644 --- a/dev/archery/archery/docker/tests/test_docker_cli.py +++ b/dev/archery/archery/docker/tests/test_docker_cli.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations from unittest.mock import patch @@ -32,21 +33,10 @@ def test_docker_run_with_custom_command(run, build, pull): result = CliRunner().invoke(docker, args) assert result.exit_code == 0 - pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, - ) - build.assert_called_once_with( - "ubuntu-cpp", - use_cache=True, - use_leaf_cache=True, - ) + pull.assert_called_once_with("ubuntu-cpp", pull_leaf=True) + build.assert_called_once_with("ubuntu-cpp", use_cache=True, use_leaf_cache=True) run.assert_called_once_with( - "ubuntu-cpp", - command="bash", - env={}, - resource_limit=None, - user=None, - volumes=(), + "ubuntu-cpp", command="bash", env={}, resource_limit=None, user=None, volumes=() ) @@ -71,24 +61,15 @@ def test_docker_run_options(run, build, pull): ] result = CliRunner().invoke(docker, args) assert result.exit_code == 0 - pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, - ) - build.assert_called_once_with( - "ubuntu-cpp", - use_cache=True, - use_leaf_cache=True, - ) + pull.assert_called_once_with("ubuntu-cpp", pull_leaf=True) + build.assert_called_once_with("ubuntu-cpp", use_cache=True, use_leaf_cache=True) run.assert_called_once_with( "ubuntu-cpp", command=None, env={"ARROW_GANDIVA": "OFF", "ARROW_FLIGHT": "ON"}, resource_limit=None, user="root", - volumes=( - "./build:/build", - "./ccache:/ccache:delegated", - ), + volumes=("./build:/build", "./ccache:/ccache:delegated"), ) @@ -120,10 +101,7 @@ def test_docker_limit_options(run): env={"ARROW_GANDIVA": "OFF", "ARROW_FLIGHT": "ON"}, resource_limit="github", user="root", - volumes=( - "./build:/build", - "./ccache:/ccache:delegated", - ), + volumes=("./build:/build", "./ccache:/ccache:delegated"), ) @@ -133,12 +111,7 @@ def test_docker_run_without_pulling_or_building(run): result = CliRunner().invoke(docker, args) assert result.exit_code == 0 run.assert_called_once_with( - "ubuntu-cpp", - command=None, - env={}, - resource_limit=None, - user=None, - volumes=(), + "ubuntu-cpp", command=None, env={}, resource_limit=None, user=None, volumes=() ) @@ -148,14 +121,8 @@ def test_docker_run_only_pulling_and_building(build, pull): args = ["run", "ubuntu-cpp", "--build-only"] result = CliRunner().invoke(docker, args) assert result.exit_code == 0 - pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, - ) - build.assert_called_once_with( - "ubuntu-cpp", - use_cache=True, - use_leaf_cache=True, - ) + pull.assert_called_once_with("ubuntu-cpp", pull_leaf=True) + build.assert_called_once_with("ubuntu-cpp", use_cache=True, use_leaf_cache=True) @patch.object(DockerCompose, "build") @@ -173,16 +140,7 @@ def test_docker_run_without_build_cache(run, build): ] result = CliRunner().invoke(docker, args) assert result.exit_code == 0 - build.assert_called_once_with( - "ubuntu-cpp", - use_cache=False, - use_leaf_cache=False, - ) + build.assert_called_once_with("ubuntu-cpp", use_cache=False, use_leaf_cache=False) run.assert_called_once_with( - "ubuntu-cpp", - command=None, - env={}, - resource_limit=None, - user="me", - volumes=(), + "ubuntu-cpp", command=None, env={}, resource_limit=None, user="me", volumes=() ) diff --git a/dev/archery/archery/integration/cdata.py b/dev/archery/archery/integration/cdata.py index a5dbbe29d8aba..18e26e3ba982a 100644 --- a/dev/archery/archery/integration/cdata.py +++ b/dev/archery/archery/integration/cdata.py @@ -14,15 +14,18 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -import cffi -from contextlib import contextmanager import functools import os import sys +from contextlib import contextmanager +from typing import TYPE_CHECKING -from .tester import CDataExporter, CDataImporter +import cffi +if TYPE_CHECKING: + from .tester import CDataExporter, CDataImporter if sys.platform == "darwin": dll_suffix = ".dylib" @@ -81,9 +84,7 @@ @functools.lru_cache def ffi() -> cffi.FFI: - """ - Return a FFI object supporting C Data Interface types. - """ + """Return a FFI object supporting C Data Interface types.""" ffi = cffi.FFI() ffi.cdef(_c_data_decls) return ffi @@ -91,7 +92,7 @@ def ffi() -> cffi.FFI: def _release_memory_steps(exporter: CDataExporter, importer: CDataImporter): yield - for i in range(max(exporter.required_gc_runs, importer.required_gc_runs)): + for _ in range(max(exporter.required_gc_runs, importer.required_gc_runs)): importer.run_gc() yield exporter.run_gc() @@ -100,8 +101,7 @@ def _release_memory_steps(exporter: CDataExporter, importer: CDataImporter): @contextmanager def check_memory_released(exporter: CDataExporter, importer: CDataImporter): - """ - A context manager for memory release checks. + """A context manager for memory release checks. The context manager arranges cooperation between the exporter and importer to try and release memory at the end of the enclosed block. @@ -109,8 +109,7 @@ def check_memory_released(exporter: CDataExporter, importer: CDataImporter): However, if either the exporter or importer doesn't support deterministic memory release, no memory check is performed. """ - do_check = (exporter.supports_releasing_memory and - importer.supports_releasing_memory) + do_check = exporter.supports_releasing_memory and importer.supports_releasing_memory if do_check: before = exporter.record_allocation_state() yield @@ -123,4 +122,5 @@ def check_memory_released(exporter: CDataExporter, importer: CDataImporter): if after != before: raise RuntimeError( f"Memory was not released correctly after roundtrip: " - f"before = {before}, after = {after} (should have been equal)") + f"before = {before}, after = {after} (should have been equal)" + ) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 027e675792dbe..0a9b300427fb7 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -14,26 +14,33 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from collections import namedtuple, OrderedDict import binascii import json import os import random import tempfile +from collections import OrderedDict, namedtuple import numpy as np -from .util import frombytes, tobytes, random_bytes, random_utf8 -from .util import SKIP_C_SCHEMA, SKIP_C_ARRAY, SKIP_FLIGHT +from .util import ( + SKIP_C_ARRAY, + SKIP_C_SCHEMA, + SKIP_FLIGHT, + frombytes, + random_bytes, + random_utf8, + tobytes, +) def metadata_key_values(pairs): - return [{'key': k, 'value': v} for k, v in pairs] + return [{"key": k, "value": v} for k, v in pairs] -class Field(object): - +class Field: def __init__(self, name, *, nullable=True, metadata=None): self.name = name self.nullable = nullable @@ -41,18 +48,18 @@ def __init__(self, name, *, nullable=True, metadata=None): def get_json(self): entries = [ - ('name', self.name), - ('type', self._get_type()), - ('nullable', self.nullable), - ('children', self._get_children()), + ("name", self.name), + ("type", self._get_type()), + ("nullable", self.nullable), + ("children", self._get_children()), ] dct = self._get_dictionary() if dct: - entries.append(('dictionary', dct)) + entries.append(("dictionary", dct)) if self.metadata is not None and len(self.metadata) > 0: - entries.append(('metadata', metadata_key_values(self.metadata))) + entries.append(("metadata", metadata_key_values(self.metadata))) return OrderedDict(entries) @@ -61,14 +68,12 @@ def _get_dictionary(self): def _make_is_valid(self, size, null_probability=0.4): if self.nullable: - return (np.random.random_sample(size) > null_probability - ).astype(np.int8) + return (np.random.random_sample(size) > null_probability).astype(np.int8) else: return np.ones(size, dtype=np.int8) -class Column(object): - +class Column: def __init__(self, name, count): self.name = name self.count = count @@ -83,29 +88,24 @@ def _get_buffers(self): return [] def get_json(self): - entries = [ - ('name', self.name), - ('count', self.count) - ] + entries = [("name", self.name), ("count", self.count)] buffers = self._get_buffers() entries.extend(buffers) children = self._get_children() if len(children) > 0: - entries.append(('children', children)) + entries.append(("children", children)) return OrderedDict(entries) class PrimitiveField(Field): - def _get_children(self): return [] class PrimitiveColumn(Column): - def __init__(self, name, count, is_valid, values): super().__init__(name, count) self.is_valid = is_valid @@ -116,8 +116,8 @@ def _encode_value(self, x): def _get_buffers(self): return [ - ('VALIDITY', [int(v) for v in self.is_valid]), - ('DATA', list([self._encode_value(x) for x in self.values])) + ("VALIDITY", [int(v) for v in self.is_valid]), + ("DATA", [self._encode_value(x) for x in self.values]), ] @@ -127,30 +127,33 @@ class NullColumn(Column): class NullField(PrimitiveField): - def __init__(self, name, metadata=None): - super().__init__(name, nullable=True, - metadata=metadata) + super().__init__(name, nullable=True, metadata=metadata) def _get_type(self): - return OrderedDict([('name', 'null')]) + return OrderedDict([("name", "null")]) def generate_column(self, size, name=None): return NullColumn(name or self.name, size) -TEST_INT_MAX = 2 ** 31 - 1 +TEST_INT_MAX = 2**31 - 1 TEST_INT_MIN = ~TEST_INT_MAX class IntegerField(PrimitiveField): - - def __init__(self, name, is_signed, bit_width, *, nullable=True, - metadata=None, - min_value=TEST_INT_MIN, - max_value=TEST_INT_MAX): - super().__init__(name, nullable=nullable, - metadata=metadata) + def __init__( + self, + name, + is_signed, + bit_width, + *, + nullable=True, + metadata=None, + min_value=TEST_INT_MIN, + max_value=TEST_INT_MAX, + ): + super().__init__(name, nullable=nullable, metadata=metadata) self.is_signed = is_signed self.bit_width = bit_width self.min_value = min_value @@ -158,10 +161,10 @@ def __init__(self, name, is_signed, bit_width, *, nullable=True, def _get_generated_data_bounds(self): if self.is_signed: - signed_iinfo = np.iinfo('int' + str(self.bit_width)) + signed_iinfo = np.iinfo("int" + str(self.bit_width)) min_value, max_value = signed_iinfo.min, signed_iinfo.max else: - unsigned_iinfo = np.iinfo('uint' + str(self.bit_width)) + unsigned_iinfo = np.iinfo("uint" + str(self.bit_width)) min_value, max_value = 0, unsigned_iinfo.max lower_bound = max(min_value, self.min_value) @@ -169,19 +172,21 @@ def _get_generated_data_bounds(self): return lower_bound, upper_bound def _get_type(self): - return OrderedDict([ - ('name', 'int'), - ('isSigned', self.is_signed), - ('bitWidth', self.bit_width) - ]) + return OrderedDict( + [ + ("name", "int"), + ("isSigned", self.is_signed), + ("bitWidth", self.bit_width), + ] + ) def generate_column(self, size, name=None): lower_bound, upper_bound = self._get_generated_data_bounds() - return self.generate_range(size, lower_bound, upper_bound, - name=name, include_extremes=True) + return self.generate_range( + size, lower_bound, upper_bound, name=name, include_extremes=True + ) - def generate_range(self, size, lower, upper, name=None, - include_extremes=False): + def generate_range(self, size, lower, upper, name=None, include_extremes=False): values = np.random.randint(lower, upper, size=size, dtype=np.int64) if include_extremes and size >= 2: values[:2] = [lower, upper] @@ -199,11 +204,16 @@ def generate_range(self, size, lower, upper, name=None, class RunEndsField(IntegerField): # bit_width should only be one of 16/32/64 def __init__(self, name, bit_width, *, metadata=None): - super().__init__(name, is_signed=True, bit_width=bit_width, - nullable=False, metadata=metadata, min_value=1) + super().__init__( + name, + is_signed=True, + bit_width=bit_width, + nullable=False, + metadata=metadata, + min_value=1, + ) - def generate_range(self, size, lower, upper, name=None, - include_extremes=False): + def generate_range(self, size, lower, upper, name=None, include_extremes=False): rng = np.random.default_rng() # generate values that are strictly increasing with a min-value of # 1, but don't go higher than the max signed value for the given @@ -224,35 +234,36 @@ def generate_range(self, size, lower, upper, name=None, class DateField(IntegerField): - DAY = 0 MILLISECOND = 1 # 1/1/1 to 12/31/9999 - _ranges = { - DAY: [-719162, 2932896], - MILLISECOND: [-62135596800000, 253402214400000] - } + _ranges = {DAY: [-719162, 2932896], MILLISECOND: [-62135596800000, 253402214400000]} def __init__(self, name, unit, *, nullable=True, metadata=None): bit_width = 32 if unit == self.DAY else 64 min_value, max_value = self._ranges[unit] super().__init__( - name, True, bit_width, - nullable=nullable, metadata=metadata, - min_value=min_value, max_value=max_value + name, + True, + bit_width, + nullable=nullable, + metadata=metadata, + min_value=min_value, + max_value=max_value, ) self.unit = unit def _get_type(self): - return OrderedDict([ - ('name', 'date'), - ('unit', 'DAY' if self.unit == self.DAY else 'MILLISECOND') - ]) + return OrderedDict( + [ + ("name", "date"), + ("unit", "DAY" if self.unit == self.DAY else "MILLISECOND"), + ] + ) - def generate_range(self, size, lower, upper, name=None, - include_extremes=False): + def generate_range(self, size, lower, upper, name=None, include_extremes=False): if self.unit == self.DAY: return super().generate_range(size, lower, upper, name) @@ -260,8 +271,10 @@ def generate_range(self, size, lower, upper, name=None, lower = -1 * (abs(lower) // full_day_millis) upper //= full_day_millis - values = [val * full_day_millis for val in np.random.randint( - lower, upper, size=size, dtype=np.int64)] + values = [ + val * full_day_millis + for val in np.random.randint(lower, upper, size=size, dtype=np.int64) + ] lower *= full_day_millis upper *= full_day_millis @@ -277,144 +290,142 @@ def generate_range(self, size, lower, upper, name=None, TIMEUNIT_NAMES = { - 's': 'SECOND', - 'ms': 'MILLISECOND', - 'us': 'MICROSECOND', - 'ns': 'NANOSECOND' + "s": "SECOND", + "ms": "MILLISECOND", + "us": "MICROSECOND", + "ns": "NANOSECOND", } class TimeField(IntegerField): - - BIT_WIDTHS = { - 's': 32, - 'ms': 32, - 'us': 64, - 'ns': 64 - } + BIT_WIDTHS = {"s": 32, "ms": 32, "us": 64, "ns": 64} _ranges = { - 's': [0, 86400], - 'ms': [0, 86400000], - 'us': [0, 86400000000], - 'ns': [0, 86400000000000] + "s": [0, 86400], + "ms": [0, 86400000], + "us": [0, 86400000000], + "ns": [0, 86400000000000], } - def __init__(self, name, unit='s', *, nullable=True, - metadata=None): + def __init__(self, name, unit="s", *, nullable=True, metadata=None): min_val, max_val = self._ranges[unit] - super().__init__(name, True, self.BIT_WIDTHS[unit], - nullable=nullable, metadata=metadata, - min_value=min_val, max_value=max_val) + super().__init__( + name, + True, + self.BIT_WIDTHS[unit], + nullable=nullable, + metadata=metadata, + min_value=min_val, + max_value=max_val, + ) self.unit = unit def _get_type(self): - return OrderedDict([ - ('name', 'time'), - ('unit', TIMEUNIT_NAMES[self.unit]), - ('bitWidth', self.bit_width) - ]) + return OrderedDict( + [ + ("name", "time"), + ("unit", TIMEUNIT_NAMES[self.unit]), + ("bitWidth", self.bit_width), + ] + ) def generate_column(self, size, name=None): lower_bound, upper_bound = self._get_generated_data_bounds() - return self.generate_range(size, lower_bound, upper_bound, - name=name) + return self.generate_range(size, lower_bound, upper_bound, name=name) class TimestampField(IntegerField): - # 1/1/1 to 12/31/9999 _ranges = { - 's': [-62135596800, 253402214400], - 'ms': [-62135596800000, 253402214400000], - 'us': [-62135596800000000, 253402214400000000], - + "s": [-62135596800, 253402214400], + "ms": [-62135596800000, 253402214400000], + "us": [-62135596800000000, 253402214400000000], # Physical range for int64, ~584 years and change - 'ns': [np.iinfo('int64').min, np.iinfo('int64').max] + "ns": [np.iinfo("int64").min, np.iinfo("int64").max], } - def __init__(self, name, unit='s', tz=None, *, nullable=True, - metadata=None): + def __init__(self, name, unit="s", tz=None, *, nullable=True, metadata=None): min_val, max_val = self._ranges[unit] - super().__init__(name, True, 64, - nullable=nullable, - metadata=metadata, - min_value=min_val, - max_value=max_val) + super().__init__( + name, + True, + 64, + nullable=nullable, + metadata=metadata, + min_value=min_val, + max_value=max_val, + ) self.unit = unit self.tz = tz def _get_type(self): - fields = [ - ('name', 'timestamp'), - ('unit', TIMEUNIT_NAMES[self.unit]) - ] + fields = [("name", "timestamp"), ("unit", TIMEUNIT_NAMES[self.unit])] if self.tz is not None: - fields.append(('timezone', self.tz)) + fields.append(("timezone", self.tz)) return OrderedDict(fields) class DurationIntervalField(IntegerField): - - def __init__(self, name, unit='s', *, nullable=True, - metadata=None): - min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max, + def __init__(self, name, unit="s", *, nullable=True, metadata=None): + min_val, max_val = (np.iinfo("int64").min, np.iinfo("int64").max) super().__init__( - name, True, 64, - nullable=nullable, metadata=metadata, - min_value=min_val, max_value=max_val) + name, + True, + 64, + nullable=nullable, + metadata=metadata, + min_value=min_val, + max_value=max_val, + ) self.unit = unit def _get_type(self): - fields = [ - ('name', 'duration'), - ('unit', TIMEUNIT_NAMES[self.unit]) - ] + fields = [("name", "duration"), ("unit", TIMEUNIT_NAMES[self.unit])] return OrderedDict(fields) class YearMonthIntervalField(IntegerField): def __init__(self, name, *, nullable=True, metadata=None): - min_val, max_val = [-10000*12, 10000*12] # +/- 10000 years. + min_val, max_val = [-10000 * 12, 10000 * 12] # +/- 10000 years. super().__init__( - name, True, 32, - nullable=nullable, metadata=metadata, - min_value=min_val, max_value=max_val) + name, + True, + 32, + nullable=nullable, + metadata=metadata, + min_value=min_val, + max_value=max_val, + ) def _get_type(self): - fields = [ - ('name', 'interval'), - ('unit', 'YEAR_MONTH'), - ] + fields = [("name", "interval"), ("unit", "YEAR_MONTH")] return OrderedDict(fields) class DayTimeIntervalField(PrimitiveField): def __init__(self, name, *, nullable=True, metadata=None): - super().__init__(name, - nullable=True, - metadata=metadata) + super().__init__(name, nullable=True, metadata=metadata) @property def numpy_type(self): return object def _get_type(self): - - return OrderedDict([ - ('name', 'interval'), - ('unit', 'DAY_TIME'), - ]) + return OrderedDict([("name", "interval"), ("unit", "DAY_TIME")]) def generate_column(self, size, name=None): - min_day_value, max_day_value = -10000*366, 10000*366 - values = [{'days': random.randint(min_day_value, max_day_value), - 'milliseconds': random.randint(-86400000, +86400000)} - for _ in range(size)] + min_day_value, max_day_value = -10000 * 366, 10000 * 366 + values = [ + { + "days": random.randint(min_day_value, max_day_value), + "milliseconds": random.randint(-86400000, +86400000), + } + for _ in range(size) + ] is_valid = self._make_is_valid(size) if name is None: @@ -424,30 +435,28 @@ def generate_column(self, size, name=None): class MonthDayNanoIntervalField(PrimitiveField): def __init__(self, name, *, nullable=True, metadata=None): - super().__init__(name, - nullable=True, - metadata=metadata) + super().__init__(name, nullable=True, metadata=metadata) @property def numpy_type(self): return object def _get_type(self): - - return OrderedDict([ - ('name', 'interval'), - ('unit', 'MONTH_DAY_NANO'), - ]) + return OrderedDict([("name", "interval"), ("unit", "MONTH_DAY_NANO")]) def generate_column(self, size, name=None): - I32 = 'int32' + I32 = "int32" min_int_value, max_int_value = np.iinfo(I32).min, np.iinfo(I32).max - I64 = 'int64' - min_nano_val, max_nano_val = np.iinfo(I64).min, np.iinfo(I64).max, - values = [{'months': random.randint(min_int_value, max_int_value), - 'days': random.randint(min_int_value, max_int_value), - 'nanoseconds': random.randint(min_nano_val, max_nano_val)} - for _ in range(size)] + I64 = "int64" + min_nano_val, max_nano_val = (np.iinfo(I64).min, np.iinfo(I64).max) + values = [ + { + "months": random.randint(min_int_value, max_int_value), + "days": random.randint(min_int_value, max_int_value), + "nanoseconds": random.randint(min_nano_val, max_nano_val), + } + for _ in range(size) + ] is_valid = self._make_is_valid(size) if name is None: @@ -456,29 +465,18 @@ def generate_column(self, size, name=None): class FloatingPointField(PrimitiveField): - - def __init__(self, name, bit_width, *, nullable=True, - metadata=None): - super().__init__(name, - nullable=nullable, - metadata=metadata) + def __init__(self, name, bit_width, *, nullable=True, metadata=None): + super().__init__(name, nullable=nullable, metadata=metadata) self.bit_width = bit_width - self.precision = { - 16: 'HALF', - 32: 'SINGLE', - 64: 'DOUBLE' - }[self.bit_width] + self.precision = {16: "HALF", 32: "SINGLE", 64: "DOUBLE"}[self.bit_width] @property def numpy_type(self): - return 'float' + str(self.bit_width) + return "float" + str(self.bit_width) def _get_type(self): - return OrderedDict([ - ('name', 'floatingpoint'), - ('precision', self.precision) - ]) + return OrderedDict([("name", "floatingpoint"), ("precision", self.precision)]) def generate_column(self, size, name=None): values = np.random.randn(size) * 1000 @@ -492,15 +490,15 @@ def generate_column(self, size, name=None): def decimal_range_from_precision(precision): assert 1 <= precision <= 76 - max_value = (10 ** precision) - 1 + max_value = (10**precision) - 1 return -max_value, max_value class DecimalField(PrimitiveField): - def __init__(self, name, precision, scale, bit_width, *, - nullable=True, metadata=None): - super().__init__(name, nullable=True, - metadata=metadata) + def __init__( + self, name, precision, scale, bit_width, *, nullable=True, metadata=None + ): + super().__init__(name, nullable=True, metadata=metadata) self.precision = precision self.scale = scale self.bit_width = bit_width @@ -510,12 +508,14 @@ def numpy_type(self): return object def _get_type(self): - return OrderedDict([ - ('name', 'decimal'), - ('precision', self.precision), - ('scale', self.scale), - ('bitWidth', self.bit_width), - ]) + return OrderedDict( + [ + ("name", "decimal"), + ("precision", self.precision), + ("scale", self.scale), + ("bitWidth", self.bit_width), + ] + ) def generate_column(self, size, name=None): min_value, max_value = decimal_range_from_precision(self.precision) @@ -528,7 +528,6 @@ def generate_column(self, size, name=None): class DecimalColumn(PrimitiveColumn): - def __init__(self, name, count, is_valid, values, bit_width): super().__init__(name, count, is_valid, values) self.bit_width = bit_width @@ -541,11 +540,11 @@ class BooleanField(PrimitiveField): bit_width = 1 def _get_type(self): - return OrderedDict([('name', 'bool')]) + return OrderedDict([("name", "bool")]) @property def numpy_type(self): - return 'bool' + return "bool" def generate_column(self, size, name=None): values = list(map(bool, np.random.randint(0, 2, size=size))) @@ -556,11 +555,8 @@ def generate_column(self, size, name=None): class FixedSizeBinaryField(PrimitiveField): - - def __init__(self, name, byte_width, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, - metadata=metadata) + def __init__(self, name, byte_width, *, nullable=True, metadata=None): + super().__init__(name, nullable=nullable, metadata=metadata) self.byte_width = byte_width @property @@ -572,14 +568,15 @@ def column_class(self): return FixedSizeBinaryColumn def _get_type(self): - return OrderedDict([('name', 'fixedsizebinary'), - ('byteWidth', self.byte_width)]) + return OrderedDict( + [("name", "fixedsizebinary"), ("byteWidth", self.byte_width)] + ) def generate_column(self, size, name=None): is_valid = self._make_is_valid(size) values = [] - for i in range(size): + for _ in range(size): values.append(random_bytes(self.byte_width)) if name is None: @@ -588,7 +585,6 @@ def generate_column(self, size, name=None): class BinaryField(PrimitiveField): - @property def numpy_type(self): return object @@ -598,7 +594,7 @@ def column_class(self): return BinaryColumn def _get_type(self): - return OrderedDict([('name', 'binary')]) + return OrderedDict([("name", "binary")]) def _random_sizes(self, size): return np.random.exponential(scale=4, size=size).astype(np.int32) @@ -621,13 +617,12 @@ def generate_column(self, size, name=None): class StringField(BinaryField): - @property def column_class(self): return StringColumn def _get_type(self): - return OrderedDict([('name', 'utf8')]) + return OrderedDict([("name", "utf8")]) def generate_column(self, size, name=None): K = 7 @@ -646,70 +641,61 @@ def generate_column(self, size, name=None): class LargeBinaryField(BinaryField): - @property def column_class(self): return LargeBinaryColumn def _get_type(self): - return OrderedDict([('name', 'largebinary')]) + return OrderedDict([("name", "largebinary")]) class LargeStringField(StringField): - @property def column_class(self): return LargeStringColumn def _get_type(self): - return OrderedDict([('name', 'largeutf8')]) + return OrderedDict([("name", "largeutf8")]) class BinaryViewField(BinaryField): - @property def column_class(self): return BinaryViewColumn def _get_type(self): - return OrderedDict([('name', 'binaryview')]) + return OrderedDict([("name", "binaryview")]) class StringViewField(StringField): - @property def column_class(self): return StringViewColumn def _get_type(self): - return OrderedDict([('name', 'utf8view')]) - + return OrderedDict([("name", "utf8view")]) -class Schema(object): +class Schema: def __init__(self, fields, metadata=None): self.fields = fields self.metadata = metadata def get_json(self): - entries = [ - ('fields', [field.get_json() for field in self.fields]) - ] + entries = [("fields", [field.get_json() for field in self.fields])] if self.metadata is not None and len(self.metadata) > 0: - entries.append(('metadata', metadata_key_values(self.metadata))) + entries.append(("metadata", metadata_key_values(self.metadata))) return OrderedDict(entries) class _NarrowOffsetsMixin: - def _encode_offsets(self, offsets): return list(map(int, offsets)) class _LargeOffsetsMixin: - def _encode_offsets(self, offsets): # 64-bit offsets have to be represented as strings to roundtrip # through JSON. @@ -717,7 +703,6 @@ def _encode_offsets(self, offsets): class _BaseBinaryColumn(PrimitiveColumn): - def _encode_value(self, x): return frombytes(binascii.hexlify(x).upper()) @@ -736,14 +721,13 @@ def _get_buffers(self): data.append(self._encode_value(v)) return [ - ('VALIDITY', [int(x) for x in self.is_valid]), - ('OFFSET', self._encode_offsets(offsets)), - ('DATA', data) + ("VALIDITY", [int(x) for x in self.is_valid]), + ("OFFSET", self._encode_offsets(offsets)), + ("DATA", data), ] class _BaseStringColumn(_BaseBinaryColumn): - def _encode_value(self, x): return frombytes(x) @@ -765,7 +749,6 @@ class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin): class BinaryViewColumn(PrimitiveColumn): - def _encode_value(self, x): return frombytes(binascii.hexlify(x).upper()) @@ -779,15 +762,14 @@ def _get_buffers(self): for i, v in enumerate(self.values): if not self.is_valid[i]: - v = b'' + v = b"" assert isinstance(v, bytes) if len(v) <= INLINE_SIZE: # Append an inline view, skip data buffer management. - views.append(OrderedDict([ - ('SIZE', len(v)), - ('INLINED', self._encode_value(v)), - ])) + views.append( + OrderedDict([("SIZE", len(v)), ("INLINED", self._encode_value(v))]) + ) continue if len(data_buffers) == 0: @@ -809,51 +791,47 @@ def _get_buffers(self): # even if the whole string view is prefix = frombytes(binascii.hexlify(v[:4]).upper()) - views.append(OrderedDict([ - ('SIZE', len(v)), - ('PREFIX_HEX', prefix), - ('BUFFER_INDEX', len(data_buffers) - 1), - ('OFFSET', offset), - ])) + views.append( + OrderedDict( + [ + ("SIZE", len(v)), + ("PREFIX_HEX", prefix), + ("BUFFER_INDEX", len(data_buffers) - 1), + ("OFFSET", offset), + ] + ) + ) return [ - ('VALIDITY', [int(x) for x in self.is_valid]), - ('VIEWS', views), - ('VARIADIC_DATA_BUFFERS', [ - frombytes(binascii.hexlify(b).upper()) - for b in data_buffers - ]), + ("VALIDITY", [int(x) for x in self.is_valid]), + ("VIEWS", views), + ( + "VARIADIC_DATA_BUFFERS", + [frombytes(binascii.hexlify(b).upper()) for b in data_buffers], + ), ] class StringViewColumn(BinaryViewColumn): - def _encode_value(self, x): return frombytes(x) class FixedSizeBinaryColumn(PrimitiveColumn): - def _encode_value(self, x): return frombytes(binascii.hexlify(x).upper()) def _get_buffers(self): data = [] - for i, v in enumerate(self.values): + for v in self.values: data.append(self._encode_value(v)) - return [ - ('VALIDITY', [int(x) for x in self.is_valid]), - ('DATA', data) - ] + return [("VALIDITY", [int(x) for x in self.is_valid]), ("DATA", data)] class ListField(Field): - - def __init__(self, name, value_field, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, - metadata=metadata) + def __init__(self, name, value_field, *, nullable=True, metadata=None): + super().__init__(name, nullable=nullable, metadata=metadata) self.value_field = value_field @property @@ -861,9 +839,7 @@ def column_class(self): return ListColumn def _get_type(self): - return OrderedDict([ - ('name', 'list') - ]) + return OrderedDict([("name", "list")]) def _get_children(self): return [self.value_field.get_json()] @@ -890,19 +866,15 @@ def generate_column(self, size, name=None): class LargeListField(ListField): - @property def column_class(self): return LargeListColumn def _get_type(self): - return OrderedDict([ - ('name', 'largelist') - ]) + return OrderedDict([("name", "largelist")]) class _BaseListColumn(Column): - def __init__(self, name, count, is_valid, offsets, values): super().__init__(name, count) self.is_valid = is_valid @@ -911,8 +883,8 @@ def __init__(self, name, count, is_valid, offsets, values): def _get_buffers(self): return [ - ('VALIDITY', [int(v) for v in self.is_valid]), - ('OFFSET', self._encode_offsets(self.offsets)) + ("VALIDITY", [int(v) for v in self.is_valid]), + ("OFFSET", self._encode_offsets(self.offsets)), ] def _get_children(self): @@ -928,11 +900,8 @@ class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin): class ListViewField(Field): - - def __init__(self, name, value_field, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, - metadata=metadata) + def __init__(self, name, value_field, *, nullable=True, metadata=None): + super().__init__(name, nullable=nullable, metadata=metadata) self.value_field = value_field @property @@ -940,9 +909,7 @@ def column_class(self): return ListViewColumn def _get_type(self): - return OrderedDict([ - ('name', 'listview') - ]) + return OrderedDict([("name", "listview")]) def _get_children(self): return [self.value_field.get_json()] @@ -965,19 +932,15 @@ def generate_column(self, size, name=None): class LargeListViewField(ListViewField): - @property def column_class(self): return LargeListViewColumn def _get_type(self): - return OrderedDict([ - ('name', 'largelistview') - ]) + return OrderedDict([("name", "largelistview")]) class _BaseListViewColumn(Column): - def __init__(self, name, count, is_valid, offsets, sizes, values): super().__init__(name, count) self.is_valid = is_valid @@ -987,9 +950,9 @@ def __init__(self, name, count, is_valid, offsets, sizes, values): def _get_buffers(self): return [ - ('VALIDITY', [int(v) for v in self.is_valid]), - ('OFFSET', self._encode_offsets(self.offsets)), - ('SIZE', self._encode_offsets(self.sizes)), + ("VALIDITY", [int(v) for v in self.is_valid]), + ("OFFSET", self._encode_offsets(self.offsets)), + ("SIZE", self._encode_offsets(self.sizes)), ] def _get_children(self): @@ -1005,24 +968,29 @@ class LargeListViewColumn(_BaseListViewColumn, _LargeOffsetsMixin): class MapField(Field): - - def __init__(self, name, key_field, item_field, *, nullable=True, - metadata=None, keys_sorted=False, entries_name='entries'): - super().__init__(name, nullable=nullable, - metadata=metadata) + def __init__( + self, + name, + key_field, + item_field, + *, + nullable=True, + metadata=None, + keys_sorted=False, + entries_name="entries", + ): + super().__init__(name, nullable=nullable, metadata=metadata) assert not key_field.nullable self.key_field = key_field self.item_field = item_field - self.pair_field = StructField(entries_name, [key_field, item_field], - nullable=False) + self.pair_field = StructField( + entries_name, [key_field, item_field], nullable=False + ) self.keys_sorted = keys_sorted def _get_type(self): - return OrderedDict([ - ('name', 'map'), - ('keysSorted', self.keys_sorted) - ]) + return OrderedDict([("name", "map"), ("keysSorted", self.keys_sorted)]) def _get_children(self): return [self.pair_field.get_json()] @@ -1049,7 +1017,6 @@ def generate_column(self, size, name=None): class MapColumn(Column): - def __init__(self, name, count, is_valid, offsets, pairs): super().__init__(name, count) self.is_valid = is_valid @@ -1058,8 +1025,8 @@ def __init__(self, name, count, is_valid, offsets, pairs): def _get_buffers(self): return [ - ('VALIDITY', [int(v) for v in self.is_valid]), - ('OFFSET', list(self.offsets)) + ("VALIDITY", [int(v) for v in self.is_valid]), + ("OFFSET", list(self.offsets)), ] def _get_children(self): @@ -1067,19 +1034,13 @@ def _get_children(self): class FixedSizeListField(Field): - - def __init__(self, name, value_field, list_size, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, - metadata=metadata) + def __init__(self, name, value_field, list_size, *, nullable=True, metadata=None): + super().__init__(name, nullable=nullable, metadata=metadata) self.value_field = value_field self.list_size = list_size def _get_type(self): - return OrderedDict([ - ('name', 'fixedsizelist'), - ('listSize', self.list_size) - ]) + return OrderedDict([("name", "fixedsizelist"), ("listSize", self.list_size)]) def _get_children(self): return [self.value_field.get_json()] @@ -1094,33 +1055,25 @@ def generate_column(self, size, name=None): class FixedSizeListColumn(Column): - def __init__(self, name, count, is_valid, values): super().__init__(name, count) self.is_valid = is_valid self.values = values def _get_buffers(self): - return [ - ('VALIDITY', [int(v) for v in self.is_valid]) - ] + return [("VALIDITY", [int(v) for v in self.is_valid])] def _get_children(self): return [self.values.get_json()] class StructField(Field): - - def __init__(self, name, fields, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, - metadata=metadata) + def __init__(self, name, fields, *, nullable=True, metadata=None): + super().__init__(name, nullable=nullable, metadata=metadata) self.fields = fields def _get_type(self): - return OrderedDict([ - ('name', 'struct') - ]) + return OrderedDict([("name", "struct")]) def _get_children(self): return [field.get_json() for field in self.fields] @@ -1135,23 +1088,18 @@ def generate_column(self, size, name=None): class RunEndEncodedField(Field): - - def __init__(self, name, run_ends_bitwidth, values_field, *, nullable=True, - metadata=None): + def __init__( + self, name, run_ends_bitwidth, values_field, *, nullable=True, metadata=None + ): super().__init__(name, nullable=nullable, metadata=metadata) - self.run_ends_field = RunEndsField('run_ends', run_ends_bitwidth) + self.run_ends_field = RunEndsField("run_ends", run_ends_bitwidth) self.values_field = values_field def _get_type(self): - return OrderedDict([ - ('name', 'runendencoded') - ]) + return OrderedDict([("name", "runendencoded")]) def _get_children(self): - return [ - self.run_ends_field.get_json(), - self.values_field.get_json() - ] + return [self.run_ends_field.get_json(), self.values_field.get_json()] def generate_column(self, size, name=None): values = self.values_field.generate_column(size) @@ -1162,9 +1110,7 @@ def generate_column(self, size, name=None): class _BaseUnionField(Field): - - def __init__(self, name, fields, type_ids=None, *, nullable=True, - metadata=None): + def __init__(self, name, fields, type_ids=None, *, nullable=True, metadata=None): super().__init__(name, nullable=nullable, metadata=metadata) if type_ids is None: type_ids = list(range(fields)) @@ -1175,11 +1121,9 @@ def __init__(self, name, fields, type_ids=None, *, nullable=True, assert all(x >= 0 for x in self.type_ids) def _get_type(self): - return OrderedDict([ - ('name', 'union'), - ('mode', self.mode), - ('typeIds', self.type_ids), - ]) + return OrderedDict( + [("name", "union"), ("mode", self.mode), ("typeIds", self.type_ids)] + ) def _get_children(self): return [field.get_json() for field in self.fields] @@ -1189,7 +1133,7 @@ def _make_type_ids(self, size): class SparseUnionField(_BaseUnionField): - mode = 'SPARSE' + mode = "SPARSE" def generate_column(self, size, name=None): array_type_ids = self._make_type_ids(size) @@ -1201,7 +1145,7 @@ def generate_column(self, size, name=None): class DenseUnionField(_BaseUnionField): - mode = 'DENSE' + mode = "DENSE" def generate_column(self, size, name=None): # Reverse mapping {logical type id => physical child id} @@ -1221,16 +1165,15 @@ def generate_column(self, size, name=None): field_values = [ field.generate_column(child_size) - for field, child_size in zip(self.fields, child_sizes)] + for field, child_size in zip(self.fields, child_sizes) + ] if name is None: name = self.name - return DenseUnionColumn(name, size, array_type_ids, offsets, - field_values) + return DenseUnionColumn(name, size, array_type_ids, offsets, field_values) -class Dictionary(object): - +class Dictionary: def __init__(self, id_, field, size, name=None, ordered=False): self.id_ = id_ self.field = field @@ -1242,19 +1185,13 @@ def __len__(self): def get_json(self): dummy_batch = RecordBatch(len(self.values), [self.values]) - return OrderedDict([ - ('id', self.id_), - ('data', dummy_batch.get_json()) - ]) + return OrderedDict([("id", self.id_), ("data", dummy_batch.get_json())]) class DictionaryField(Field): - - def __init__(self, name, index_field, dictionary, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, - metadata=metadata) - assert index_field.name == '' + def __init__(self, name, index_field, dictionary, *, nullable=True, metadata=None): + super().__init__(name, nullable=nullable, metadata=metadata) + assert index_field.name == "" assert isinstance(index_field, IntegerField) assert isinstance(dictionary, Dictionary) @@ -1268,29 +1205,30 @@ def _get_children(self): return self.dictionary.field._get_children() def _get_dictionary(self): - return OrderedDict([ - ('id', self.dictionary.id_), - ('indexType', self.index_field._get_type()), - ('isOrdered', self.dictionary.ordered) - ]) + return OrderedDict( + [ + ("id", self.dictionary.id_), + ("indexType", self.index_field._get_type()), + ("isOrdered", self.dictionary.ordered), + ] + ) def generate_column(self, size, name=None): if name is None: name = self.name - return self.index_field.generate_range(size, 0, len(self.dictionary), - name=name) + return self.index_field.generate_range(size, 0, len(self.dictionary), name=name) ExtensionType = namedtuple( - 'ExtensionType', ['extension_name', 'serialized', 'storage_field']) + "ExtensionType", ["extension_name", "serialized", "storage_field"] +) class ExtensionField(Field): - def __init__(self, name, extension_type, *, nullable=True, metadata=None): metadata = (metadata or []) + [ - ('ARROW:extension:name', extension_type.extension_name), - ('ARROW:extension:metadata', extension_type.serialized), + ("ARROW:extension:name", extension_type.extension_name), + ("ARROW:extension:metadata", extension_type.serialized), ] super().__init__(name, nullable=nullable, metadata=metadata) self.extension_type = extension_type @@ -1311,23 +1249,19 @@ def generate_column(self, size, name=None): class StructColumn(Column): - def __init__(self, name, count, is_valid, field_values): super().__init__(name, count) self.is_valid = is_valid self.field_values = field_values def _get_buffers(self): - return [ - ('VALIDITY', [int(v) for v in self.is_valid]) - ] + return [("VALIDITY", [int(v) for v in self.is_valid])] def _get_children(self): return [field.get_json() for field in self.field_values] class RunEndEncodedColumn(Column): - def __init__(self, name, count, run_ends_field, values_field): super().__init__(name, count) self.run_ends = run_ends_field @@ -1341,23 +1275,19 @@ def _get_children(self): class SparseUnionColumn(Column): - def __init__(self, name, count, type_ids, field_values): super().__init__(name, count) self.type_ids = type_ids self.field_values = field_values def _get_buffers(self): - return [ - ('TYPE_ID', [int(v) for v in self.type_ids]) - ] + return [("TYPE_ID", [int(v) for v in self.type_ids])] def _get_children(self): return [field.get_json() for field in self.field_values] class DenseUnionColumn(Column): - def __init__(self, name, count, type_ids, offsets, field_values): super().__init__(name, count) self.type_ids = type_ids @@ -1366,31 +1296,39 @@ def __init__(self, name, count, type_ids, offsets, field_values): def _get_buffers(self): return [ - ('TYPE_ID', [int(v) for v in self.type_ids]), - ('OFFSET', [int(v) for v in self.offsets]), + ("TYPE_ID", [int(v) for v in self.type_ids]), + ("OFFSET", [int(v) for v in self.offsets]), ] def _get_children(self): return [field.get_json() for field in self.field_values] -class RecordBatch(object): - +class RecordBatch: def __init__(self, count, columns): self.count = count self.columns = columns def get_json(self): - return OrderedDict([ - ('count', self.count), - ('columns', [col.get_json() for col in self.columns]) - ]) - + return OrderedDict( + [ + ("count", self.count), + ("columns", [col.get_json() for col in self.columns]), + ] + ) -class File(object): - def __init__(self, name, schema, batches, dictionaries=None, - skip_testers=None, path=None, quirks=None): +class File: + def __init__( + self, + name, + schema, + batches, + dictionaries=None, + skip_testers=None, + path=None, + quirks=None, + ): self.name = name self.schema = schema self.dictionaries = dictionaries or [] @@ -1407,88 +1345,82 @@ def __init__(self, name, schema, batches, dictionaries=None, self.quirks.update(quirks) def get_json(self): - entries = [ - ('schema', self.schema.get_json()) - ] + entries = [("schema", self.schema.get_json())] if len(self.dictionaries) > 0: - entries.append(('dictionaries', - [dictionary.get_json() - for dictionary in self.dictionaries])) - - entries.append(('batches', [batch.get_json() - for batch in self.batches])) + entries.append( + ( + "dictionaries", + [dictionary.get_json() for dictionary in self.dictionaries], + ) + ) + + entries.append(("batches", [batch.get_json() for batch in self.batches])) return OrderedDict(entries) def write(self, path): - with open(path, 'wb') as f: - f.write(json.dumps(self.get_json(), indent=2).encode('utf-8')) + with open(path, "wb") as f: + f.write(json.dumps(self.get_json(), indent=2).encode("utf-8")) self.path = path def skip_tester(self, tester): - """Skip this test for the given tester (such as 'C#'). - """ + """Skip this test for the given tester (such as 'C#').""" self.skipped_testers.add(tester) return self - def skip_format(self, format, tester='all'): - """Skip this test for the given format, and optionally tester. - """ + def skip_format(self, format, tester="all"): + """Skip this test for the given format, and optionally tester.""" self.skipped_formats.setdefault(format, set()).add(tester) return self def add_skips_from(self, other_file): - """Add skips from another File object. - """ + """Add skips from another File object.""" self.skipped_testers.update(other_file.skipped_testers) for format, testers in other_file.skipped_formats.items(): self.skipped_formats.setdefault(format, set()).update(testers) def should_skip(self, tester, format): - """Whether this (tester, format) combination should be skipped. - """ + """Whether this (tester, format) combination should be skipped.""" if tester in self.skipped_testers: return True testers = self.skipped_formats.get(format, ()) - return 'all' in testers or tester in testers + return "all" in testers or tester in testers @property def num_batches(self): - """The number of record batches in this file. - """ + """The number of record batches in this file.""" return len(self.batches) def get_field(name, type_, **kwargs): - if type_ == 'binary': + if type_ == "binary": return BinaryField(name, **kwargs) - elif type_ == 'utf8': + elif type_ == "utf8": return StringField(name, **kwargs) - elif type_ == 'largebinary': + elif type_ == "largebinary": return LargeBinaryField(name, **kwargs) - elif type_ == 'largeutf8': + elif type_ == "largeutf8": return LargeStringField(name, **kwargs) - elif type_.startswith('fixedsizebinary_'): - byte_width = int(type_.split('_')[1]) + elif type_.startswith("fixedsizebinary_"): + byte_width = int(type_.split("_")[1]) return FixedSizeBinaryField(name, byte_width=byte_width, **kwargs) dtype = np.dtype(type_) - if dtype.kind in ('i', 'u'): - signed = dtype.kind == 'i' + if dtype.kind in ("i", "u"): + signed = dtype.kind == "i" bit_width = dtype.itemsize * 8 return IntegerField(name, signed, bit_width, **kwargs) - elif dtype.kind == 'f': + elif dtype.kind == "f": bit_width = dtype.itemsize * 8 return FloatingPointField(name, bit_width, **kwargs) - elif dtype.kind == 'b': + elif dtype.kind == "b": return BooleanField(name, **kwargs) else: raise TypeError(dtype) -def _generate_file(name, fields, batch_sizes, *, - dictionaries=None, metadata=None): +def _generate_file(name, fields, batch_sizes, *, dictionaries=None, metadata=None): schema = Schema(fields, metadata=metadata) batches = [] for size in batch_sizes: @@ -1506,47 +1438,64 @@ def generate_custom_metadata_case(): def meta(items): # Generate a simple block of metadata where each value is '{}'. # Keys are delimited by whitespace in `items`. - return [(k, '{}') for k in items.split()] + return [(k, "{}") for k in items.split()] fields = [ - get_field('sort_of_pandas', 'int8', metadata=meta('pandas')), - - get_field('lots_of_meta', 'int8', metadata=meta('a b c d .. w x y z')), - + get_field("sort_of_pandas", "int8", metadata=meta("pandas")), + get_field("lots_of_meta", "int8", metadata=meta("a b c d .. w x y z")), get_field( - 'unregistered_extension', 'int8', + "unregistered_extension", + "int8", metadata=[ - ('ARROW:extension:name', '!nonexistent'), - ('ARROW:extension:metadata', ''), - ('ARROW:integration:allow_unregistered_extension', 'true'), - ]), - - ListField('list_with_odd_values', - get_field('item', 'int32', metadata=meta('odd_values'))), + ("ARROW:extension:name", "!nonexistent"), + ("ARROW:extension:metadata", ""), + ("ARROW:integration:allow_unregistered_extension", "true"), + ], + ), + ListField( + "list_with_odd_values", + get_field("item", "int32", metadata=meta("odd_values")), + ), ] batch_sizes = [1] - return _generate_file('custom_metadata', fields, batch_sizes, - metadata=meta('schema_custom_0 schema_custom_1')) + return _generate_file( + "custom_metadata", + fields, + batch_sizes, + metadata=meta("schema_custom_0 schema_custom_1"), + ) def generate_duplicate_fieldnames_case(): fields = [ - get_field('ints', 'int8'), - get_field('ints', 'int32'), - - StructField('struct', [get_field('', 'int32'), get_field('', 'utf8')]), + get_field("ints", "int8"), + get_field("ints", "int32"), + StructField("struct", [get_field("", "int32"), get_field("", "utf8")]), ] batch_sizes = [1] - return _generate_file('duplicate_fieldnames', fields, batch_sizes) - - -def generate_primitive_case(batch_sizes, name='primitive'): - types = ['bool', 'int8', 'int16', 'int32', 'int64', - 'uint8', 'uint16', 'uint32', 'uint64', - 'float32', 'float64', 'binary', 'utf8', - 'fixedsizebinary_19', 'fixedsizebinary_120'] + return _generate_file("duplicate_fieldnames", fields, batch_sizes) + + +def generate_primitive_case(batch_sizes, name="primitive"): + types = [ + "bool", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "float32", + "float64", + "binary", + "utf8", + "fixedsizebinary_19", + "fixedsizebinary_120", + ] fields = [] @@ -1558,7 +1507,7 @@ def generate_primitive_case(batch_sizes, name='primitive'): def generate_primitive_large_offsets_case(batch_sizes): - types = ['largebinary', 'largeutf8'] + types = ["largebinary", "largeutf8"] fields = [] @@ -1566,56 +1515,51 @@ def generate_primitive_large_offsets_case(batch_sizes): fields.append(get_field(type_ + "_nullable", type_, nullable=True)) fields.append(get_field(type_ + "_nonnullable", type_, nullable=False)) - return _generate_file('primitive_large_offsets', fields, batch_sizes) + return _generate_file("primitive_large_offsets", fields, batch_sizes) def generate_null_case(batch_sizes): # Interleave null with non-null types to ensure the appropriate number of # buffers (0) is read and written fields = [ - NullField(name='f0'), - get_field('f1', 'int32'), - NullField(name='f2'), - get_field('f3', 'float64'), - NullField(name='f4') + NullField(name="f0"), + get_field("f1", "int32"), + NullField(name="f2"), + get_field("f3", "float64"), + NullField(name="f4"), ] - return _generate_file('null', fields, batch_sizes) + return _generate_file("null", fields, batch_sizes) def generate_null_trivial_case(batch_sizes): # Generate a case with no buffers - fields = [ - NullField(name='f0'), - ] - return _generate_file('null_trivial', fields, batch_sizes) + fields = [NullField(name="f0")] + return _generate_file("null_trivial", fields, batch_sizes) def generate_decimal32_case(): fields = [ - DecimalField(name='f{}'.format(i), precision=precision, scale=2, - bit_width=32) + DecimalField(name=f"f{i}", precision=precision, scale=2, bit_width=32) for i, precision in enumerate(range(3, 10)) ] batch_sizes = [7, 10] - return _generate_file('decimal32', fields, batch_sizes) + return _generate_file("decimal32", fields, batch_sizes) def generate_decimal64_case(): fields = [ - DecimalField(name='f{}'.format(i), precision=precision, scale=2, - bit_width=64) + DecimalField(name=f"f{i}", precision=precision, scale=2, bit_width=64) for i, precision in enumerate(range(3, 19)) ] batch_sizes = [7, 10] - return _generate_file('decimal64', fields, batch_sizes) + return _generate_file("decimal64", fields, batch_sizes) def generate_decimal128_case(): fields = [ - DecimalField(name='f{}'.format(i), precision=precision, scale=2, - bit_width=128) + DecimalField(name=f"f{i}", precision=precision, scale=2, bit_width=128) for i, precision in enumerate(range(3, 39)) ] @@ -1623,37 +1567,36 @@ def generate_decimal128_case(): # 'decimal' is the original name for the test, and it must match # provide "gold" files that test backwards compatibility, so they # can be appropriately skipped. - return _generate_file('decimal', fields, batch_sizes) + return _generate_file("decimal", fields, batch_sizes) def generate_decimal256_case(): fields = [ - DecimalField(name='f{}'.format(i), precision=precision, scale=5, - bit_width=256) + DecimalField(name=f"f{i}", precision=precision, scale=5, bit_width=256) for i, precision in enumerate(range(37, 70)) ] batch_sizes = [7, 10] - return _generate_file('decimal256', fields, batch_sizes) + return _generate_file("decimal256", fields, batch_sizes) def generate_datetime_case(): fields = [ - DateField('f0', DateField.DAY), - DateField('f1', DateField.MILLISECOND), - TimeField('f2', 's'), - TimeField('f3', 'ms'), - TimeField('f4', 'us'), - TimeField('f5', 'ns'), - TimestampField('f6', 's'), - TimestampField('f7', 'ms'), - TimestampField('f8', 'us'), - TimestampField('f9', 'ns'), - TimestampField('f10', 'ms', tz=None), - TimestampField('f11', 's', tz='UTC'), - TimestampField('f12', 'ms', tz='US/Eastern'), - TimestampField('f13', 'us', tz='Europe/Paris'), - TimestampField('f14', 'ns', tz='US/Pacific'), + DateField("f0", DateField.DAY), + DateField("f1", DateField.MILLISECOND), + TimeField("f2", "s"), + TimeField("f3", "ms"), + TimeField("f4", "us"), + TimeField("f5", "ns"), + TimestampField("f6", "s"), + TimestampField("f7", "ms"), + TimestampField("f8", "us"), + TimestampField("f9", "ns"), + TimestampField("f10", "ms", tz=None), + TimestampField("f11", "s", tz="UTC"), + TimestampField("f12", "ms", tz="US/Eastern"), + TimestampField("f13", "us", tz="Europe/Paris"), + TimestampField("f14", "ns", tz="US/Pacific"), ] batch_sizes = [7, 10] @@ -1662,10 +1605,10 @@ def generate_datetime_case(): def generate_duration_case(): fields = [ - DurationIntervalField('f1', 's'), - DurationIntervalField('f2', 'ms'), - DurationIntervalField('f3', 'us'), - DurationIntervalField('f4', 'ns'), + DurationIntervalField("f1", "s"), + DurationIntervalField("f2", "ms"), + DurationIntervalField("f3", "us"), + DurationIntervalField("f4", "ns"), ] batch_sizes = [7, 10] @@ -1673,19 +1616,14 @@ def generate_duration_case(): def generate_interval_case(): - fields = [ - YearMonthIntervalField('f5'), - DayTimeIntervalField('f6'), - ] + fields = [YearMonthIntervalField("f5"), DayTimeIntervalField("f6")] batch_sizes = [7, 10] return _generate_file("interval", fields, batch_sizes) def generate_month_day_nano_interval_case(): - fields = [ - MonthDayNanoIntervalField('f1'), - ] + fields = [MonthDayNanoIntervalField("f1")] batch_sizes = [7, 10] return _generate_file("interval_mdn", fields, batch_sizes) @@ -1693,8 +1631,11 @@ def generate_month_day_nano_interval_case(): def generate_map_case(): fields = [ - MapField('map_nullable', get_field('key', 'utf8', nullable=False), - get_field('value', 'int32')), + MapField( + "map_nullable", + get_field("key", "utf8", nullable=False), + get_field("value", "int32"), + ) ] batch_sizes = [7, 10] @@ -1703,10 +1644,12 @@ def generate_map_case(): def generate_non_canonical_map_case(): fields = [ - MapField('map_other_names', - get_field('some_key', 'utf8', nullable=False), - get_field('some_value', 'int32'), - entries_name='some_entries'), + MapField( + "map_other_names", + get_field("some_key", "utf8", nullable=False), + get_field("some_value", "int32"), + entries_name="some_entries", + ) ] batch_sizes = [7] @@ -1715,11 +1658,11 @@ def generate_non_canonical_map_case(): def generate_nested_case(): fields = [ - ListField('list_nullable', get_field('item', 'int32')), - FixedSizeListField('fixedsizelist_nullable', - get_field('item', 'int32'), 4), - StructField('struct_nullable', [get_field('f1', 'int32'), - get_field('f2', 'utf8')]), + ListField("list_nullable", get_field("item", "int32")), + FixedSizeListField("fixedsizelist_nullable", get_field("item", "int32"), 4), + StructField( + "struct_nullable", [get_field("f1", "int32"), get_field("f2", "utf8")] + ), # Fails on Go (ARROW-8452) # ListField('list_nonnullable', get_field('item', 'int32'), # nullable=False), @@ -1731,12 +1674,13 @@ def generate_nested_case(): def generate_recursive_nested_case(): fields = [ - ListField('lists_list', - ListField('inner_list', get_field('item', 'int16'))), - ListField('structs_list', - StructField('inner_struct', - [get_field('f1', 'int32'), - get_field('f2', 'utf8')])), + ListField("lists_list", ListField("inner_list", get_field("item", "int16"))), + ListField( + "structs_list", + StructField( + "inner_struct", [get_field("f1", "int32"), get_field("f2", "utf8")] + ), + ), ] batch_sizes = [7, 10] @@ -1745,27 +1689,24 @@ def generate_recursive_nested_case(): def generate_run_end_encoded_case(): fields = [ - RunEndEncodedField('ree16', 16, get_field('values', 'int32')), - RunEndEncodedField('ree32', 32, get_field('values', 'utf8')), - RunEndEncodedField('ree64', 64, get_field('values', 'float32')), + RunEndEncodedField("ree16", 16, get_field("values", "int32")), + RunEndEncodedField("ree32", 32, get_field("values", "utf8")), + RunEndEncodedField("ree64", 64, get_field("values", "float32")), ] batch_sizes = [0, 7, 10] return _generate_file("run_end_encoded", fields, batch_sizes) def generate_binary_view_case(): - fields = [ - BinaryViewField('bv'), - StringViewField('sv'), - ] + fields = [BinaryViewField("bv"), StringViewField("sv")] batch_sizes = [0, 7, 256] return _generate_file("binary_view", fields, batch_sizes) def generate_list_view_case(): fields = [ - ListViewField('lv', get_field('item', 'float32')), - LargeListViewField('llv', get_field('item', 'float32')), + ListViewField("lv", get_field("item", "float32")), + LargeListViewField("llv", get_field("item", "float32")), ] batch_sizes = [0, 7, 256] return _generate_file("list_view", fields, batch_sizes) @@ -1773,11 +1714,13 @@ def generate_list_view_case(): def generate_nested_large_offsets_case(): fields = [ - LargeListField('large_list_nullable', get_field('item', 'int32')), - LargeListField('large_list_nonnullable', - get_field('item', 'int32'), nullable=False), - LargeListField('large_list_nested', - ListField('inner_list', get_field('item', 'int16'))), + LargeListField("large_list_nullable", get_field("item", "int32")), + LargeListField( + "large_list_nonnullable", get_field("item", "int32"), nullable=False + ), + LargeListField( + "large_list_nested", ListField("inner_list", get_field("item", "int16")) + ), ] batch_sizes = [0, 13] @@ -1786,19 +1729,32 @@ def generate_nested_large_offsets_case(): def generate_unions_case(): fields = [ - SparseUnionField('sparse_1', [get_field('f1', 'int32'), - get_field('f2', 'utf8')], - type_ids=[5, 7]), - DenseUnionField('dense_1', [get_field('f1', 'int16'), - get_field('f2', 'binary')], - type_ids=[10, 20]), - SparseUnionField('sparse_2', [get_field('f1', 'float32', nullable=False), - get_field('f2', 'bool')], - type_ids=[5, 7], nullable=False), - DenseUnionField('dense_2', [get_field('f1', 'uint8', nullable=False), - get_field('f2', 'uint16'), - NullField('f3')], - type_ids=[42, 43, 44], nullable=False), + SparseUnionField( + "sparse_1", + [get_field("f1", "int32"), get_field("f2", "utf8")], + type_ids=[5, 7], + ), + DenseUnionField( + "dense_1", + [get_field("f1", "int16"), get_field("f2", "binary")], + type_ids=[10, 20], + ), + SparseUnionField( + "sparse_2", + [get_field("f1", "float32", nullable=False), get_field("f2", "bool")], + type_ids=[5, 7], + nullable=False, + ), + DenseUnionField( + "dense_2", + [ + get_field("f1", "uint8", nullable=False), + get_field("f2", "uint16"), + NullField("f3"), + ], + type_ids=[42, 43, 44], + nullable=False, + ), ] batch_sizes = [0, 11] @@ -1806,202 +1762,175 @@ def generate_unions_case(): def generate_dictionary_case(): - dict0 = Dictionary(0, StringField('dictionary1'), size=10, name='DICT0') - dict1 = Dictionary(1, StringField('dictionary1'), size=5, name='DICT1') - dict2 = Dictionary(2, get_field('dictionary2', 'int64'), - size=50, name='DICT2') + dict0 = Dictionary(0, StringField("dictionary1"), size=10, name="DICT0") + dict1 = Dictionary(1, StringField("dictionary1"), size=5, name="DICT1") + dict2 = Dictionary(2, get_field("dictionary2", "int64"), size=50, name="DICT2") fields = [ - DictionaryField('dict0', get_field('', 'int8'), dict0), - DictionaryField('dict1', get_field('', 'int32'), dict1), - DictionaryField('dict2', get_field('', 'int16'), dict2) + DictionaryField("dict0", get_field("", "int8"), dict0), + DictionaryField("dict1", get_field("", "int32"), dict1), + DictionaryField("dict2", get_field("", "int16"), dict2), ] batch_sizes = [7, 10] - return _generate_file("dictionary", fields, batch_sizes, - dictionaries=[dict0, dict1, dict2]) + return _generate_file( + "dictionary", fields, batch_sizes, dictionaries=[dict0, dict1, dict2] + ) def generate_dictionary_unsigned_case(): - dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0') - dict1 = Dictionary(1, StringField('dictionary1'), size=5, name='DICT1') - dict2 = Dictionary(2, StringField('dictionary2'), size=5, name='DICT2') + dict0 = Dictionary(0, StringField("dictionary0"), size=5, name="DICT0") + dict1 = Dictionary(1, StringField("dictionary1"), size=5, name="DICT1") + dict2 = Dictionary(2, StringField("dictionary2"), size=5, name="DICT2") # TODO: JavaScript does not support uint64 dictionary indices, so disabled # for now # dict3 = Dictionary(3, StringField('dictionary3'), size=5, name='DICT3') fields = [ - DictionaryField('f0', get_field('', 'uint8'), dict0), - DictionaryField('f1', get_field('', 'uint16'), dict1), - DictionaryField('f2', get_field('', 'uint32'), dict2), + DictionaryField("f0", get_field("", "uint8"), dict0), + DictionaryField("f1", get_field("", "uint16"), dict1), + DictionaryField("f2", get_field("", "uint32"), dict2), # DictionaryField('f3', get_field('', 'uint64'), dict3) ] batch_sizes = [7, 10] - return _generate_file("dictionary_unsigned", fields, batch_sizes, - dictionaries=[dict0, dict1, dict2]) + return _generate_file( + "dictionary_unsigned", fields, batch_sizes, dictionaries=[dict0, dict1, dict2] + ) def generate_nested_dictionary_case(): - dict0 = Dictionary(0, StringField('str'), size=10, name='DICT0') + dict0 = Dictionary(0, StringField("str"), size=10, name="DICT0") list_of_dict = ListField( - 'list', - DictionaryField('str_dict', get_field('', 'int8'), dict0)) - dict1 = Dictionary(1, list_of_dict, size=30, name='DICT1') - - struct_of_dict = StructField('struct', [ - DictionaryField('str_dict_a', get_field('', 'int8'), dict0), - DictionaryField('str_dict_b', get_field('', 'int8'), dict0) - ]) - dict2 = Dictionary(2, struct_of_dict, size=30, name='DICT2') + "list", DictionaryField("str_dict", get_field("", "int8"), dict0) + ) + dict1 = Dictionary(1, list_of_dict, size=30, name="DICT1") + + struct_of_dict = StructField( + "struct", + [ + DictionaryField("str_dict_a", get_field("", "int8"), dict0), + DictionaryField("str_dict_b", get_field("", "int8"), dict0), + ], + ) + dict2 = Dictionary(2, struct_of_dict, size=30, name="DICT2") fields = [ - DictionaryField('list_dict', get_field('', 'int8'), dict1), - DictionaryField('struct_dict', get_field('', 'int8'), dict2) + DictionaryField("list_dict", get_field("", "int8"), dict1), + DictionaryField("struct_dict", get_field("", "int8"), dict2), ] batch_sizes = [10, 13] - return _generate_file("nested_dictionary", fields, batch_sizes, - dictionaries=[dict0, dict1, dict2]) + return _generate_file( + "nested_dictionary", fields, batch_sizes, dictionaries=[dict0, dict1, dict2] + ) def generate_extension_case(): - dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0') + dict0 = Dictionary(0, StringField("dictionary0"), size=5, name="DICT0") - uuid_type = ExtensionType('arrow.uuid', '', - FixedSizeBinaryField('', 16)) + uuid_type = ExtensionType("arrow.uuid", "", FixedSizeBinaryField("", 16)) dict_ext_type = ExtensionType( - 'dict-extension', 'dict-extension-serialized', - DictionaryField('str_dict', get_field('', 'int8'), dict0)) + "dict-extension", + "dict-extension-serialized", + DictionaryField("str_dict", get_field("", "int8"), dict0), + ) fields = [ - ExtensionField('uuids', uuid_type), - ExtensionField('dict_exts', dict_ext_type), + ExtensionField("uuids", uuid_type), + ExtensionField("dict_exts", dict_ext_type), ] batch_sizes = [0, 13] - return _generate_file("extension", fields, batch_sizes, - dictionaries=[dict0]) + return _generate_file("extension", fields, batch_sizes, dictionaries=[dict0]) def get_generated_json_files(tempdir=None): - tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-') + tempdir = tempdir or tempfile.mkdtemp(prefix="arrow-integration-") def _temp_path(): return file_objs = [ - generate_primitive_case([], name='primitive_no_batches'), - generate_primitive_case([17, 20], name='primitive'), - generate_primitive_case([0, 0, 0], name='primitive_zerolength'), - + generate_primitive_case([], name="primitive_no_batches"), + generate_primitive_case([17, 20], name="primitive"), + generate_primitive_case([0, 0, 0], name="primitive_zerolength"), generate_primitive_large_offsets_case([17, 20]), - generate_null_case([10, 0]), - generate_null_trivial_case([0, 0]), - generate_decimal128_case(), - - generate_decimal256_case() - .skip_tester('JS'), - + generate_decimal256_case().skip_tester("JS"), generate_decimal32_case() - .skip_tester('Java') - .skip_tester('JS') - .skip_tester('nanoarrow') - .skip_tester('Rust') - .skip_tester('Go'), - + .skip_tester("Java") + .skip_tester("JS") + .skip_tester("nanoarrow") + .skip_tester("Rust") + .skip_tester("Go"), generate_decimal64_case() - .skip_tester('Java') - .skip_tester('JS') - .skip_tester('nanoarrow') - .skip_tester('Rust') - .skip_tester('Go'), - + .skip_tester("Java") + .skip_tester("JS") + .skip_tester("nanoarrow") + .skip_tester("Rust") + .skip_tester("Go"), generate_datetime_case(), - generate_duration_case(), - - generate_interval_case() - .skip_tester('JS'), # TODO(ARROW-5239): Intervals + JS - - generate_month_day_nano_interval_case() - .skip_tester('JS'), - + generate_interval_case().skip_tester("JS"), # TODO(ARROW-5239): Intervals + JS + generate_month_day_nano_interval_case().skip_tester("JS"), generate_map_case(), - generate_non_canonical_map_case() - .skip_tester('Java') # TODO(ARROW-8715) + .skip_tester("Java") # TODO(ARROW-8715) # Canonical map names are restored on import, so the schemas are unequal - .skip_format(SKIP_C_SCHEMA, 'C++'), - + .skip_format(SKIP_C_SCHEMA, "C++"), generate_nested_case(), - generate_recursive_nested_case(), - - generate_nested_large_offsets_case() - .skip_tester('JS'), - + generate_nested_large_offsets_case().skip_tester("JS"), generate_unions_case(), - generate_custom_metadata_case(), - - generate_duplicate_fieldnames_case() - .skip_tester('JS'), - + generate_duplicate_fieldnames_case().skip_tester("JS"), generate_dictionary_case() # TODO(https://github.com/apache/arrow-nanoarrow/issues/622) - .skip_tester('nanoarrow') + .skip_tester("nanoarrow") # TODO(https://github.com/apache/arrow/issues/38045) - .skip_format(SKIP_FLIGHT, 'C#'), - + .skip_format(SKIP_FLIGHT, "C#"), generate_dictionary_unsigned_case() - .skip_tester('nanoarrow') - .skip_tester('Java') # TODO(ARROW-9377) + .skip_tester("nanoarrow") + .skip_tester("Java") # TODO(ARROW-9377) # TODO(https://github.com/apache/arrow/issues/38045) - .skip_format(SKIP_FLIGHT, 'C#'), - + .skip_format(SKIP_FLIGHT, "C#"), generate_nested_dictionary_case() # TODO(https://github.com/apache/arrow-nanoarrow/issues/622) - .skip_tester('nanoarrow') - .skip_tester('Java') # TODO(ARROW-7779) + .skip_tester("nanoarrow") + .skip_tester("Java") # TODO(ARROW-7779) # TODO(https://github.com/apache/arrow/issues/38045) - .skip_format(SKIP_FLIGHT, 'C#'), - + .skip_format(SKIP_FLIGHT, "C#"), generate_run_end_encoded_case() - .skip_tester('C#') - .skip_tester('JS') + .skip_tester("C#") + .skip_tester("JS") # TODO(https://github.com/apache/arrow-nanoarrow/issues/618) - .skip_tester('nanoarrow') - .skip_tester('Rust'), - + .skip_tester("nanoarrow") + .skip_tester("Rust"), generate_binary_view_case() - .skip_tester('JS') + .skip_tester("JS") # TODO(https://github.com/apache/arrow-nanoarrow/issues/618) - .skip_tester('nanoarrow') - .skip_tester('Rust'), - + .skip_tester("nanoarrow") + .skip_tester("Rust"), generate_list_view_case() - .skip_tester('C#') # Doesn't support large list views - .skip_tester('JS') + .skip_tester("C#") # Doesn't support large list views + .skip_tester("JS") # TODO(https://github.com/apache/arrow-nanoarrow/issues/618) - .skip_tester('nanoarrow') - .skip_tester('Rust'), - + .skip_tester("nanoarrow") + .skip_tester("Rust"), generate_extension_case() - .skip_tester('nanoarrow') + .skip_tester("nanoarrow") # TODO: ensure the extension is registered in the C++ entrypoint - .skip_format(SKIP_C_SCHEMA, 'C++') - .skip_format(SKIP_C_ARRAY, 'C++') + .skip_format(SKIP_C_SCHEMA, "C++") + .skip_format(SKIP_C_ARRAY, "C++") # TODO(https://github.com/apache/arrow/issues/38045) - .skip_format(SKIP_FLIGHT, 'C#'), + .skip_format(SKIP_FLIGHT, "C#"), ] generated_paths = [] for file_obj in file_objs: - out_path = os.path.join(tempdir, 'generated_' + - file_obj.name + '.json') + out_path = os.path.join(tempdir, "generated_" + file_obj.name + ".json") file_obj.write(out_path) generated_paths.append(file_obj) diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 4a323291c9bb6..652618e5ef34a 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -14,11 +14,9 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from collections import namedtuple -from concurrent.futures import ThreadPoolExecutor import contextlib -from functools import partial import glob import gzip import itertools @@ -26,20 +24,21 @@ import sys import tempfile import traceback -from typing import Callable, List, Optional +from collections import namedtuple +from concurrent.futures import ThreadPoolExecutor +from functools import partial +from typing import TYPE_CHECKING, Callable, List, Optional -from . import cdata -from .scenario import Scenario -from .tester import Tester, CDataExporter, CDataImporter -from .util import guid, printer -from .util import SKIP_C_ARRAY, SKIP_C_SCHEMA, SKIP_FLIGHT, SKIP_IPC from ..utils.logger import group as group_raw from ..utils.source import ARROW_ROOT_DEFAULT -from . import datagen +from . import cdata, datagen +from .scenario import Scenario +from .util import SKIP_C_ARRAY, SKIP_C_SCHEMA, SKIP_FLIGHT, SKIP_IPC, guid, printer +if TYPE_CHECKING: + from .tester import CDataExporter, CDataImporter, Tester -Failure = namedtuple('Failure', - ('test_case', 'producer', 'consumer', 'exc_info')) +Failure = namedtuple("Failure", ("test_case", "producer", "consumer", "exc_info")) log = printer.print @@ -56,13 +55,21 @@ def __init__(self): self.skipped = False -class IntegrationRunner(object): - - def __init__(self, json_files, - flight_scenarios: List[Scenario], - testers: List[Tester], other_testers: List[Tester], - tempdir=None, debug=False, stop_on_error=True, gold_dirs=None, - serial=False, match=None, **unused_kwargs): +class IntegrationRunner: + def __init__( + self, + json_files, + flight_scenarios: List[Scenario], + testers: List[Tester], + other_testers: List[Tester], + tempdir=None, + debug=False, + stop_on_error=True, + gold_dirs=None, + serial=False, + match=None, + **unused_kwargs, + ): self.json_files = json_files self.flight_scenarios = flight_scenarios self.testers = testers @@ -77,57 +84,63 @@ def __init__(self, json_files, self.match = match if self.match is not None: - print("-- Only running tests with {} in their name" - .format(self.match)) - self.json_files = [json_file for json_file in self.json_files - if self.match in json_file.name] + print(f"-- Only running tests with {self.match} in their name") + self.json_files = [ + json_file + for json_file in self.json_files + if self.match in json_file.name + ] def run_ipc(self): - """ - Run Arrow IPC integration tests for the matrix of enabled + """Run Arrow IPC integration tests for the matrix of enabled implementations. """ for producer, consumer in itertools.product( - filter(lambda t: t.PRODUCER, self.testers), - filter(lambda t: t.CONSUMER, self.testers)): + filter(lambda t: t.PRODUCER, self.testers), + filter(lambda t: t.CONSUMER, self.testers), + ): self._compare_ipc_implementations( - producer, consumer, self._produce_consume, - self.json_files) + producer, consumer, self._produce_consume, self.json_files + ) for producer, consumer in itertools.product( - filter(lambda t: t.PRODUCER, self.testers), - filter(lambda t: t.CONSUMER, self.other_testers)): + filter(lambda t: t.PRODUCER, self.testers), + filter(lambda t: t.CONSUMER, self.other_testers), + ): self._compare_ipc_implementations( - producer, consumer, self._produce_consume, - self.json_files) + producer, consumer, self._produce_consume, self.json_files + ) for producer, consumer in itertools.product( - filter(lambda t: t.PRODUCER, self.other_testers), - filter(lambda t: t.CONSUMER, self.testers)): + filter(lambda t: t.PRODUCER, self.other_testers), + filter(lambda t: t.CONSUMER, self.testers), + ): self._compare_ipc_implementations( - producer, consumer, self._produce_consume, - self.json_files) + producer, consumer, self._produce_consume, self.json_files + ) if self.gold_dirs: for gold_dir, consumer in itertools.product( - self.gold_dirs, - filter(lambda t: t.CONSUMER, self.testers)): + self.gold_dirs, filter(lambda t: t.CONSUMER, self.testers) + ): with group(f"Integration: Test: IPC: Gold: {consumer.name}"): - log('\n') - log('******************************************************') - log('Tests against golden files in {}'.format(gold_dir)) - log('******************************************************') - - def run_gold(_, consumer, test_case: datagen.File): + log("\n") + log("******************************************************") + log(f"Tests against golden files in {gold_dir}") + log("******************************************************") + + def run_gold( + _, consumer, test_case: datagen.File, gold_dir=gold_dir + ): return self._run_gold(gold_dir, consumer, test_case) + self._compare_ipc_implementations( - consumer, consumer, run_gold, - self._gold_tests(gold_dir)) - log('\n') + consumer, consumer, run_gold, self._gold_tests(gold_dir) + ) + log("\n") def run_flight(self): - """ - Run Arrow Flight integration tests for the matrix of enabled + """Run Arrow Flight integration tests for the matrix of enabled implementations. """ @@ -138,44 +151,46 @@ def is_client(t): return t.FLIGHT_CLIENT and t.CONSUMER for server, client in itertools.product( - filter(is_server, self.testers), - filter(is_client, self.testers)): + filter(is_server, self.testers), filter(is_client, self.testers) + ): self._compare_flight_implementations(server, client) for server, client in itertools.product( - filter(is_server, self.testers), - filter(is_client, self.other_testers)): + filter(is_server, self.testers), filter(is_client, self.other_testers) + ): self._compare_flight_implementations(server, client) for server, client in itertools.product( - filter(is_server, self.other_testers), - filter(is_client, self.testers)): + filter(is_server, self.other_testers), filter(is_client, self.testers) + ): self._compare_flight_implementations(server, client) - log('\n') + log("\n") def run_c_data(self): - """ - Run Arrow C Data interface integration tests for the matrix of + """Run Arrow C Data interface integration tests for the matrix of enabled implementations. """ for producer, consumer in itertools.product( - filter(lambda t: t.C_DATA_SCHEMA_EXPORTER, self.testers), - filter(lambda t: t.C_DATA_SCHEMA_IMPORTER, self.testers)): + filter(lambda t: t.C_DATA_SCHEMA_EXPORTER, self.testers), + filter(lambda t: t.C_DATA_SCHEMA_IMPORTER, self.testers), + ): self._compare_c_data_implementations(producer, consumer) for producer, consumer in itertools.product( - filter(lambda t: t.C_DATA_SCHEMA_EXPORTER, self.testers), - filter(lambda t: t.C_DATA_SCHEMA_IMPORTER, self.other_testers)): + filter(lambda t: t.C_DATA_SCHEMA_EXPORTER, self.testers), + filter(lambda t: t.C_DATA_SCHEMA_IMPORTER, self.other_testers), + ): self._compare_c_data_implementations(producer, consumer) for producer, consumer in itertools.product( - filter(lambda t: t.C_DATA_SCHEMA_EXPORTER, self.other_testers), - filter(lambda t: t.C_DATA_SCHEMA_IMPORTER, self.testers)): + filter(lambda t: t.C_DATA_SCHEMA_EXPORTER, self.other_testers), + filter(lambda t: t.C_DATA_SCHEMA_IMPORTER, self.testers), + ): self._compare_c_data_implementations(producer, consumer) - log('\n') + log("\n") def _gold_tests(self, gold_dir): prefix = os.path.basename(os.path.normpath(gold_dir)) SUFFIX = ".json.gz" golds = [jf for jf in os.listdir(gold_dir) if jf.endswith(SUFFIX)] for json_path in golds: - name = json_path[json_path.index('_')+1: -len(SUFFIX)] + name = json_path[json_path.index("_") + 1 : -len(SUFFIX)] base_name = prefix + "_" + name + ".gold.json" out_path = os.path.join(self.temp_dir, base_name) with gzip.open(os.path.join(gold_dir, json_path)) as i: @@ -184,59 +199,64 @@ def _gold_tests(self, gold_dir): # Find the generated file with the same name as this gold file try: - equiv_json_file = next(f for f in self.json_files - if f.name == name) + equiv_json_file = next(f for f in self.json_files if f.name == name) except StopIteration: equiv_json_file = None skip_testers = set() - if name == 'union' and prefix == '0.17.1': + if name == "union" and prefix == "0.17.1": skip_testers.add("Java") skip_testers.add("JS") - if prefix == '1.0.0-bigendian' or prefix == '1.0.0-littleendian': + if prefix in ("1.0.0-bigendian", "1.0.0-littleendian"): skip_testers.add("C#") skip_testers.add("Java") skip_testers.add("JS") skip_testers.add("Rust") - if prefix == '2.0.0-compression': + if prefix == "2.0.0-compression": skip_testers.add("JS") - if prefix == '2.0.0-compression' and 'lz4' in name: + if prefix == "2.0.0-compression" and "lz4" in name: # https://github.com/apache/arrow-nanoarrow/issues/621 skip_testers.add("nanoarrow") # See https://github.com/apache/arrow/pull/9822 for how to # disable specific compression type tests. - if prefix == '4.0.0-shareddict': + if prefix == "4.0.0-shareddict": skip_testers.add("C#") # https://github.com/apache/arrow-nanoarrow/issues/622 skip_testers.add("nanoarrow") quirks = set() - if prefix in {'0.14.1', '0.17.1', - '1.0.0-bigendian', '1.0.0-littleendian'}: + if prefix in {"0.14.1", "0.17.1", "1.0.0-bigendian", "1.0.0-littleendian"}: # ARROW-13558: older versions generated decimal values that # were out of range for the given precision. quirks.add("no_decimal_validate") quirks.add("no_date64_validate") quirks.add("no_times_validate") - json_file = datagen.File(name, schema=None, batches=None, - path=out_path, - skip_testers=skip_testers, - quirks=quirks) + json_file = datagen.File( + name, + schema=None, + batches=None, + path=out_path, + skip_testers=skip_testers, + quirks=quirks, + ) if equiv_json_file is not None: json_file.add_skips_from(equiv_json_file) yield json_file - def _run_test_cases(self, - case_runner: Callable[[datagen.File], Outcome], - test_cases: List[datagen.File], - *, serial: Optional[bool] = None) -> None: - """ - Populate self.failures with the outcomes of the + def _run_test_cases( + self, + case_runner: Callable[[datagen.File], Outcome], + test_cases: List[datagen.File], + *, + serial: Optional[bool] = None, + ) -> None: + """Populate self.failures with the outcomes of the ``case_runner`` ran against ``test_cases`` """ + def case_wrapper(test_case): if serial: return case_runner(test_case) @@ -273,19 +293,17 @@ def _compare_ipc_implementations( producer: Tester, consumer: Tester, run_binaries: Callable[[Tester, Tester, datagen.File], None], - test_cases: List[datagen.File] + test_cases: List[datagen.File], ): - """ - Compare Arrow IPC for two implementations (one producer, one consumer). - """ + """Compare Arrow IPC for two implementations (one producer, one consumer).""" with group(f"Integration: Test: IPC: {producer.name} -> {consumer.name}"): - log('##########################################################') - log('IPC: {0} producing, {1} consuming' - .format(producer.name, consumer.name)) - log('##########################################################') + log("##########################################################") + log(f"IPC: {producer.name} producing, {consumer.name} consuming") + log("##########################################################") - case_runner = partial(self._run_ipc_test_case, - producer, consumer, run_binaries) + case_runner = partial( + self._run_ipc_test_case, producer, consumer, run_binaries + ) self._run_test_cases(case_runner, test_cases) def _run_ipc_test_case( @@ -295,23 +313,25 @@ def _run_ipc_test_case( run_binaries: Callable[[Tester, Tester, datagen.File], None], test_case: datagen.File, ) -> Outcome: - """ - Run one IPC test case. - """ + """Run one IPC test case.""" outcome = Outcome() json_path = test_case.path - log('=' * 70) - log('Testing file {0}'.format(json_path)) + log("=" * 70) + log(f"Testing file {json_path}") if test_case.should_skip(producer.name, SKIP_IPC): - log(f'-- Skipping test because producer {producer.name} does ' - f'not support IPC') + log( + f"-- Skipping test because producer {producer.name} does " + f"not support IPC" + ) outcome.skipped = True elif test_case.should_skip(consumer.name, SKIP_IPC): - log(f'-- Skipping test because consumer {consumer.name} does ' - f'not support IPC') + log( + f"-- Skipping test because consumer {consumer.name} does " + f"not support IPC" + ) outcome.skipped = True else: @@ -319,20 +339,16 @@ def _run_ipc_test_case( run_binaries(producer, consumer, test_case) except Exception: traceback.print_exc(file=printer.stdout) - outcome.failure = Failure(test_case, producer, consumer, - sys.exc_info()) + outcome.failure = Failure(test_case, producer, consumer, sys.exc_info()) - log('=' * 70) + log("=" * 70) return outcome - def _produce_consume(self, - producer: Tester, - consumer: Tester, - test_case: datagen.File - ) -> None: - """ - Given a producer and a consumer, run different combination of + def _produce_consume( + self, producer: Tester, consumer: Tester, test_case: datagen.File + ) -> None: + """Given a producer and a consumer, run different combination of tests for the ``test_case`` * read and write are consistent * stream to file is consistent @@ -342,31 +358,32 @@ def _produce_consume(self, file_id = guid()[:8] name = os.path.splitext(os.path.basename(json_path))[0] - producer_file_path = os.path.join(self.temp_dir, file_id + '_' + - name + '.json_as_file') - producer_stream_path = os.path.join(self.temp_dir, file_id + '_' + - name + '.producer_file_as_stream') - consumer_file_path = os.path.join(self.temp_dir, file_id + '_' + - name + '.consumer_stream_as_file') - - log('-- Creating binary inputs') + producer_file_path = os.path.join( + self.temp_dir, file_id + "_" + name + ".json_as_file" + ) + producer_stream_path = os.path.join( + self.temp_dir, file_id + "_" + name + ".producer_file_as_stream" + ) + consumer_file_path = os.path.join( + self.temp_dir, file_id + "_" + name + ".consumer_stream_as_file" + ) + + log("-- Creating binary inputs") producer.json_to_file(json_path, producer_file_path) # Validate the file - log('-- Validating file') + log("-- Validating file") consumer.validate(json_path, producer_file_path) - log('-- Validating stream') + log("-- Validating stream") producer.file_to_stream(producer_file_path, producer_stream_path) consumer.stream_to_file(producer_stream_path, consumer_file_path) consumer.validate(json_path, consumer_file_path) - def _run_gold(self, - gold_dir: str, - consumer: Tester, - test_case: datagen.File) -> None: - """ - Given a directory with: + def _run_gold( + self, gold_dir: str, consumer: Tester, test_case: datagen.File + ) -> None: + """Given a directory with: * an ``.arrow_file`` * a ``.stream`` associated to the json integration file at ``test_case.path`` @@ -379,70 +396,66 @@ def _run_gold(self, json_path = test_case.path # Validate the file - log('-- Validating file') + log("-- Validating file") producer_file_path = os.path.join( - gold_dir, "generated_" + test_case.name + ".arrow_file") - consumer.validate(json_path, producer_file_path, - quirks=test_case.quirks) + gold_dir, "generated_" + test_case.name + ".arrow_file" + ) + consumer.validate(json_path, producer_file_path, quirks=test_case.quirks) - log('-- Validating stream') + log("-- Validating stream") consumer_stream_path = os.path.join( - gold_dir, "generated_" + test_case.name + ".stream") + gold_dir, "generated_" + test_case.name + ".stream" + ) file_id = guid()[:8] name = os.path.splitext(os.path.basename(json_path))[0] - consumer_file_path = os.path.join(self.temp_dir, file_id + '_' + - name + '.consumer_stream_as_file') + consumer_file_path = os.path.join( + self.temp_dir, file_id + "_" + name + ".consumer_stream_as_file" + ) consumer.stream_to_file(consumer_stream_path, consumer_file_path) - consumer.validate(json_path, consumer_file_path, - quirks=test_case.quirks) + consumer.validate(json_path, consumer_file_path, quirks=test_case.quirks) - def _compare_flight_implementations( - self, - producer: Tester, - consumer: Tester - ): + def _compare_flight_implementations(self, producer: Tester, consumer: Tester): with group(f"Integration: Test: Flight: {producer.name} -> {consumer.name}"): - log('##########################################################') - log('Flight: {0} serving, {1} requesting' - .format(producer.name, consumer.name)) - log('##########################################################') + log("##########################################################") + log(f"Flight: {producer.name} serving, {consumer.name} requesting") + log("##########################################################") case_runner = partial(self._run_flight_test_case, producer, consumer) - self._run_test_cases( - case_runner, self.json_files + self.flight_scenarios) + self._run_test_cases(case_runner, self.json_files + self.flight_scenarios) - def _run_flight_test_case(self, - producer: Tester, - consumer: Tester, - test_case: datagen.File) -> Outcome: - """ - Run one Flight test case. - """ + def _run_flight_test_case( + self, producer: Tester, consumer: Tester, test_case: datagen.File + ) -> Outcome: + """Run one Flight test case.""" outcome = Outcome() - log('=' * 70) - log('Testing file {0}'.format(test_case.name)) + log("=" * 70) + log(f"Testing file {test_case.name}") if test_case.should_skip(producer.name, SKIP_FLIGHT): - log(f'-- Skipping test because producer {producer.name} does ' - f'not support Flight') + log( + f"-- Skipping test because producer {producer.name} does " + f"not support Flight" + ) outcome.skipped = True elif test_case.should_skip(consumer.name, SKIP_FLIGHT): - log(f'-- Skipping test because consumer {consumer.name} does ' - f'not support Flight') + log( + f"-- Skipping test because consumer {consumer.name} does " + f"not support Flight" + ) outcome.skipped = True else: try: if isinstance(test_case, Scenario): server = producer.flight_server(test_case.name) - client_args = {'scenario_name': test_case.name} + client_args = {"scenario_name": test_case.name} else: server = producer.flight_server() - client_args = {'json_path': test_case.path} + client_args = {"json_path": test_case.path} with server as port: # Have the client upload the file, then download and @@ -450,52 +463,61 @@ def _run_flight_test_case(self, consumer.flight_request(port, **client_args) except Exception: traceback.print_exc(file=printer.stdout) - outcome.failure = Failure(test_case, producer, consumer, - sys.exc_info()) + outcome.failure = Failure(test_case, producer, consumer, sys.exc_info()) - log('=' * 70) + log("=" * 70) return outcome - def _compare_c_data_implementations( - self, - producer: Tester, - consumer: Tester - ): - with group("Integration: Test: C Data Interface: " - f"{producer.name} -> {consumer.name}"): - log('##########################################################') - log(f'C Data Interface: ' - f'{producer.name} exporting, {consumer.name} importing') - log('##########################################################') + def _compare_c_data_implementations(self, producer: Tester, consumer: Tester): + with group( + f"Integration: Test: C Data Interface: {producer.name} -> {consumer.name}" + ): + log("##########################################################") + log( + f"C Data Interface: " + f"{producer.name} exporting, {consumer.name} importing" + ) + log("##########################################################") # Serial execution is required for proper memory accounting serial = True with producer.make_c_data_exporter() as exporter: with consumer.make_c_data_importer() as importer: - case_runner = partial(self._run_c_schema_test_case, - producer, consumer, - exporter, importer) + case_runner = partial( + self._run_c_schema_test_case, + producer, + consumer, + exporter, + importer, + ) self._run_test_cases(case_runner, self.json_files, serial=serial) - if producer.C_DATA_ARRAY_EXPORTER and \ - consumer.C_DATA_ARRAY_IMPORTER: - case_runner = partial(self._run_c_array_test_cases, - producer, consumer, - exporter, importer) - self._run_test_cases(case_runner, - self.json_files, - serial=serial) - - def _run_c_schema_test_case(self, - producer: Tester, consumer: Tester, - exporter: CDataExporter, - importer: CDataImporter, - test_case: datagen.File) -> Outcome: - """ - Run one C ArrowSchema test case. - """ + if ( + producer.C_DATA_ARRAY_EXPORTER + and consumer.C_DATA_ARRAY_IMPORTER + ): + case_runner = partial( + self._run_c_array_test_cases, + producer, + consumer, + exporter, + importer, + ) + self._run_test_cases( + case_runner, self.json_files, serial=serial + ) + + def _run_c_schema_test_case( + self, + producer: Tester, + consumer: Tester, + exporter: CDataExporter, + importer: CDataImporter, + test_case: datagen.File, + ) -> Outcome: + """Run one C ArrowSchema test case.""" outcome = Outcome() def do_run(): @@ -506,17 +528,21 @@ def do_run(): exporter.export_schema_from_json(json_path, c_schema_ptr) importer.import_schema_and_compare_to_json(json_path, c_schema_ptr) - log('=' * 70) - log(f'Testing C ArrowSchema from file {test_case.name!r}') + log("=" * 70) + log(f"Testing C ArrowSchema from file {test_case.name!r}") if test_case.should_skip(producer.name, SKIP_C_SCHEMA): - log(f'-- Skipping test because producer {producer.name} does ' - f'not support C ArrowSchema') + log( + f"-- Skipping test because producer {producer.name} does " + f"not support C ArrowSchema" + ) outcome.skipped = True elif test_case.should_skip(consumer.name, SKIP_C_SCHEMA): - log(f'-- Skipping test because consumer {consumer.name} does ' - f'not support C ArrowSchema') + log( + f"-- Skipping test because consumer {consumer.name} does " + f"not support C ArrowSchema" + ) outcome.skipped = True else: @@ -524,21 +550,21 @@ def do_run(): do_run() except Exception: traceback.print_exc(file=printer.stdout) - outcome.failure = Failure(test_case, producer, consumer, - sys.exc_info()) + outcome.failure = Failure(test_case, producer, consumer, sys.exc_info()) - log('=' * 70) + log("=" * 70) return outcome - def _run_c_array_test_cases(self, - producer: Tester, consumer: Tester, - exporter: CDataExporter, - importer: CDataImporter, - test_case: datagen.File) -> Outcome: - """ - Run one set C ArrowArray test cases. - """ + def _run_c_array_test_cases( + self, + producer: Tester, + consumer: Tester, + exporter: CDataExporter, + importer: CDataImporter, + test_case: datagen.File, + ) -> Outcome: + """Run one set C ArrowArray test cases.""" outcome = Outcome() def do_run(): @@ -546,27 +572,28 @@ def do_run(): ffi = cdata.ffi() c_array_ptr = ffi.new("struct ArrowArray*") for num_batch in range(test_case.num_batches): - log(f'... with record batch #{num_batch}') + log(f"... with record batch #{num_batch}") with cdata.check_memory_released(exporter, importer): - exporter.export_batch_from_json(json_path, - num_batch, - c_array_ptr) - importer.import_batch_and_compare_to_json(json_path, - num_batch, - c_array_ptr) + exporter.export_batch_from_json(json_path, num_batch, c_array_ptr) + importer.import_batch_and_compare_to_json( + json_path, num_batch, c_array_ptr + ) - log('=' * 70) - log(f'Testing C ArrowArray ' - f'from file {test_case.name!r}') + log("=" * 70) + log(f"Testing C ArrowArray from file {test_case.name!r}") if test_case.should_skip(producer.name, SKIP_C_ARRAY): - log(f'-- Skipping test because producer {producer.name} does ' - f'not support C ArrowArray') + log( + f"-- Skipping test because producer {producer.name} does " + f"not support C ArrowArray" + ) outcome.skipped = True elif test_case.should_skip(consumer.name, SKIP_C_ARRAY): - log(f'-- Skipping test because consumer {consumer.name} does ' - f'not support C ArrowArray') + log( + f"-- Skipping test because consumer {consumer.name} does " + f"not support C ArrowArray" + ) outcome.skipped = True else: @@ -574,32 +601,40 @@ def do_run(): do_run() except Exception: traceback.print_exc(file=printer.stdout) - outcome.failure = Failure(test_case, producer, consumer, - sys.exc_info()) + outcome.failure = Failure(test_case, producer, consumer, sys.exc_info()) - log('=' * 70) + log("=" * 70) return outcome def get_static_json_files(): - glob_pattern = os.path.join(ARROW_ROOT_DEFAULT, - 'integration', 'data', '*.json') + glob_pattern = os.path.join(ARROW_ROOT_DEFAULT, "integration", "data", "*.json") return [ - datagen.File(name=os.path.basename(p), path=p, - schema=None, batches=None) + datagen.File(name=os.path.basename(p), path=p, schema=None, batches=None) for p in glob.glob(glob_pattern) ] -def run_all_tests(with_cpp=True, with_java=True, with_js=True, - with_csharp=True, with_go=True, with_rust=False, - with_nanoarrow=False, run_ipc=False, run_flight=False, - run_c_data=False, tempdir=None, target_implementations="", - **kwargs): - tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-') - target_implementations = \ +def run_all_tests( + with_cpp=True, + with_java=True, + with_js=True, + with_csharp=True, + with_go=True, + with_rust=False, + with_nanoarrow=False, + run_ipc=False, + run_flight=False, + run_c_data=False, + tempdir=None, + target_implementations="", + **kwargs, +): + tempdir = tempdir or tempfile.mkdtemp(prefix="arrow-integration-") + target_implementations = ( target_implementations.split(",") if target_implementations else [] + ) testers: List[Tester] = [] other_testers: List[Tester] = [] @@ -612,30 +647,37 @@ def append_tester(implementation, tester): if with_cpp: from .tester_cpp import CppTester + append_tester("cpp", CppTester(**kwargs)) if with_java: from .tester_java import JavaTester + append_tester("java", JavaTester(**kwargs)) if with_js: from .tester_js import JSTester + append_tester("js", JSTester(**kwargs)) if with_csharp: from .tester_csharp import CSharpTester + append_tester("csharp", CSharpTester(**kwargs)) if with_go: from .tester_go import GoTester + append_tester("go", GoTester(**kwargs)) if with_nanoarrow: from .tester_nanoarrow import NanoarrowTester + append_tester("nanoarrow", NanoarrowTester(**kwargs)) if with_rust: from .tester_rust import RustTester + append_tester("rust", RustTester(**kwargs)) static_json_files = get_static_json_files() @@ -661,33 +703,42 @@ def append_tester(implementation, tester): ), Scenario( "expiration_time:do_get", - description=("Ensure FlightEndpoint.expiration_time with " - "DoGet is working as expected."), + description=( + "Ensure FlightEndpoint.expiration_time with " + "DoGet is working as expected." + ), skip_testers={"JS", "C#", "Rust"}, ), Scenario( "expiration_time:list_actions", - description=("Ensure FlightEndpoint.expiration_time related " - "pre-defined actions is working with ListActions " - "as expected."), + description=( + "Ensure FlightEndpoint.expiration_time related " + "pre-defined actions is working with ListActions " + "as expected." + ), skip_testers={"JS", "C#", "Rust"}, ), Scenario( "expiration_time:cancel_flight_info", - description=("Ensure FlightEndpoint.expiration_time and " - "CancelFlightInfo are working as expected."), + description=( + "Ensure FlightEndpoint.expiration_time and " + "CancelFlightInfo are working as expected." + ), skip_testers={"JS", "C#", "Rust"}, ), Scenario( "expiration_time:renew_flight_endpoint", - description=("Ensure FlightEndpoint.expiration_time and " - "RenewFlightEndpoint are working as expected."), + description=( + "Ensure FlightEndpoint.expiration_time and " + "RenewFlightEndpoint are working as expected." + ), skip_testers={"JS", "C#", "Rust"}, ), Scenario( "do_exchange:echo", - description=("Test the do_exchange method by " - "echoing data back to the client."), + description=( + "Test the do_exchange method by echoing data back to the client." + ), skip_testers={"Go", "JS", "Rust"}, ), Scenario( @@ -698,37 +749,38 @@ def append_tester(implementation, tester): Scenario( "session_options", description="Ensure Flight SQL Sessions work as expected.", - skip_testers={"JS", "C#", "Rust"} + skip_testers={"JS", "C#", "Rust"}, ), Scenario( "poll_flight_info", description="Ensure PollFlightInfo is supported.", - skip_testers={"JS", "C#", "Rust"} + skip_testers={"JS", "C#", "Rust"}, ), Scenario( "app_metadata_flight_info_endpoint", description="Ensure support FlightInfo and Endpoint app_metadata", - skip_testers={"JS", "C#", "Rust"} + skip_testers={"JS", "C#", "Rust"}, ), Scenario( "flight_sql", description="Ensure Flight SQL protocol is working as expected.", - skip_testers={"Rust", "C#"} + skip_testers={"Rust", "C#"}, ), Scenario( "flight_sql:extension", description="Ensure Flight SQL extensions work as expected.", - skip_testers={"Rust", "C#"} + skip_testers={"Rust", "C#"}, ), Scenario( "flight_sql:ingestion", description="Ensure Flight SQL ingestion works as expected.", - skip_testers={"JS", "C#", "Rust"} + skip_testers={"JS", "C#", "Rust"}, ), ] - runner = IntegrationRunner(json_files, flight_scenarios, testers, - other_testers, **kwargs) + runner = IntegrationRunner( + json_files, flight_scenarios, testers, other_testers, **kwargs + ) if run_ipc: runner.run_ipc() if run_flight: @@ -743,11 +795,16 @@ def append_tester(implementation, tester): for test_case, producer, consumer, exc_info in runner.failures: fail_count += 1 log("FAILED TEST:", end=" ") - log(test_case.name, producer.name, "producing, ", - consumer.name, "consuming") + log( + test_case.name, + producer.name, + "producing, ", + consumer.name, + "consuming", + ) if exc_info: exc_type, exc_value, exc_tb = exc_info - log(f'{exc_type}: {exc_value}') + log(f"{exc_type}: {exc_value}") log() log(f"{fail_count} failures, {len(runner.skips)} skips") @@ -756,33 +813,27 @@ def append_tester(implementation, tester): def write_js_test_json(directory): - datagen.generate_primitive_case([], name='primitive_no_batches').write( - os.path.join(directory, 'primitive-no-batches.json') + datagen.generate_primitive_case([], name="primitive_no_batches").write( + os.path.join(directory, "primitive-no-batches.json") ) - datagen.generate_primitive_case([17, 20], name='primitive').write( - os.path.join(directory, 'primitive.json') + datagen.generate_primitive_case([17, 20], name="primitive").write( + os.path.join(directory, "primitive.json") ) - datagen.generate_primitive_case([0, 0, 0], name='primitive_zerolength').write( - os.path.join(directory, 'primitive-empty.json') + datagen.generate_primitive_case([0, 0, 0], name="primitive_zerolength").write( + os.path.join(directory, "primitive-empty.json") ) # datagen.generate_primitive_large_offsets_case([17, 20]).write( # os.path.join(directory, 'primitive-large-offsets.json') # ) - datagen.generate_null_case([10, 0]).write( - os.path.join(directory, 'null.json') - ) + datagen.generate_null_case([10, 0]).write(os.path.join(directory, "null.json")) datagen.generate_null_trivial_case([0, 0]).write( - os.path.join(directory, 'null-trivial.json') - ) - datagen.generate_decimal128_case().write( - os.path.join(directory, 'decimal128.json') + os.path.join(directory, "null-trivial.json") ) + datagen.generate_decimal128_case().write(os.path.join(directory, "decimal128.json")) # datagen.generate_decimal256_case().write( # os.path.join(directory, 'decimal256.json') # ) - datagen.generate_datetime_case().write( - os.path.join(directory, 'datetime.json') - ) + datagen.generate_datetime_case().write(os.path.join(directory, "datetime.json")) # datagen.generate_duration_case().write( # os.path.join(directory, 'duration.json') # ) @@ -792,42 +843,32 @@ def write_js_test_json(directory): # datagen.generate_month_day_nano_interval_case().write( # os.path.join(directory, 'month_day_nano_interval.json') # ) - datagen.generate_map_case().write( - os.path.join(directory, 'map.json') - ) + datagen.generate_map_case().write(os.path.join(directory, "map.json")) datagen.generate_non_canonical_map_case().write( - os.path.join(directory, 'non_canonical_map.json') - ) - datagen.generate_nested_case().write( - os.path.join(directory, 'nested.json') + os.path.join(directory, "non_canonical_map.json") ) + datagen.generate_nested_case().write(os.path.join(directory, "nested.json")) datagen.generate_recursive_nested_case().write( - os.path.join(directory, 'recursive-nested.json') + os.path.join(directory, "recursive-nested.json") ) # datagen.generate_nested_large_offsets_case().write( # os.path.join(directory, 'nested-large-offsets.json') # ) - datagen.generate_unions_case().write( - os.path.join(directory, 'unions.json') - ) + datagen.generate_unions_case().write(os.path.join(directory, "unions.json")) datagen.generate_custom_metadata_case().write( - os.path.join(directory, 'custom-metadata.json') + os.path.join(directory, "custom-metadata.json") ) # datagen.generate_duplicate_fieldnames_case().write( # os.path.join(directory, 'duplicate-fieldnames.json') # ) - datagen.generate_dictionary_case().write( - os.path.join(directory, 'dictionary.json') - ) + datagen.generate_dictionary_case().write(os.path.join(directory, "dictionary.json")) datagen.generate_dictionary_unsigned_case().write( - os.path.join(directory, 'dictionary-unsigned.json') + os.path.join(directory, "dictionary-unsigned.json") ) datagen.generate_nested_dictionary_case().write( - os.path.join(directory, 'dictionary-nested.json') + os.path.join(directory, "dictionary-nested.json") ) # datagen.generate_run_end_encoded_case().write( # os.path.join(directory, 'run_end_encoded.json') # ) - datagen.generate_extension_case().write( - os.path.join(directory, 'extension.json') - ) + datagen.generate_extension_case().write(os.path.join(directory, "extension.json")) diff --git a/dev/archery/archery/integration/scenario.py b/dev/archery/archery/integration/scenario.py index 89c64452e5fc5..cf608b3b7089e 100644 --- a/dev/archery/archery/integration/scenario.py +++ b/dev/archery/archery/integration/scenario.py @@ -14,11 +14,11 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations class Scenario: - """ - An integration test scenario for Arrow Flight. + """An integration test scenario for Arrow Flight. Does not correspond to a particular IPC JSON file. """ diff --git a/dev/archery/archery/integration/tester.py b/dev/archery/archery/integration/tester.py index 7de8f73c1398c..3004cedbd499d 100644 --- a/dev/archery/archery/integration/tester.py +++ b/dev/archery/archery/integration/tester.py @@ -16,25 +16,24 @@ # under the License. # Base class for language-specific integration test harnesses +from __future__ import annotations -from abc import ABC, abstractmethod -import os import subprocess import typing +from abc import ABC, abstractmethod from .util import log +if typing.TYPE_CHECKING: + import os _Predicate = typing.Callable[[], bool] class CDataExporter(ABC): - @abstractmethod - def export_schema_from_json(self, json_path: os.PathLike, - c_schema_ptr: object): - """ - Read a JSON integration file and export its schema. + def export_schema_from_json(self, json_path: os.PathLike, c_schema_ptr: object): + """Read a JSON integration file and export its schema. Parameters ---------- @@ -42,14 +41,14 @@ def export_schema_from_json(self, json_path: os.PathLike, Path to the JSON file c_schema_ptr : cffi pointer value Pointer to the ``ArrowSchema`` struct to export to. + """ @abstractmethod - def export_batch_from_json(self, json_path: os.PathLike, - num_batch: int, - c_array_ptr: object): - """ - Read a JSON integration file and export one of its batches. + def export_batch_from_json( + self, json_path: os.PathLike, num_batch: int, c_array_ptr: object + ): + """Read a JSON integration file and export one of its batches. Parameters ---------- @@ -59,13 +58,13 @@ def export_batch_from_json(self, json_path: os.PathLike, Number of the record batch in the JSON file c_schema_ptr : cffi pointer value Pointer to the ``ArrowArray`` struct to export to. + """ @property @abstractmethod def supports_releasing_memory(self) -> bool: - """ - Whether the implementation is able to release memory deterministically. + """Whether the implementation is able to release memory deterministically. Here, "release memory" means that, after the `release` callback of a C Data Interface export is called, `run_gc` is able to trigger @@ -76,20 +75,19 @@ def supports_releasing_memory(self) -> bool: """ def record_allocation_state(self) -> object: - """ - Return the current memory allocation state. + """Return the current memory allocation state. Returns ------- state : object Equality-comparable object representing the allocation state, for example the number of allocated or exported bytes. + """ raise NotImplementedError def run_gc(self): - """ - Run the GC if necessary. + """Run the GC if necessary. This should ensure that any temporary objects and data created by previous exporter calls are collected. @@ -97,17 +95,14 @@ def run_gc(self): @property def required_gc_runs(self): - """ - The maximum number of calls to `run_gc` that need to be issued to + """The maximum number of calls to `run_gc` that need to be issued to ensure proper deallocation. Some implementations may require this to be greater than one. """ return 1 def close(self): - """ - Final cleanup after usage. - """ + """Final cleanup after usage.""" def __enter__(self): return self @@ -117,12 +112,11 @@ def __exit__(self, *exc): class CDataImporter(ABC): - @abstractmethod - def import_schema_and_compare_to_json(self, json_path: os.PathLike, - c_schema_ptr: object): - """ - Import schema and compare it to the schema of a JSON integration file. + def import_schema_and_compare_to_json( + self, json_path: os.PathLike, c_schema_ptr: object + ): + """Import schema and compare it to the schema of a JSON integration file. An error is raised if importing fails or the schemas differ. @@ -132,14 +126,14 @@ def import_schema_and_compare_to_json(self, json_path: os.PathLike, The path to the JSON file c_schema_ptr : cffi pointer value Pointer to the ``ArrowSchema`` struct to import from. + """ @abstractmethod - def import_batch_and_compare_to_json(self, json_path: os.PathLike, - num_batch: int, - c_array_ptr: object): - """ - Import record batch and compare it to one of the batches + def import_batch_and_compare_to_json( + self, json_path: os.PathLike, num_batch: int, c_array_ptr: object + ): + """Import record batch and compare it to one of the batches from a JSON integration file. The schema used for importing the record batch is the one from @@ -155,13 +149,13 @@ def import_batch_and_compare_to_json(self, json_path: os.PathLike, Number of the record batch in the JSON file c_array_ptr : cffi pointer value Pointer to the ``ArrowArray`` struct to import from. + """ @property @abstractmethod def supports_releasing_memory(self) -> bool: - """ - Whether the implementation is able to release memory deterministically. + """Whether the implementation is able to release memory deterministically. Here, "release memory" means `run_gc()` is able to trigger the `release` callback of a C Data Interface export (which would then @@ -169,40 +163,35 @@ def supports_releasing_memory(self) -> bool: """ def run_gc(self): - """ - Run the GC if necessary. + """Run the GC if necessary. This should ensure that any imported data has its release callback called. """ @property def required_gc_runs(self): - """ - The maximum number of calls to `run_gc` that need to be issued to + """The maximum number of calls to `run_gc` that need to be issued to ensure release callbacks are triggered. Some implementations may require this to be greater than one. """ return 1 def close(self): - """ - Final cleanup after usage. - """ + """Final cleanup after usage.""" def __enter__(self): return self def __exit__(self, *exc): # Make sure any exported data is released. - for i in range(self.required_gc_runs): + for _ in range(self.required_gc_runs): self.run_gc() self.close() class Tester: - """ - The interface to declare a tester to run integration tests against. - """ + """The interface to declare a tester to run integration tests against.""" + # whether the language supports producing / writing IPC PRODUCER = False # whether the language supports consuming / reading IPC @@ -226,35 +215,30 @@ def __init__(self, debug=False, **args): self.debug = debug def run_shell_command(self, cmd, **kwargs): - cmd = ' '.join(cmd) + cmd = " ".join(cmd) if self.debug: log(cmd) - kwargs.update(shell=True) + kwargs.update(shell=True) # noqa: S604 subprocess.check_call(cmd, **kwargs) def json_to_file(self, json_path, arrow_path): - """ - Run the conversion of an Arrow JSON integration file + """Run the conversion of an Arrow JSON integration file to an Arrow IPC file """ raise NotImplementedError def stream_to_file(self, stream_path, file_path): - """ - Run the conversion of an Arrow IPC stream to an + """Run the conversion of an Arrow IPC stream to an Arrow IPC file """ raise NotImplementedError def file_to_stream(self, file_path, stream_path): - """ - Run the conversion of an Arrow IPC file to an Arrow IPC stream - """ + """Run the conversion of an Arrow IPC file to an Arrow IPC stream""" raise NotImplementedError def validate(self, json_path, arrow_path, quirks=None): - """ - Validate that the Arrow IPC file is equal to the corresponding + """Validate that the Arrow IPC file is equal to the corresponding Arrow JSON integration file """ raise NotImplementedError diff --git a/dev/archery/archery/integration/tester_cpp.py b/dev/archery/archery/integration/tester_cpp.py index 2a47bc830886a..cbb291d791252 100644 --- a/dev/archery/archery/integration/tester_cpp.py +++ b/dev/archery/archery/integration/tester_cpp.py @@ -14,17 +14,17 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import contextlib import functools import os import subprocess -from . import cdata -from .tester import Tester, CDataExporter, CDataImporter -from .util import run_cmd, log from ..utils.source import ARROW_ROOT_DEFAULT - +from . import cdata +from .tester import CDataExporter, CDataImporter, Tester +from .util import log, run_cmd _EXE_PATH = os.environ.get( "ARROW_CPP_EXE_PATH", os.path.join(ARROW_ROOT_DEFAULT, "cpp/build/debug") @@ -33,8 +33,7 @@ _STREAM_TO_FILE = os.path.join(_EXE_PATH, "arrow-stream-to-file") _FILE_TO_STREAM = os.path.join(_EXE_PATH, "arrow-file-to-stream") -_FLIGHT_SERVER_CMD = [os.path.join( - _EXE_PATH, "flight-test-integration-server")] +_FLIGHT_SERVER_CMD = [os.path.join(_EXE_PATH, "flight-test-integration-server")] _FLIGHT_CLIENT_CMD = [ os.path.join(_EXE_PATH, "flight-test-integration-client"), "-host", @@ -55,24 +54,18 @@ class CppTester(Tester): C_DATA_SCHEMA_IMPORTER = True C_DATA_ARRAY_IMPORTER = True - name = 'C++' + name = "C++" - def _run( - self, - arrow_path=None, - json_path=None, - command='VALIDATE', - quirks=None - ): - cmd = [_INTEGRATION_EXE, '--integration'] + def _run(self, arrow_path=None, json_path=None, command="VALIDATE", quirks=None): + cmd = [_INTEGRATION_EXE, "--integration"] if arrow_path is not None: - cmd.append('--arrow=' + arrow_path) + cmd.append("--arrow=" + arrow_path) if json_path is not None: - cmd.append('--json=' + json_path) + cmd.append("--json=" + json_path) - cmd.append('--mode=' + command) + cmd.append("--mode=" + command) if quirks: if "no_decimal_validate" in quirks: @@ -83,36 +76,32 @@ def _run( cmd.append("--validate_times=false") if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) def validate(self, json_path, arrow_path, quirks=None): - return self._run(arrow_path, json_path, 'VALIDATE', quirks=quirks) + return self._run(arrow_path, json_path, "VALIDATE", quirks=quirks) def json_to_file(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'JSON_TO_ARROW') + return self._run(arrow_path, json_path, "JSON_TO_ARROW") def stream_to_file(self, stream_path, file_path): - cmd = [_STREAM_TO_FILE, '<', stream_path, '>', file_path] + cmd = [_STREAM_TO_FILE, "<", stream_path, ">", file_path] self.run_shell_command(cmd) def file_to_stream(self, file_path, stream_path): - cmd = [_FILE_TO_STREAM, file_path, '>', stream_path] + cmd = [_FILE_TO_STREAM, file_path, ">", stream_path] self.run_shell_command(cmd) @contextlib.contextmanager def flight_server(self, scenario_name=None): - cmd = _FLIGHT_SERVER_CMD + ['-port=0'] + cmd = _FLIGHT_SERVER_CMD + ["-port=0"] if scenario_name: cmd = cmd + ["-scenario", scenario_name] if self.debug: log(" ".join(cmd)) - server = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) + server = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: output = server.stdout.readline().decode() if not output.startswith("Server listening on localhost:"): @@ -120,9 +109,7 @@ def flight_server(self, scenario_name=None): out, err = server.communicate() raise RuntimeError( "Flight-C++ server did not start properly, " - "stdout:\n{}\n\nstderr:\n{}\n".format( - output + out.decode(), err.decode() - ) + f"stdout:\n{output + out.decode()}\n\nstderr:\n{err.decode()}\n" ) port = int(output.split(":")[1]) yield port @@ -131,16 +118,16 @@ def flight_server(self, scenario_name=None): server.wait(5) def flight_request(self, port, json_path=None, scenario_name=None): - cmd = _FLIGHT_CLIENT_CMD + [f'-port={port}'] + cmd = _FLIGHT_CLIENT_CMD + [f"-port={port}"] if json_path: - cmd.extend(('-path', json_path)) + cmd.extend(("-path", json_path)) elif scenario_name: - cmd.extend(('-scenario', scenario_name)) + cmd.extend(("-scenario", scenario_name)) else: raise TypeError("Must provide one of json_path or scenario_name") if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) def make_c_data_exporter(self): @@ -167,15 +154,14 @@ def make_c_data_importer(self): @functools.lru_cache def _load_ffi(ffi, lib_path=_ARROW_DLL): - os.environ['ARROW_DEBUG_MEMORY_POOL'] = 'trap' + os.environ["ARROW_DEBUG_MEMORY_POOL"] = "trap" ffi.cdef(_cpp_c_data_entrypoints) dll = ffi.dlopen(lib_path) - dll.ArrowCpp_CDataIntegration_ExportSchemaFromJson + dll.ArrowCpp_CDataIntegration_ExportSchemaFromJson # noqa: B018 return dll class _CDataBase: - def __init__(self, debug, args): self.debug = debug self.args = args @@ -183,30 +169,28 @@ def __init__(self, debug, args): self.dll = _load_ffi(self.ffi) def _check_c_error(self, c_error): - """ - Check a `const char*` error return from an integration entrypoint. + """Check a `const char*` error return from an integration entrypoint. A null means success, a non-empty string is an error message. The string is statically allocated on the C++ side. """ assert self.ffi.typeof(c_error) is self.ffi.typeof("const char*") if c_error != self.ffi.NULL: - error = self.ffi.string(c_error).decode('utf8', - errors='replace') - raise RuntimeError( - f"C++ C Data Integration call failed: {error}") + error = self.ffi.string(c_error).decode("utf8", errors="replace") + raise RuntimeError(f"C++ C Data Integration call failed: {error}") class CppCDataExporter(CDataExporter, _CDataBase): - def export_schema_from_json(self, json_path, c_schema_ptr): c_error = self.dll.ArrowCpp_CDataIntegration_ExportSchemaFromJson( - str(json_path).encode(), c_schema_ptr) + str(json_path).encode(), c_schema_ptr + ) self._check_c_error(c_error) def export_batch_from_json(self, json_path, num_batch, c_array_ptr): c_error = self.dll.ArrowCpp_CDataIntegration_ExportBatchFromJson( - str(json_path).encode(), num_batch, c_array_ptr) + str(json_path).encode(), num_batch, c_array_ptr + ) self._check_c_error(c_error) @property @@ -218,16 +202,16 @@ def record_allocation_state(self): class CppCDataImporter(CDataImporter, _CDataBase): - def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): c_error = self.dll.ArrowCpp_CDataIntegration_ImportSchemaAndCompareToJson( - str(json_path).encode(), c_schema_ptr) + str(json_path).encode(), c_schema_ptr + ) self._check_c_error(c_error) - def import_batch_and_compare_to_json(self, json_path, num_batch, - c_array_ptr): + def import_batch_and_compare_to_json(self, json_path, num_batch, c_array_ptr): c_error = self.dll.ArrowCpp_CDataIntegration_ImportBatchAndCompareToJson( - str(json_path).encode(), num_batch, c_array_ptr) + str(json_path).encode(), num_batch, c_array_ptr + ) self._check_c_error(c_error) @property diff --git a/dev/archery/archery/integration/tester_csharp.py b/dev/archery/archery/integration/tester_csharp.py index 50b3499fbf285..c4cdf588647cc 100644 --- a/dev/archery/archery/integration/tester_csharp.py +++ b/dev/archery/archery/integration/tester_csharp.py @@ -14,31 +14,33 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from contextlib import contextmanager import os import subprocess +from contextlib import contextmanager -from . import cdata -from .tester import Tester, CDataExporter, CDataImporter -from .util import run_cmd, log from ..utils.source import ARROW_ROOT_DEFAULT - +from . import cdata +from .tester import CDataExporter, CDataImporter, Tester +from .util import log, run_cmd _ARTIFACTS_PATH = os.path.join(ARROW_ROOT_DEFAULT, "csharp/artifacts") _BUILD_SUBDIR = "Debug/net8.0" -_EXE_PATH = os.path.join(_ARTIFACTS_PATH, - "Apache.Arrow.IntegrationTest", - _BUILD_SUBDIR, - "Apache.Arrow.IntegrationTest", - ) +_EXE_PATH = os.path.join( + _ARTIFACTS_PATH, + "Apache.Arrow.IntegrationTest", + _BUILD_SUBDIR, + "Apache.Arrow.IntegrationTest", +) -_FLIGHT_EXE_PATH = os.path.join(_ARTIFACTS_PATH, - "Apache.Arrow.Flight.IntegrationTest", - _BUILD_SUBDIR, - "Apache.Arrow.Flight.IntegrationTest", - ) +_FLIGHT_EXE_PATH = os.path.join( + _ARTIFACTS_PATH, + "Apache.Arrow.Flight.IntegrationTest", + _BUILD_SUBDIR, + "Apache.Arrow.Flight.IntegrationTest", +) _clr_loaded = False @@ -47,26 +49,29 @@ def _load_clr(): global _clr_loaded if not _clr_loaded: _clr_loaded = True - os.environ['DOTNET_GCHeapHardLimit'] = '0xC800000' # 200 MiB + os.environ["DOTNET_GCHeapHardLimit"] = "0xC800000" # 200 MiB import pythonnet + pythonnet.load("coreclr") import clr + clr.AddReference( f"{_ARTIFACTS_PATH}/Apache.Arrow.IntegrationTest/" - f"{_BUILD_SUBDIR}/Apache.Arrow.IntegrationTest.dll") + f"{_BUILD_SUBDIR}/Apache.Arrow.IntegrationTest.dll" + ) clr.AddReference( f"{_ARTIFACTS_PATH}/Apache.Arrow.Tests/" - f"{_BUILD_SUBDIR}/Apache.Arrow.Tests.dll") + f"{_BUILD_SUBDIR}/Apache.Arrow.Tests.dll" + ) from Apache.Arrow.IntegrationTest import CDataInterface + CDataInterface.Initialize() @contextmanager def _disposing(disposable): - """ - Ensure the IDisposable object is disposed of when the enclosed block exits. - """ + """Ensure the IDisposable object is disposed of when the enclosed block exits.""" try: yield disposable finally: @@ -74,7 +79,6 @@ def _disposing(disposable): class _CDataBase: - def __init__(self, debug, args): self.debug = debug self.args = args @@ -82,7 +86,7 @@ def __init__(self, debug, args): _load_clr() def _pointer_to_int(self, c_ptr): - return int(self.ffi.cast('uintptr_t', c_ptr)) + return int(self.ffi.cast("uintptr_t", c_ptr)) def _read_batch_from_json(self, json_path, num_batch): from Apache.Arrow.IntegrationTest import CDataInterface @@ -91,25 +95,25 @@ def _read_batch_from_json(self, json_path, num_batch): def _run_gc(self): from Apache.Arrow.IntegrationTest import CDataInterface + CDataInterface.RunGC() class CSharpCDataExporter(CDataExporter, _CDataBase): - def export_schema_from_json(self, json_path, c_schema_ptr): from Apache.Arrow.IntegrationTest import CDataInterface jf = CDataInterface.ParseJsonFile(json_path) - CDataInterface.ExportSchema(jf.Schema.ToArrow(), - self._pointer_to_int(c_schema_ptr)) + CDataInterface.ExportSchema( + jf.Schema.ToArrow(), self._pointer_to_int(c_schema_ptr) + ) def export_batch_from_json(self, json_path, num_batch, c_array_ptr): from Apache.Arrow.IntegrationTest import CDataInterface _, batch = self._read_batch_from_json(json_path, num_batch) with _disposing(batch): - CDataInterface.ExportRecordBatch(batch, - self._pointer_to_int(c_array_ptr)) + CDataInterface.ExportRecordBatch(batch, self._pointer_to_int(c_array_ptr)) @property def supports_releasing_memory(self): @@ -121,28 +125,29 @@ def run_gc(self): class CSharpCDataImporter(CDataImporter, _CDataBase): - def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): from Apache.Arrow.IntegrationTest import CDataInterface from Apache.Arrow.Tests import SchemaComparer jf = CDataInterface.ParseJsonFile(json_path) imported_schema = CDataInterface.ImportSchema( - self._pointer_to_int(c_schema_ptr)) + self._pointer_to_int(c_schema_ptr) + ) SchemaComparer.Compare(jf.Schema.ToArrow(), imported_schema) - def import_batch_and_compare_to_json(self, json_path, num_batch, - c_array_ptr): + def import_batch_and_compare_to_json(self, json_path, num_batch, c_array_ptr): from Apache.Arrow.IntegrationTest import CDataInterface from Apache.Arrow.Tests import ArrowReaderVerifier schema, batch = self._read_batch_from_json(json_path, num_batch) with _disposing(batch): imported_batch = CDataInterface.ImportRecordBatch( - self._pointer_to_int(c_array_ptr), schema) + self._pointer_to_int(c_array_ptr), schema + ) with _disposing(imported_batch): - ArrowReaderVerifier.CompareBatches(batch, imported_batch, - strictCompare=False) + ArrowReaderVerifier.CompareBatches( + batch, imported_batch, strictCompare=False + ) @property def supports_releasing_memory(self): @@ -162,40 +167,40 @@ class CSharpTester(Tester): C_DATA_ARRAY_EXPORTER = True C_DATA_ARRAY_IMPORTER = True - name = 'C#' + name = "C#" - def _run(self, json_path=None, arrow_path=None, command='validate'): + def _run(self, json_path=None, arrow_path=None, command="validate"): cmd = [_EXE_PATH] - cmd.extend(['--mode', command]) + cmd.extend(["--mode", command]) if json_path is not None: - cmd.extend(['-j', json_path]) + cmd.extend(["-j", json_path]) if arrow_path is not None: - cmd.extend(['-a', arrow_path]) + cmd.extend(["-a", arrow_path]) if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) def validate(self, json_path, arrow_path, quirks=None): - return self._run(json_path, arrow_path, 'validate') + return self._run(json_path, arrow_path, "validate") def json_to_file(self, json_path, arrow_path): - return self._run(json_path, arrow_path, 'json-to-arrow') + return self._run(json_path, arrow_path, "json-to-arrow") def stream_to_file(self, stream_path, file_path): cmd = [_EXE_PATH] - cmd.extend(['--mode', 'stream-to-file', '-a', file_path]) - cmd.extend(['<', stream_path]) + cmd.extend(["--mode", "stream-to-file", "-a", file_path]) + cmd.extend(["<", stream_path]) self.run_shell_command(cmd) def file_to_stream(self, file_path, stream_path): cmd = [_EXE_PATH] - cmd.extend(['--mode', 'file-to-stream']) - cmd.extend(['-a', file_path, '>', stream_path]) + cmd.extend(["--mode", "file-to-stream"]) + cmd.extend(["-a", file_path, ">", stream_path]) self.run_shell_command(cmd) def make_c_data_exporter(self): @@ -205,27 +210,26 @@ def make_c_data_importer(self): return CSharpCDataImporter(self.debug, self.args) def flight_request(self, port, json_path=None, scenario_name=None): - cmd = [_FLIGHT_EXE_PATH, 'client', '--port', f'{port}'] + cmd = [_FLIGHT_EXE_PATH, "client", "--port", f"{port}"] if json_path: - cmd.extend(['--path', json_path]) + cmd.extend(["--path", json_path]) elif scenario_name: - cmd.extend(['--scenario', scenario_name]) + cmd.extend(["--scenario", scenario_name]) else: raise TypeError("Must provide one of json_path or scenario_name") if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) @contextmanager def flight_server(self, scenario_name=None): - cmd = [_FLIGHT_EXE_PATH, 'server'] + cmd = [_FLIGHT_EXE_PATH, "server"] if scenario_name: - cmd.extend(['--scenario', scenario_name]) + cmd.extend(["--scenario", scenario_name]) if self.debug: - log(' '.join(cmd)) - server = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + log(" ".join(cmd)) + server = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: output = server.stdout.readline().decode() @@ -233,12 +237,10 @@ def flight_server(self, scenario_name=None): server.kill() out, err = server.communicate() raise RuntimeError( - '.NET Flight server did not start properly, ' - 'stdout: \n{}\n\nstderr:\n{}\n'.format( - output + out.decode(), err.decode() - ) + ".NET Flight server did not start properly, " + f"stdout: \n{output + out.decode()}\n\nstderr:\n{err.decode()}\n" ) - port = int(output.split(':')[-1]) + port = int(output.split(":")[-1]) yield port finally: server.kill() diff --git a/dev/archery/archery/integration/tester_go.py b/dev/archery/archery/integration/tester_go.py index b59cd9d113291..5ffaeb05ac5b2 100644 --- a/dev/archery/archery/integration/tester_go.py +++ b/dev/archery/archery/integration/tester_go.py @@ -14,17 +14,17 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import contextlib import functools import os import subprocess -from . import cdata -from .tester import Tester, CDataExporter, CDataImporter -from .util import run_cmd, log from ..utils.source import ARROW_ROOT_DEFAULT - +from . import cdata +from .tester import CDataExporter, CDataImporter, Tester +from .util import log, run_cmd # FIXME(sbinet): revisit for Go modules _HOME = os.getenv("HOME", "~") @@ -42,9 +42,7 @@ "localhost", ] -_DLL_PATH = os.path.join( - ARROW_ROOT_DEFAULT, - "go/arrow/internal/cdata_integration") +_DLL_PATH = os.path.join(ARROW_ROOT_DEFAULT, "go/arrow/internal/cdata_integration") _INTEGRATION_DLL = os.path.join(_DLL_PATH, "arrow_go_integration" + cdata.dll_suffix) @@ -58,78 +56,73 @@ class GoTester(Tester): C_DATA_SCHEMA_IMPORTER = True C_DATA_ARRAY_IMPORTER = True - name = 'Go' + name = "Go" - def _run(self, arrow_path=None, json_path=None, command='VALIDATE'): + def _run(self, arrow_path=None, json_path=None, command="VALIDATE"): cmd = [_GO_INTEGRATION_EXE] if arrow_path is not None: - cmd.extend(['-arrow', arrow_path]) + cmd.extend(["-arrow", arrow_path]) if json_path is not None: - cmd.extend(['-json', json_path]) + cmd.extend(["-json", json_path]) - cmd.extend(['-mode', command]) + cmd.extend(["-mode", command]) if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) def validate(self, json_path, arrow_path, quirks=None): - return self._run(arrow_path, json_path, 'VALIDATE') + return self._run(arrow_path, json_path, "VALIDATE") def json_to_file(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'JSON_TO_ARROW') + return self._run(arrow_path, json_path, "JSON_TO_ARROW") def stream_to_file(self, stream_path, file_path): - cmd = [_STREAM_TO_FILE, '<', stream_path, '>', file_path] + cmd = [_STREAM_TO_FILE, "<", stream_path, ">", file_path] self.run_shell_command(cmd) def file_to_stream(self, file_path, stream_path): - cmd = [_FILE_TO_STREAM, file_path, '>', stream_path] + cmd = [_FILE_TO_STREAM, file_path, ">", stream_path] self.run_shell_command(cmd) @contextlib.contextmanager def flight_server(self, scenario_name=None): - cmd = _FLIGHT_SERVER_CMD + ['-port=0'] + cmd = _FLIGHT_SERVER_CMD + ["-port=0"] if scenario_name: - cmd = cmd + ['-scenario', scenario_name] + cmd = cmd + ["-scenario", scenario_name] if self.debug: - log(' '.join(cmd)) - server = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + log(" ".join(cmd)) + server = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: output = server.stdout.readline().decode() - if not output.startswith('Server listening on localhost:'): + if not output.startswith("Server listening on localhost:"): server.kill() out, err = server.communicate() raise RuntimeError( - 'Flight-Go server did not start properly, ' - 'stdout: \n{}\n\nstderr:\n{}\n'.format( - output + out.decode(), err.decode() - ) + "Flight-Go server did not start properly, " + f"stdout: \n{output + out.decode()}\n\nstderr:\n{err.decode()}\n" ) - port = int(output.split(':')[1]) + port = int(output.split(":")[1]) yield port finally: server.kill() server.wait(5) def flight_request(self, port, json_path=None, scenario_name=None): - cmd = _FLIGHT_CLIENT_CMD + [ - '-port=' + str(port), - ] + cmd = _FLIGHT_CLIENT_CMD + ["-port=" + str(port)] if json_path: - cmd.extend(('-path', json_path)) + cmd.extend(("-path", json_path)) elif scenario_name: - cmd.extend(('-scenario', scenario_name)) + cmd.extend(("-scenario", scenario_name)) else: - raise TypeError('Must provide one of json_path or scenario_name') + raise TypeError("Must provide one of json_path or scenario_name") if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) def make_c_data_exporter(self): @@ -168,7 +161,6 @@ def _load_ffi(ffi, lib_path=_INTEGRATION_DLL): class _CDataBase: - def __init__(self, debug, args): self.debug = debug self.args = args @@ -176,11 +168,10 @@ def __init__(self, debug, args): self.dll = _load_ffi(self.ffi) def _pointer_to_int(self, c_ptr): - return self.ffi.cast('uintptr_t', c_ptr) + return self.ffi.cast("uintptr_t", c_ptr) def _check_go_error(self, go_error): - """ - Check a `const char*` error return from an integration entrypoint. + """Check a `const char*` error return from an integration entrypoint. A null means success, a non-empty string is an error message. The string is dynamically allocated on the Go side. @@ -188,10 +179,8 @@ def _check_go_error(self, go_error): assert self.ffi.typeof(go_error) is self.ffi.typeof("const char*") if go_error != self.ffi.NULL: try: - error = self.ffi.string(go_error).decode('utf8', - errors='replace') - raise RuntimeError( - f"Go C Data Integration call failed: {error}") + error = self.ffi.string(go_error).decode("utf8", errors="replace") + raise RuntimeError(f"Go C Data Integration call failed: {error}") finally: self.dll.ArrowGo_FreeError(go_error) @@ -203,13 +192,14 @@ class GoCDataExporter(CDataExporter, _CDataBase): def export_schema_from_json(self, json_path, c_schema_ptr): go_error = self.dll.ArrowGo_ExportSchemaFromJson( - str(json_path).encode(), self._pointer_to_int(c_schema_ptr)) + str(json_path).encode(), self._pointer_to_int(c_schema_ptr) + ) self._check_go_error(go_error) def export_batch_from_json(self, json_path, num_batch, c_array_ptr): go_error = self.dll.ArrowGo_ExportBatchFromJson( - str(json_path).encode(), num_batch, - self._pointer_to_int(c_array_ptr)) + str(json_path).encode(), num_batch, self._pointer_to_int(c_array_ptr) + ) self._check_go_error(go_error) @property @@ -224,17 +214,16 @@ def record_allocation_state(self): class GoCDataImporter(CDataImporter, _CDataBase): - def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): go_error = self.dll.ArrowGo_ImportSchemaAndCompareToJson( - str(json_path).encode(), self._pointer_to_int(c_schema_ptr)) + str(json_path).encode(), self._pointer_to_int(c_schema_ptr) + ) self._check_go_error(go_error) - def import_batch_and_compare_to_json(self, json_path, num_batch, - c_array_ptr): + def import_batch_and_compare_to_json(self, json_path, num_batch, c_array_ptr): go_error = self.dll.ArrowGo_ImportBatchAndCompareToJson( - str(json_path).encode(), num_batch, - self._pointer_to_int(c_array_ptr)) + str(json_path).encode(), num_batch, self._pointer_to_int(c_array_ptr) + ) self._check_go_error(go_error) @property diff --git a/dev/archery/archery/integration/tester_java.py b/dev/archery/archery/integration/tester_java.py index cbc76a1825a0e..fcfd7abcc8b01 100644 --- a/dev/archery/archery/integration/tester_java.py +++ b/dev/archery/archery/integration/tester_java.py @@ -14,30 +14,30 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import contextlib import functools import os -from pathlib import Path import subprocess +from pathlib import Path -from . import cdata -from .tester import Tester, CDataExporter, CDataImporter -from .util import run_cmd, log from ..utils.source import ARROW_ROOT_DEFAULT - +from . import cdata +from .tester import CDataExporter, CDataImporter, Tester +from .util import log, run_cmd ARROW_BUILD_ROOT = os.environ.get( - 'ARROW_BUILD_ROOT', - Path(__file__).resolve().parents[4] + "ARROW_BUILD_ROOT", Path(__file__).resolve().parents[4] ) def load_version_from_pom(): import xml.etree.ElementTree as ET - tree = ET.parse(os.path.join(ARROW_ROOT_DEFAULT, 'java', 'pom.xml')) - tag_pattern = '{http://maven.apache.org/POM/4.0.0}version' - version_tag = list(tree.getroot().findall(tag_pattern))[0] + + tree = ET.parse(os.path.join(ARROW_ROOT_DEFAULT, "java", "pom.xml")) + tag_pattern = "{http://maven.apache.org/POM/4.0.0}version" + version_tag = next(iter(tree.getroot().findall(tag_pattern))) return version_tag.text @@ -58,51 +58,48 @@ def load_version_from_pom(): os.path.join( ARROW_BUILD_ROOT, "java/tools/target", - f"arrow-tools-{_arrow_version}-jar-with-dependencies.jar" - ) + f"arrow-tools-{_arrow_version}-jar-with-dependencies.jar", + ), ) _ARROW_C_DATA_JAR = os.environ.get( "ARROW_C_DATA_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_BUILD_ROOT, - "java/c/target", - f"arrow-c-data-{_arrow_version}.jar" - ) + ARROW_BUILD_ROOT, "java/c/target", f"arrow-c-data-{_arrow_version}.jar" + ), ) _ARROW_FLIGHT_JAR = os.environ.get( "ARROW_FLIGHT_JAVA_INTEGRATION_JAR", os.path.join( ARROW_BUILD_ROOT, "java/flight/flight-integration-tests/target", - f"flight-integration-tests-{_arrow_version}-jar-with-dependencies.jar" - ) -) -_ARROW_FLIGHT_SERVER = ( - "org.apache.arrow.flight.integration.tests.IntegrationTestServer" -) -_ARROW_FLIGHT_CLIENT = ( - "org.apache.arrow.flight.integration.tests.IntegrationTestClient" + f"flight-integration-tests-{_arrow_version}-jar-with-dependencies.jar", + ), ) +_ARROW_FLIGHT_SERVER = "org.apache.arrow.flight.integration.tests.IntegrationTestServer" +_ARROW_FLIGHT_CLIENT = "org.apache.arrow.flight.integration.tests.IntegrationTestClient" @functools.lru_cache def setup_jpype(): import jpype + jar_path = f"{_ARROW_TOOLS_JAR}:{_ARROW_C_DATA_JAR}" # XXX Didn't manage to tone down the logging level here (DEBUG -> INFO) - jpype.startJVM(jpype.getDefaultJVMPath(), - "-Djava.class.path=" + jar_path, - # This flag is too heavy for IPC and Flight tests - "-Darrow.memory.debug.allocator=true", - # Reduce internal use of signals by the JVM - "-Xrs", - *_JAVA_OPTS) + jpype.startJVM( + jpype.getDefaultJVMPath(), + "-Djava.class.path=" + jar_path, + # This flag is too heavy for IPC and Flight tests + "-Darrow.memory.debug.allocator=true", + # Reduce internal use of signals by the JVM + "-Xrs", + *_JAVA_OPTS, + ) class _CDataBase: - def __init__(self, debug, args): import jpype + self.debug = debug self.args = args self.ffi = cdata.ffi() @@ -113,15 +110,13 @@ def __init__(self, debug, args): self.java_allocator = self._make_java_allocator() def _pointer_to_int(self, c_ptr): - return int(self.ffi.cast('uintptr_t', c_ptr)) + return int(self.ffi.cast("uintptr_t", c_ptr)) def _wrap_c_schema_ptr(self, c_schema_ptr): - return self.java_arrow.c.ArrowSchema.wrap( - self._pointer_to_int(c_schema_ptr)) + return self.java_arrow.c.ArrowSchema.wrap(self._pointer_to_int(c_schema_ptr)) def _wrap_c_array_ptr(self, c_array_ptr): - return self.java_arrow.c.ArrowArray.wrap( - self._pointer_to_int(c_array_ptr)) + return self.java_arrow.c.ArrowArray.wrap(self._pointer_to_int(c_array_ptr)) def _make_java_allocator(self): # Return a new allocator @@ -130,37 +125,39 @@ def _make_java_allocator(self): def _assert_schemas_equal(self, expected, actual): # XXX This is fragile for dictionaries, as Schema.equals compares # dictionary ids. - self.java_arrow.vector.util.Validator.compareSchemas( - expected, actual) + self.java_arrow.vector.util.Validator.compareSchemas(expected, actual) def _assert_batches_equal(self, expected, actual): - self.java_arrow.vector.util.Validator.compareVectorSchemaRoot( - expected, actual) + self.java_arrow.vector.util.Validator.compareVectorSchemaRoot(expected, actual) def _assert_dict_providers_equal(self, expected, actual): self.java_arrow.vector.util.Validator.compareDictionaryProviders( - expected, actual) + expected, actual + ) # Note: no need to call the Java GC anywhere thanks to AutoCloseable class JavaCDataExporter(CDataExporter, _CDataBase): - def export_schema_from_json(self, json_path, c_schema_ptr): json_file = self.java_io.File(json_path) with self.java_arrow.vector.ipc.JsonFileReader( - json_file, self.java_allocator) as json_reader: + json_file, self.java_allocator + ) as json_reader: schema = json_reader.start() dict_provider = json_reader self.java_arrow.c.Data.exportSchema( - self.java_allocator, schema, dict_provider, - self._wrap_c_schema_ptr(c_schema_ptr) + self.java_allocator, + schema, + dict_provider, + self._wrap_c_schema_ptr(c_schema_ptr), ) def export_batch_from_json(self, json_path, num_batch, c_array_ptr): json_file = self.java_io.File(json_path) with self.java_arrow.vector.ipc.JsonFileReader( - json_file, self.java_allocator) as json_reader: + json_file, self.java_allocator + ) as json_reader: json_reader.start() if num_batch > 0: actually_skipped = json_reader.skip(num_batch) @@ -168,8 +165,11 @@ def export_batch_from_json(self, json_path, num_batch, c_array_ptr): with json_reader.read() as batch: dict_provider = json_reader self.java_arrow.c.Data.exportVectorSchemaRoot( - self.java_allocator, batch, dict_provider, - self._wrap_c_array_ptr(c_array_ptr)) + self.java_allocator, + batch, + dict_provider, + self._wrap_c_array_ptr(c_array_ptr), + ) @property def supports_releasing_memory(self): @@ -183,42 +183,45 @@ def close(self): class JavaCDataImporter(CDataImporter, _CDataBase): - def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): json_file = self.java_io.File(json_path) with self.java_arrow.vector.ipc.JsonFileReader( - json_file, self.java_allocator) as json_reader: + json_file, self.java_allocator + ) as json_reader: json_schema = json_reader.start() with self.java_arrow.c.CDataDictionaryProvider() as dict_provider: imported_schema = self.java_arrow.c.Data.importSchema( self.java_allocator, self._wrap_c_schema_ptr(c_schema_ptr), - dict_provider) + dict_provider, + ) self._assert_schemas_equal(json_schema, imported_schema) - def import_batch_and_compare_to_json(self, json_path, num_batch, - c_array_ptr): + def import_batch_and_compare_to_json(self, json_path, num_batch, c_array_ptr): json_file = self.java_io.File(json_path) with self.java_arrow.vector.ipc.JsonFileReader( - json_file, self.java_allocator) as json_reader: + json_file, self.java_allocator + ) as json_reader: schema = json_reader.start() if num_batch > 0: actually_skipped = json_reader.skip(num_batch) assert actually_skipped == num_batch with json_reader.read() as batch: with self.java_arrow.vector.VectorSchemaRoot.create( - schema, self.java_allocator) as imported_batch: + schema, self.java_allocator + ) as imported_batch: # We need to pass a dict provider primed with dictionary ids # matching those in the schema, hence an empty # CDataDictionaryProvider would not work here. - dict_provider = (self.java_arrow.vector.dictionary - .DictionaryProvider.MapDictionaryProvider()) + dict_provider = self.java_arrow.vector.dictionary.DictionaryProvider.MapDictionaryProvider() # noqa: E501 dict_provider.copyStructureFrom(json_reader, self.java_allocator) with dict_provider: self.java_arrow.c.Data.importIntoVectorSchemaRoot( self.java_allocator, self._wrap_c_array_ptr(c_array_ptr), - imported_batch, dict_provider) + imported_batch, + dict_provider, + ) self._assert_batches_equal(batch, imported_batch) self._assert_dict_providers_equal(json_reader, dict_provider) @@ -240,111 +243,111 @@ class JavaTester(Tester): C_DATA_ARRAY_EXPORTER = True C_DATA_ARRAY_IMPORTER = True - name = 'Java' + name = "Java" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._java_opts = _JAVA_OPTS[:] - self._java_opts.append( - '--add-reads=org.apache.arrow.flight.core=ALL-UNNAMED') + self._java_opts.append("--add-reads=org.apache.arrow.flight.core=ALL-UNNAMED") - def _run(self, arrow_path=None, json_path=None, command='VALIDATE'): + def _run(self, arrow_path=None, json_path=None, command="VALIDATE"): cmd = ( - ['java'] + - self._java_opts + - ['-cp', _ARROW_TOOLS_JAR, 'org.apache.arrow.tools.Integration'] + ["java"] + + self._java_opts + + ["-cp", _ARROW_TOOLS_JAR, "org.apache.arrow.tools.Integration"] ) if arrow_path is not None: - cmd.extend(['-a', arrow_path]) + cmd.extend(["-a", arrow_path]) if json_path is not None: - cmd.extend(['-j', json_path]) + cmd.extend(["-j", json_path]) - cmd.extend(['-c', command]) + cmd.extend(["-c", command]) if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) def validate(self, json_path, arrow_path, quirks=None): - return self._run(arrow_path, json_path, 'VALIDATE') + return self._run(arrow_path, json_path, "VALIDATE") def json_to_file(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'JSON_TO_ARROW') + return self._run(arrow_path, json_path, "JSON_TO_ARROW") def stream_to_file(self, stream_path, file_path): cmd = ( - ['java'] + self._java_opts + [ - '-cp', + ["java"] + + self._java_opts + + [ + "-cp", _ARROW_TOOLS_JAR, - 'org.apache.arrow.tools.StreamToFile', + "org.apache.arrow.tools.StreamToFile", stream_path, file_path, ] ) if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) def file_to_stream(self, file_path, stream_path): cmd = ( - ['java'] + self._java_opts + [ - '-cp', + ["java"] + + self._java_opts + + [ + "-cp", _ARROW_TOOLS_JAR, - 'org.apache.arrow.tools.FileToStream', + "org.apache.arrow.tools.FileToStream", file_path, stream_path, ] ) if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) def flight_request(self, port, json_path=None, scenario_name=None): cmd = ( - ['java'] + self._java_opts + [ - '-cp', _ARROW_FLIGHT_JAR, _ARROW_FLIGHT_CLIENT, '-port', str( - port) - ]) + ["java"] + + self._java_opts + + ["-cp", _ARROW_FLIGHT_JAR, _ARROW_FLIGHT_CLIENT, "-port", str(port)] + ) if json_path: - cmd.extend(('-j', json_path)) + cmd.extend(("-j", json_path)) elif scenario_name: - cmd.extend(('-scenario', scenario_name)) + cmd.extend(("-scenario", scenario_name)) else: - raise TypeError('Must provide one of json_path or scenario_name') + raise TypeError("Must provide one of json_path or scenario_name") if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) @contextlib.contextmanager def flight_server(self, scenario_name=None): cmd = ( - ['java'] + - self._java_opts + - ['-cp', _ARROW_FLIGHT_JAR, _ARROW_FLIGHT_SERVER, '-port', '0'] + ["java"] + + self._java_opts + + ["-cp", _ARROW_FLIGHT_JAR, _ARROW_FLIGHT_SERVER, "-port", "0"] ) if scenario_name: - cmd.extend(('-scenario', scenario_name)) + cmd.extend(("-scenario", scenario_name)) if self.debug: - log(' '.join(cmd)) - server = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + log(" ".join(cmd)) + server = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: output = server.stdout.readline().decode() - if not output.startswith('Server listening on localhost:'): + if not output.startswith("Server listening on localhost:"): server.kill() out, err = server.communicate() raise RuntimeError( - 'Flight-Java server did not start properly, ' - 'stdout:\n{}\n\nstderr:\n{}\n'.format( - output + out.decode(), err.decode() - ) + "Flight-Java server did not start properly, " + f"stdout:\n{output + out.decode()}\n\nstderr:\n{err.decode()}\n" ) - port = int(output.split(':')[1]) + port = int(output.split(":")[1]) yield port finally: server.kill() diff --git a/dev/archery/archery/integration/tester_js.py b/dev/archery/archery/integration/tester_js.py index dcf56f9a5ab6b..10d44da188af4 100644 --- a/dev/archery/archery/integration/tester_js.py +++ b/dev/archery/archery/integration/tester_js.py @@ -14,66 +14,58 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import os from pathlib import Path from .tester import Tester -from .util import run_cmd, log - +from .util import log, run_cmd ARROW_BUILD_ROOT = os.environ.get( - 'ARROW_BUILD_ROOT', - Path(__file__).resolve().parents[4] + "ARROW_BUILD_ROOT", Path(__file__).resolve().parents[4] ) -ARROW_JS_ROOT = os.path.join(ARROW_BUILD_ROOT, 'js') -_EXE_PATH = os.path.join(ARROW_JS_ROOT, 'bin') -_VALIDATE = os.path.join(_EXE_PATH, 'integration.ts') -_JSON_TO_ARROW = os.path.join(_EXE_PATH, 'json-to-arrow.ts') -_STREAM_TO_FILE = os.path.join(_EXE_PATH, 'stream-to-file.ts') -_FILE_TO_STREAM = os.path.join(_EXE_PATH, 'file-to-stream.ts') +ARROW_JS_ROOT = os.path.join(ARROW_BUILD_ROOT, "js") +_EXE_PATH = os.path.join(ARROW_JS_ROOT, "bin") +_VALIDATE = os.path.join(_EXE_PATH, "integration.ts") +_JSON_TO_ARROW = os.path.join(_EXE_PATH, "json-to-arrow.ts") +_STREAM_TO_FILE = os.path.join(_EXE_PATH, "stream-to-file.ts") +_FILE_TO_STREAM = os.path.join(_EXE_PATH, "file-to-stream.ts") class JSTester(Tester): PRODUCER = True CONSUMER = True - name = 'JS' + name = "JS" - def _run(self, exe_cmd, arrow_path=None, json_path=None, - command='VALIDATE'): + def _run(self, exe_cmd, arrow_path=None, json_path=None, command="VALIDATE"): cmd = [exe_cmd] if arrow_path is not None: - cmd.extend(['-a', arrow_path]) + cmd.extend(["-a", arrow_path]) if json_path is not None: - cmd.extend(['-j', json_path]) + cmd.extend(["-j", json_path]) - cmd.extend(['--mode', command]) + cmd.extend(["--mode", command]) if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd, cwd=ARROW_JS_ROOT) def validate(self, json_path, arrow_path, quirks=None): - return self._run(_VALIDATE, arrow_path, json_path, 'VALIDATE') + return self._run(_VALIDATE, arrow_path, json_path, "VALIDATE") def json_to_file(self, json_path, arrow_path): - cmd = [_JSON_TO_ARROW, - '-a', arrow_path, - '-j', json_path] + cmd = [_JSON_TO_ARROW, "-a", arrow_path, "-j", json_path] self.run_shell_command(cmd, cwd=ARROW_JS_ROOT) def stream_to_file(self, stream_path, file_path): - cmd = [_STREAM_TO_FILE, - '<', stream_path, - '>', file_path] + cmd = [_STREAM_TO_FILE, "<", stream_path, ">", file_path] self.run_shell_command(cmd, cwd=ARROW_JS_ROOT) def file_to_stream(self, file_path, stream_path): - cmd = [_FILE_TO_STREAM, - '<', file_path, - '>', stream_path] + cmd = [_FILE_TO_STREAM, "<", file_path, ">", stream_path] self.run_shell_command(cmd, cwd=ARROW_JS_ROOT) diff --git a/dev/archery/archery/integration/tester_nanoarrow.py b/dev/archery/archery/integration/tester_nanoarrow.py index 5af469d7a151a..108b02c57961f 100644 --- a/dev/archery/archery/integration/tester_nanoarrow.py +++ b/dev/archery/archery/integration/tester_nanoarrow.py @@ -14,28 +14,25 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import functools import os -from . import cdata -from .tester import Tester, CDataExporter, CDataImporter -from .util import run_cmd, log from ..utils.source import ARROW_ROOT_DEFAULT - +from . import cdata +from .tester import CDataExporter, CDataImporter, Tester +from .util import log, run_cmd _NANOARROW_PATH = os.environ.get( - "ARROW_NANOARROW_PATH", - os.path.join(ARROW_ROOT_DEFAULT, "nanoarrow/cdata"), + "ARROW_NANOARROW_PATH", os.path.join(ARROW_ROOT_DEFAULT, "nanoarrow/cdata") ) _INTEGRATION_DLL = os.path.join( _NANOARROW_PATH, "libnanoarrow_c_data_integration" + cdata.dll_suffix ) -_INTEGRATION_EXE = os.path.join( - _NANOARROW_PATH, "nanoarrow_ipc_integration" -) +_INTEGRATION_EXE = os.path.join(_NANOARROW_PATH, "nanoarrow_ipc_integration") class NanoarrowTester(Tester): @@ -52,37 +49,34 @@ class NanoarrowTester(Tester): def _run(self, arrow_path, json_path, command, quirks): env = { - 'ARROW_PATH': arrow_path, - 'JSON_PATH': json_path, - 'COMMAND': command, - **{ - f'QUIRK_{q}': "1" - for q in quirks or () - }, + "ARROW_PATH": arrow_path, + "JSON_PATH": json_path, + "COMMAND": command, + **{f"QUIRK_{q}": "1" for q in quirks or ()}, } if self.debug: - log(f'{_INTEGRATION_EXE} {env}') + log(f"{_INTEGRATION_EXE} {env}") run_cmd([_INTEGRATION_EXE], env=env) def validate(self, json_path, arrow_path, quirks=None): - return self._run(arrow_path, json_path, 'VALIDATE', quirks) + return self._run(arrow_path, json_path, "VALIDATE", quirks) def json_to_file(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'JSON_TO_ARROW', quirks=None) + return self._run(arrow_path, json_path, "JSON_TO_ARROW", quirks=None) def stream_to_file(self, stream_path, file_path): - self.run_shell_command([_INTEGRATION_EXE, '<', stream_path], env={ - 'COMMAND': 'STREAM_TO_FILE', - 'ARROW_PATH': file_path, - }) + self.run_shell_command( + [_INTEGRATION_EXE, "<", stream_path], + env={"COMMAND": "STREAM_TO_FILE", "ARROW_PATH": file_path}, + ) def file_to_stream(self, file_path, stream_path): - self.run_shell_command([_INTEGRATION_EXE, '>', stream_path], env={ - 'COMMAND': 'FILE_TO_STREAM', - 'ARROW_PATH': file_path, - }) + self.run_shell_command( + [_INTEGRATION_EXE, ">", stream_path], + env={"COMMAND": "FILE_TO_STREAM", "ARROW_PATH": file_path}, + ) def make_c_data_exporter(self): return NanoarrowCDataExporter(self.debug, self.args) @@ -123,8 +117,7 @@ def __init__(self, debug, args): self.dll = _load_ffi(self.ffi) def _check_nanoarrow_error(self, na_error): - """ - Check a `const char*` error return from an integration entrypoint. + """Check a `const char*` error return from an integration entrypoint. A null means success, a non-empty string is an error message. The string is statically allocated on the nanoarrow side and does not diff --git a/dev/archery/archery/integration/tester_rust.py b/dev/archery/archery/integration/tester_rust.py index 56b07859dc82a..ed575409e4264 100644 --- a/dev/archery/archery/integration/tester_rust.py +++ b/dev/archery/archery/integration/tester_rust.py @@ -14,17 +14,17 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import contextlib import functools import os import subprocess -from . import cdata -from .tester import Tester, CDataExporter, CDataImporter -from .util import run_cmd, log from ..utils.source import ARROW_ROOT_DEFAULT - +from . import cdata +from .tester import CDataExporter, CDataImporter, Tester +from .util import log, run_cmd _EXE_PATH = os.environ.get( "ARROW_RUST_EXE_PATH", os.path.join(ARROW_ROOT_DEFAULT, "rust/target/debug") @@ -33,16 +33,16 @@ _STREAM_TO_FILE = os.path.join(_EXE_PATH, "arrow-stream-to-file") _FILE_TO_STREAM = os.path.join(_EXE_PATH, "arrow-file-to-stream") -_FLIGHT_SERVER_CMD = [os.path.join( - _EXE_PATH, "flight-test-integration-server")] +_FLIGHT_SERVER_CMD = [os.path.join(_EXE_PATH, "flight-test-integration-server")] _FLIGHT_CLIENT_CMD = [ os.path.join(_EXE_PATH, "flight-test-integration-client"), "--host", "localhost", ] -_INTEGRATION_DLL = os.path.join(_EXE_PATH, - "libarrow_integration_testing" + cdata.dll_suffix) +_INTEGRATION_DLL = os.path.join( + _EXE_PATH, "libarrow_integration_testing" + cdata.dll_suffix +) class RustTester(Tester): @@ -55,78 +55,72 @@ class RustTester(Tester): C_DATA_SCHEMA_IMPORTER = True C_DATA_ARRAY_IMPORTER = True - name = 'Rust' + name = "Rust" - def _run(self, arrow_path=None, json_path=None, command='VALIDATE'): - cmd = [_INTEGRATION_EXE, '--integration'] + def _run(self, arrow_path=None, json_path=None, command="VALIDATE"): + cmd = [_INTEGRATION_EXE, "--integration"] if arrow_path is not None: - cmd.append('--arrow=' + arrow_path) + cmd.append("--arrow=" + arrow_path) if json_path is not None: - cmd.append('--json=' + json_path) + cmd.append("--json=" + json_path) - cmd.append('--mode=' + command) + cmd.append("--mode=" + command) if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) def validate(self, json_path, arrow_path, quirks=None): - return self._run(arrow_path, json_path, 'VALIDATE') + return self._run(arrow_path, json_path, "VALIDATE") def json_to_file(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'JSON_TO_ARROW') + return self._run(arrow_path, json_path, "JSON_TO_ARROW") def stream_to_file(self, stream_path, file_path): - cmd = [_STREAM_TO_FILE, '<', stream_path, '>', file_path] + cmd = [_STREAM_TO_FILE, "<", stream_path, ">", file_path] self.run_shell_command(cmd) def file_to_stream(self, file_path, stream_path): - cmd = [_FILE_TO_STREAM, file_path, '>', stream_path] + cmd = [_FILE_TO_STREAM, file_path, ">", stream_path] self.run_shell_command(cmd) @contextlib.contextmanager def flight_server(self, scenario_name=None): - cmd = _FLIGHT_SERVER_CMD + ['--port=0'] + cmd = _FLIGHT_SERVER_CMD + ["--port=0"] if scenario_name: - cmd = cmd + ['--scenario', scenario_name] + cmd = cmd + ["--scenario", scenario_name] if self.debug: - log(' '.join(cmd)) - server = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) + log(" ".join(cmd)) + server = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: output = server.stdout.readline().decode() - if not output.startswith('Server listening on localhost:'): + if not output.startswith("Server listening on localhost:"): server.kill() out, err = server.communicate() raise RuntimeError( - 'Flight-Rust server did not start properly, ' - 'stdout:\n{}\n\nstderr:\n{}\n'.format( - output + out.decode(), err.decode() - ) + "Flight-Rust server did not start properly, " + f"stdout:\n{output + out.decode()}\n\nstderr:\n{err.decode()}\n" ) - port = int(output.split(':')[1]) + port = int(output.split(":")[1]) yield port finally: server.kill() server.wait(5) def flight_request(self, port, json_path=None, scenario_name=None): - cmd = _FLIGHT_CLIENT_CMD + [f'--port={port}'] + cmd = _FLIGHT_CLIENT_CMD + [f"--port={port}"] if json_path: - cmd.extend(('--path', json_path)) + cmd.extend(("--path", json_path)) elif scenario_name: - cmd.extend(('--scenario', scenario_name)) + cmd.extend(("--scenario", scenario_name)) else: - raise TypeError('Must provide one of json_path or scenario_name') + raise TypeError("Must provide one of json_path or scenario_name") if self.debug: - log(' '.join(cmd)) + log(" ".join(cmd)) run_cmd(cmd) def make_c_data_exporter(self): @@ -159,7 +153,6 @@ def _load_ffi(ffi, lib_path=_INTEGRATION_DLL): class _CDataBase: - def __init__(self, debug, args): self.debug = debug self.args = args @@ -167,11 +160,10 @@ def __init__(self, debug, args): self.dll = _load_ffi(self.ffi) def _pointer_to_int(self, c_ptr): - return self.ffi.cast('uintptr_t', c_ptr) + return self.ffi.cast("uintptr_t", c_ptr) def _check_rust_error(self, rs_error): - """ - Check a `const char*` error return from an integration entrypoint. + """Check a `const char*` error return from an integration entrypoint. A null means success, a non-empty string is an error message. The string is dynamically allocated on the Rust side. @@ -179,25 +171,23 @@ def _check_rust_error(self, rs_error): assert self.ffi.typeof(rs_error) is self.ffi.typeof("const char*") if rs_error != self.ffi.NULL: try: - error = self.ffi.string(rs_error).decode( - 'utf8', errors='replace') - raise RuntimeError( - f"Rust C Data Integration call failed: {error}") + error = self.ffi.string(rs_error).decode("utf8", errors="replace") + raise RuntimeError(f"Rust C Data Integration call failed: {error}") finally: self.dll.arrow_rs_free_error(rs_error) class RustCDataExporter(CDataExporter, _CDataBase): - def export_schema_from_json(self, json_path, c_schema_ptr): rs_error = self.dll.arrow_rs_cdata_integration_export_schema_from_json( - str(json_path).encode(), self._pointer_to_int(c_schema_ptr)) + str(json_path).encode(), self._pointer_to_int(c_schema_ptr) + ) self._check_rust_error(rs_error) def export_batch_from_json(self, json_path, num_batch, c_array_ptr): rs_error = self.dll.arrow_rs_cdata_integration_export_batch_from_json( - str(json_path).encode(), num_batch, - self._pointer_to_int(c_array_ptr)) + str(json_path).encode(), num_batch, self._pointer_to_int(c_array_ptr) + ) self._check_rust_error(rs_error) @property @@ -210,18 +200,18 @@ def record_allocation_state(self): class RustCDataImporter(CDataImporter, _CDataBase): - def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): - rs_error = \ + rs_error = ( self.dll.arrow_rs_cdata_integration_import_schema_and_compare_to_json( - str(json_path).encode(), self._pointer_to_int(c_schema_ptr)) + str(json_path).encode(), self._pointer_to_int(c_schema_ptr) + ) + ) self._check_rust_error(rs_error) - def import_batch_and_compare_to_json(self, json_path, num_batch, - c_array_ptr): - rs_error = \ - self.dll.arrow_rs_cdata_integration_import_batch_and_compare_to_json( - str(json_path).encode(), num_batch, self._pointer_to_int(c_array_ptr)) + def import_batch_and_compare_to_json(self, json_path, num_batch, c_array_ptr): + rs_error = self.dll.arrow_rs_cdata_integration_import_batch_and_compare_to_json( + str(json_path).encode(), num_batch, self._pointer_to_int(c_array_ptr) + ) self._check_rust_error(rs_error) @property diff --git a/dev/archery/archery/integration/util.py b/dev/archery/archery/integration/util.py index 1b1eb95a1d296..e96ab9a7d83f6 100644 --- a/dev/archery/archery/integration/util.py +++ b/dev/archery/archery/integration/util.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import contextlib import io @@ -32,15 +33,14 @@ def guid(): # SKIP categories -SKIP_C_ARRAY = 'c_array' -SKIP_C_SCHEMA = 'c_schema' -SKIP_FLIGHT = 'flight' -SKIP_IPC = 'ipc' +SKIP_C_ARRAY = "c_array" +SKIP_C_SCHEMA = "c_schema" +SKIP_FLIGHT = "flight" +SKIP_IPC = "ipc" class _Printer: - """ - A print()-providing object that can override the stream output on + """A print()-providing object that can override the stream output on a per-thread basis. """ @@ -56,23 +56,19 @@ def _get_stdout(self): return self._tls.stdout def print(self, *args, **kwargs): - """ - A variant of print() that writes to a thread-local stream. - """ + """A variant of print() that writes to a thread-local stream.""" print(*args, file=self._get_stdout(), **kwargs) @property def stdout(self): - """ - A thread-local stdout wrapper that may be temporarily buffered + """A thread-local stdout wrapper that may be temporarily buffered using `cork()`. """ return self._get_stdout() @contextlib.contextmanager def cork(self): - """ - Temporarily buffer this thread's stream and write out its contents + """Temporarily buffer this thread's stream and write out its contents at the end of the context manager. Useful to avoid interleaved output when multiple threads output progress information. """ @@ -97,39 +93,34 @@ def cork(self): def random_utf8(nchars): - """ - Generate one random UTF8 string. - """ - return ''.join(np.random.choice(_RAND_CHARS, nchars)) + """Generate one random UTF8 string.""" + return "".join(np.random.choice(_RAND_CHARS, nchars)) def random_bytes(nbytes): - """ - Generate one random binary string. - """ + """Generate one random binary string.""" # NOTE getrandbits(0) fails if nbytes > 0: - return random.getrandbits(nbytes * 8).to_bytes(nbytes, - byteorder='little') + return random.getrandbits(nbytes * 8).to_bytes(nbytes, byteorder="little") else: return b"" def tobytes(o): if isinstance(o, str): - return o.encode('utf8') + return o.encode("utf8") return o def frombytes(o): if isinstance(o, bytes): - return o.decode('utf8') + return o.decode("utf8") return o def run_cmd(cmd, **kwargs): if isinstance(cmd, str): - cmd = cmd.split(' ') + cmd = cmd.split(" ") try: kwargs.update(stderr=subprocess.STDOUT) @@ -137,11 +128,11 @@ def run_cmd(cmd, **kwargs): except subprocess.CalledProcessError as e: # this avoids hiding the stdout / stderr of failed processes sio = io.StringIO() - print('Command failed:', " ".join(cmd), file=sio) - print('With output:', file=sio) - print('--------------', file=sio) + print("Command failed:", " ".join(cmd), file=sio) + print("With output:", file=sio) + print("--------------", file=sio) print(frombytes(e.output), file=sio) - print('--------------', file=sio) + print("--------------", file=sio) raise RuntimeError(sio.getvalue()) return frombytes(output) @@ -157,7 +148,7 @@ def find_unused_port(family=socket.AF_INET, socktype=socket.SOCK_STREAM): then closed and deleted, and the ephemeral port is returned. """ with socket.socket(family, socktype) as tempsock: - tempsock.bind(('', 0)) + tempsock.bind(("", 0)) port = tempsock.getsockname()[1] del tempsock return port diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index 0b48ca2f97766..af1d10d503485 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import os @@ -36,36 +37,62 @@ def coalesce(value, fallback): class CppConfiguration: - def __init__(self, - - # toolchain - cc=None, cxx=None, cxx_flags=None, - build_type=None, warn_level=None, - cpp_package_prefix=None, install_prefix=None, use_conda=None, - build_static=True, build_shared=True, build_unity=True, - # tests & examples - with_tests=None, with_benchmarks=None, with_examples=None, - with_integration=None, - # static checks - use_asan=None, use_tsan=None, use_ubsan=None, - with_fuzzing=None, - # Components - with_compute=None, with_csv=None, with_cuda=None, - with_dataset=None, with_filesystem=None, with_flight=None, - with_gandiva=None, with_gcs=None, with_hdfs=None, - with_hiveserver2=None, - with_ipc=True, with_json=None, - with_mimalloc=None, with_jemalloc=None, - with_parquet=None, with_python=True, - with_r=None, with_s3=None, - # Compressions - with_brotli=None, with_bz2=None, with_lz4=None, - with_snappy=None, with_zlib=None, with_zstd=None, - # extras - with_lint_only=False, - use_gold_linker=True, - simd_level="DEFAULT", - cmake_extras=None): + def __init__( + self, + # toolchain + cc=None, + cxx=None, + cxx_flags=None, + build_type=None, + warn_level=None, + cpp_package_prefix=None, + install_prefix=None, + use_conda=None, + build_static=True, + build_shared=True, + build_unity=True, + # tests & examples + with_tests=None, + with_benchmarks=None, + with_examples=None, + with_integration=None, + # static checks + use_asan=None, + use_tsan=None, + use_ubsan=None, + with_fuzzing=None, + # Components + with_compute=None, + with_csv=None, + with_cuda=None, + with_dataset=None, + with_filesystem=None, + with_flight=None, + with_gandiva=None, + with_gcs=None, + with_hdfs=None, + with_hiveserver2=None, + with_ipc=True, + with_json=None, + with_mimalloc=None, + with_jemalloc=None, + with_parquet=None, + with_python=True, + with_r=None, + with_s3=None, + # Compressions + with_brotli=None, + with_bz2=None, + with_lz4=None, + with_snappy=None, + with_zlib=None, + with_zstd=None, + # extras + with_lint_only=False, + use_gold_linker=True, + simd_level="DEFAULT", + cmake_extras=None, + ): self._cc = cc self._cxx = cxx self.cxx_flags = cxx_flags @@ -164,7 +191,7 @@ def cc(self): return self._cc if self.with_fuzzing: - return "clang-{}".format(LLVM_VERSION) + return f"clang-{LLVM_VERSION}" return None @@ -174,7 +201,7 @@ def cxx(self): return self._cxx if self.with_fuzzing: - return "clang++-{}".format(LLVM_VERSION) + return f"clang++-{LLVM_VERSION}" return None @@ -186,8 +213,7 @@ def _gen_defs(self): yield ("CMAKE_BUILD_TYPE", self.build_type) if not self.with_lint_only: - yield ("BUILD_WARNING_LEVEL", - or_else(self.warn_level, "production")) + yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production")) # if not ctx.quiet: # yield ("ARROW_VERBOSE_THIRDPARTY_BUILD", "ON") @@ -251,7 +277,7 @@ def _gen_defs(self): # Detect custom conda toolchain if self.use_conda: - for d, v in [('CMAKE_AR', 'AR'), ('CMAKE_RANLIB', 'RANLIB')]: + for d, v in [("CMAKE_AR", "AR"), ("CMAKE_RANLIB", "RANLIB")]: v = os.environ.get(v) if v: yield (d, v) @@ -277,7 +303,7 @@ def use_conda(self): @property def definitions(self): extras = list(self.cmake_extras) if self.cmake_extras else [] - definitions = ["-D{}={}".format(d[0], d[1]) for d in self._gen_defs()] + definitions = [f"-D{d[0]}={d[1]}" for d in self._gen_defs()] return definitions + extras @property @@ -296,6 +322,10 @@ def environment(self): class CppCMakeDefinition(CMakeDefinition): def __init__(self, source, conf, **kwargs): self.configuration = conf - super().__init__(source, **kwargs, - definitions=conf.definitions, env=conf.environment, - build_type=conf.build_type) + super().__init__( + source, + **kwargs, + definitions=conf.definitions, + env=conf.environment, + build_type=conf.build_type, + ) diff --git a/dev/archery/archery/lang/java.py b/dev/archery/archery/lang/java.py index f447b352e6a6c..8681dfd153441 100644 --- a/dev/archery/archery/lang/java.py +++ b/dev/archery/archery/lang/java.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import os @@ -35,14 +36,18 @@ def __init__(self, jar, *args, **kwargs): class JavaConfiguration: REQUIRED_JAVA_OPTIONS = [ - "--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED", + "--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED" ] - def __init__(self, - # toolchain - java_home=None, java_options=None, - # build & benchmark - build_extras=None, benchmark_extras=None): + def __init__( + self, + # toolchain + java_home=None, + java_options=None, + # build & benchmark + build_extras=None, + benchmark_extras=None, + ): self.java_home = java_home self.java_options = java_options @@ -54,8 +59,7 @@ def __init__(self, self.java_options += " " + option self.build_extras = list(build_extras) if build_extras else [] - self.benchmark_extras = list( - benchmark_extras) if benchmark_extras else [] + self.benchmark_extras = list(benchmark_extras) if benchmark_extras else [] @property def build_definitions(self): @@ -81,7 +85,10 @@ def environment(self): class JavaMavenDefinition(MavenDefinition): def __init__(self, source, conf, **kwargs): self.configuration = conf - super().__init__(source, **kwargs, - build_definitions=conf.build_definitions, - benchmark_definitions=conf.benchmark_definitions, - env=conf.environment) + super().__init__( + source, + **kwargs, + build_definitions=conf.build_definitions, + benchmark_definitions=conf.benchmark_definitions, + env=conf.environment, + ) diff --git a/dev/archery/archery/lang/python.py b/dev/archery/archery/lang/python.py index d4c1853d097b2..a0cb6b49749a1 100644 --- a/dev/archery/archery/lang/python.py +++ b/dev/archery/archery/lang/python.py @@ -14,11 +14,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from contextlib import contextmanager -from enum import EnumMeta import inspect import tokenize +from contextlib import contextmanager +from enum import EnumMeta try: from numpydoc.validate import Docstring, validate @@ -28,8 +29,8 @@ have_numpydoc = True from ..compat import _get_module -from ..utils.logger import logger from ..utils.command import Command, capture_stdout, default_bin +from ..utils.logger import logger class PythonCommand(Command): @@ -57,7 +58,7 @@ def run_captured(self, *args, **kwargs): def _tokenize_signature(s): - lines = s.encode('ascii').splitlines() + lines = s.encode("ascii").splitlines() generator = iter(lines).__next__ return tokenize.tokenize(generator) @@ -68,7 +69,7 @@ def _convert_typehint(tokens): for token in tokens: # omit the tokens before the opening bracket if not opening_bracket_reached: - if token.string == '(': + if token.string == "(": opening_bracket_reached = True else: continue @@ -85,14 +86,13 @@ def _convert_typehint(tokens): # are not supported by _signature_fromstr yield (names[1].type, names[1].string) elif len(names) > 2: - raise ValueError('More than two NAME tokens follow each other') + raise ValueError("More than two NAME tokens follow each other") names = [] yield (token.type, token.string) def inspect_signature(obj): - """ - Custom signature inspection primarily for cython generated callables. + """Custom signature inspection primarily for cython generated callables. Cython puts the signatures to the first line of the docstrings, which we can reuse to parse the python signature from, but some gymnastics are @@ -121,10 +121,10 @@ class NumpyDoc: def __init__(self, symbols=None): if not have_numpydoc: raise RuntimeError( - 'Numpydoc is not available, install with command: ' - 'pip install numpydoc==1.1.0' + "Numpydoc is not available, install with command: " + "pip install numpydoc==1.1.0" ) - self.symbols = set(symbols or {'pyarrow'}) + self.symbols = set(symbols or {"pyarrow"}) def traverse(self, fn, obj, from_package): """Apply a function on publicly exposed API components. @@ -140,6 +140,7 @@ def traverse(self, fn, obj, from_package): The object to start from. from_package : string Predicate to only consider objects from this package. + """ todo = [obj] seen = set() @@ -154,7 +155,7 @@ def traverse(self, fn, obj, from_package): fn(obj) for name in dir(obj): - if name.startswith('_'): + if name.startswith("_"): continue member = getattr(obj, name) @@ -166,19 +167,17 @@ def traverse(self, fn, obj, from_package): # and no user-defined docstring following it. # The generated docstring would lack description of method # parameters and therefore fail Numpydoc validation. - if hasattr(member, '__objclass__'): - doc = getattr(member, '__doc__', None) + if hasattr(member, "__objclass__"): + doc = getattr(member, "__doc__", None) # The Cython-generated docstring would be a one-liner, # such as "ReadOptions.equals(self, ReadOptions other)". - if (doc and '\n' not in doc and f'.{name}(' in doc): + if doc and "\n" not in doc and f".{name}(" in doc: continue todo.append(member) @contextmanager def _apply_patches(self): - """ - Patch Docstring class to bypass loading already loaded python objects. - """ + """Patch Docstring class to bypass loading already loaded python objects.""" orig_load_obj = Docstring._load_obj orig_signature = inspect.signature @@ -216,8 +215,13 @@ def signature(obj): Docstring._load_obj = orig_load_obj inspect.signature = orig_signature - def validate(self, from_package='', allow_rules=None, - disallow_rules=None): + def _should_ignore_error(self, obj, errcode): + for obj_type, errcode_list in self.IGNORE_VALIDATION_ERRORS_FOR_TYPE.items(): + if isinstance(obj, obj_type) and errcode in errcode_list: + return True + return False + + def validate(self, from_package="", allow_rules=None, disallow_rules=None): results = [] def callback(obj): @@ -229,19 +233,17 @@ def callback(obj): return errors = [] - for errcode, errmsg in result.get('errors', []): + for errcode, errmsg in result.get("errors", []): if allow_rules and errcode not in allow_rules: continue if disallow_rules and errcode in disallow_rules: continue - if any(isinstance(obj, obj_type) and errcode in errcode_list - for obj_type, errcode_list - in NumpyDoc.IGNORE_VALIDATION_ERRORS_FOR_TYPE.items()): + if self._should_ignore_error(obj, errcode): continue errors.append((errcode, errmsg)) if len(errors): - result['errors'] = errors + result["errors"] = errors results.append((obj, result)) with self._apply_patches(): @@ -249,7 +251,7 @@ def callback(obj): try: obj = Docstring._load_obj(symbol) except (ImportError, AttributeError): - print('{} is not available for import'.format(symbol)) + print(f"{symbol} is not available for import") else: self.traverse(callback, obj, from_package=from_package) diff --git a/dev/archery/archery/linking.py b/dev/archery/archery/linking.py index c2e6f1772fad6..718c8645c66be 100644 --- a/dev/archery/archery/linking.py +++ b/dev/archery/archery/linking.py @@ -14,13 +14,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import platform import subprocess from .utils.command import Command - _ldd = Command("ldd") _otool = Command("otool") @@ -30,14 +30,11 @@ class DependencyError(Exception): class DynamicLibrary: - def __init__(self, path): self.path = path def list_dependencies(self): - """ - List the full name of the library dependencies. - """ + """List the full name of the library dependencies.""" system = platform.system() if system == "Linux": result = _ldd.run(self.path, stdout=subprocess.PIPE) @@ -51,9 +48,7 @@ def list_dependencies(self): raise ValueError(f"{platform} is not supported") def list_dependency_names(self): - """ - List the truncated names of the dynamic library dependencies. - """ + """List the truncated names of the dynamic library dependencies.""" names = [] for dependency in self.list_dependencies(): *_, library = dependency.rsplit("/", 1) diff --git a/dev/archery/archery/release/__init__.py b/dev/archery/archery/release/__init__.py index a902c99088efe..a384a7a3924db 100644 --- a/dev/archery/archery/release/__init__.py +++ b/dev/archery/archery/release/__init__.py @@ -14,5 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from .core import Release, MajorRelease, MinorRelease, PatchRelease # noqa +from .core import MajorRelease, MinorRelease, PatchRelease, Release + +__all__ = ["MajorRelease", "MinorRelease", "PatchRelease", "Release"] diff --git a/dev/archery/archery/release/cli.py b/dev/archery/archery/release/cli.py index 92fdbb801f357..6919439cfeb67 100644 --- a/dev/archery/archery/release/cli.py +++ b/dev/archery/archery/release/cli.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import pathlib @@ -23,120 +24,127 @@ from .core import IssueTracker, Release -@click.group('release') -@click.option("--src", metavar="<arrow_src>", default=None, - callback=validate_arrow_sources, - help="Specify Arrow source directory.") -@click.option('--github-token', '-t', default=None, - envvar="CROSSBOW_GITHUB_TOKEN", - help='OAuth token for GitHub authentication') +@click.group("release") +@click.option( + "--src", + metavar="<arrow_src>", + default=None, + callback=validate_arrow_sources, + help="Specify Arrow source directory.", +) +@click.option( + "--github-token", + "-t", + default=None, + envvar="CROSSBOW_GITHUB_TOKEN", + help="OAuth token for GitHub authentication", +) @click.pass_obj def release(obj, src, github_token): """Release related commands.""" + obj["issue_tracker"] = IssueTracker(github_token=github_token) + obj["repo"] = src.path - obj['issue_tracker'] = IssueTracker(github_token=github_token) - obj['repo'] = src.path - -@release.command('curate', help="Lists release related issues.") -@click.argument('version') -@click.option('--minimal/--full', '-m/-f', - help="Only show actionable issues.", default=False) +@release.command("curate", help="Lists release related issues.") +@click.argument("version") +@click.option( + "--minimal/--full", "-m/-f", help="Only show actionable issues.", default=False +) @click.pass_obj def release_curate(obj, version, minimal): """Release curation.""" - release = Release(version, repo=obj['repo'], - issue_tracker=obj['issue_tracker']) + release = Release(version, repo=obj["repo"], issue_tracker=obj["issue_tracker"]) curation = release.curate(minimal) - click.echo(curation.render('console')) + click.echo(curation.render("console")) -@release.group('changelog') +@release.group("changelog") def release_changelog(): """Release changelog.""" - pass -@release_changelog.command('add') -@click.argument('version') +@release_changelog.command("add") +@click.argument("version") @click.pass_obj def release_changelog_add(obj, version): """Prepend the changelog with the current release""" - repo, issue_tracker = obj['repo'], obj['issue_tracker'] + repo, issue_tracker = obj["repo"], obj["issue_tracker"] # just handle the current version release = Release(version, repo=repo, issue_tracker=issue_tracker) if release.is_released: - raise ValueError('This version has been already released!') + raise ValueError("This version has been already released!") changelog = release.changelog() - changelog_path = pathlib.Path(repo) / 'CHANGELOG.md' + changelog_path = pathlib.Path(repo) / "CHANGELOG.md" current_content = changelog_path.read_text() - new_content = changelog.render('markdown') + current_content + new_content = changelog.render("markdown") + current_content changelog_path.write_text(new_content) click.echo("CHANGELOG.md is updated!") -@release_changelog.command('generate') -@click.argument('version') -@click.argument('output', type=click.File('w', encoding='utf8'), default='-') +@release_changelog.command("generate") +@click.argument("version") +@click.argument("output", type=click.File("w", encoding="utf8"), default="-") @click.pass_obj def release_changelog_generate(obj, version, output): """Generate the changelog of a specific release.""" - repo, issue_tracker = obj['repo'], obj['issue_tracker'] + repo, issue_tracker = obj["repo"], obj["issue_tracker"] # just handle the current version release = Release(version, repo=repo, issue_tracker=issue_tracker) changelog = release.changelog() - output.write(changelog.render('markdown')) + output.write(changelog.render("markdown")) -@release_changelog.command('regenerate') +@release_changelog.command("regenerate") @click.pass_obj def release_changelog_regenerate(obj): """Regenerate the whole CHANGELOG.md file""" - issue_tracker, repo = obj['issue_tracker'], obj['repo'] + issue_tracker, repo = obj["issue_tracker"], obj["repo"] changelogs = [] issue_tracker = IssueTracker(issue_tracker=issue_tracker) for version in issue_tracker.project_versions(): if not version.released: continue - release = Release(version, repo=repo, - issue_tracker=issue_tracker) - click.echo('Querying changelog for version: {}'.format(version)) + release = Release(version, repo=repo, issue_tracker=issue_tracker) + click.echo(f"Querying changelog for version: {version}") changelogs.append(release.changelog()) - click.echo('Rendering new CHANGELOG.md file...') - changelog_path = pathlib.Path(repo) / 'CHANGELOG.md' - with changelog_path.open('w') as fp: + click.echo("Rendering new CHANGELOG.md file...") + changelog_path = pathlib.Path(repo) / "CHANGELOG.md" + with changelog_path.open("w") as fp: for cl in changelogs: - fp.write(cl.render('markdown')) - - -@release.command('cherry-pick') -@click.argument('version') -@click.option('--dry-run/--execute', default=True, - help="Display the git commands instead of executing them.") -@click.option('--recreate/--continue', default=True, - help="Recreate the maintenance branch or only apply unapplied " - "patches.") + fp.write(cl.render("markdown")) + + +@release.command("cherry-pick") +@click.argument("version") +@click.option( + "--dry-run/--execute", + default=True, + help="Display the git commands instead of executing them.", +) +@click.option( + "--recreate/--continue", + default=True, + help="Recreate the maintenance branch or only apply unapplied patches.", +) @click.pass_obj def release_cherry_pick(obj, version, dry_run, recreate): - """ - Cherry pick commits. - """ - issue_tracker = obj['issue_tracker'] - release = Release(version, - repo=obj['repo'], issue_tracker=issue_tracker) + """Cherry pick commits.""" + issue_tracker = obj["issue_tracker"] + release = Release(version, repo=obj["repo"], issue_tracker=issue_tracker) if not dry_run: release.cherry_pick_commits(recreate_branch=recreate) else: - click.echo(f'git checkout -b {release.branch} {release.base_branch}') + click.echo(f"git checkout -b {release.branch} {release.base_branch}") for commit in release.commits_to_pick(): - click.echo('git cherry-pick {}'.format(commit.hexsha)) + click.echo(f"git cherry-pick {commit.hexsha}") diff --git a/dev/archery/archery/release/core.py b/dev/archery/archery/release/core.py index bbaba2f648f29..122bebf40b835 100644 --- a/dev/archery/archery/release/core.py +++ b/dev/archery/archery/release/core.py @@ -14,22 +14,23 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from abc import abstractmethod -from collections import defaultdict import functools import os import pathlib import re import warnings +from abc import abstractmethod +from collections import defaultdict from git import Repo from github import Github from semver import VersionInfo as SemVer -from ..utils.source import ArrowSources from ..utils.logger import logger -from .reports import ReleaseCuration, ReleaseChangelog +from ..utils.source import ArrowSources +from .reports import ReleaseChangelog, ReleaseCuration def cached_property(fn): @@ -37,8 +38,7 @@ def cached_property(fn): class Version(SemVer): - - __slots__ = ('released', 'release_date') + __slots__ = ("release_date", "released") def __init__(self, released=False, release_date=None, **kwargs): super().__init__(**kwargs) @@ -54,12 +54,11 @@ def from_milestone(cls, milestone): return cls.parse( milestone.title, released=milestone.state == "closed", - release_date=milestone.due_on + release_date=milestone.due_on, ) class Issue: - def __init__(self, key, type, summary, github_issue=None): self.key = key self.type = type @@ -74,24 +73,27 @@ def from_github(cls, github_issue): type=next( iter( [ - label.name for label in github_issue.labels + label.name + for label in github_issue.labels if label.name.startswith("Type:") ] - ), None), + ), + None, + ), summary=github_issue.title, - github_issue=github_issue + github_issue=github_issue, ) @property def project(self): if isinstance(self.key, int): - return 'GH' - return self.key.split('-')[0] + return "GH" + return self.key.split("-")[0] @property def number(self): if isinstance(self.key, str): - return int(self.key.split('-')[1]) + return int(self.key.split("-")[1]) else: return self.key @@ -101,10 +103,9 @@ def is_pr(self): class IssueTracker: - def __init__(self, github_token=None): github = Github(github_token) - self.github_repo = github.get_repo('apache/arrow') + self.github_repo = github.get_repo("apache/arrow") def project_version(self, version_string): for milestone in self.project_versions(): @@ -134,8 +135,8 @@ def _milestone_from_semver(self, semver): def project_issues(self, version): issues = self.github_repo.get_issues( - milestone=self._milestone_from_semver(version), - state="all") + milestone=self._milestone_from_semver(version), state="all" + ) return list(map(Issue.from_github, issues)) def issue(self, key): @@ -151,9 +152,15 @@ def issue(self, key): class CommitTitle: - - def __init__(self, summary, project=None, issue=None, minor=None, - components=None, issue_id=None): + def __init__( + self, + summary, + project=None, + issue=None, + minor=None, + components=None, + issue_id=None, + ): self.project = project self.issue = issue self.issue_id = issue_id @@ -166,54 +173,49 @@ def __str__(self): def __eq__(self, other): return ( - self.summary == other.summary and - self.project == other.project and - self.issue == other.issue and - self.minor == other.minor and - self.components == other.components + self.summary == other.summary + and self.project == other.project + and self.issue == other.issue + and self.minor == other.minor + and self.components == other.components ) def __hash__(self): - return hash( - (self.summary, self.project, self.issue, tuple(self.components)) - ) + return hash((self.summary, self.project, self.issue, tuple(self.components))) @classmethod def parse(cls, headline): matches = _TITLE_REGEX.match(headline) if matches is None: - warnings.warn( - "Unable to parse commit message `{}`".format(headline) - ) + warnings.warn(f"Unable to parse commit message `{headline}`", stacklevel=2) return CommitTitle(headline) values = matches.groupdict() - components = values.get('components') or '' + components = values.get("components") or "" components = _COMPONENT_REGEX.findall(components) return CommitTitle( - values['summary'], - project=values.get('project'), - issue=values.get('issue'), - issue_id=values.get('issue_id'), - minor=values.get('minor'), - components=components + values["summary"], + project=values.get("project"), + issue=values.get("issue"), + issue_id=values.get("issue_id"), + minor=values.get("minor"), + components=components, ) def to_string(self, with_issue=True, with_components=True): out = "" if with_issue and self.issue: - out += "{}: ".format(self.issue) + out += f"{self.issue}: " if with_components and self.components: for component in self.components: - out += "[{}]".format(component) + out += f"[{component}]" out += " " out += self.summary return out class Commit: - def __init__(self, wrapped): self._title = CommitTitle.parse(wrapped.summary) self._wrapped = wrapped @@ -225,13 +227,12 @@ def __getattr__(self, attr): return getattr(self._wrapped, attr) def __repr__(self): - template = '<Commit sha={!r} issue={!r} components={!r} summary={!r}>' - return template.format(self.hexsha, self.issue, self.components, - self.summary) + template = "<Commit sha={!r} issue={!r} components={!r} summary={!r}>" + return template.format(self.hexsha, self.issue, self.components, self.summary) @property def url(self): - return 'https://github.com/apache/arrow/commit/{}'.format(self.hexsha) + return f"https://github.com/apache/arrow/commit/{self.hexsha}" @property def title(self): @@ -239,9 +240,7 @@ def title(self): class Release: - - def __new__(self, version, repo=None, github_token=None, - issue_tracker=None): + def __new__(self, version, repo=None, github_token=None, issue_tracker=None): if isinstance(version, str): version = Version.parse(version) elif not isinstance(version, Version): @@ -268,8 +267,7 @@ def __init__(self, version, repo, issue_tracker): elif isinstance(repo, (str, pathlib.Path)): repo = Repo(repo) elif not isinstance(repo, Repo): - raise TypeError("`repo` argument must be a path or a valid Repo " - "instance") + raise TypeError("`repo` argument must be a path or a valid Repo instance") if isinstance(version, str): version = issue_tracker.project_version(version) @@ -299,17 +297,13 @@ def tag(self): @property @abstractmethod def branch(self): - """ - Target branch that serves as the base for the release. - """ + """Target branch that serves as the base for the release.""" ... @property @abstractmethod def siblings(self): - """ - Releases to consider when calculating previous and next releases. - """ + """Releases to consider when calculating previous and next releases.""" ... @cached_property @@ -322,40 +316,34 @@ def previous(self): # first release doesn't have a previous one return None else: - return Release(previous, repo=self.repo, - issue_tracker=self.issue_tracker) + return Release(previous, repo=self.repo, issue_tracker=self.issue_tracker) @cached_property def next(self): # select all non-patch releases position = self.siblings.index(self.version) if position <= 0: - raise ValueError("There is no upcoming release set in JIRA after " - f"version {self.version}") + raise ValueError( + f"There is no upcoming release set in JIRA after version {self.version}" + ) upcoming = self.siblings[position - 1] - return Release(upcoming, repo=self.repo, - issue_tracker=self.issue_tracker) + return Release(upcoming, repo=self.repo, issue_tracker=self.issue_tracker) @cached_property def issues(self): - issues = self.issue_tracker.project_issues( - self.version - ) + issues = self.issue_tracker.project_issues(self.version) return {i.key: i for i in issues} @cached_property def github_issue_ids(self): - return {v.github_issue_id for v in self.issues.values() - if v.github_issue_id} + return {v.github_issue_id for v in self.issues.values() if v.github_issue_id} @cached_property def commits(self): - """ - All commits applied between two versions. - """ + """All commits applied between two versions.""" if self.previous is None: # first release - lower = '' + lower = "" else: lower = self.repo.tags[self.previous.tag] @@ -363,13 +351,15 @@ def commits(self): try: upper = self.repo.tags[self.tag] except IndexError: - warnings.warn(f"Release tag `{self.tag}` doesn't exist.") + warnings.warn(f"Release tag `{self.tag}` doesn't exist.", stacklevel=2) return [] else: try: upper = self.repo.branches[self.branch] except IndexError: - warnings.warn(f"Release branch `{self.branch}` doesn't exist.") + warnings.warn( + f"Release branch `{self.branch}` doesn't exist.", stacklevel=2 + ) return [] commit_range = f"{lower}..{upper}" @@ -404,12 +394,15 @@ def default_branch(self): except (KeyError, IndexError): # Use a hard-coded default value to set default_branch_name default_branch_name = "main" - warnings.warn('Unable to determine default branch name: ' - 'ARCHERY_DEFAULT_BRANCH environment variable is ' - 'not set. Git repository does not contain a ' - '\'refs/remotes/origin/HEAD\'reference. Setting ' - 'the default branch name to ' + - default_branch_name, RuntimeWarning) + warnings.warn( + "Unable to determine default branch name: " + "ARCHERY_DEFAULT_BRANCH environment variable is " + "not set. Git repository does not contain a " + "'refs/remotes/origin/HEAD'reference. Setting " + "the default branch name to " + default_branch_name, + RuntimeWarning, + stacklevel=2, + ) return default_branch_name @@ -423,26 +416,34 @@ def curate(self, minimal=False): minor.append(c) else: noissue.append(c) - elif c.project == 'GH': + elif c.project == "GH": if int(c.issue_id) in release_issues: within.append((release_issues[int(c.issue_id)], c)) else: - outside.append( - (self.issue_tracker.issue(int(c.issue_id)), c)) + outside.append((self.issue_tracker.issue(int(c.issue_id)), c)) else: - warnings.warn( - f'Issue {c.issue} does not pertain to GH') + warnings.warn(f"Issue {c.issue} does not pertain to GH", stacklevel=2) outside.append((c.issue, c)) # remaining tickets within_keys = {i.key for i, c in within} # Take into account that some issues milestoned are prs - nopatch = [issue for key, issue in release_issues.items() - if key not in within_keys and issue.is_pr is False] - - return ReleaseCuration(release=self, within=within, outside=outside, - noissue=noissue, parquet=parquet, - nopatch=nopatch, minimal=minimal, minor=minor) + nopatch = [ + issue + for key, issue in release_issues.items() + if key not in within_keys and issue.is_pr is False + ] + + return ReleaseCuration( + release=self, + within=within, + outside=outside, + noissue=noissue, + parquet=parquet, + nopatch=nopatch, + minimal=minimal, + minor=minor, + ) def changelog(self): issue_commit_pairs = [] @@ -459,18 +460,18 @@ def changelog(self): # organize issues into categories issue_types = { - 'Bug': 'Bug Fixes', - 'Improvement': 'New Features and Improvements', - 'New Feature': 'New Features and Improvements', - 'Sub-task': 'New Features and Improvements', - 'Task': 'New Features and Improvements', - 'Test': 'Bug Fixes', - 'Wish': 'New Features and Improvements', - 'Type: bug': 'Bug Fixes', - 'Type: enhancement': 'New Features and Improvements', - 'Type: task': 'New Features and Improvements', - 'Type: test': 'Bug Fixes', - 'Type: usage': 'New Features and Improvements', + "Bug": "Bug Fixes", + "Improvement": "New Features and Improvements", + "New Feature": "New Features and Improvements", + "Sub-task": "New Features and Improvements", + "Task": "New Features and Improvements", + "Test": "Bug Fixes", + "Wish": "New Features and Improvements", + "Type: bug": "Bug Fixes", + "Type: enhancement": "New Features and Improvements", + "Type: task": "New Features and Improvements", + "Type: test": "Bug Fixes", + "Type: usage": "New Features and Improvements", } categories = defaultdict(list) for issue, commit in issue_commit_pairs: @@ -479,7 +480,7 @@ def changelog(self): except KeyError: # If issue or pr don't have a type assume task. # Currently the label for type is not mandatory on GitHub. - categories[issue_types['Type: task']].append((issue, commit)) + categories[issue_types["Type: task"]].append((issue, commit)) # sort issues by the issue key in ascending order for issues in categories.values(): @@ -515,8 +516,9 @@ def commits_to_pick(self, exclude_already_applied=True): # issues. This is only to correct the mapping for migrated issues. if c.issue and c.issue.startswith("GH-"): key = int(c.issue_id) - if ((key in self.github_issue_ids or key in self.issues) and - c.title not in already_applied): + if ( + key in self.github_issue_ids or key in self.issues + ) and c.title not in already_applied: patches_to_pick.append(c) return reversed(patches_to_pick) @@ -526,10 +528,8 @@ def cherry_pick_commits(self, recreate_branch=True): # the previous tag if self.branch in self.repo.branches: logger.info(f"Deleting branch {self.branch}") - self.repo.git.branch('-D', self.branch) - logger.info( - f"Creating branch {self.branch} from {self.base_branch} branch" - ) + self.repo.git.branch("-D", self.branch) + logger.info(f"Creating branch {self.branch} from {self.base_branch} branch") self.repo.git.checkout(self.base_branch, b=self.branch) else: # just checkout the already existing maintenance branch @@ -543,7 +543,6 @@ def cherry_pick_commits(self, recreate_branch=True): class MajorRelease(Release): - @property def branch(self): return f"maint-{self.version}" @@ -554,16 +553,16 @@ def base_branch(self): @cached_property def siblings(self): - """ - Filter only the major releases. - """ + """Filter only the major releases.""" # handle minor releases before 1.0 as major releases - return [v for v in self.issue_tracker.project_versions() - if v.patch == 0 and (v.major == 0 or v.minor == 0)] + return [ + v + for v in self.issue_tracker.project_versions() + if v.patch == 0 and (v.major == 0 or v.minor == 0) + ] class MinorRelease(Release): - @property def branch(self): return f"maint-{self.version.major}.x.x" @@ -574,15 +573,11 @@ def base_branch(self): @cached_property def siblings(self): - """ - Filter the major and minor releases. - """ - return [v for v in self.issue_tracker.project_versions() - if v.patch == 0] + """Filter the major and minor releases.""" + return [v for v in self.issue_tracker.project_versions() if v.patch == 0] class PatchRelease(Release): - @property def branch(self): return f"maint-{self.version.major}.{self.version.minor}.x" @@ -593,7 +588,5 @@ def base_branch(self): @cached_property def siblings(self): - """ - No filtering, consider all releases. - """ + """No filtering, consider all releases.""" return self.issue_tracker.project_versions() diff --git a/dev/archery/archery/release/reports.py b/dev/archery/archery/release/reports.py index 4299eaa7ede48..4bb019c1f4569 100644 --- a/dev/archery/archery/release/reports.py +++ b/dev/archery/archery/release/reports.py @@ -14,33 +14,28 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +from __future__ import annotations from ..utils.report import JinjaReport class ReleaseCuration(JinjaReport): - templates = { - 'console': 'release_curation.txt.j2' - } + templates = {"console": "release_curation.txt.j2"} fields = [ - 'release', - 'within', - 'outside', - 'noissue', - 'parquet', - 'nopatch', - 'minimal', - 'minor' + "release", + "within", + "outside", + "noissue", + "parquet", + "nopatch", + "minimal", + "minor", ] class ReleaseChangelog(JinjaReport): templates = { - 'markdown': 'release_changelog.md.j2', - 'html': 'release_changelog.html.j2' + "markdown": "release_changelog.md.j2", + "html": "release_changelog.html.j2", } - fields = [ - 'release', - 'categories' - ] + fields = ["release", "categories"] diff --git a/dev/archery/archery/release/tests/test_release.py b/dev/archery/archery/release/tests/test_release.py index fae2bdcea04a0..920fbcd328aff 100644 --- a/dev/archery/archery/release/tests/test_release.py +++ b/dev/archery/archery/release/tests/test_release.py @@ -14,15 +14,22 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import pytest from archery.release.core import ( - Release, MajorRelease, MinorRelease, PatchRelease, - IssueTracker, Version, Issue, CommitTitle, Commit + Commit, + CommitTitle, + Issue, + IssueTracker, + MajorRelease, + MinorRelease, + PatchRelease, + Release, + Version, ) - # subset of issues per revision _issues = { "3.0.0": [ @@ -30,7 +37,7 @@ Issue("GH-9767", type="New Feature", summary="[Crossbow] Title"), Issue("GH-1231", type="Bug", summary="[Java] Title"), Issue("GH-1244", type="Bug", summary="[C++] Title"), - Issue("GH-1301", type="Bug", summary="[Python][Archery] Title") + Issue("GH-1301", type="Bug", summary="[Python][Archery] Title"), ], "2.0.0": [ Issue("ARROW-9784", type="Bug", summary="[Java] Title"), @@ -39,7 +46,7 @@ Issue("ARROW-9694", type="Bug", summary="[Release] Title"), Issue("ARROW-5643", type="Bug", summary="[Go] Title"), Issue("GH-1243", type="Bug", summary="[Python] Title"), - Issue("GH-1300", type="Bug", summary="[CI][Archery] Title") + Issue("GH-1300", type="Bug", summary="[CI][Archery] Title"), ], "1.0.1": [ Issue("ARROW-9684", type="Bug", summary="[C++] Title"), @@ -48,7 +55,7 @@ Issue("ARROW-9644", type="Bug", summary="[C++][Dataset] Title"), Issue("ARROW-9643", type="Bug", summary="[C++] Title"), Issue("ARROW-9609", type="Bug", summary="[C++] Title"), - Issue("ARROW-9606", type="Bug", summary="[C++][Dataset] Title") + Issue("ARROW-9606", type="Bug", summary="[C++][Dataset] Title"), ], "1.0.0": [ Issue("ARROW-300", type="New Feature", summary="[Format] Title"), @@ -58,7 +65,7 @@ Issue("ARROW-8472", type="Bug", summary="[Go][Integration] Title"), Issue("ARROW-8471", type="Bug", summary="[C++][Integration] Title"), Issue("ARROW-8974", type="Improvement", summary="[C++] Title"), - Issue("ARROW-8973", type="New Feature", summary="[Java] Title") + Issue("ARROW-8973", type="New Feature", summary="[Java] Title"), ], "0.17.1": [ Issue("ARROW-8684", type="Bug", summary="[Python] Title"), @@ -72,13 +79,12 @@ Issue("ARROW-2447", type="Improvement", summary="[C++] Title"), Issue("ARROW-2255", type="Bug", summary="[Integration] Title"), Issue("ARROW-1907", type="Bug", summary="[C++/Python] Title"), - Issue("ARROW-1636", type="New Feature", summary="[Format] Title") - ] + Issue("ARROW-1636", type="New Feature", summary="[Format] Title"), + ], } class FakeIssueTracker(IssueTracker): - def __init__(self): pass @@ -126,14 +132,14 @@ def test_version(fake_issue_tracker): def test_issue(fake_issue_tracker): - i = Issue("ARROW-1234", type='Bug', summary="title") + i = Issue("ARROW-1234", type="Bug", summary="title") assert i.key == "ARROW-1234" assert i.type == "Bug" assert i.summary == "title" assert i.project == "ARROW" assert i.number == 1234 - i = Issue("PARQUET-1111", type='Improvement', summary="another title") + i = Issue("PARQUET-1111", type="Improvement", summary="another title") assert i.key == "PARQUET-1111" assert i.type == "Improvement" assert i.summary == "another title" @@ -142,9 +148,7 @@ def test_issue(fake_issue_tracker): def test_commit_title(): - t = CommitTitle.parse( - "ARROW-9598: [C++][Parquet] Fix writing nullable structs" - ) + t = CommitTitle.parse("ARROW-9598: [C++][Parquet] Fix writing nullable structs") assert t.project == "ARROW" assert t.issue == "ARROW-9598" assert t.components == ["C++", "Parquet"] @@ -161,8 +165,7 @@ def test_commit_title(): assert t.minor is False t = CommitTitle.parse( - "ARROW-9600: [Rust][Arrow] pin older version of proc-macro2 during " - "build" + "ARROW-9600: [Rust][Arrow] pin older version of proc-macro2 during build" ) assert t.project == "ARROW" assert t.issue == "ARROW-9600" @@ -194,8 +197,8 @@ def test_commit_title(): t = CommitTitle.parse( "PARQUET-1882: [C++] Buffered Reads should allow for 0 length" ) - assert t.project == 'PARQUET' - assert t.issue == 'PARQUET-1882' + assert t.project == "PARQUET" + assert t.issue == "PARQUET-1882" assert t.components == ["C++"] assert t.summary == "Buffered Reads should allow for 0 length" assert t.minor is False @@ -205,8 +208,8 @@ def test_commit_title(): "\nsomething else\n" "\nwhich should be truncated" ) - assert t.project == 'ARROW' - assert t.issue == 'ARROW-9340' + assert t.project == "ARROW" + assert t.issue == "ARROW-9340" assert t.components == ["R"] assert t.summary == "Use CRAN version of decor package " assert t.minor is False @@ -216,27 +219,27 @@ def test_release_basics(fake_issue_tracker): r = Release("1.0.0", repo=None, issue_tracker=fake_issue_tracker) assert isinstance(r, MajorRelease) assert r.is_released is True - assert r.branch == 'maint-1.0.0' - assert r.tag == 'apache-arrow-1.0.0' + assert r.branch == "maint-1.0.0" + assert r.tag == "apache-arrow-1.0.0" r = Release("1.1.0", repo=None, issue_tracker=fake_issue_tracker) assert isinstance(r, MinorRelease) assert r.is_released is False - assert r.branch == 'maint-1.x.x' - assert r.tag == 'apache-arrow-1.1.0' + assert r.branch == "maint-1.x.x" + assert r.tag == "apache-arrow-1.1.0" # minor releases before 1.0 are treated as major releases r = Release("0.17.0", repo=None, issue_tracker=fake_issue_tracker) assert isinstance(r, MajorRelease) assert r.is_released is True - assert r.branch == 'maint-0.17.0' - assert r.tag == 'apache-arrow-0.17.0' + assert r.branch == "maint-0.17.0" + assert r.tag == "apache-arrow-0.17.0" r = Release("0.17.1", repo=None, issue_tracker=fake_issue_tracker) assert isinstance(r, PatchRelease) assert r.is_released is True - assert r.branch == 'maint-0.17.x' - assert r.tag == 'apache-arrow-0.17.1' + assert r.branch == "maint-0.17.x" + assert r.tag == "apache-arrow-0.17.1" def test_previous_and_next_release(fake_issue_tracker): @@ -284,7 +287,7 @@ def test_previous_and_next_release(fake_issue_tracker): def test_release_issues(fake_issue_tracker): # major release issues r = Release("1.0.0", repo=None, issue_tracker=fake_issue_tracker) - assert r.issues.keys() == set([ + assert r.issues.keys() == { "ARROW-300", "ARROW-4427", "ARROW-5035", @@ -292,47 +295,45 @@ def test_release_issues(fake_issue_tracker): "ARROW-8472", "ARROW-8471", "ARROW-8974", - "ARROW-8973" - ]) + "ARROW-8973", + } # minor release issues r = Release("0.17.0", repo=None, issue_tracker=fake_issue_tracker) - assert r.issues.keys() == set([ + assert r.issues.keys() == { "ARROW-2882", "ARROW-2587", "ARROW-2447", "ARROW-2255", "ARROW-1907", "ARROW-1636", - ]) + } # patch release issues r = Release("1.0.1", repo=None, issue_tracker=fake_issue_tracker) - assert r.issues.keys() == set([ + assert r.issues.keys() == { "ARROW-9684", "ARROW-9667", "ARROW-9659", "ARROW-9644", "ARROW-9643", "ARROW-9609", - "ARROW-9606" - ]) + "ARROW-9606", + } r = Release("2.0.0", repo=None, issue_tracker=fake_issue_tracker) - assert r.issues.keys() == set([ + assert r.issues.keys() == { "ARROW-9784", "ARROW-9767", "GH-1230", "ARROW-9694", "ARROW-5643", "GH-1243", - "GH-1300" - ]) + "GH-1300", + } -@pytest.mark.parametrize(('version', 'ncommits'), [ - ("1.0.0", 771), - ("0.17.1", 27), - ("0.17.0", 569), - ("0.15.1", 41) -]) +@pytest.mark.parametrize( + ("version", "ncommits"), + [("1.0.0", 771), ("0.17.1", 27), ("0.17.0", 569), ("0.15.1", 41)], +) def test_release_commits(fake_issue_tracker, version, ncommits): r = Release(version, repo=None, issue_tracker=fake_issue_tracker) assert len(r.commits) == ncommits @@ -345,13 +346,11 @@ def test_release_commits(fake_issue_tracker, version, ncommits): def test_maintenance_patch_selection(fake_issue_tracker): r = Release("0.17.1", repo=None, issue_tracker=fake_issue_tracker) - shas_to_pick = [ - c.hexsha for c in r.commits_to_pick(exclude_already_applied=False) - ] + shas_to_pick = [c.hexsha for c in r.commits_to_pick(exclude_already_applied=False)] expected = [ - '8939b4bd446ee406d5225c79d563a27d30fd7d6d', - 'bcef6c95a324417e85e0140f9745d342cd8784b3', - '6002ec388840de5622e39af85abdc57a2cccc9b2', - '9123dadfd123bca7af4eaa9455f5b0d1ca8b929d', + "8939b4bd446ee406d5225c79d563a27d30fd7d6d", + "bcef6c95a324417e85e0140f9745d342cd8784b3", + "6002ec388840de5622e39af85abdc57a2cccc9b2", + "9123dadfd123bca7af4eaa9455f5b0d1ca8b929d", ] assert shas_to_pick == expected diff --git a/dev/archery/archery/testing.py b/dev/archery/archery/testing.py index 3b1061ac85fa4..e4d82f9d33dcc 100644 --- a/dev/archery/archery/testing.py +++ b/dev/archery/archery/testing.py @@ -14,15 +14,15 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from contextlib import contextmanager import os -from unittest import mock import re +from contextlib import contextmanager +from unittest import mock class PartialEnv(dict): - def __eq__(self, other): return self.items() <= other.items() @@ -43,7 +43,6 @@ def _ensure_mock_call_object(obj, **kwargs): class SuccessfulSubprocessResult: - def check_returncode(self): return @@ -51,10 +50,9 @@ def check_returncode(self): @contextmanager def assert_subprocess_calls(expected_commands_or_calls, **kwargs): calls = [ - _ensure_mock_call_object(obj, **kwargs) - for obj in expected_commands_or_calls + _ensure_mock_call_object(obj, **kwargs) for obj in expected_commands_or_calls ] - with mock.patch('subprocess.run', autospec=True) as run: + with mock.patch("subprocess.run", autospec=True) as run: run.return_value = SuccessfulSubprocessResult() yield run run.assert_has_calls(calls) diff --git a/dev/archery/archery/tests/test_benchmarks.py b/dev/archery/archery/tests/test_benchmarks.py index e5af2b3b02794..6e41a8b8235d9 100644 --- a/dev/archery/archery/tests/test_benchmarks.py +++ b/dev/archery/archery/tests/test_benchmarks.py @@ -14,17 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import json from archery.benchmark.codec import JsonEncoder +from archery.benchmark.compare import BenchmarkComparator, RunnerComparator from archery.benchmark.core import Benchmark, median -from archery.benchmark.compare import ( - BenchmarkComparator, RunnerComparator -) -from archery.benchmark.google import ( - GoogleBenchmark, GoogleBenchmarkObservation -) +from archery.benchmark.google import GoogleBenchmark, GoogleBenchmarkObservation from archery.benchmark.runner import StaticBenchmarkRunner @@ -62,15 +59,11 @@ def test_static_runner_from_json_not_a_regression(): "name": "FloatParsing<DoubleType>", "unit": "items_per_second", "less_is_better": False, - "values": [ - 109941112.87296811 - ], + "values": [109941112.87296811], "time_unit": "ns", - "times": [ - 9095.800104330105 - ] - }, - ] + "times": [9095.800104330105], + } + ], } ] } @@ -97,7 +90,7 @@ def test_static_runner_from_json_multiple_values_not_a_regression(): 94873831.3818328, 95593675.20810866, 95797325.6543961, - 96134728.05794072 + 96134728.05794072, ], "time_unit": "ns", "times": [ @@ -105,7 +98,7 @@ def test_static_runner_from_json_multiple_values_not_a_regression(): 10575.162068480413, 10599.271208720838, 10679.028059166194, - 10827.995119861762 + 10827.995119861762, ], "counters": { "family_index": 0, @@ -114,10 +107,10 @@ def test_static_runner_from_json_multiple_values_not_a_regression(): "repetitions": 5, "repetition_index": 0, "threads": 1, - "iterations": 10656 - } + "iterations": 10656, + }, } - ] + ], } ] } @@ -138,15 +131,11 @@ def test_static_runner_from_json_regression(): "name": "FloatParsing<DoubleType>", "unit": "items_per_second", "less_is_better": False, - "values": [ - 109941112.87296811 - ], + "values": [109941112.87296811], "time_unit": "ns", - "times": [ - 9095.800104330105 - ] - }, - ] + "times": [9095.800104330105], + } + ], } ] } @@ -154,7 +143,7 @@ def test_static_runner_from_json_regression(): contender = StaticBenchmarkRunner.from_json(json.dumps(archery_result)) # introduce artificial regression - archery_result['suites'][0]['benchmarks'][0]['values'][0] *= 2 + archery_result["suites"][0]["benchmarks"][0]["values"][0] *= 2 baseline = StaticBenchmarkRunner.from_json(json.dumps(archery_result)) [comparison] = RunnerComparator(contender, baseline).comparisons @@ -177,7 +166,7 @@ def test_static_runner_from_json_multiple_values_regression(): 94873831.3818328, 95593675.20810866, 95797325.6543961, - 96134728.05794072 + 96134728.05794072, ], "time_unit": "ns", "times": [ @@ -185,7 +174,7 @@ def test_static_runner_from_json_multiple_values_regression(): 10575.162068480413, 10599.271208720838, 10679.028059166194, - 10827.995119861762 + 10827.995119861762, ], "counters": { "family_index": 0, @@ -194,10 +183,10 @@ def test_static_runner_from_json_multiple_values_regression(): "repetitions": 5, "repetition_index": 0, "threads": 1, - "iterations": 10656 - } + "iterations": 10656, + }, } - ] + ], } ] } @@ -205,7 +194,7 @@ def test_static_runner_from_json_multiple_values_regression(): contender = StaticBenchmarkRunner.from_json(json.dumps(archery_result)) # introduce artificial regression - values = archery_result['suites'][0]['benchmarks'][0]['values'] + values = archery_result["suites"][0]["benchmarks"][0]["values"] values[:] = [v * 2 for v in values] baseline = StaticBenchmarkRunner.from_json(json.dumps(archery_result)) @@ -221,7 +210,7 @@ def test_benchmark_median(): assert median([1, 1, 1, 1]) == 1 try: median([]) - assert False + raise AssertionError() except ValueError: pass @@ -251,12 +240,14 @@ def test_items_per_second(): "time_unit": "ns", } archery_result = { - "counters": {"iterations": 5964, - "null_percent": 0.0, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, + "counters": { + "iterations": 5964, + "null_percent": 0.0, + "repetition_index": 0, + "repetitions": 0, + "run_name": name, + "threads": 1, + }, "name": name, "unit": "items_per_second", "less_is_better": False, @@ -285,11 +276,13 @@ def test_bytes_per_second(): "time_unit": "ns", } archery_result = { - "counters": {"iterations": 47, - "repetition_index": 1, - "repetitions": 0, - "run_name": name, - "threads": 1}, + "counters": { + "iterations": 47, + "repetition_index": 1, + "repetitions": 0, + "run_name": name, + "threads": 1, + }, "name": name, "unit": "bytes_per_second", "less_is_better": False, @@ -322,12 +315,14 @@ def test_both_items_and_bytes_per_second(): } # Note that bytes_per_second trumps items_per_second archery_result = { - "counters": {"iterations": 5964, - "null_percent": 0.0, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, + "counters": { + "iterations": 5964, + "null_percent": 0.0, + "repetition_index": 0, + "repetitions": 0, + "run_name": name, + "threads": 1, + }, "name": name, "unit": "bytes_per_second", "less_is_better": False, @@ -355,11 +350,13 @@ def test_neither_items_nor_bytes_per_second(): "time_unit": "ns", } archery_result = { - "counters": {"iterations": 352765, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, + "counters": { + "iterations": 352765, + "repetition_index": 0, + "repetitions": 0, + "run_name": name, + "threads": 1, + }, "name": name, "unit": "ns", "less_is_better": True, @@ -387,11 +384,13 @@ def test_prefer_real_time(): "time_unit": "ns", } archery_result = { - "counters": {"iterations": 352765, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, + "counters": { + "iterations": 352765, + "repetition_index": 0, + "repetitions": 0, + "run_name": name, + "threads": 1, + }, "name": name, "unit": "ns", "less_is_better": True, @@ -418,11 +417,13 @@ def test_prefer_cpu_time(): "time_unit": "ns", } archery_result = { - "counters": {"iterations": 352765, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, + "counters": { + "iterations": 352765, + "repetition_index": 0, + "repetitions": 0, + "run_name": name, + "threads": 1, + }, "name": name, "unit": "ns", "less_is_better": True, @@ -461,11 +462,13 @@ def test_omits_aggregates(): "time_unit": "ns", } archery_result = { - "counters": {"iterations": 352765, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, + "counters": { + "iterations": 352765, + "repetition_index": 0, + "repetitions": 0, + "run_name": name, + "threads": 1, + }, "name": name, "unit": "ns", "less_is_better": True, @@ -486,68 +489,68 @@ def test_multiple_observations(): name = "FloatParsing<DoubleType>" google_results = [ { - 'cpu_time': 10627.38199641615, - 'family_index': 0, - 'items_per_second': 94096551.75067839, - 'iterations': 9487, - 'name': 'FloatParsing<DoubleType>', - 'per_family_instance_index': 0, - 'real_time': 10628.84905663701, - 'repetition_index': 0, - 'repetitions': 3, - 'run_name': 'FloatParsing<DoubleType>', - 'run_type': 'iteration', - 'threads': 1, - 'time_unit': 'ns' + "cpu_time": 10627.38199641615, + "family_index": 0, + "items_per_second": 94096551.75067839, + "iterations": 9487, + "name": "FloatParsing<DoubleType>", + "per_family_instance_index": 0, + "real_time": 10628.84905663701, + "repetition_index": 0, + "repetitions": 3, + "run_name": "FloatParsing<DoubleType>", + "run_type": "iteration", + "threads": 1, + "time_unit": "ns", }, { - 'cpu_time': 10633.318014124594, - 'family_index': 0, - 'items_per_second': 94044022.63448404, - 'iterations': 9487, - 'name': 'FloatParsing<DoubleType>', - 'per_family_instance_index': 0, - 'real_time': 10634.858754122948, - 'repetition_index': 1, - 'repetitions': 3, - 'run_name': 'FloatParsing<DoubleType>', - 'run_type': 'iteration', - 'threads': 1, - 'time_unit': 'ns' + "cpu_time": 10633.318014124594, + "family_index": 0, + "items_per_second": 94044022.63448404, + "iterations": 9487, + "name": "FloatParsing<DoubleType>", + "per_family_instance_index": 0, + "real_time": 10634.858754122948, + "repetition_index": 1, + "repetitions": 3, + "run_name": "FloatParsing<DoubleType>", + "run_type": "iteration", + "threads": 1, + "time_unit": "ns", }, { - 'cpu_time': 10664.315484347, - 'family_index': 0, - 'items_per_second': 93770669.24434038, - 'iterations': 9487, - 'name': 'FloatParsing<DoubleType>', - 'per_family_instance_index': 0, - 'real_time': 10665.584589337563, - 'repetition_index': 2, - 'repetitions': 3, - 'run_name': 'FloatParsing<DoubleType>', - 'run_type': 'iteration', - 'threads': 1, - 'time_unit': 'ns' - } + "cpu_time": 10664.315484347, + "family_index": 0, + "items_per_second": 93770669.24434038, + "iterations": 9487, + "name": "FloatParsing<DoubleType>", + "per_family_instance_index": 0, + "real_time": 10665.584589337563, + "repetition_index": 2, + "repetitions": 3, + "run_name": "FloatParsing<DoubleType>", + "run_type": "iteration", + "threads": 1, + "time_unit": "ns", + }, ] archery_result = { - 'counters': { - 'family_index': 0, - 'iterations': 9487, - 'per_family_instance_index': 0, - 'repetition_index': 2, - 'repetitions': 3, - 'run_name': 'FloatParsing<DoubleType>', - 'threads': 1 + "counters": { + "family_index": 0, + "iterations": 9487, + "per_family_instance_index": 0, + "repetition_index": 2, + "repetitions": 3, + "run_name": "FloatParsing<DoubleType>", + "threads": 1, }, - 'less_is_better': False, - 'name': 'FloatParsing<DoubleType>', - 'time_unit': 'ns', - 'times': [10628.84905663701, 10634.858754122948, 10665.584589337563], - 'unit': 'items_per_second', - 'values': [93770669.24434038, 94044022.63448404, 94096551.75067839] + "less_is_better": False, + "name": "FloatParsing<DoubleType>", + "time_unit": "ns", + "times": [10628.84905663701, 10634.858754122948, 10665.584589337563], + "unit": "items_per_second", + "values": [93770669.24434038, 94044022.63448404, 94096551.75067839], } observations = [GoogleBenchmarkObservation(**g) for g in google_results] diff --git a/dev/archery/archery/tests/test_bot.py b/dev/archery/archery/tests/test_bot.py index 5d32cdfd9a59a..1fbac5cea261b 100644 --- a/dev/archery/archery/tests/test_bot.py +++ b/dev/archery/archery/tests/test_bot.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import json import os @@ -25,11 +26,11 @@ import responses as rsps from archery.bot import ( - CommentBot, CommandError, + CommentBot, PullRequestState, PullRequestWorkflowBot, - group + group, ) @@ -41,16 +42,19 @@ def responses(): @pytest.fixture(autouse=True) def set_env_vars(): - with mock.patch.dict(os.environ, { - "GITHUB_SERVER_URL": "https://github.com", - "GITHUB_REPOSITORY": "apache/arrow", - "GITHUB_RUN_ID": "1463784188" - }): + with mock.patch.dict( + os.environ, + { + "GITHUB_SERVER_URL": "https://github.com", + "GITHUB_REPOSITORY": "apache/arrow", + "GITHUB_RUN_ID": "1463784188", + }, + ): yield def github_url(path): - return 'https://api.github.com:443/{}'.format(path.strip('/')) + return "https://api.github.com:443/{}".format(path.strip("/")) @group() @@ -65,40 +69,43 @@ def extra(obj): @custom_handler.command() -@click.option('--force', '-f', is_flag=True) +@click.option("--force", "-f", is_flag=True) def build(force): return force @custom_handler.command() -@click.option('--name', required=True) +@click.option("--name", required=True) def benchmark(name): return name def test_click_based_commands(): - assert custom_handler('build') is False - assert custom_handler('build -f') is True + assert custom_handler("build") is False + assert custom_handler("build -f") is True - assert custom_handler('benchmark --name strings') == 'strings' + assert custom_handler("benchmark --name strings") == "strings" with pytest.raises(CommandError): - assert custom_handler('benchmark') + assert custom_handler("benchmark") - assert custom_handler('extra', extra='data') == {'extra': 'data'} + assert custom_handler("extra", extra="data") == {"extra": "data"} -@pytest.mark.parametrize('fixture_name', [ - # the bot is not mentioned, nothing to do - 'event-issue-comment-not-mentioning-ursabot.json', - # don't respond to itself, it prevents recursive comment storms! - 'event-issue-comment-by-ursabot.json', -]) +@pytest.mark.parametrize( + "fixture_name", + [ + # the bot is not mentioned, nothing to do + "event-issue-comment-not-mentioning-ursabot.json", + # don't respond to itself, it prevents recursive comment storms! + "event-issue-comment-by-ursabot.json", + ], +) def test_noop_events(load_fixture, fixture_name): payload = load_fixture(fixture_name) handler = Mock() - bot = CommentBot(name='ursabot', handler=handler) - bot.handle('issue_comment', payload) + bot = CommentBot(name="ursabot", handler=handler) + bot.handle("issue_comment", payload) handler.assert_not_called() @@ -106,276 +113,273 @@ def test_noop_events(load_fixture, fixture_name): def test_unauthorized_user_comment(load_fixture, responses): responses.add( responses.GET, - github_url('/repositories/169101701/issues/26'), - json=load_fixture('issue-26.json'), - status=200 + github_url("/repositories/169101701/issues/26"), + json=load_fixture("issue-26.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/pulls/26'), - json=load_fixture('pull-request-26.json'), - status=200 + github_url("/repos/ursa-labs/ursabot/pulls/26"), + json=load_fixture("pull-request-26.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/comments/480243815'), - json=load_fixture('pull-request-26.json'), - status=200 + github_url("/repos/ursa-labs/ursabot/issues/comments/480243815"), + json=load_fixture("pull-request-26.json"), + status=200, ) responses.add( responses.POST, - github_url('/repos/ursa-labs/ursabot/issues/26/comments'), - json={} + github_url("/repos/ursa-labs/ursabot/issues/26/comments"), + json={}, ) responses.add( responses.POST, - github_url( - '/repos/ursa-labs/ursabot/pulls/26/reactions'), - json=() + github_url("/repos/ursa-labs/ursabot/pulls/26/reactions"), + json=(), ) def handler(command, **kwargs): pass - payload = load_fixture('event-issue-comment-by-non-authorized-user.json') - payload["comment"]["body"] = '@ursabot crossbow submit -g nightly' - bot = CommentBot(name='ursabot', handler=handler) - bot.handle('issue_comment', payload) + payload = load_fixture("event-issue-comment-by-non-authorized-user.json") + payload["comment"]["body"] = "@ursabot crossbow submit -g nightly" + bot = CommentBot(name="ursabot", handler=handler) + bot.handle("issue_comment", payload) print([c.request.body for c in responses.calls]) post = responses.calls[-2] reaction = responses.calls[-1] - comment = ("```\nOnly contributors can submit requests to this bot. " - "Please ask someone from the community for help with getting " - "the first commit in.\n" - "The Archery job run can be found at: " - "https://github.com/apache/arrow/actions/runs/1463784188\n" - "```") - assert json.loads(post.request.body) == { - "body": f'{comment}'} - assert json.loads(reaction.request.body) == {'content': '-1'} + comment = ( + "```\nOnly contributors can submit requests to this bot. " + "Please ask someone from the community for help with getting " + "the first commit in.\n" + "The Archery job run can be found at: " + "https://github.com/apache/arrow/actions/runs/1463784188\n" + "```" + ) + assert json.loads(post.request.body) == {"body": f"{comment}"} + assert json.loads(reaction.request.body) == {"content": "-1"} def test_issue_comment_without_pull_request(load_fixture, responses): responses.add( responses.GET, - github_url('/repositories/169101701/issues/19'), - json=load_fixture('issue-19.json'), - status=200 + github_url("/repositories/169101701/issues/19"), + json=load_fixture("issue-19.json"), + status=200, ) responses.add( responses.GET, - github_url('repos/ursa-labs/ursabot/pulls/19'), + github_url("repos/ursa-labs/ursabot/pulls/19"), json={}, - status=404 + status=404, ) responses.add( responses.POST, - github_url('/repos/ursa-labs/ursabot/issues/19/comments'), - json={} + github_url("/repos/ursa-labs/ursabot/issues/19/comments"), + json={}, ) def handler(command, **kwargs): pass - payload = load_fixture('event-issue-comment-without-pull-request.json') - bot = CommentBot(name='ursabot', handler=handler) - bot.handle('issue_comment', payload) + payload = load_fixture("event-issue-comment-without-pull-request.json") + bot = CommentBot(name="ursabot", handler=handler) + bot.handle("issue_comment", payload) post = responses.calls[2] assert json.loads(post.request.body) == { - 'body': "The comment bot only listens to pull request comments!" + "body": "The comment bot only listens to pull request comments!" } def test_respond_with_usage(load_fixture, responses): responses.add( responses.GET, - github_url('/repositories/169101701/issues/26'), - json=load_fixture('issue-26.json'), - status=200 + github_url("/repositories/169101701/issues/26"), + json=load_fixture("issue-26.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/pulls/26'), - json=load_fixture('pull-request-26.json'), - status=200 + github_url("/repos/ursa-labs/ursabot/pulls/26"), + json=load_fixture("pull-request-26.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/comments/480243811'), - json=load_fixture('issue-comment-480243811.json') + github_url("/repos/ursa-labs/ursabot/issues/comments/480243811"), + json=load_fixture("issue-comment-480243811.json"), ) responses.add( responses.POST, - github_url('/repos/ursa-labs/ursabot/issues/26/comments'), - json={} + github_url("/repos/ursa-labs/ursabot/issues/26/comments"), + json={}, ) responses.add( responses.POST, - github_url( - '/repos/ursa-labs/ursabot/issues/comments/479081273/reactions'), - json=() + github_url("/repos/ursa-labs/ursabot/issues/comments/479081273/reactions"), + json=(), ) def handler(command, **kwargs): - raise CommandError('test-usage') + raise CommandError("test-usage") - payload = load_fixture('event-issue-comment-with-empty-command.json') - bot = CommentBot(name='ursabot', handler=handler) - bot.handle('issue_comment', payload) + payload = load_fixture("event-issue-comment-with-empty-command.json") + bot = CommentBot(name="ursabot", handler=handler) + bot.handle("issue_comment", payload) post = responses.calls[3] - assert json.loads(post.request.body) == \ - {'body': - ("```\ntest-usage\n" - "The Archery job run can be found at: " - "https://github.com/apache/arrow/actions/runs/1463784188\n" - "```") - } + assert json.loads(post.request.body) == { + "body": ( + "```\ntest-usage\n" + "The Archery job run can be found at: " + "https://github.com/apache/arrow/actions/runs/1463784188\n" + "```" + ) + } -@pytest.mark.parametrize(('command', 'reaction'), [ - ('@ursabot build', '+1'), - ('@ursabot build\nwith a comment', '+1'), -]) -def test_issue_comment_with_commands(load_fixture, responses, command, - reaction): +@pytest.mark.parametrize( + ("command", "reaction"), + [("@ursabot build", "+1"), ("@ursabot build\nwith a comment", "+1")], +) +def test_issue_comment_with_commands(load_fixture, responses, command, reaction): responses.add( responses.GET, - github_url('/repositories/169101701/issues/26'), - json=load_fixture('issue-26.json'), - status=200 + github_url("/repositories/169101701/issues/26"), + json=load_fixture("issue-26.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/pulls/26'), - json=load_fixture('pull-request-26.json'), - status=200 + github_url("/repos/ursa-labs/ursabot/pulls/26"), + json=load_fixture("pull-request-26.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/comments/480248726'), - json=load_fixture('issue-comment-480248726.json') + github_url("/repos/ursa-labs/ursabot/issues/comments/480248726"), + json=load_fixture("issue-comment-480248726.json"), ) responses.add( responses.POST, - github_url( - '/repos/ursa-labs/ursabot/issues/comments/480248726/reactions' - ), - json={} + github_url("/repos/ursa-labs/ursabot/issues/comments/480248726/reactions"), + json={}, ) def handler(command, **kwargs): - if command == 'build': + if command == "build": return True else: - raise ValueError('Only `build` command is supported.') + raise ValueError("Only `build` command is supported.") - payload = load_fixture('event-issue-comment-build-command.json') + payload = load_fixture("event-issue-comment-build-command.json") payload["comment"]["body"] = command - bot = CommentBot(name='ursabot', handler=handler) - bot.handle('issue_comment', payload) + bot = CommentBot(name="ursabot", handler=handler) + bot.handle("issue_comment", payload) post = responses.calls[3] - assert json.loads(post.request.body) == {'content': reaction} + assert json.loads(post.request.body) == {"content": reaction} -@pytest.mark.parametrize(('command', 'reaction'), [ - ('@ursabot listen', '-1'), -]) -def test_issue_comment_invalid_commands(load_fixture, responses, command, - reaction): +@pytest.mark.parametrize(("command", "reaction"), [("@ursabot listen", "-1")]) +def test_issue_comment_invalid_commands(load_fixture, responses, command, reaction): responses.add( responses.GET, - github_url('/repositories/169101701/issues/26'), - json=load_fixture('issue-26.json'), - status=200 + github_url("/repositories/169101701/issues/26"), + json=load_fixture("issue-26.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/pulls/26'), - json=load_fixture('pull-request-26.json'), - status=200 + github_url("/repos/ursa-labs/ursabot/pulls/26"), + json=load_fixture("pull-request-26.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/comments/480248726'), - json=load_fixture('issue-comment-480248726.json') + github_url("/repos/ursa-labs/ursabot/issues/comments/480248726"), + json=load_fixture("issue-comment-480248726.json"), ) responses.add( responses.POST, - github_url( - '/repos/ursa-labs/ursabot/issues/comments/480248726/reactions' - ), - json={} + github_url("/repos/ursa-labs/ursabot/issues/comments/480248726/reactions"), + json={}, ) responses.add( responses.POST, - github_url('/repos/ursa-labs/ursabot/issues/26/comments'), - json={} + github_url("/repos/ursa-labs/ursabot/issues/26/comments"), + json={}, ) def handler(command, **kwargs): - if command == 'build': + if command == "build": return True else: - raise ValueError('Only `build` command is supported.') + raise ValueError("Only `build` command is supported.") - payload = load_fixture('event-issue-comment-build-command.json') + payload = load_fixture("event-issue-comment-build-command.json") payload["comment"]["body"] = command - bot = CommentBot(name='ursabot', handler=handler) - bot.handle('issue_comment', payload) + bot = CommentBot(name="ursabot", handler=handler) + bot.handle("issue_comment", payload) # Setting reaction is always the last call post = responses.calls[-1] - assert json.loads(post.request.body) == {'content': reaction} + assert json.loads(post.request.body) == {"content": reaction} def test_issue_comment_with_commands_bot_not_first(load_fixture, responses): # when the @-mention is not first, this is a no-op handler = Mock() - payload = load_fixture('event-issue-comment-build-command.json') - payload["comment"]["body"] = 'with a comment\n@ursabot build' + payload = load_fixture("event-issue-comment-build-command.json") + payload["comment"]["body"] = "with a comment\n@ursabot build" - bot = CommentBot(name='ursabot', handler=handler) - bot.handle('issue_comment', payload) + bot = CommentBot(name="ursabot", handler=handler) + bot.handle("issue_comment", payload) handler.assert_not_called() -@pytest.mark.parametrize(('fixture_name', 'expected_label'), [ - ('event-pull-request-target-opened-committer.json', - PullRequestState.committer_review.value), - ('event-pull-request-target-opened-non-committer.json', - PullRequestState.review.value), -]) +@pytest.mark.parametrize( + ("fixture_name", "expected_label"), + [ + ( + "event-pull-request-target-opened-committer.json", + PullRequestState.committer_review.value, + ), + ( + "event-pull-request-target-opened-non-committer.json", + PullRequestState.review.value, + ), + ], +) def test_open_pull_request(load_fixture, responses, fixture_name, expected_label): responses.add( responses.GET, - github_url('/repositories/169101701/pulls/26'), - json=load_fixture('pull-request-26.json'), - status=200 + github_url("/repositories/169101701/pulls/26"), + json=load_fixture("pull-request-26.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/26/labels'), + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), json=[], - status=200 + status=200, ) responses.add( responses.POST, - github_url( - '/repos/ursa-labs/ursabot/issues/26/labels' - ), - status=201 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + status=201, ) payload = load_fixture(fixture_name) - bot = PullRequestWorkflowBot('pull_request_target', payload) + bot = PullRequestWorkflowBot("pull_request_target", payload) bot.handle() # Setting awaiting committer review or awaiting review label @@ -383,36 +387,39 @@ def test_open_pull_request(load_fixture, responses, fixture_name, expected_label assert json.loads(post.request.body) == [expected_label] -@pytest.mark.parametrize(('fixture_name', 'expected_label'), [ - ('event-pull-request-target-opened-non-committer.json', - PullRequestState.committer_review.value), -]) -def test_open_pull_request_with_committer_list(load_fixture, responses, fixture_name, - expected_label): +@pytest.mark.parametrize( + ("fixture_name", "expected_label"), + [ + ( + "event-pull-request-target-opened-non-committer.json", + PullRequestState.committer_review.value, + ) + ], +) +def test_open_pull_request_with_committer_list( + load_fixture, responses, fixture_name, expected_label +): responses.add( responses.GET, - github_url('/repositories/169101701/pulls/26'), - json=load_fixture('pull-request-26.json'), - status=200 + github_url("/repositories/169101701/pulls/26"), + json=load_fixture("pull-request-26.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/26/labels'), + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), json=[], - status=200 + status=200, ) responses.add( responses.POST, - github_url( - '/repos/ursa-labs/ursabot/issues/26/labels' - ), - status=201 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + status=201, ) payload = load_fixture(fixture_name) # Even though the author_association is not committer the list overrides. - bot = PullRequestWorkflowBot( - 'pull_request_target', payload, committers=['kszucs']) + bot = PullRequestWorkflowBot("pull_request_target", payload, committers=["kszucs"]) bot.handle() # Setting awaiting committer review or awaiting review label @@ -420,250 +427,265 @@ def test_open_pull_request_with_committer_list(load_fixture, responses, fixture_ assert json.loads(post.request.body) == [expected_label] -@pytest.mark.parametrize(('fixture_name', 'expected_label'), [ - ('event-pull-request-target-opened-committer.json', - PullRequestState.committer_review.value), -]) +@pytest.mark.parametrize( + ("fixture_name", "expected_label"), + [ + ( + "event-pull-request-target-opened-committer.json", + PullRequestState.committer_review.value, + ) + ], +) def test_open_pull_request_with_existing_label( - load_fixture, responses, fixture_name, expected_label): + load_fixture, responses, fixture_name, expected_label +): responses.add( responses.GET, - github_url('/repositories/169101701/pulls/26'), - json=load_fixture('pull-request-26-awaiting-review.json'), - status=200 + github_url("/repositories/169101701/pulls/26"), + json=load_fixture("pull-request-26-awaiting-review.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/26/labels'), - json=load_fixture('label-awaiting-review.json'), - status=200 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + json=load_fixture("label-awaiting-review.json"), + status=200, ) responses.add( responses.DELETE, - github_url('/repos/ursa-labs/ursabot/issues/26/labels/awaiting%20review'), - status=200 + github_url("/repos/ursa-labs/ursabot/issues/26/labels/awaiting%20review"), + status=200, ) responses.add( responses.POST, - github_url( - '/repos/ursa-labs/ursabot/issues/26/labels' - ), - status=201 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + status=201, ) payload = load_fixture(fixture_name) - payload['pull_request']['labels'] = ['awaiting review'] + payload["pull_request"]["labels"] = ["awaiting review"] - bot = PullRequestWorkflowBot('pull_request_target', payload) + bot = PullRequestWorkflowBot("pull_request_target", payload) bot.handle() post = responses.calls[-1] assert json.loads(post.request.body) == [expected_label] -@pytest.mark.parametrize(('fixture_name', 'review_state', 'expected_label'), [ - ('event-pr-review-committer.json', 'commented', PullRequestState.changes.value), - ('event-pr-review-committer.json', 'changes_requested', - PullRequestState.changes.value), - ('event-pr-review-committer.json', 'approved', PullRequestState.merge.value), - ('event-pr-review-non-committer.json', 'commented', - PullRequestState.committer_review.value), - ('event-pr-review-non-committer.json', 'changes_requested', - PullRequestState.committer_review.value), - ('event-pr-review-non-committer.json', 'approved', - PullRequestState.committer_review.value), -]) +@pytest.mark.parametrize( + ("fixture_name", "review_state", "expected_label"), + [ + ("event-pr-review-committer.json", "commented", PullRequestState.changes.value), + ( + "event-pr-review-committer.json", + "changes_requested", + PullRequestState.changes.value, + ), + ("event-pr-review-committer.json", "approved", PullRequestState.merge.value), + ( + "event-pr-review-non-committer.json", + "commented", + PullRequestState.committer_review.value, + ), + ( + "event-pr-review-non-committer.json", + "changes_requested", + PullRequestState.committer_review.value, + ), + ( + "event-pr-review-non-committer.json", + "approved", + PullRequestState.committer_review.value, + ), + ], +) def test_pull_request_review_awaiting_review( - load_fixture, responses, fixture_name, review_state, expected_label): + load_fixture, responses, fixture_name, review_state, expected_label +): responses.add( responses.GET, - github_url('/repositories/169101701/pulls/26'), - json=load_fixture('pull-request-26-awaiting-review.json'), - status=200 + github_url("/repositories/169101701/pulls/26"), + json=load_fixture("pull-request-26-awaiting-review.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/26/labels'), - json=load_fixture('label-awaiting-review.json'), - status=200 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + json=load_fixture("label-awaiting-review.json"), + status=200, ) responses.add( responses.DELETE, - github_url('/repos/ursa-labs/ursabot/issues/26/labels/awaiting%20review'), - status=200 + github_url("/repos/ursa-labs/ursabot/issues/26/labels/awaiting%20review"), + status=200, ) responses.add( responses.POST, - github_url( - '/repos/ursa-labs/ursabot/issues/26/labels' - ), - status=201 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + status=201, ) payload = load_fixture(fixture_name) - payload['pull_request']['labels'] = ['awaiting review'] - payload['review']['state'] = review_state + payload["pull_request"]["labels"] = ["awaiting review"] + payload["review"]["state"] = review_state - bot = PullRequestWorkflowBot('pull_request_review', payload) + bot = PullRequestWorkflowBot("pull_request_review", payload) bot.handle() post = responses.calls[-1] assert json.loads(post.request.body) == [expected_label] -@pytest.mark.parametrize(('review_state', 'expected_label'), [ - ('commented', PullRequestState.changes.value), - ('changes_requested', PullRequestState.changes.value), - ('approved', PullRequestState.merge.value), -]) +@pytest.mark.parametrize( + ("review_state", "expected_label"), + [ + ("commented", PullRequestState.changes.value), + ("changes_requested", PullRequestState.changes.value), + ("approved", PullRequestState.merge.value), + ], +) def test_pull_request_committer_review_awaiting_change_review( - load_fixture, responses, review_state, expected_label): + load_fixture, responses, review_state, expected_label +): responses.add( responses.GET, - github_url('/repositories/169101701/pulls/26'), - json=load_fixture('pull-request-26-awaiting-review.json'), - status=200 + github_url("/repositories/169101701/pulls/26"), + json=load_fixture("pull-request-26-awaiting-review.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/26/labels'), - json=load_fixture('label-awaiting-change-review.json'), - status=200 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + json=load_fixture("label-awaiting-change-review.json"), + status=200, ) responses.add( responses.DELETE, - github_url('/repos/ursa-labs/ursabot/issues/26/' + - 'labels/awaiting%20change%20review'), - status=200 + github_url( + "/repos/ursa-labs/ursabot/issues/26/" + "labels/awaiting%20change%20review" + ), + status=200, ) responses.add( responses.POST, - github_url( - '/repos/ursa-labs/ursabot/issues/26/labels' - ), - status=201 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + status=201, ) - payload = load_fixture('event-pr-review-committer.json') - payload['pull_request']['labels'] = ['awaiting change review'] - payload['review']['state'] = review_state + payload = load_fixture("event-pr-review-committer.json") + payload["pull_request"]["labels"] = ["awaiting change review"] + payload["review"]["state"] = review_state - bot = PullRequestWorkflowBot('pull_request_review', payload) + bot = PullRequestWorkflowBot("pull_request_review", payload) bot.handle() post = responses.calls[-1] assert json.loads(post.request.body) == [expected_label] -@pytest.mark.parametrize('review_state', [ - 'commented', 'changes_requested', 'approved']) +@pytest.mark.parametrize("review_state", ["commented", "changes_requested", "approved"]) def test_pull_request_non_committer_review_awaiting_change_review( - load_fixture, responses, review_state): + load_fixture, responses, review_state +): responses.add( responses.GET, - github_url('/repositories/169101701/pulls/26'), - json=load_fixture('pull-request-26-awaiting-review.json'), - status=200 + github_url("/repositories/169101701/pulls/26"), + json=load_fixture("pull-request-26-awaiting-review.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/26/labels'), - json=load_fixture('label-awaiting-change-review.json'), - status=200 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + json=load_fixture("label-awaiting-change-review.json"), + status=200, ) - payload = load_fixture('event-pr-review-non-committer.json') - payload['pull_request']['labels'] = ['awaiting change review'] - payload['review']['state'] = review_state + payload = load_fixture("event-pr-review-non-committer.json") + payload["pull_request"]["labels"] = ["awaiting change review"] + payload["review"]["state"] = review_state - bot = PullRequestWorkflowBot('pull_request_review', payload) + bot = PullRequestWorkflowBot("pull_request_review", payload) bot.handle() # No requests to delete post new labels on non-committer reviews assert len(responses.calls) == 2 -def test_pull_request_synchronize_event_on_awaiting_changes( - load_fixture, responses): - payload = load_fixture('event-pull-request-target-synchronize.json') +def test_pull_request_synchronize_event_on_awaiting_changes(load_fixture, responses): + payload = load_fixture("event-pull-request-target-synchronize.json") responses.add( responses.GET, - github_url('/repositories/169101701/pulls/26'), - json=load_fixture('pull-request-26-awaiting-review.json'), - status=200 + github_url("/repositories/169101701/pulls/26"), + json=load_fixture("pull-request-26-awaiting-review.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/26/labels'), - json=load_fixture('label-awaiting-changes.json'), - status=200 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + json=load_fixture("label-awaiting-changes.json"), + status=200, ) responses.add( responses.DELETE, - github_url('/repos/ursa-labs/ursabot/issues/26/' + - 'labels/awaiting%20changes'), - status=200 + github_url("/repos/ursa-labs/ursabot/issues/26/" + "labels/awaiting%20changes"), + status=200, ) responses.add( responses.POST, - github_url( - '/repos/ursa-labs/ursabot/issues/26/labels' - ), - status=201 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + status=201, ) - bot = PullRequestWorkflowBot('pull_request_target', payload) + bot = PullRequestWorkflowBot("pull_request_target", payload) bot.handle() # after push event label changes. post = responses.calls[-1] assert json.loads(post.request.body) == ["awaiting change review"] -def test_pull_request_synchronize_event_on_awaiting_review( - load_fixture, responses): - payload = load_fixture('event-pull-request-target-synchronize.json') +def test_pull_request_synchronize_event_on_awaiting_review(load_fixture, responses): + payload = load_fixture("event-pull-request-target-synchronize.json") responses.add( responses.GET, - github_url('/repositories/169101701/pulls/26'), - json=load_fixture('pull-request-26-awaiting-review.json'), - status=200 + github_url("/repositories/169101701/pulls/26"), + json=load_fixture("pull-request-26-awaiting-review.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/26/labels'), - json=load_fixture('label-awaiting-review.json'), - status=200 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + json=load_fixture("label-awaiting-review.json"), + status=200, ) - bot = PullRequestWorkflowBot('pull_request_target', payload) + bot = PullRequestWorkflowBot("pull_request_target", payload) bot.handle() # No requests to delete or post new labels on push awaiting review assert len(responses.calls) == 2 def test_pull_request_synchronize_event_on_existing_pr_without_state( - load_fixture, responses): - payload = load_fixture('event-pull-request-target-synchronize.json') + load_fixture, responses +): + payload = load_fixture("event-pull-request-target-synchronize.json") responses.add( responses.GET, - github_url('/repositories/169101701/pulls/26'), - json=load_fixture('pull-request-26.json'), - status=200 + github_url("/repositories/169101701/pulls/26"), + json=load_fixture("pull-request-26.json"), + status=200, ) responses.add( responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/26/labels'), + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), json=[], - status=200 + status=200, ) responses.add( responses.POST, - github_url( - '/repos/ursa-labs/ursabot/issues/26/labels' - ), - status=201 + github_url("/repos/ursa-labs/ursabot/issues/26/labels"), + status=201, ) - bot = PullRequestWorkflowBot('pull_request_target', payload) + bot = PullRequestWorkflowBot("pull_request_target", payload) bot.handle() # after push event label get set to default post = responses.calls[-1] diff --git a/dev/archery/archery/tests/test_cli.py b/dev/archery/archery/tests/test_cli.py index 3891a2c288d68..77ab7f2f0e0b6 100644 --- a/dev/archery/archery/tests/test_cli.py +++ b/dev/archery/archery/tests/test_cli.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations from pathlib import Path from unittest.mock import patch @@ -28,12 +29,14 @@ def test_linking_check_dependencies(fn): args = [ "linking", "check-dependencies", - "-a", "libarrow", - "-d", "libcurl", - "somelib.so" + "-a", + "libarrow", + "-d", + "libcurl", + "somelib.so", ] result = CliRunner().invoke(archery, args) assert result.exit_code == 0 fn.assert_called_once_with( - Path('somelib.so'), allowed={'libarrow'}, disallowed={'libcurl'} + Path("somelib.so"), allowed={"libarrow"}, disallowed={"libcurl"} ) diff --git a/dev/archery/archery/tests/test_testing.py b/dev/archery/archery/tests/test_testing.py index 117b9288d74b6..5db59b0ed5b4b 100644 --- a/dev/archery/archery/tests/test_testing.py +++ b/dev/archery/archery/tests/test_testing.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import subprocess @@ -23,40 +24,27 @@ def test_partial_env(): - assert PartialEnv(a=1, b=2) == {'a': 1, 'b': 2, 'c': 3} - assert PartialEnv(a=1) == {'a': 1, 'b': 2, 'c': 3} - assert PartialEnv(a=1, b=2) == {'a': 1, 'b': 2} - assert PartialEnv(a=1, b=2) != {'b': 2, 'c': 3} - assert PartialEnv(a=1, b=2) != {'a': 1, 'c': 3} + assert PartialEnv(a=1, b=2) == {"a": 1, "b": 2, "c": 3} + assert PartialEnv(a=1) == {"a": 1, "b": 2, "c": 3} + assert PartialEnv(a=1, b=2) == {"a": 1, "b": 2} + assert PartialEnv(a=1, b=2) != {"b": 2, "c": 3} + assert PartialEnv(a=1, b=2) != {"a": 1, "c": 3} def test_assert_subprocess_calls(): - expected_calls = [ - "echo Hello", - ["echo", "World"] - ] + expected_calls = ["echo Hello", ["echo", "World"]] with assert_subprocess_calls(expected_calls): - subprocess.run(['echo', 'Hello']) - subprocess.run(['echo', 'World']) + subprocess.run(["echo", "Hello"]) + subprocess.run(["echo", "World"]) - expected_env = PartialEnv( - CUSTOM_ENV_A='a', - CUSTOM_ENV_C='c' - ) + expected_env = PartialEnv(CUSTOM_ENV_A="a", CUSTOM_ENV_C="c") with assert_subprocess_calls(expected_calls, env=expected_env): - env = { - 'CUSTOM_ENV_A': 'a', - 'CUSTOM_ENV_B': 'b', - 'CUSTOM_ENV_C': 'c' - } - subprocess.run(['echo', 'Hello'], env=env) - subprocess.run(['echo', 'World'], env=env) + env = {"CUSTOM_ENV_A": "a", "CUSTOM_ENV_B": "b", "CUSTOM_ENV_C": "c"} + subprocess.run(["echo", "Hello"], env=env) + subprocess.run(["echo", "World"], env=env) with pytest.raises(AssertionError): with assert_subprocess_calls(expected_calls, env=expected_env): - env = { - 'CUSTOM_ENV_B': 'b', - 'CUSTOM_ENV_C': 'c' - } - subprocess.run(['echo', 'Hello'], env=env) - subprocess.run(['echo', 'World'], env=env) + env = {"CUSTOM_ENV_B": "b", "CUSTOM_ENV_C": "c"} + subprocess.run(["echo", "Hello"], env=env) + subprocess.run(["echo", "World"], env=env) diff --git a/dev/archery/archery/utils/cache.py b/dev/archery/archery/utils/cache.py index d92c5f32e270b..b991e9bfa384e 100644 --- a/dev/archery/archery/utils/cache.py +++ b/dev/archery/archery/utils/cache.py @@ -14,9 +14,10 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from pathlib import Path import os +from pathlib import Path from urllib.request import urlopen from .logger import logger @@ -25,7 +26,7 @@ class Cache: - """ Cache stores downloaded objects, notably apache-rat.jar. """ + """Cache stores downloaded objects, notably apache-rat.jar.""" def __init__(self, path=ARCHERY_CACHE_DIR): self.root = path @@ -34,23 +35,22 @@ def __init__(self, path=ARCHERY_CACHE_DIR): os.makedirs(path) def key_path(self, key): - """ Return the full path of a key. """ - return self.root/key + """Return the full path of a key.""" + return self.root / key def get(self, key): - """ Return the full path of a key if cached, None otherwise. """ + """Return the full path of a key if cached, None otherwise.""" path = self.key_path(key) return path if path.exists() else None def delete(self, key): - """ Remove a key (and the file) from the cache. """ + """Remove a key (and the file) from the cache.""" path = self.get(key) if path: path.unlink() def get_or_insert(self, key, create): - """ - Get or Insert a key from the cache. If the key is not found, the + """Get or Insert a key from the cache. If the key is not found, the `create` closure will be evaluated. The `create` closure takes a single parameter, the path where the @@ -64,13 +64,13 @@ def get_or_insert(self, key, create): return path def get_or_insert_from_url(self, key, url): - """ - Get or Insert a key from the cache. If the key is not found, the file + """Get or Insert a key from the cache. If the key is not found, the file is downloaded from `url`. """ + def download(path): - """ Tiny wrapper that download a file and save as key. """ - logger.debug("Downloading {} as {}".format(url, path)) + """Tiny wrapper that download a file and save as key.""" + logger.debug(f"Downloading {url} as {path}") conn = urlopen(url) # Ensure the download is completed before writing to disks. content = conn.read() diff --git a/dev/archery/archery/utils/cli.py b/dev/archery/archery/utils/cli.py index 701abe925fe56..78bdfc19bf40a 100644 --- a/dev/archery/archery/utils/cli.py +++ b/dev/archery/archery/utils/cli.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import importlib @@ -23,11 +24,11 @@ class ArrowBool(click.types.BoolParamType): - """ - ArrowBool supports the 'ON' and 'OFF' values on top of the values + """ArrowBool supports the 'ON' and 'OFF' values on top of the values supported by BoolParamType. This is convenient to port script which exports CMake options variables. """ + name = "boolean" def convert(self, value, param, ctx): @@ -42,9 +43,7 @@ def convert(self, value, param, ctx): def validate_arrow_sources(ctx, param, src): - """ - Ensure a directory contains Arrow cpp sources. - """ + """Ensure a directory contains Arrow cpp sources.""" try: return ArrowSources.find(src) except InvalidArrowSource as e: @@ -60,10 +59,7 @@ def add_optional_command(name, module, function, parent): @parent.command( name, - context_settings={ - "allow_extra_args": True, - "ignore_unknown_options": True, - } + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, ) def command(): raise click.ClickException( diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py index f93895b1a09ce..370fda7150800 100644 --- a/dev/archery/archery/utils/cmake.py +++ b/dev/archery/archery/utils/cmake.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import os import re @@ -28,7 +29,7 @@ def __init__(self, cmake_bin=None): @staticmethod def default_generator(): - """ Infer default generator. + """Infer default generator. Gives precedence to ninja if there exists an executable named `ninja` in the search path. @@ -41,7 +42,7 @@ def default_generator(): class CMakeDefinition: - """ CMakeDefinition captures the cmake invocation arguments. + """CMakeDefinition captures the cmake invocation arguments. It allows creating build directories with the same definition, e.g. ``` @@ -52,11 +53,13 @@ class CMakeDefinition: build1.all() build2.all() + ``` """ - def __init__(self, source, build_type="release", generator=None, - definitions=None, env=None): - """ Initialize a CMakeDefinition + def __init__( + self, source, build_type="release", generator=None, definitions=None, env=None + ): + """Initialize a CMakeDefinition Parameters ---------- @@ -68,6 +71,7 @@ def __init__(self, source, build_type="release", generator=None, env : dict(str,str), optional Environment to use when invoking cmake. This can be required to work around cmake deficiencies, e.g. CC and CXX. + """ self.source = os.path.abspath(source) self.build_type = build_type @@ -77,16 +81,12 @@ def __init__(self, source, build_type="release", generator=None, @property def arguments(self): - """" Return the arguments to cmake invocation. """ - arguments = [ - "-G{}".format(self.generator), - ] + self.definitions + [ - self.source - ] + """ " Return the arguments to cmake invocation.""" + arguments = [f"-G{self.generator}"] + self.definitions + [self.source] return arguments def build(self, build_dir, force=False, cmd_kwargs=None, **kwargs): - """ Invoke cmake into a build directory. + """Invoke cmake into a build directory. Parameters ---------- @@ -95,42 +95,38 @@ def build(self, build_dir, force=False, cmd_kwargs=None, **kwargs): force : bool If the build folder exists, delete it before. Otherwise if it's present, an error will be returned. + """ if os.path.exists(build_dir): # Extra safety to ensure we're deleting a build folder. if not CMakeBuild.is_build_dir(build_dir): - raise FileExistsError( - "{} is not a cmake build".format(build_dir) - ) + raise FileExistsError(f"{build_dir} is not a cmake build") if not force: - raise FileExistsError( - "{} exists use force=True".format(build_dir) - ) + raise FileExistsError(f"{build_dir} exists use force=True") rmtree(build_dir) os.mkdir(build_dir) cmd_kwargs = cmd_kwargs if cmd_kwargs else {} cmake(*self.arguments, cwd=build_dir, env=self.env, **cmd_kwargs) - return CMakeBuild(build_dir, self.build_type, definition=self, - **kwargs) + return CMakeBuild(build_dir, self.build_type, definition=self, **kwargs) def __repr__(self): - return "CMakeDefinition[source={}]".format(self.source) + return f"CMakeDefinition[source={self.source}]" CMAKE_BUILD_TYPE_RE = re.compile("CMAKE_BUILD_TYPE:STRING=([a-zA-Z]+)") class CMakeBuild(CMake): - """ CMakeBuild represents a build directory initialized by cmake. + """CMakeBuild represents a build directory initialized by cmake. The build instance can be used to build/test/install. It alleviates the user to know which generator is used. """ def __init__(self, build_dir, build_type, definition=None): - """ Initialize a CMakeBuild. + """Initialize a CMakeBuild. The caller must ensure that cmake was invoked in the build directory. @@ -140,6 +136,7 @@ def __init__(self, build_dir, build_type, definition=None): The definition to build from. build_dir : str The build directory to setup into. + """ assert CMakeBuild.is_build_dir(build_dir) super().__init__() @@ -157,8 +154,7 @@ def run(self, *argv, verbose=False, **kwargs): if verbose: extra.append("-v" if self.bin.endswith("ninja") else "VERBOSE=1") # Commands must be ran under the build directory - return super().run(*cmake_args, *extra, - *argv, **kwargs, cwd=self.build_dir) + return super().run(*cmake_args, *extra, *argv, **kwargs, cwd=self.build_dir) def all(self): return self.run("all") @@ -174,7 +170,7 @@ def test(self): @staticmethod def is_build_dir(path): - """ Indicate if a path is CMake build directory. + """Indicate if a path is CMake build directory. This method only checks for the existence of paths and does not do any validation whatsoever. @@ -185,7 +181,7 @@ def is_build_dir(path): @staticmethod def from_path(path): - """ Instantiate a CMakeBuild from a path. + """Instantiate a CMakeBuild from a path. This is used to recover from an existing physical directory (created with or without CMakeBuild). @@ -194,22 +190,22 @@ def from_path(path): be lost. Only build_type is recovered. """ if not CMakeBuild.is_build_dir(path): - raise ValueError("Not a valid CMakeBuild path: {}".format(path)) + raise ValueError(f"Not a valid CMakeBuild path: {path}") build_type = None # Infer build_type by looking at CMakeCache.txt and looking for a magic # definition cmake_cache_path = os.path.join(path, "CMakeCache.txt") - with open(cmake_cache_path, "r") as cmake_cache: + with open(cmake_cache_path) as cmake_cache: candidates = CMAKE_BUILD_TYPE_RE.findall(cmake_cache.read()) build_type = candidates[0].lower() if candidates else "release" return CMakeBuild(path, build_type) def __repr__(self): - return ("CMakeBuild[" - "build = {}," - "build_type = {}," - "definition = {}]".format(self.build_dir, - self.build_type, - self.definition)) + return ( + "CMakeBuild[" + f"build = {self.build_dir}," + f"build_type = {self.build_type}," + f"definition = {self.definition}]" + ) diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py index c3161164d312e..b94222c1e3b29 100644 --- a/dev/archery/archery/utils/command.py +++ b/dev/archery/archery/utils/command.py @@ -14,18 +14,19 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import os import shlex import shutil import subprocess -from .logger import logger, ctx +from .logger import ctx, logger def default_bin(name, default): assert default - env_name = "ARCHERY_{0}_BIN".format(default.upper()) + env_name = f"ARCHERY_{default.upper()}_BIN" return name if name else os.environ.get(env_name, default) @@ -40,18 +41,18 @@ def strip_it(x): return x.strip() if self.strip else x def list_it(x): - return x.decode('utf-8').splitlines() if self.listify else x + return x.decode("utf-8").splitlines() if self.listify else x def wrapper(*argv, **kwargs): # Ensure stdout is captured kwargs["stdout"] = subprocess.PIPE return list_it(strip_it(f(*argv, **kwargs).stdout)) + return wrapper class Command: - """ - A runnable command. + """A runnable command. Class inheriting from the Command class must provide the bin property/attribute. @@ -74,14 +75,12 @@ def run(self, *argv, **kwargs): if "check" not in kwargs: kwargs["check"] = True - logger.debug("Executing `{}`".format(invocation)) + logger.debug(f"Executing `{invocation}`") return subprocess.run(invocation, **kwargs) @property def available(self): - """ - Indicate if the command binary is found in PATH. - """ + """Indicate if the command binary is found in PATH.""" binary = shlex.split(self.bin)[0] return shutil.which(binary) is not None @@ -92,7 +91,7 @@ def __call__(self, *argv, **kwargs): class CommandStackMixin: def run(self, *argv, **kwargs): stacked_args = self.argv + argv - return super(CommandStackMixin, self).run(*stacked_args, **kwargs) + return super().run(*stacked_args, **kwargs) class Bash(Command): diff --git a/dev/archery/archery/utils/git.py b/dev/archery/archery/utils/git.py index 798bc5d7096fb..31466709c866c 100644 --- a/dev/archery/archery/utils/git.py +++ b/dev/archery/archery/utils/git.py @@ -14,9 +14,10 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from .command import Command, capture_stdout, default_bin from ..compat import _stringify_path +from .command import Command, capture_stdout, default_bin # Decorator prepending argv with the git sub-command found with the method @@ -27,6 +28,7 @@ def git_cmd(fn): def wrapper(self, *argv, **kwargs): return fn(self, sub_cmd, *argv, **kwargs) + return wrapper @@ -35,7 +37,7 @@ def __init__(self, git_bin=None): self.bin = default_bin(git_bin, "git") def run_cmd(self, cmd, *argv, git_dir=None, **kwargs): - """ Inject flags before sub-command in argv. """ + """Inject flags before sub-command in argv.""" opts = [] if git_dir is not None: opts.extend(["-C", _stringify_path(git_dir)]) @@ -84,7 +86,7 @@ def status(self, *argv, **kwargs): @capture_stdout(strip=True) def head(self, **kwargs): - """ Return commit pointed by HEAD. """ + """Return commit pointed by HEAD.""" return self.rev_parse("HEAD", **kwargs) @capture_stdout(strip=True) @@ -92,9 +94,9 @@ def current_branch(self, **kwargs): return self.rev_parse("--abbrev-ref", "HEAD", **kwargs) def repository_root(self, git_dir=None, **kwargs): - """ Locates the repository's root path from a subdirectory. """ + """Locates the repository's root path from a subdirectory.""" stdout = self.rev_parse("--show-toplevel", git_dir=git_dir, **kwargs) - return stdout.decode('utf-8') + return stdout.decode("utf-8") git = Git() diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py index c9d05fffd9168..8ca7b8eeaa1c4 100644 --- a/dev/archery/archery/utils/lint.py +++ b/dev/archery/archery/utils/lint.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import fnmatch import gzip @@ -22,17 +23,16 @@ import click -from .command import Bash, Command, default_bin from ..compat import _get_module +from ..lang.cpp import CppCMakeDefinition, CppConfiguration +from ..lang.python import Autopep8, CythonLint, Flake8, NumpyDoc, PythonCommand from .cmake import CMake +from .command import Bash, Command, default_bin from .git import git from .logger import logger -from ..lang.cpp import CppCMakeDefinition, CppConfiguration -from ..lang.python import Autopep8, Flake8, CythonLint, NumpyDoc, PythonCommand from .rat import Rat, exclusion_from_globs from .tmpdir import tmpdir - _archery_install_msg = ( "Please install archery using: `pip install -e dev/archery[lint]`. " ) @@ -55,10 +55,17 @@ def from_cmd(command_result): return LintResult(command_result.returncode == 0) -def cpp_linter(src, build_dir, clang_format=True, cpplint=True, - clang_tidy=False, iwyu=False, iwyu_all=False, - fix=False): - """ Run clang-format, cpplint and clang-tidy on cpp/ codebase. """ +def cpp_linter( + src, + build_dir, + clang_format=True, + cpplint=True, + clang_tidy=False, + iwyu=False, + iwyu_all=False, + fix=False, +): + """Run clang-format, cpplint and clang-tidy on cpp/ codebase.""" logger.info("Running C++ linters") cmake = CMake() @@ -97,7 +104,6 @@ def cpp_linter(src, build_dir, clang_format=True, cpplint=True, class CMakeFormat(Command): - def __init__(self, paths, cmake_format_bin=None): self.check_version() self.bin = default_bin(cmake_format_bin, "cmake-format") @@ -121,14 +127,11 @@ def check_version(): # cmake_format is part of the cmakelang package import cmakelang except ImportError: - raise ImportError( - - ) + raise ImportError() # pin a specific version of cmake_format, must be updated in setup.py if cmakelang.__version__ != "0.6.13": raise LintValidationException( - f"Wrong version of cmake_format is detected. " - f"{_archery_install_msg}" + f"Wrong version of cmake_format is detected. {_archery_install_msg}" ) def check(self): @@ -139,32 +142,30 @@ def fix(self): def cmake_linter(src, fix=False): - """ - Run cmake-format on all CMakeFiles.txt - """ + """Run cmake-format on all CMakeFiles.txt""" logger.info("Running cmake-format linters") cmake_format = CMakeFormat.from_patterns( src.path, include_patterns=[ - 'ci/**/*.cmake', - 'cpp/CMakeLists.txt', - 'cpp/src/**/*.cmake', - 'cpp/src/**/*.cmake.in', - 'cpp/src/**/CMakeLists.txt', - 'cpp/examples/**/CMakeLists.txt', - 'cpp/cmake_modules/*.cmake', - 'go/**/CMakeLists.txt', - 'java/**/CMakeLists.txt', - 'matlab/**/CMakeLists.txt', - 'python/**/CMakeLists.txt', + "ci/**/*.cmake", + "cpp/CMakeLists.txt", + "cpp/src/**/*.cmake", + "cpp/src/**/*.cmake.in", + "cpp/src/**/CMakeLists.txt", + "cpp/examples/**/CMakeLists.txt", + "cpp/cmake_modules/*.cmake", + "go/**/CMakeLists.txt", + "java/**/CMakeLists.txt", + "matlab/**/CMakeLists.txt", + "python/**/CMakeLists.txt", ], exclude_patterns=[ - 'cpp/cmake_modules/FindNumPy.cmake', - 'cpp/cmake_modules/FindPythonLibsNew.cmake', - 'cpp/cmake_modules/UseCython.cmake', - 'cpp/src/arrow/util/*.h.cmake', - ] + "cpp/cmake_modules/FindNumPy.cmake", + "cpp/cmake_modules/FindPythonLibsNew.cmake", + "cpp/cmake_modules/UseCython.cmake", + "cpp/src/arrow/util/*.h.cmake", + ], ) method = cmake_format.fix if fix else cmake_format.check @@ -173,7 +174,8 @@ def cmake_linter(src, fix=False): def python_linter(src, fix=False): """Run Python linters on python/pyarrow, python/examples, setup.py - and dev/. """ + and dev/. + """ setup_py = os.path.join(src.python, "setup.py") setup_cfg = os.path.join(src.python, "setup.cfg") @@ -183,37 +185,40 @@ def python_linter(src, fix=False): if not autopep8.available: logger.error( "Python formatter requested but autopep8 binary not found. " - f"{_archery_install_msg}") + f"{_archery_install_msg}" + ) return # Gather files for autopep8 - patterns = ["python/benchmarks/**/*.py", - "python/examples/**/*.py", - "python/pyarrow/**/*.py", - "python/pyarrow/**/*.pyx", - "python/pyarrow/**/*.pxd", - "python/pyarrow/**/*.pxi", - "dev/*.py", - "dev/archery/**/*.py", - "dev/release/**/*.py"] + patterns = [ + "python/benchmarks/**/*.py", + "python/examples/**/*.py", + "python/pyarrow/**/*.py", + "python/pyarrow/**/*.pyx", + "python/pyarrow/**/*.pxd", + "python/pyarrow/**/*.pxi", + "dev/*.py", + "dev/archery/**/*.py", + "dev/release/**/*.py", + ] files = [setup_py] for pattern in patterns: files += list(map(str, Path(src.path).glob(pattern))) - args = ['--global-config', setup_cfg, '--ignore-local-config'] + args = ["--global-config", setup_cfg, "--ignore-local-config"] if fix: - args += ['-j0', '--in-place'] + args += ["-j0", "--in-place"] args += sorted(files) yield LintResult.from_cmd(autopep8(*args)) else: # XXX `-j0` doesn't work well with `--exit-code`, so instead # we capture the diff and check whether it's empty # (https://github.com/hhatto/autopep8/issues/543) - args += ['-j0', '--diff'] + args += ["-j0", "--diff"] args += sorted(files) diff = autopep8.run_captured(*args) if diff: - print(diff.decode('utf8')) + print(diff.decode("utf8")) yield LintResult(success=False) else: yield LintResult(success=True) @@ -225,16 +230,24 @@ def python_linter(src, fix=False): if not flake8.available: logger.error( "Python linter requested but flake8 binary not found. " - f"{_archery_install_msg}") + f"{_archery_install_msg}" + ) return - flake8_exclude = ['.venv*', 'vendored'] + flake8_exclude = [".venv*", "vendored"] yield LintResult.from_cmd( - flake8("--extend-exclude=" + ','.join(flake8_exclude), - "--config=" + os.path.join(src.python, "setup.cfg"), - setup_py, src.pyarrow, os.path.join(src.python, "benchmarks"), - os.path.join(src.python, "examples"), src.dev, check=False)) + flake8( + "--extend-exclude=" + ",".join(flake8_exclude), + "--config=" + os.path.join(src.python, "setup.cfg"), + setup_py, + src.pyarrow, + os.path.join(src.python, "benchmarks"), + os.path.join(src.python, "examples"), + src.dev, + check=False, + ) + ) logger.info("Running Cython linter (cython-lint)") @@ -242,21 +255,23 @@ def python_linter(src, fix=False): if not cython_lint.available: logger.error( "Cython linter requested but cython-lint binary not found. " - f"{_archery_install_msg}") + f"{_archery_install_msg}" + ) return # Gather files for cython-lint - patterns = ["python/pyarrow/**/*.pyx", - "python/pyarrow/**/*.pxd", - "python/pyarrow/**/*.pxi", - "python/examples/**/*.pyx", - "python/examples/**/*.pxd", - "python/examples/**/*.pxi", - ] + patterns = [ + "python/pyarrow/**/*.pyx", + "python/pyarrow/**/*.pxd", + "python/pyarrow/**/*.pxi", + "python/examples/**/*.pyx", + "python/examples/**/*.pxd", + "python/examples/**/*.pxi", + ] files = [] for pattern in patterns: files += list(map(str, Path(src.path).glob(pattern))) - args = ['--no-pycodestyle'] + args = ["--no-pycodestyle"] args += sorted(files) yield LintResult.from_cmd(cython_lint(*args)) @@ -272,14 +287,19 @@ def python_cpp_linter(src, clang_format=True, fix=False): if "CLANG_TOOLS_PATH" in os.environ: clang_format_binary = os.path.join( - os.environ["CLANG_TOOLS_PATH"], "clang-format") + os.environ["CLANG_TOOLS_PATH"], "clang-format" + ) else: clang_format_binary = "clang-format-14" - run_clang_format = os.path.join(src.cpp, "build-support", - "run_clang_format.py") - args = [run_clang_format, "--source_dir", cpp_src, - "--clang_format_binary", clang_format_binary] + run_clang_format = os.path.join(src.cpp, "build-support", "run_clang_format.py") + args = [ + run_clang_format, + "--source_dir", + cpp_src, + "--clang_format_binary", + clang_format_binary, + ] if fix: args += ["--fix"] @@ -294,19 +314,19 @@ def python_numpydoc(symbols=None, allow_rules=None, disallow_rules=None): logger.info("Running Python docstring linters") # by default try to run on all pyarrow package symbols = symbols or { - 'pyarrow', - 'pyarrow.compute', - 'pyarrow.csv', - 'pyarrow.dataset', - 'pyarrow.feather', + "pyarrow", + "pyarrow.compute", + "pyarrow.csv", + "pyarrow.dataset", + "pyarrow.feather", # 'pyarrow.flight', - 'pyarrow.fs', - 'pyarrow.gandiva', - 'pyarrow.ipc', - 'pyarrow.json', - 'pyarrow.orc', - 'pyarrow.parquet', - 'pyarrow.types', + "pyarrow.fs", + "pyarrow.gandiva", + "pyarrow.ipc", + "pyarrow.json", + "pyarrow.orc", + "pyarrow.parquet", + "pyarrow.types", } try: numpydoc = NumpyDoc(symbols) @@ -317,9 +337,9 @@ def python_numpydoc(symbols=None, allow_rules=None, disallow_rules=None): results = numpydoc.validate( # limit the validation scope to the pyarrow package - from_package='pyarrow', + from_package="pyarrow", allow_rules=allow_rules, - disallow_rules=disallow_rules + disallow_rules=disallow_rules, ) if len(results) == 0: @@ -328,47 +348,40 @@ def python_numpydoc(symbols=None, allow_rules=None, disallow_rules=None): number_of_violations = 0 for obj, result in results: - errors = result['errors'] + errors = result["errors"] # inspect doesn't play nice with cython generated source code, # to use a hacky way to represent a proper __qualname__ - doc = getattr(obj, '__doc__', '') - name = getattr(obj, '__name__', '') - qualname = getattr(obj, '__qualname__', '') - module = _get_module(obj, default='') - instance = getattr(obj, '__self__', '') + doc = getattr(obj, "__doc__", "") + name = getattr(obj, "__name__", "") + qualname = getattr(obj, "__qualname__", "") + module = _get_module(obj, default="") + instance = getattr(obj, "__self__", "") if instance: klass = instance.__class__.__name__ else: - klass = '' + klass = "" try: cython_signature = doc.splitlines()[0] except Exception: - cython_signature = '' + cython_signature = "" - desc = '.'.join(filter(None, [module, klass, qualname or name])) + desc = ".".join(filter(None, [module, klass, qualname or name])) click.echo() - click.echo(click.style(desc, bold=True, fg='yellow')) + click.echo(click.style(desc, bold=True, fg="yellow")) if cython_signature: - qualname_with_signature = '.'.join([module, cython_signature]) - click.echo( - click.style( - '-> {}'.format(qualname_with_signature), - fg='yellow' - ) - ) + qualname_with_signature = ".".join([module, cython_signature]) + click.echo(click.style(f"-> {qualname_with_signature}", fg="yellow")) for error in errors: number_of_violations += 1 - click.echo('{}: {}'.format(*error)) + click.echo("{}: {}".format(*error)) - msg = 'Total number of docstring violations: {}'.format( - number_of_violations - ) + msg = f"Total number of docstring violations: {number_of_violations}" click.echo() - click.echo(click.style(msg, fg='red')) + click.echo(click.style(msg, fg="red")) yield LintResult(success=False) @@ -378,11 +391,14 @@ def rat_linter(src, root): logger.info("Running apache-rat linter") if src.git_dirty: - logger.warn("Due to the usage of git-archive, uncommitted files will" - " not be checked for rat violations. ") + logger.warn( + "Due to the usage of git-archive, uncommitted files will" + " not be checked for rat violations. " + ) exclusion = exclusion_from_globs( - os.path.join(src.dev, "release", "rat_exclude_files.txt")) + os.path.join(src.dev, "release", "rat_exclude_files.txt") + ) # Creates a git-archive of ArrowSources, apache-rat expects a gzip # compressed tar archive. @@ -392,7 +408,7 @@ def rat_linter(src, root): violations = list(report.validate(exclusion=exclusion)) for violation in violations: - print("apache-rat license violation: {}".format(violation)) + print(f"apache-rat license violation: {violation}") yield LintResult(len(violations) == 0) @@ -413,8 +429,7 @@ def is_docker_image(path): dirname = os.path.dirname(path) filename = os.path.basename(path) - excluded = dirname.startswith( - "dev") or dirname.startswith("python/manylinux") + excluded = dirname.startswith(("dev", "python/manylinux")) return filename.startswith("Dockerfile") and not excluded @@ -426,14 +441,12 @@ def docker_linter(src): hadolint = Hadolint() if not hadolint.available: - logger.error( - "hadolint linter requested but hadolint binary not found.") + logger.error("hadolint linter requested but hadolint binary not found.") return for path in git.ls_files(git_dir=src.path): if is_docker_image(path): - yield LintResult.from_cmd(hadolint.run(path, check=False, - cwd=src.path)) + yield LintResult.from_cmd(hadolint.run(path, check=False, cwd=src.path)) class SphinxLint(Command): @@ -471,7 +484,7 @@ def docs_linter(src, path=None): src, path=path, disable="all", - enable="trailing-whitespace,missing-final-newline" + enable="trailing-whitespace,missing-final-newline", ) if not sphinx_lint.available: @@ -481,10 +494,24 @@ def docs_linter(src, path=None): yield LintResult.from_cmd(sphinx_lint.lint()) -def linter(src, fix=False, path=None, *, clang_format=False, cpplint=False, - clang_tidy=False, iwyu=False, iwyu_all=False, - python=False, numpydoc=False, cmake_format=False, rat=False, - r=False, docker=False, docs=False): +def linter( + src, + fix=False, + path=None, + *, + clang_format=False, + cpplint=False, + clang_tidy=False, + iwyu=False, + iwyu_all=False, + python=False, + numpydoc=False, + cmake_format=False, + rat=False, + r=False, + docker=False, + docs=False, +): """Run all linters.""" with tmpdir(prefix="arrow-lint-") as root: build_dir = os.path.join(root, "cpp-build") @@ -495,21 +522,24 @@ def linter(src, fix=False, path=None, *, clang_format=False, cpplint=False, results = [] if clang_format or cpplint or clang_tidy or iwyu: - results.extend(cpp_linter(src, build_dir, - clang_format=clang_format, - cpplint=cpplint, - clang_tidy=clang_tidy, - iwyu=iwyu, - iwyu_all=iwyu_all, - fix=fix)) + results.extend( + cpp_linter( + src, + build_dir, + clang_format=clang_format, + cpplint=cpplint, + clang_tidy=clang_tidy, + iwyu=iwyu, + iwyu_all=iwyu_all, + fix=fix, + ) + ) if python: results.extend(python_linter(src, fix=fix)) if python and clang_format: - results.extend(python_cpp_linter(src, - clang_format=clang_format, - fix=fix)) + results.extend(python_cpp_linter(src, clang_format=clang_format, fix=fix)) if numpydoc: results.extend(python_numpydoc()) diff --git a/dev/archery/archery/utils/logger.py b/dev/archery/archery/utils/logger.py index 4ab119ea7d951..3881527f0ccbb 100644 --- a/dev/archery/archery/utils/logger.py +++ b/dev/archery/archery/utils/logger.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import contextlib import logging @@ -51,14 +52,15 @@ def running_in_ci(): @contextlib.contextmanager def group(name, output=None): - """ - Group outputs in the given with block. + """Group outputs in the given with block. This does nothing in non GitHub Actions environment for now. """ if output is None: + def output(message): print(message, flush=True) + if in_github_actions(): output(f"::group::{name}") try: diff --git a/dev/archery/archery/utils/maven.py b/dev/archery/archery/utils/maven.py index 96a3bf5bd9970..bf1652d822d11 100644 --- a/dev/archery/archery/utils/maven.py +++ b/dev/archery/archery/utils/maven.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import os @@ -29,7 +30,7 @@ def __init__(self, maven_bin=None): class MavenDefinition: - """ MavenDefinition captures the maven invocation arguments. + """MavenDefinition captures the maven invocation arguments. It allows creating build directories with the same definition, e.g. ``` @@ -40,11 +41,13 @@ class MavenDefinition: build1.install() build2.install() + ``` """ - def __init__(self, source, build_definitions=None, - benchmark_definitions=None, env=None): - """ Initialize a MavenDefinition + def __init__( + self, source, build_definitions=None, benchmark_definitions=None, env=None + ): + """Initialize a MavenDefinition Parameters ---------- @@ -53,26 +56,32 @@ def __init__(self, source, build_definitions=None, located. This is usually the root of the project. build_definitions: list(str), optional benchmark_definitions: list(str), optional + """ self.source = os.path.abspath(source) self.build_definitions = build_definitions if build_definitions else [] - self.benchmark_definitions =\ + self.benchmark_definitions = ( benchmark_definitions if benchmark_definitions else [] + ) self.env = env @property def build_arguments(self): - """" Return the arguments to maven invocation for build. """ + """ " Return the arguments to maven invocation for build.""" arguments = self.build_definitions + [ - "-B", "-DskipTests", "-Drat.skip=true", + "-B", + "-DskipTests", + "-Drat.skip=true", "-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer." "Slf4jMavenTransferListener=warn", - "-T", "2C", "install" + "-T", + "2C", + "install", ] return arguments def build(self, build_dir, force=False, cmd_kwargs=None, **kwargs): - """ Invoke maven into a build directory. + """Invoke maven into a build directory. Parameters ---------- @@ -80,13 +89,11 @@ def build(self, build_dir, force=False, cmd_kwargs=None, **kwargs): Directory in which the Maven build will be instantiated. force : bool not used now + """ - if os.path.exists(build_dir): + if os.path.exists(build_dir) and not MavenBuild.is_build_dir(build_dir): # Extra safety to ensure we're deleting a build folder. - if not MavenBuild.is_build_dir(build_dir): - raise FileExistsError( - "{} is not a maven build".format(build_dir) - ) + raise FileExistsError(f"{build_dir} is not a maven build") cmd_kwargs = cmd_kwargs if cmd_kwargs else {} assert MavenBuild.is_build_dir(build_dir) @@ -95,36 +102,35 @@ def build(self, build_dir, force=False, cmd_kwargs=None, **kwargs): @property def list_arguments(self): - """" Return the arguments to maven invocation for list """ - arguments = [ - "-Dskip.perf.benchmarks=false", "-Dbenchmark.list=-lp", "install" - ] + """ " Return the arguments to maven invocation for list""" + arguments = ["-Dskip.perf.benchmarks=false", "-Dbenchmark.list=-lp", "install"] return arguments @property def benchmark_arguments(self): - """" Return the arguments to maven invocation for benchmark """ + """ " Return the arguments to maven invocation for benchmark""" arguments = self.benchmark_definitions + [ - "-Dskip.perf.benchmarks=false", "-Dbenchmark.fork=1", - "-Dbenchmark.jvmargs=\"-Darrow.enable_null_check_for_get=false " - "-Darrow.enable_unsafe_memory_access=true\"", - "install" + "-Dskip.perf.benchmarks=false", + "-Dbenchmark.fork=1", + '-Dbenchmark.jvmargs="-Darrow.enable_null_check_for_get=false ' + '-Darrow.enable_unsafe_memory_access=true"', + "install", ] return arguments def __repr__(self): - return "MavenDefinition[source={}]".format(self.source) + return f"MavenDefinition[source={self.source}]" class MavenBuild(Maven): - """ MavenBuild represents a build directory initialized by maven. + """MavenBuild represents a build directory initialized by maven. The build instance can be used to build/test/install. It alleviates the user to know which generator is used. """ def __init__(self, build_dir, definition=None): - """ Initialize a MavenBuild. + """Initialize a MavenBuild. The caller must ensure that maven was invoked in the build directory. @@ -134,6 +140,7 @@ def __init__(self, build_dir, definition=None): The definition to build from. build_dir : str The build directory to setup into. + """ assert MavenBuild.is_build_dir(build_dir) super().__init__() @@ -156,24 +163,42 @@ def run(self, *argv, verbose=False, cwd=None, **kwargs): def build(self, *argv, verbose=False, **kwargs): definition_args = self.definition.build_arguments cwd = self.binaries_dir - return self.run(*argv, *definition_args, verbose=verbose, cwd=cwd, - env=self.definition.env, **kwargs) + return self.run( + *argv, + *definition_args, + verbose=verbose, + cwd=cwd, + env=self.definition.env, + **kwargs, + ) def list(self, *argv, verbose=False, **kwargs): definition_args = self.definition.list_arguments cwd = self.binaries_dir + "/performance" - return self.run(*argv, *definition_args, verbose=verbose, cwd=cwd, - env=self.definition.env, **kwargs) + return self.run( + *argv, + *definition_args, + verbose=verbose, + cwd=cwd, + env=self.definition.env, + **kwargs, + ) def benchmark(self, *argv, verbose=False, **kwargs): definition_args = self.definition.benchmark_arguments cwd = self.binaries_dir + "/performance" - return self.run(*argv, *definition_args, verbose=verbose, cwd=cwd, - env=self.definition.env, **kwargs) + return self.run( + *argv, + *definition_args, + verbose=verbose, + cwd=cwd, + env=self.definition.env, + **kwargs, + ) @staticmethod def is_build_dir(path): - """ Indicate if a path is Maven top directory. + """Indicate if a path is Maven top directory. This method only checks for the existence of paths and does not do any validation whatsoever. @@ -184,7 +209,7 @@ def is_build_dir(path): @staticmethod def from_path(path): - """ Instantiate a Maven from a path. + """Instantiate a Maven from a path. This is used to recover from an existing physical directory (created with or without Maven). @@ -193,12 +218,9 @@ def from_path(path): be lost. """ if not MavenBuild.is_build_dir(path): - raise ValueError("Not a valid MavenBuild path: {}".format(path)) + raise ValueError(f"Not a valid MavenBuild path: {path}") return MavenBuild(path, definition=None) def __repr__(self): - return ("MavenBuild[" - "build = {}," - "definition = {}]".format(self.build_dir, - self.definition)) + return f"MavenBuild[build = {self.build_dir},definition = {self.definition}]" diff --git a/dev/archery/archery/utils/rat.py b/dev/archery/archery/utils/rat.py index e7fe19a7ea8c4..7178a44f15373 100644 --- a/dev/archery/archery/utils/rat.py +++ b/dev/archery/archery/utils/rat.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import fnmatch import re @@ -24,7 +25,7 @@ from .command import capture_stdout RAT_VERSION = 0.13 -RAT_JAR_FILENAME = "apache-rat-{}.jar".format(RAT_VERSION) +RAT_JAR_FILENAME = f"apache-rat-{RAT_VERSION}.jar" RAT_URL_ = "https://repo1.maven.org/maven2/org/apache/rat/apache-rat" RAT_URL = "/".join([RAT_URL_, str(RAT_VERSION), RAT_JAR_FILENAME]) @@ -43,9 +44,9 @@ def report(self, archive_path, **kwargs): def exclusion_from_globs(exclusions_path): - with open(exclusions_path, 'r') as exclusions_fd: + with open(exclusions_path) as exclusions_fd: exclusions = [e.strip() for e in exclusions_fd] - return lambda path: any([fnmatch.fnmatch(path, e) for e in exclusions]) + return lambda path: any(fnmatch.fnmatch(path, e) for e in exclusions) class RatReport: @@ -54,15 +55,15 @@ def __init__(self, xml): self.tree = ElementTree.fromstring(xml) def __repr__(self): - return "RatReport({})".format(self.xml) + return f"RatReport({self.xml})" def validate(self, exclusion=None): - for r in self.tree.findall('resource'): - approvals = r.findall('license-approval') - if not approvals or approvals[0].attrib['name'] == 'true': + for r in self.tree.findall("resource"): + approvals = r.findall("license-approval") + if not approvals or approvals[0].attrib["name"] == "true": continue - clean_name = re.sub('^[^/]+/', '', r.attrib['name']) + clean_name = re.sub("^[^/]+/", "", r.attrib["name"]) if exclusion and exclusion(clean_name): continue diff --git a/dev/archery/archery/utils/report.py b/dev/archery/archery/utils/report.py index 6c7587ddd8729..b53372a9efef7 100644 --- a/dev/archery/archery/utils/report.py +++ b/dev/archery/archery/utils/report.py @@ -14,25 +14,25 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations -from abc import ABCMeta, abstractmethod import datetime +from abc import ABCMeta, abstractmethod import jinja2 def markdown_escape(s): - for char in ('*', '#', '_', '~', '`', '>'): - s = s.replace(char, '\\' + char) + for char in ("*", "#", "_", "~", "`", ">"): + s = s.replace(char, "\\" + char) return s class Report(metaclass=ABCMeta): - def __init__(self, **kwargs): for field in self.fields: if field not in kwargs: - raise ValueError('Missing keyword argument {}'.format(field)) + raise ValueError(f"Missing keyword argument {field}") self._data = kwargs def __getattr__(self, key): @@ -49,13 +49,12 @@ def templates(self): class JinjaReport(Report): - def __init__(self, **kwargs): self.env = jinja2.Environment( - loader=jinja2.PackageLoader('archery', 'templates') + loader=jinja2.PackageLoader("archery", "templates") ) - self.env.filters['md'] = markdown_escape - self.env.globals['today'] = datetime.date.today + self.env.filters["md"] = markdown_escape + self.env.globals["today"] = datetime.date.today super().__init__(**kwargs) def render(self, template_name): diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py index 1915b8f2ef305..fa9df0d83c4ea 100644 --- a/dev/archery/archery/utils/source.py +++ b/dev/archery/archery/utils/source.py @@ -14,26 +14,21 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import os -from pathlib import Path import subprocess import tempfile +from pathlib import Path from .command import Command from .git import git - -ARROW_ROOT_DEFAULT = os.environ.get( - 'ARROW_ROOT', - Path(__file__).resolve().parents[4] -) +ARROW_ROOT_DEFAULT = os.environ.get("ARROW_ROOT", Path(__file__).resolve().parents[4]) def arrow_path(path): - """ - Return full path to a file given its path inside the Arrow repo. - """ + """Return full path to a file given its path inside the Arrow repo.""" return os.path.join(ARROW_ROOT_DEFAULT, path) @@ -42,16 +37,17 @@ class InvalidArrowSource(Exception): class ArrowSources: - """ ArrowSources is a companion class representing a directory containing + """ArrowSources is a companion class representing a directory containing Apache Arrow's sources. """ + # Note that WORKSPACE is a reserved git revision name by this module to # reference the current git workspace. In other words, this indicates to # ArrowSources.at_revision that no cloning/checkout is required. WORKSPACE = "WORKSPACE" def __init__(self, path): - """ Initialize an ArrowSources + """Initialize an ArrowSources The caller must ensure that path is valid arrow source directory (can be checked with ArrowSources.valid) @@ -59,68 +55,66 @@ def __init__(self, path): Parameters ---------- path : src + """ path = Path(path) # validate by checking a specific path in the arrow source tree - if not (path / 'cpp' / 'CMakeLists.txt').exists(): - raise InvalidArrowSource( - "No Arrow C++ sources found in {}.".format(path) - ) + if not (path / "cpp" / "CMakeLists.txt").exists(): + raise InvalidArrowSource(f"No Arrow C++ sources found in {path}.") self.path = path @property def archery(self): - """ Returns the archery directory of an Arrow sources. """ + """Returns the archery directory of an Arrow sources.""" return self.dev / "archery" @property def cpp(self): - """ Returns the cpp directory of an Arrow sources. """ + """Returns the cpp directory of an Arrow sources.""" return self.path / "cpp" @property def dev(self): - """ Returns the dev directory of an Arrow sources. """ + """Returns the dev directory of an Arrow sources.""" return self.path / "dev" @property def java(self): - """ Returns the java directory of an Arrow sources. """ + """Returns the java directory of an Arrow sources.""" return self.path / "java" @property def python(self): - """ Returns the python directory of an Arrow sources. """ + """Returns the python directory of an Arrow sources.""" return self.path / "python" @property def pyarrow(self): - """ Returns the python/pyarrow directory of an Arrow sources. """ + """Returns the python/pyarrow directory of an Arrow sources.""" return self.python / "pyarrow" @property def r(self): - """ Returns the r directory of an Arrow sources. """ + """Returns the r directory of an Arrow sources.""" return self.path / "r" @property def git_backed(self): - """ Indicate if the sources are backed by git. """ + """Indicate if the sources are backed by git.""" return (self.path / ".git").exists() @property def git_dirty(self): - """ Indicate if the sources is a dirty git directory. """ + """Indicate if the sources is a dirty git directory.""" return self.git_backed and git.dirty(git_dir=self.path) def archive(self, path, dereference=False, compressor=None, revision=None): - """ Saves a git archive at path. """ + """Saves a git archive at path.""" if not self.git_backed: - raise ValueError("{} is not backed by git".format(self)) + raise ValueError(f"{self} is not backed by git") rev = revision if revision else "HEAD" - archive = git.archive("--prefix=apache-arrow.tmp/", rev, - git_dir=self.path) + archive = git.archive("--prefix=apache-arrow.tmp/", rev, git_dir=self.path) with tempfile.TemporaryDirectory() as tmp: tmp = Path(tmp) tar_path = tmp / "apache-arrow.tar" @@ -128,8 +122,9 @@ def archive(self, path, dereference=False, compressor=None, revision=None): tar.write(archive) Command("tar").run("xf", tar_path, "-C", tmp) # Must use the same logic in dev/release/02-source.sh - Command("cp").run("-R", "-L", tmp / - "apache-arrow.tmp", tmp / "apache-arrow") + Command("cp").run( + "-R", "-L", tmp / "apache-arrow.tmp", tmp / "apache-arrow" + ) Command("tar").run("cf", tar_path, "-C", tmp, "apache-arrow") with open(tar_path, "rb") as tar: archive = tar.read() @@ -141,7 +136,7 @@ def archive(self, path, dereference=False, compressor=None, revision=None): archive_fd.write(archive) def at_revision(self, revision, clone_dir): - """ Return a copy of the current sources for a specified git revision. + """Return a copy of the current sources for a specified git revision. This method may return the current object if no checkout is required. The caller is responsible to remove the cloned repository directory. @@ -158,9 +153,10 @@ def at_revision(self, revision, clone_dir): Revision to checkout sources at. clone_dir : str Path to checkout the local clone. + """ if not self.git_backed: - raise ValueError("{} is not backed by git".format(self)) + raise ValueError(f"{self} is not backed by git") if revision == ArrowSources.WORKSPACE: return self, False @@ -183,7 +179,7 @@ def at_revision(self, revision, clone_dir): @staticmethod def find(path=None): - """ Infer Arrow sources directory from various method. + """Infer Arrow sources directory from various method. The following guesses are done in order until a valid match is found: @@ -199,7 +195,6 @@ def find(path=None): repository. If so, returns the relative path to the source directory. """ - # Explicit via environment env = os.environ.get("ARROW_SRC") @@ -226,10 +221,10 @@ def find(path=None): except InvalidArrowSource: pass - searched_paths = "\n".join([" - {}".format(p) for p in paths]) + searched_paths = "\n".join([f" - {p}" for p in paths]) raise InvalidArrowSource( "Unable to locate Arrow's source directory. " - "Searched paths are:\n{}".format(searched_paths) + f"Searched paths are:\n{searched_paths}" ) def __repr__(self): diff --git a/dev/archery/archery/utils/tmpdir.py b/dev/archery/archery/utils/tmpdir.py index 07d7355c87fb8..a7b3dffd67032 100644 --- a/dev/archery/archery/utils/tmpdir.py +++ b/dev/archery/archery/utils/tmpdir.py @@ -14,9 +14,10 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations from contextlib import contextmanager -from tempfile import mkdtemp, TemporaryDirectory +from tempfile import TemporaryDirectory, mkdtemp @contextmanager diff --git a/dev/archery/conftest.py b/dev/archery/conftest.py index 06a643bea5645..4add9b6b4ca75 100644 --- a/dev/archery/conftest.py +++ b/dev/archery/conftest.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import pathlib @@ -25,7 +26,7 @@ def pytest_addoption(parser): "--enable-integration", action="store_true", default=False, - help="run slow tests" + help="run slow tests", ) @@ -35,7 +36,7 @@ def pytest_configure(config): ( "integration: mark test as integration tests involving more " "extensive setup (only used for crossbow at the moment)" - ) + ), ) @@ -53,18 +54,20 @@ def load_fixture(request): current_test_directory = pathlib.Path(request.node.fspath).parent def decoder(path): - with path.open('r') as fp: - if path.suffix == '.json': + with path.open("r") as fp: + if path.suffix == ".json": import json + return json.load(fp) - elif path.suffix == '.yaml': + elif path.suffix == ".yaml": import yaml - return yaml.load(fp) + + return yaml.safe_load(fp) else: return fp.read() def loader(name, decoder=decoder): - path = current_test_directory / 'fixtures' / name + path = current_test_directory / "fixtures" / name return decoder(path) return loader diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 6587e61546b5a..88bcb19db4645 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -15,50 +15,63 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import functools import operator import sys -from setuptools import setup, find_packages -if sys.version_info < (3, 9): - sys.exit('Python < 3.9 is not supported') +from setuptools import find_packages, setup + +if sys.version_info < (3, 9): # noqa: UP036 + sys.exit("Python < 3.9 is not supported") # For pathlib.Path compatibility -jinja_req = 'jinja2>=2.11' +jinja_req = "jinja2>=2.11" extras = { - 'benchmark': ['pandas'], - 'crossbow': ['github3.py', jinja_req, 'pygit2>=1.14.0', 'requests', - 'ruamel.yaml', 'setuptools_scm>=8.0.0'], - 'crossbow-upload': ['github3.py', jinja_req, 'ruamel.yaml', - 'setuptools_scm'], - 'docker': ['ruamel.yaml', 'python-dotenv'], - 'integration': ['cffi', 'numpy'], - 'integration-java': ['jpype1'], - 'lint': ['numpydoc==1.1.0', 'autopep8', 'flake8==6.1.0', 'cython-lint', - 'cmake_format==0.6.13', 'sphinx-lint==0.9.1'], - 'numpydoc': ['numpydoc==1.1.0'], - 'release': ['pygithub', jinja_req, 'semver', 'gitpython'], + "benchmark": ["pandas"], + "crossbow": [ + "github3.py", + jinja_req, + "pygit2>=1.14.0", + "requests", + "ruamel.yaml", + "setuptools_scm>=8.0.0", + ], + "crossbow-upload": ["github3.py", jinja_req, "ruamel.yaml", "setuptools_scm"], + "docker": ["ruamel.yaml", "python-dotenv"], + "integration": ["cffi", "numpy"], + "integration-java": ["jpype1"], + "lint": [ + "numpydoc==1.1.0", + "autopep8", + "flake8==6.1.0", + "cython-lint", + "cmake_format==0.6.13", + "sphinx-lint==0.9.1", + ], + "numpydoc": ["numpydoc==1.1.0"], + "release": ["pygithub", jinja_req, "semver", "gitpython"], } -extras['bot'] = extras['crossbow'] + ['pygithub'] -extras['all'] = list(set(functools.reduce(operator.add, extras.values()))) +extras["bot"] = extras["crossbow"] + ["pygithub"] +extras["all"] = list(set(functools.reduce(operator.add, extras.values()))) setup( - name='archery', + name="archery", version="0.1.0", - description='Apache Arrow Developers Tools', - url='http://github.com/apache/arrow', - maintainer='Arrow Developers', - maintainer_email='dev@arrow.apache.org', + description="Apache Arrow Developers Tools", + url="http://github.com/apache/arrow", + maintainer="Arrow Developers", + maintainer_email="dev@arrow.apache.org", packages=find_packages(), include_package_data=True, - python_requires='>=3.9', - install_requires=['click>=7'], - tests_require=['pytest', 'responses'], + python_requires=">=3.9", + install_requires=["click>=7"], + tests_require=["pytest", "responses"], extras_require=extras, - entry_points=''' + entry_points=""" [console_scripts] archery=archery.cli:archery - ''' + """, ) diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index fe1dc1e79290e..c796d51a850fb 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -33,23 +33,25 @@ # - ARROW_GITHUB_API_TOKEN: a GitHub API token to use for API requests # - ARROW_GITHUB_ORG: the GitHub organisation ('apache' by default) # - DEBUG: use for testing to avoid pushing to apache (0 by default) +from __future__ import annotations import configparser +import getpass import os import pprint import re import subprocess import sys + import requests -import getpass # Remote name which points to the GitHub site ORG_NAME = ( - os.environ.get("ARROW_GITHUB_ORG") or - os.environ.get("PR_REMOTE_NAME") or # backward compatibility - "apache" + os.environ.get("ARROW_GITHUB_ORG") + or os.environ.get("PR_REMOTE_NAME") # backward compatibility + or "apache" ) -PROJECT_NAME = os.environ.get('ARROW_PROJECT_NAME') or "arrow" +PROJECT_NAME = os.environ.get("ARROW_PROJECT_NAME") or "arrow" # For testing to avoid accidentally pushing to apache DEBUG = bool(int(os.environ.get("DEBUG", 0))) @@ -67,7 +69,7 @@ def get_json(url, headers=None): # https://docs.github.com/en/rest/guides/using-pagination-in-the-rest-api#using-link-headers next_responses = None if "link" in response.headers: - links = response.headers['link'].split(', ') + links = response.headers["link"].split(", ") for link in links: if 'rel="next"' in link: # Format: '<url>; rel="next"' @@ -78,37 +80,37 @@ def get_json(url, headers=None): if isinstance(responses, list): responses.extend(next_responses) else: - raise ValueError('GitHub response was paginated and is not a list') + raise ValueError("GitHub response was paginated and is not a list") return responses def run_cmd(cmd): if isinstance(cmd, str): - cmd = cmd.split(' ') + cmd = cmd.split(" ") try: output = subprocess.check_output(cmd) except subprocess.CalledProcessError as e: # this avoids hiding the stdout / stderr of failed processes - print('Command failed: %s' % cmd) - print('With output:') - print('--------------') + print(f"Command failed: {cmd}") + print("With output:") + print("--------------") print(e.output) - print('--------------') + print("--------------") raise e if isinstance(output, bytes): - output = output.decode('utf-8') + output = output.decode("utf-8") return output -_REGEX_CI_DIRECTIVE = re.compile(r'\[[^\]]*\]') +_REGEX_CI_DIRECTIVE = re.compile(r"\[[^\]]*\]") def strip_ci_directives(commit_message): # Remove things like '[force ci]', '[skip appveyor]' from the assembled # commit message - return _REGEX_CI_DIRECTIVE.sub('', commit_message) + return _REGEX_CI_DIRECTIVE.sub("", commit_message) def fix_version_from_branch(versions): @@ -122,8 +124,7 @@ def fix_version_from_branch(versions): ) -class GitHubIssue(object): - +class GitHubIssue: def __init__(self, github_api, github_id, cmd): self.github_api = github_api self.github_id = github_id @@ -132,13 +133,14 @@ def __init__(self, github_api, github_id, cmd): try: self.issue = self.github_api.get_issue_data(github_id) except Exception as e: - self.cmd.fail("GitHub could not find %s\n%s" % (github_id, e)) + self.cmd.fail(f"GitHub could not find {github_id}\n{e}") def get_label(self, prefix): prefix = f"{prefix}:" return [ - lbl["name"][len(prefix):].strip() - for lbl in self.issue["labels"] if lbl["name"].startswith(prefix) + lbl["name"][len(prefix) :].strip() + for lbl in self.issue["labels"] + if lbl["name"].startswith(prefix) ] @property @@ -169,190 +171,186 @@ def resolve(self, fix_version, comment, pr_body): cur_status = self.issue["state"] if cur_status == "closed": - self.cmd.fail("GitHub issue %s already has status '%s'" - % (self.github_id, cur_status)) + self.cmd.fail( + f"GitHub issue {self.github_id} already has status '{cur_status}'" + ) if DEBUG: - print("GitHub issue %s untouched -> %s" % - (self.github_id, fix_version)) + print(f"GitHub issue {self.github_id} untouched -> {fix_version}") else: self.github_api.assign_milestone(self.github_id, fix_version) if f"Closes: #{self.github_id}" not in pr_body: self.github_api.close_issue(self.github_id, comment) - print("Successfully resolved %s!" % (self.github_id)) + print(f"Successfully resolved {self.github_id}!") self.issue = self.github_api.get_issue_data(self.github_id) self.show() def show(self): issue = self.issue - print(format_issue_output("github", self.github_id, issue["state"], - issue["title"], ', '.join(self.assignees), - self.components)) - + print( + format_issue_output( + "github", + self.github_id, + issue["state"], + issue["title"], + ", ".join(self.assignees), + self.components, + ) + ) -def get_candidate_fix_version(mainline_versions, - maintenance_branches=()): +def get_candidate_fix_version(mainline_versions, maintenance_branches=()): all_versions = [getattr(v, "name", v) for v in mainline_versions] def version_tuple(x): # Parquet versions are something like cpp-1.2.0 numeric_version = getattr(x, "name", x).split("-", 1)[-1] return tuple(int(_) for _ in numeric_version.split(".")) + all_versions = sorted(all_versions, key=version_tuple, reverse=True) # Only suggest versions starting with a number, like 0.x but not JS-0.x mainline_versions = all_versions - major_versions = [v for v in mainline_versions if v.endswith('.0.0')] + major_versions = [v for v in mainline_versions if v.endswith(".0.0")] if len(mainline_versions) > len(major_versions): # If there is a future major release, suggest that mainline_versions = major_versions - mainline_versions = [v for v in mainline_versions - if f"maint-{v}" not in maintenance_branches] + mainline_versions = [ + v for v in mainline_versions if f"maint-{v}" not in maintenance_branches + ] default_fix_versions = fix_version_from_branch(mainline_versions) return default_fix_versions -def format_issue_output(issue_type, issue_id, status, - summary, assignee, components): +def format_issue_output(issue_type, issue_id, status, summary, assignee, components): if not assignee: assignee = "NOT ASSIGNED!!!" else: assignee = getattr(assignee, "displayName", assignee) if len(components) == 0: - components = 'NO COMPONENTS!!!' + components = "NO COMPONENTS!!!" else: - components = ', '.join((getattr(x, "name", x) for x in components)) + components = ", ".join(getattr(x, "name", x) for x in components) url_id = issue_id if "GH" in issue_id: url_id = issue_id.replace("GH-", "") - url = f'https://github.com/{ORG_NAME}/{PROJECT_NAME}/issues/{url_id}' - - return """=== {} {} === -Summary\t\t{} -Assignee\t{} -Components\t{} -Status\t\t{} -URL\t\t{}""".format(issue_type.upper(), issue_id, summary, assignee, - components, status, url) + url = f"https://github.com/{ORG_NAME}/{PROJECT_NAME}/issues/{url_id}" + return f"""=== {issue_type.upper()} {issue_id} === +Summary\t\t{summary} +Assignee\t{assignee} +Components\t{components} +Status\t\t{status} +URL\t\t{url}""" -class GitHubAPI(object): +class GitHubAPI: def __init__(self, project_name, cmd): - self.github_api = ( - f"https://api.github.com/repos/{ORG_NAME}/{project_name}" - ) + self.github_api = f"https://api.github.com/repos/{ORG_NAME}/{project_name}" token = None config = load_configuration() if "github" in config.sections(): token = config["github"]["api_token"] if not token: - token = os.environ.get('ARROW_GITHUB_API_TOKEN') + token = os.environ.get("ARROW_GITHUB_API_TOKEN") if not token: - token = cmd.prompt('Env ARROW_GITHUB_API_TOKEN not set, ' - 'please enter your GitHub API token ' - '(GitHub personal access token):') + token = cmd.prompt( + "Env ARROW_GITHUB_API_TOKEN not set, " + "please enter your GitHub API token " + "(GitHub personal access token):" + ) headers = { - 'Accept': 'application/vnd.github.v3+json', - 'Authorization': 'token {0}'.format(token), + "Accept": "application/vnd.github.v3+json", + "Authorization": f"token {token}", } self.headers = headers def get_milestones(self): - return get_json("%s/milestones" % (self.github_api, ), - headers=self.headers) + return get_json(f"{self.github_api}/milestones", headers=self.headers) def get_milestone_number(self, version): - return next(( - m["number"] for m in self.get_milestones() if m["title"] == version - ), None) + return next( + (m["number"] for m in self.get_milestones() if m["title"] == version), None + ) def get_issue_data(self, number): - return get_json("%s/issues/%s" % (self.github_api, number), - headers=self.headers) + return get_json(f"{self.github_api}/issues/{number}", headers=self.headers) def get_pr_data(self, number): - return get_json("%s/pulls/%s" % (self.github_api, number), - headers=self.headers) + return get_json(f"{self.github_api}/pulls/{number}", headers=self.headers) def get_pr_commits(self, number): - return get_json("%s/pulls/%s/commits" % (self.github_api, number), - headers=self.headers) + return get_json( + f"{self.github_api}/pulls/{number}/commits", headers=self.headers + ) def get_branches(self): - return get_json("%s/branches" % (self.github_api), - headers=self.headers) + return get_json(f"{self.github_api}/branches", headers=self.headers) def close_issue(self, number, comment): - issue_url = f'{self.github_api}/issues/{number}' - comment_url = f'{self.github_api}/issues/{number}/comments' + issue_url = f"{self.github_api}/issues/{number}" + comment_url = f"{self.github_api}/issues/{number}/comments" - r = requests.post(comment_url, json={ - "body": comment}, headers=self.headers) + r = requests.post(comment_url, json={"body": comment}, headers=self.headers) if not r.ok: raise ValueError( - f"Failed request: {comment_url}:{r.status_code} -> {r.json()}") + f"Failed request: {comment_url}:{r.status_code} -> {r.json()}" + ) - r = requests.patch( - issue_url, json={"state": "closed"}, headers=self.headers) + r = requests.patch(issue_url, json={"state": "closed"}, headers=self.headers) if not r.ok: raise ValueError( - f"Failed request: {issue_url}:{r.status_code} -> {r.json()}") + f"Failed request: {issue_url}:{r.status_code} -> {r.json()}" + ) def assign_milestone(self, number, version): - url = f'{self.github_api}/issues/{number}' + url = f"{self.github_api}/issues/{number}" milestone_number = self.get_milestone_number(version) if not milestone_number: raise ValueError(f"Invalid version {version}, milestone not found") - payload = { - 'milestone': milestone_number - } + payload = {"milestone": milestone_number} r = requests.patch(url, headers=self.headers, json=payload) if not r.ok: - raise ValueError( - f"Failed request: {url}:{r.status_code} -> {r.json()}") + raise ValueError(f"Failed request: {url}:{r.status_code} -> {r.json()}") return r.json() def merge_pr(self, number, commit_title, commit_message): - url = f'{self.github_api}/pulls/{number}/merge' + url = f"{self.github_api}/pulls/{number}/merge" payload = { - 'commit_title': commit_title, - 'commit_message': commit_message, - 'merge_method': 'squash', + "commit_title": commit_title, + "commit_message": commit_message, + "merge_method": "squash", } response = requests.put(url, headers=self.headers, json=payload) result = response.json() - if response.status_code == 200 and 'merged' in result: + if response.status_code == 200 and "merged" in result: self.clear_pr_state_labels(number) else: - result['merged'] = False - result['message'] += f': {url}' + result["merged"] = False + result["message"] += f": {url}" return result def clear_pr_state_labels(self, number): - url = f'{self.github_api}/issues/{number}/labels' + url = f"{self.github_api}/issues/{number}/labels" response = requests.get(url, headers=self.headers) labels = response.json() for label in labels: # All PR workflow state labels starts with "awaiting" - if label['name'].startswith('awaiting'): + if label["name"].startswith("awaiting"): label_url = f"{url}/{label['name']}" requests.delete(label_url, headers=self.headers) -class CommandInput(object): - """ - Interface to input(...) to enable unit test mocks to be created - """ +class CommandInput: + """Interface to input(...) to enable unit test mocks to be created""" def fail(self, msg): raise Exception(msg) @@ -365,7 +363,7 @@ def getpass(self, prompt): def continue_maybe(self, prompt): while True: - result = input("\n%s (y/n): " % prompt) + result = input(f"\n{prompt} (y/n): ") if result.lower() == "y": return elif result.lower() == "n": @@ -374,8 +372,8 @@ def continue_maybe(self, prompt): prompt = "Please input 'y' or 'n'" -class PullRequest(object): - GITHUB_PR_TITLE_PATTERN = re.compile(r'^GH-([0-9]+)\b.*$') +class PullRequest: + GITHUB_PR_TITLE_PATTERN = re.compile(r"^GH-([0-9]+)\b.*$") def __init__(self, cmd, github_api, git_remote, number): self.cmd = cmd @@ -393,14 +391,18 @@ def __init__(self, cmd, github_api, git_remote, number): except KeyError: pprint.pprint(self._pr_data) raise - self.description = "%s/%s" % (self.user_login, self.base_ref) + self.description = f"{self.user_login}/{self.base_ref}" self.issue = self._get_issue() def show(self): - print("\n=== Pull Request #%s ===" % self.number) - print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" - % (self.title, self.description, self.target_ref, self.url)) + print(f"\n=== Pull Request #{self.number} ===") + print( + f"title\t{self.title}\n" + f"source\t{self.description}\n" + f"target\t{self.target_ref}\n" + f"url\t{self.url}" + ) if self.issue is not None: self.issue.show() else: @@ -416,8 +418,11 @@ def is_mergeable(self): @property def maintenance_branches(self): - return [x["name"] for x in self._github_api.get_branches() - if x["name"].startswith("maint-")] + return [ + x["name"] + for x in self._github_api.get_branches() + if x["name"].startswith("maint-") + ] def _get_issue(self): if self.title.startswith("MINOR:"): @@ -428,48 +433,50 @@ def _get_issue(self): github_id = m.group(1) return GitHubIssue(self._github_api, github_id, self.cmd) - self.cmd.fail("PR title should be prefixed by a GitHub ID, like: " - "GH-XXX, but found {0}".format(self.title)) + self.cmd.fail( + "PR title should be prefixed by a GitHub ID, like: " + f"GH-XXX, but found {self.title}" + ) def merge(self): - """ - merge the requested PR and return the merge hash - """ + """Merge the requested PR and return the merge hash""" commits = self._github_api.get_pr_commits(self.number) def format_commit_author(commit): - author = commit['commit']['author'] - name = author['name'] - email = author['email'] - return f'{name} <{email}>' + author = commit["commit"]["author"] + name = author["name"] + email = author["email"] + return f"{name} <{email}>" + commit_authors = [format_commit_author(commit) for commit in commits] - co_authored_by_re = re.compile( - r'^Co-authored-by:\s*(.*)', re.MULTILINE) + co_authored_by_re = re.compile(r"^Co-authored-by:\s*(.*)", re.MULTILINE) def extract_co_authors(commit): - message = commit['commit']['message'] + message = commit["commit"]["message"] return co_authored_by_re.findall(message) + commit_co_authors = [] for commit in commits: commit_co_authors.extend(extract_co_authors(commit)) all_commit_authors = commit_authors + commit_co_authors - distinct_authors = sorted(set(all_commit_authors), - key=lambda x: commit_authors.count(x), - reverse=True) + distinct_authors = sorted( + set(all_commit_authors), key=lambda x: commit_authors.count(x), reverse=True + ) for i, author in enumerate(distinct_authors): - print("Author {}: {}".format(i + 1, author)) + print(f"Author {i + 1}: {author}") if len(distinct_authors) > 1: primary_author, distinct_other_authors = get_primary_author( - self.cmd, distinct_authors) + self.cmd, distinct_authors + ) else: # If there is only one author, do not prompt for a lead author primary_author = distinct_authors.pop() distinct_other_authors = [] - commit_title = f'{self.title} (#{self.number})' + commit_title = f"{self.title} (#{self.number})" commit_message_chunks = [] if self.body is not None: # Remove comments (i.e. <-- comment -->) from the PR description. @@ -481,14 +488,15 @@ def extract_co_authors(commit): committer_name = run_cmd("git config --get user.name").strip() committer_email = run_cmd("git config --get user.email").strip() - authors = ("Authored-by:" if len(distinct_other_authors) == 0 - else "Lead-authored-by:") - authors += " %s" % primary_author + authors = ( + "Authored-by:" if len(distinct_other_authors) == 0 else "Lead-authored-by:" + ) + authors += f" {primary_author}" if len(distinct_authors) > 0: - authors += "\n" + "\n".join(["Co-authored-by: %s" % a - for a in distinct_other_authors]) - authors += "\n" + "Signed-off-by: %s <%s>" % (committer_name, - committer_email) + authors += "\n" + "\n".join( + [f"Co-authored-by: {a}" for a in distinct_other_authors] + ) + authors += "\n" + f"Signed-off-by: {committer_name} <{committer_email}>" commit_message_chunks.append(authors) commit_message = "\n\n".join(commit_message_chunks) @@ -508,61 +516,59 @@ def extract_co_authors(commit): if DEBUG: merge_hash = None else: - result = self._github_api.merge_pr(self.number, - commit_title, - commit_message) - if not result['merged']: - message = result['message'] - self.cmd.fail(f'Failed to merge pull request: {message}') - merge_hash = result['sha'] + result = self._github_api.merge_pr( + self.number, commit_title, commit_message + ) + if not result["merged"]: + message = result["message"] + self.cmd.fail(f"Failed to merge pull request: {message}") + merge_hash = result["sha"] - print("Pull request #%s merged!" % self.number) - print("Merge hash: %s" % merge_hash) + print(f"Pull request #{self.number} merged!") + print(f"Merge hash: {merge_hash}") def get_primary_author(cmd, distinct_authors): - author_pat = re.compile(r'(.*) <(.*)>') + author_pat = re.compile(r"(.*) <(.*)>") while True: primary_author = cmd.prompt( "Enter primary author in the format of " - "\"name <email>\" [%s]: " % distinct_authors[0]) + f'"name <email>" [{distinct_authors[0]}]: ' + ) if primary_author == "": return distinct_authors[0], distinct_authors[1:] if author_pat.match(primary_author): break - print('Bad author "{}", please try again'.format(primary_author)) + print(f'Bad author "{primary_author}", please try again') # When primary author is specified manually, de-dup it from # author list and put it at the head of author list. - distinct_other_authors = [x for x in distinct_authors - if x != primary_author] + distinct_other_authors = [x for x in distinct_authors if x != primary_author] return primary_author, distinct_other_authors def prompt_for_fix_version(cmd, issue, maintenance_branches=()): default_fix_version = get_candidate_fix_version( mainline_versions=issue.current_versions, - maintenance_branches=maintenance_branches + maintenance_branches=maintenance_branches, ) current_fix_versions = issue.current_fix_versions - if (current_fix_versions and - current_fix_versions != default_fix_version): + if current_fix_versions and current_fix_versions != default_fix_version: print("\n=== The assigned milestone is not the default ===") print(f"Assigned milestone: {current_fix_versions}") print(f"Current milestone: {default_fix_version}") - if issue.issue["milestone"].get("state") == 'closed': + if issue.issue["milestone"].get("state") == "closed": print("The assigned milestone state is closed. Contact the ") print("Release Manager if it has to be added to a closed Release") print("Please ensure to assign the correct milestone.") # Default to existing assigned milestone default_fix_version = current_fix_versions - issue_fix_version = cmd.prompt("Enter fix version [%s]: " - % default_fix_version) + issue_fix_version = cmd.prompt(f"Enter fix version [{default_fix_version}]: ") if issue_fix_version == "": issue_fix_version = default_fix_version issue_fix_version = issue_fix_version.strip() @@ -602,16 +608,16 @@ def cli(): pr = PullRequest(cmd, github_api, ORG_NAME, pr_num) if pr.is_merged: - print("Pull request %s has already been merged" % pr_num) + print(f"Pull request {pr_num} has already been merged") sys.exit(0) if not pr.is_mergeable: - print("Pull request %s is not mergeable in its current form" % pr_num) + print(f"Pull request {pr_num} is not mergeable in its current form") sys.exit(1) pr.show() - cmd.continue_maybe("Proceed with merging pull request #%s?" % pr_num) + cmd.continue_maybe(f"Proceed with merging pull request #{pr_num}?") pr.merge() @@ -620,17 +626,14 @@ def cli(): return cmd.continue_maybe("Would you like to update the associated issue?") - issue_comment = ( - "Issue resolved by pull request %s\n%s" - % (pr_num, - f"https://github.com/{ORG_NAME}/{PROJECT_NAME}/pull/{pr_num}") + issue_comment = "Issue resolved by pull request {}\n{}".format( + pr_num, f"https://github.com/{ORG_NAME}/{PROJECT_NAME}/pull/{pr_num}" ) - fix_version = prompt_for_fix_version(cmd, pr.issue, - pr.maintenance_branches) + fix_version = prompt_for_fix_version(cmd, pr.issue, pr.maintenance_branches) pr.issue.resolve(fix_version, issue_comment, pr.body) -if __name__ == '__main__': +if __name__ == "__main__": try: cli() except Exception: diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py old mode 100644 new mode 100755 index a5718103a5346..512916cf4dfb8 --- a/dev/release/check-rat-report.py +++ b/dev/release/check-rat-report.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -############################################################################## +# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -16,44 +16,49 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -############################################################################## + +from __future__ import annotations + import fnmatch import re import sys import xml.etree.ElementTree as ET if len(sys.argv) != 3: - sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % - sys.argv[0]) + sys.stderr.write(f"Usage: {sys.argv[0]} exclude_globs.lst rat_report.xml\n") sys.exit(1) exclude_globs_filename = sys.argv[1] xml_filename = sys.argv[2] -globs = [line.strip() for line in open(exclude_globs_filename, "r")] +with open(exclude_globs_filename) as f: + globs = [line.strip() for line in f] tree = ET.parse(xml_filename) root = tree.getroot() -resources = root.findall('resource') +resources = root.findall("resource") all_ok = True for r in resources: - approvals = r.findall('license-approval') - if not approvals or approvals[0].attrib['name'] == 'true': + approvals = r.findall("license-approval") + if not approvals or approvals[0].attrib["name"] == "true": continue - clean_name = re.sub('^[^/]+/', '', r.attrib['name']) + clean_name = re.sub("^[^/]+/", "", r.attrib["name"]) excluded = False for g in globs: if fnmatch.fnmatch(clean_name, g): excluded = True break if not excluded: - sys.stdout.write("NOT APPROVED: %s (%s): %s\n" % ( - clean_name, r.attrib['name'], approvals[0].attrib['name'])) + sys.stdout.write( + "NOT APPROVED: {} ({}): {}\n".format( + clean_name, r.attrib["name"], approvals[0].attrib["name"] + ) + ) all_ok = False if not all_ok: sys.exit(1) -print('OK') +print("OK") sys.exit(0) diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py index 23e33edaad20d..15e0f928e5023 100755 --- a/dev/release/download_rc_binaries.py +++ b/dev/release/download_rc_binaries.py @@ -17,8 +17,11 @@ """Download release binaries.""" +from __future__ import annotations + import argparse import concurrent.futures as cf +import contextlib import functools import json import os @@ -28,43 +31,38 @@ import time import urllib.request - DEFAULT_PARALLEL_DOWNLOADS = 8 class Downloader: - def get_file_list(self, prefix, filter=None): def traverse(directory, files, directories): - url = f'{self.URL_ROOT}/{directory}' + url = f"{self.URL_ROOT}/{directory}" response = urllib.request.urlopen(url).read().decode() paths = re.findall('<a href="(.+?)"', response) for path in paths: - path = re.sub(f'^{re.escape(url)}', - '', - path) - if path == '../': + path = re.sub(f"^{re.escape(url)}", "", path) + if path == "../": continue - resolved_path = f'{directory}{path}' + resolved_path = f"{directory}{path}" if filter and not filter(path): continue - if path.endswith('/'): + if path.endswith("/"): directories.append(resolved_path) else: files.append(resolved_path) + files = [] - if prefix != '' and not prefix.endswith('/'): - prefix += '/' + if prefix != "" and not prefix.endswith("/"): + prefix += "/" directories = [prefix] while len(directories) > 0: directory = directories.pop() traverse(directory, files, directories) return files - def download_files(self, files, dest=None, num_parallel=None, - re_match=None): - """ - Download files from Bintray in parallel. If file already exists, will + def download_files(self, files, dest=None, num_parallel=None, re_match=None): + """Download files from Bintray in parallel. If file already exists, will overwrite if the checksum does not match what Bintray says it should be Parameters @@ -76,6 +74,7 @@ def download_files(self, files, dest=None, num_parallel=None, num_parallel : int, default 8 Number of files to download in parallel. If set to None, uses default + """ if dest is None: dest = os.getcwd() @@ -91,9 +90,7 @@ def download_files(self, files, dest=None, num_parallel=None, self._download_file(dest, path) else: parallel_map_terminate_early( - functools.partial(self._download_file, dest), - files, - num_parallel + functools.partial(self._download_file, dest), files, num_parallel ) def _download_file(self, dest, path): @@ -104,9 +101,9 @@ def _download_file(self, dest, path): dest_path = os.path.join(dest_dir, filename) - print("Downloading {} to {}".format(path, dest_path)) + print(f"Downloading {path} to {dest_path}") - url = f'{self.URL_ROOT}/{path}' + url = f"{self.URL_ROOT}/{path}" self._download_url(url, dest_path) def _download_url(self, url, dest_path, *, extra_args=None): @@ -128,22 +125,18 @@ def _download_url(self, url, dest_path, *, extra_args=None): delay = attempt * 3 print(f"Waiting {delay} seconds before retrying {url}") time.sleep(delay) - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() if proc.returncode != 0: - try: + with contextlib.suppress(OSError): # Don't leave possibly partial file around os.remove(dest_path) - except IOError: - pass if "OpenSSL" not in stderr: # We assume curl has already retried on other errors. break else: return - raise Exception(f"Downloading {url} failed\n" - f"stdout: {stdout}\nstderr: {stderr}") + raise Exception(f"Downloading {url} failed\nstdout: {stdout}\nstderr: {stderr}") def _curl_version(self): cmd = ["curl", "--version"] @@ -157,8 +150,9 @@ class Artifactory(Downloader): class Maven(Downloader): - URL_ROOT = "https://repository.apache.org" + \ - "/content/repositories/staging/org/apache/arrow" + URL_ROOT = ( + "https://repository.apache.org/content/repositories/staging/org/apache/arrow" + ) class GitHub(Downloader): @@ -174,19 +168,14 @@ def __init__(self, repository, tag): self._token = os.environ.get("GH_TOKEN") def get_file_list(self, prefix, filter=None): - url = (f"https://api.github.com/repos/{self._repository}/" - f"releases/tags/{self._tag}") + url = ( + f"https://api.github.com/repos/{self._repository}/releases/tags/{self._tag}" + ) print("Fetching release from", url) - headers = { - "Accept": "application/vnd.github+json", - } + headers = {"Accept": "application/vnd.github+json"} if self._token: headers["Authorization"] = f"Bearer {self._token}" - request = urllib.request.Request( - url, - method="GET", - headers=headers, - ) + request = urllib.request.Request(url, method="GET", headers=headers) raw_response = urllib.request.urlopen(request).read().decode() response = json.loads(raw_response) @@ -219,18 +208,11 @@ def _download_file(self, dest, asset): print(f"Waiting {delay} seconds to avoid rate limit") time.sleep(delay) - extra_args = [ - "--header", - "Accept: application/octet-stream", - ] + extra_args = ["--header", "Accept: application/octet-stream"] if self._curl_version() >= (7, 71, 0): # Also retry 403s extra_args.append("--retry-all-errors") - self._download_url( - url, - dest_path, - extra_args=extra_args - ) + self._download_url(url, dest_path, extra_args=extra_args) def parallel_map_terminate_early(f, iterable, num_parallel): @@ -248,77 +230,105 @@ def parallel_map_terminate_early(f, iterable, num_parallel): ARROW_REPOSITORY_PACKAGE_TYPES = [ - 'almalinux', - 'amazon-linux', - 'centos', - 'debian', - 'ubuntu', + "almalinux", + "amazon-linux", + "centos", + "debian", + "ubuntu", ] -ARROW_STANDALONE_PACKAGE_TYPES = ['nuget', 'python'] -ARROW_PACKAGE_TYPES = \ - ARROW_REPOSITORY_PACKAGE_TYPES + \ - ARROW_STANDALONE_PACKAGE_TYPES - - -def download_rc_binaries(version, rc_number, re_match=None, dest=None, - num_parallel=None, target_package_type=None, - repository=None, tag=None): - version_string = '{}-rc{}'.format(version, rc_number) - version_pattern = re.compile(r'\d+\.\d+\.\d+') +ARROW_STANDALONE_PACKAGE_TYPES = ["nuget", "python"] +ARROW_PACKAGE_TYPES = ARROW_REPOSITORY_PACKAGE_TYPES + ARROW_STANDALONE_PACKAGE_TYPES + + +def download_rc_binaries( + version, + rc_number, + re_match=None, + dest=None, + num_parallel=None, + target_package_type=None, + repository=None, + tag=None, +): + version_string = f"{version}-rc{rc_number}" + version_pattern = re.compile(r"\d+\.\d+\.\d+") if target_package_type: package_types = [target_package_type] else: package_types = ARROW_PACKAGE_TYPES for package_type in package_types: + def is_target(path): match = version_pattern.search(path) if not match: return True return match[0] == version + filter = is_target - if package_type == 'jars': + if package_type == "jars": downloader = Maven() - prefix = '' - elif package_type == 'github' or package_type == 'nuget': + prefix = "" + elif package_type in ("github", "nuget"): downloader = GitHub(repository, tag) - prefix = '' + prefix = "" filter = None elif package_type in ARROW_REPOSITORY_PACKAGE_TYPES: downloader = Artifactory() - prefix = f'{package_type}-rc' + prefix = f"{package_type}-rc" else: downloader = Artifactory() - prefix = f'{package_type}-rc/{version_string}' + prefix = f"{package_type}-rc/{version_string}" filter = None files = downloader.get_file_list(prefix, filter=filter) - downloader.download_files(files, re_match=re_match, dest=dest, - num_parallel=num_parallel) + downloader.download_files( + files, re_match=re_match, dest=dest, num_parallel=num_parallel + ) -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Download release candidate binaries' +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download release candidate binaries") + parser.add_argument("version", type=str, help="The version number") + parser.add_argument( + "rc_number", type=int, help="The release candidate number, e.g. 0, 1, etc" + ) + parser.add_argument( + "-e", + "--regexp", + type=str, + default=None, + help=( + "Regular expression to match on file names to only download certain files" + ), + ) + parser.add_argument( + "--dest", + type=str, + default=os.getcwd(), + help="The output folder for the downloaded files", + ) + parser.add_argument( + "--num_parallel", + type=int, + default=DEFAULT_PARALLEL_DOWNLOADS, + help="The number of concurrent downloads to do", + ) + parser.add_argument( + "--package_type", + type=str, + default=None, + help="The package type to be downloaded", + ) + parser.add_argument( + "--repository", + type=str, + help=("The repository to pull from (required if --package_type=github)"), + ) + parser.add_argument( + "--tag", + type=str, + help=("The release tag to download (required if --package_type=github)"), ) - parser.add_argument('version', type=str, help='The version number') - parser.add_argument('rc_number', type=int, - help='The release candidate number, e.g. 0, 1, etc') - parser.add_argument('-e', '--regexp', type=str, default=None, - help=('Regular expression to match on file names ' - 'to only download certain files')) - parser.add_argument('--dest', type=str, default=os.getcwd(), - help='The output folder for the downloaded files') - parser.add_argument('--num_parallel', type=int, - default=DEFAULT_PARALLEL_DOWNLOADS, - help='The number of concurrent downloads to do') - parser.add_argument('--package_type', type=str, default=None, - help='The package type to be downloaded') - parser.add_argument('--repository', type=str, - help=('The repository to pull from ' - '(required if --package_type=github)')) - parser.add_argument('--tag', type=str, - help=('The release tag to download ' - '(required if --package_type=github)')) args = parser.parse_args() download_rc_binaries( diff --git a/dev/release/utils-update-docs-versions.py b/dev/release/utils-update-docs-versions.py index ba0ddcaeb39e1..6380494ec4f9c 100644 --- a/dev/release/utils-update-docs-versions.py +++ b/dev/release/utils-update-docs-versions.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations import json import sys @@ -58,19 +59,25 @@ if stable_compatible_version != previous_compatible_version: # Create new versions new_versions = [ - {"name": f"{dev_compatible_version} (dev)", - "version": "dev/", - "url": "https://arrow.apache.org/docs/dev/"}, - {"name": f"{stable_compatible_version} (stable)", - "version": "", - "url": "https://arrow.apache.org/docs/", - "preferred": True}, - {"name": previous_compatible_version, - "version": f"{previous_compatible_version}/", - "url": f"https://arrow.apache.org/docs/{previous_compatible_version}/"}, + { + "name": f"{dev_compatible_version} (dev)", + "version": "dev/", + "url": "https://arrow.apache.org/docs/dev/", + }, + { + "name": f"{stable_compatible_version} (stable)", + "version": "", + "url": "https://arrow.apache.org/docs/", + "preferred": True, + }, + { + "name": previous_compatible_version, + "version": f"{previous_compatible_version}/", + "url": f"https://arrow.apache.org/docs/{previous_compatible_version}/", + }, *old_versions[2:], ] - with open(main_versions_path, 'w') as json_file: + with open(main_versions_path, "w") as json_file: json.dump(new_versions, json_file, indent=4) json_file.write("\n") @@ -103,7 +110,7 @@ {"name": f"{release_r_version} (release)", "version": ""}, *old_r_versions[2:], ] -with open(r_versions_path, 'w') as json_file: +with open(r_versions_path, "w") as json_file: json.dump(new_r_versions, json_file, indent=4) json_file.write("\n") @@ -112,8 +119,8 @@ data = json.load(json_file) # Write HTML to file -with open(r_html_path, 'w') as html_file: - html_file.write('<!DOCTYPE html>\n<html>\n<body>') +with open(r_html_path, "w") as html_file: + html_file.write("<!DOCTYPE html>\n<html>\n<body>") for i in data: html_file.write(f'<p><a href="../{i["version"]}r/">{i["name"]}</a></p>\n') - html_file.write('</body>\n</html>\n') + html_file.write("</body>\n</html>\n") diff --git a/dev/ruff.toml b/dev/ruff.toml new file mode 100644 index 0000000000000..ad9cfebcc0c63 --- /dev/null +++ b/dev/ruff.toml @@ -0,0 +1,158 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +line-length = 88 +indent-width = 4 + +# Assume Python 3.9 +target-version = "py39" + +[format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +# Enable auto-formatting of code examples in docstrings. Markdown, +# reStructuredText code/literal blocks and doctests are all supported. +# +# This is currently disabled by default, but it is planned for this +# to be opt-out in the future. +docstring-code-format = true + +# Set the line length limit used when formatting code snippets in +# docstrings. +# +# This only has an effect when the `docstring-code-format` setting is +# enabled. +docstring-code-line-length = 88 + +[lint] +select = [ + "B", # flake8-bugbear + "BLE", # flake8-blind-except + "C4", # comprehensions + "D", # pydocstyle + "E", # pycodestyle + "EXE", # flake8-executable + "F", # pyflakes + "FA", # flake8-future-annotations + "FLY", # flynt (format string conversion) + "G", # flake8-logging-format + "I", # isort + "ICN", # flake8-import-conventions + "INP", # flake8-no-pep420 (implicit namespace packages) + "ISC", # flake8-implicit-str-concat + "PGH", # pygrep-hooks + "PIE", # flake8-pie + "PL", # pylint + "RET", # flake8-return + "RUF", # ruff-specific rules + "S", # flake8-bandit + "SIM", # flake8-simplify + "T10", # flake8-debugger + "T20", # flake8-print + "TCH", # flake8-type-checking + "TID", # flake8-tidy-imports + "UP", # pyupgrade + "W", # pycodestyle + "YTT", # flake8-2020 +] +ignore = [ + "B003", # Assigning to `os.environ` doesn't clear the environment + "B017", # `pytest.raises(Exception)` should be considered evil + "B019", # Use of `functools.lru_cache` or `functools.cache` on methods can lead to memory leaks + "B027", # `CDataExporter.run_gc` is an empty method in an abstract base class, but has no abstract decorator + "B904", # raise from e or raise from None in exception handlers + "BLE001", # Do not catch blind exception: `Exception` + "C408", # Unnecessary `dict()` call (rewrite as a literal) + "D100", # public module + "D101", # public class + "D102", # public method + "D103", # public function + "D104", # public package + "D105", # magic methods + "D106", # nested class + "D107", # init + "D200", # One-line docstring should fit on one line + "D203", # blank line before class docstring + "D205", # 1 blank line required between summary line and description + "D210", # No whitespaces allowed surrounding docstring text + "D213", # Multi-line docstring summary should start at the second line + "D301", # Use `r"""` if any backslashes in a docstring + "D400", # First line should end with a period + "D401", # Imperative mood + "D402", # First line should not be the function's signature + "D413", # Blank line required after last section + "D415", # First line should end with a period, question mark, or exclamation point + "D417", # Missing argument descriptions in the docstring + "FLY002", # Consider `f"{}.{}"` instead of string join" + "G004", # Logging statement uses f-string + "ICN001", # `xml.etree.ElementTree` should be imported as `ET` + "INP001", # implicit-namespace-package + "ISC001", # single line implicit string concat, handled by ruff format + "PLR0911", # too many return statements + "PLR0912", # too many branches + "PLR0913", # too many arguments + "PLR0915", # too many statements + "PLR2004", # forces everything to be a constant + "PLW0603", # Using the global statement to update `variable` is discouraged + "PLW1510", # `subprocess.run` without explicit `check` argument + "PLW2901", # overwriting loop variable + "RET503", # Missing explicit `return` at the end of function able to return non-`None` value + "RET504", # unnecessary-assign, these are useful for debugging + "RET505", # superfluous-else-return, stylistic choice + "RET506", # superfluous-else-raise, stylistic choice + "RET507", # superfluous-else-continue, stylistic choice + "RET508", # superfluous-else-break, stylistic choice + "RUF005", # splat instead of concat + "RUF012", # Mutable class attributes should be annotated with `typing.ClassVar` + "S101", # ignore "Use of `assert` detected" + "S113", # Probable use of `requests` call without timeout + "S310", # Audit URL open for permitted schemes. Allowing use of `file:` or custom schemes is often unexpected. + "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes + "S314", # Using `xml` to parse untrusted data is known to be vulnerable to XML attacks; use `defusedxml` equivalents + "S603", # `subprocess` call: check for execution of untrusted input + "S607", # Starting a process with a partial executable path + "S701", # By default, jinja2 sets `autoescape` to `False`. Consider using `autoescape=True` or the `select_autoescape` function to mitigate XSS vulnerabilities. + "SIM108", # convert everything to ternary operator + "SIM112", # Use capitalized environment variable `DOTNET_GCHEAPHARDLIMIT` instead of `DOTNET_GCHeapHardLimit` + "SIM117", # Use a single `with` statement with multiple contexts instead of nested `with` statements + "T201", # `print` found + "T203", # `pprint` found + "TID252", # Prefer absolute imports over relative imports from parent modules + "UP006", # Use `list` instead of `List` for type annotation + "UP007", # Optional[str] -> str | None + "UP035", # `typing.List` is deprecated, use `list` instead +] +# none of these codes will be automatically fixed by ruff +unfixable = [ + "T201", # print statements + "F401", # unused imports + "RUF100", # unused noqa comments + "F841", # unused variables +] + +[lint.isort] +required-imports = ["from __future__ import annotations"] diff --git a/dev/tasks/conda-recipes/arrow-cpp/test_read_parquet.py b/dev/tasks/conda-recipes/arrow-cpp/test_read_parquet.py index 5f76a4e22c9ec..64ca555f1abbe 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/test_read_parquet.py +++ b/dev/tasks/conda-recipes/arrow-cpp/test_read_parquet.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pyarrow as pa import pyarrow.parquet as pq diff --git a/dev/tasks/conda-recipes/clean.py b/dev/tasks/conda-recipes/clean.py index 3f30a8929ee24..307b17fe1a6c3 100644 --- a/dev/tasks/conda-recipes/clean.py +++ b/dev/tasks/conda-recipes/clean.py @@ -1,13 +1,13 @@ -import subprocess -from typing import Set +from __future__ import annotations import json -import pandas as pd +import subprocess import sys +from typing import Set +import pandas as pd from packaging.version import Version - VERSIONS_TO_KEEP = 5 DELETE_BEFORE = pd.Timestamp.now() - pd.Timedelta(days=30) @@ -22,18 +22,19 @@ class CommandFailedException(Exception): - def __init__(self, cmdline, output): self.cmdline = cmdline self.output = output def run_command(cmdline, **kwargs): - kwargs.setdefault('capture_output', True) + kwargs.setdefault("capture_output", True) p = subprocess.run(cmdline, **kwargs) if p.returncode != 0: - print(f"Command {cmdline} returned non-zero exit status " - f"{p.returncode}", file=sys.stderr) + print( + f"Command {cmdline} returned non-zero exit status {p.returncode}", + file=sys.stderr, + ) output = "" if p.stdout: print("Stdout was:\n" + "-" * 70, file=sys.stderr) @@ -60,8 +61,8 @@ def builds_to_delete(platform: str, to_delete: Set[str]) -> int: "arrow-nightlies", "--override-channels", "--subdir", - platform - ], + platform, + ] ) except CommandFailedException as ex: # If the command failed due to no packages found, return @@ -79,8 +80,8 @@ def builds_to_delete(platform: str, to_delete: Set[str]) -> int: builds = pd.DataFrame(builds) builds["version"] = builds["version"].map(Version) # May be NaN if package doesn't depend on Python - builds["py_version"] = builds["build"].str.extract(r'(py\d+)') - builds["timestamp"] = pd.to_datetime(builds['timestamp'], unit='ms') + builds["py_version"] = builds["build"].str.extract(r"(py\d+)") + builds["timestamp"] = pd.to_datetime(builds["timestamp"], unit="ms") builds["stale"] = builds["timestamp"] < DELETE_BEFORE # Some packages can be present in several "features" (e.g. CUDA), # others miss that column in which case we set a default value. @@ -93,9 +94,9 @@ def builds_to_delete(platform: str, to_delete: Set[str]) -> int: # Detect old builds for each configuration: # a product of (architecture, Python version, features). - for (subdir, python, features, stale), group in builds.groupby( - ["subdir", "py_version", "track_features", "stale"], - dropna=False): + for (_subdir, _python, _features, stale), group in builds.groupby( + ["subdir", "py_version", "track_features", "stale"], dropna=False + ): del_candidates = [] if stale: del_candidates = group @@ -109,8 +110,7 @@ def builds_to_delete(platform: str, to_delete: Set[str]) -> int: f"arrow-nightlies/{package_name}/" + del_candidates["version"].astype(str) + del_candidates["url"].str.replace( - "https://conda.anaconda.org/arrow-nightlies", "", - regex=False + "https://conda.anaconda.org/arrow-nightlies", "", regex=False ) ) diff --git a/dev/test_merge_arrow_pr.py b/dev/test_merge_arrow_pr.py index 1db07ca91a401..2286e4ad85523 100755 --- a/dev/test_merge_arrow_pr.py +++ b/dev/test_merge_arrow_pr.py @@ -16,6 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from __future__ import annotations from collections import namedtuple @@ -23,38 +24,40 @@ import merge_arrow_pr - -FakeIssue = namedtuple('issue', ['fields']) +FakeIssue = namedtuple("issue", ["fields"]) FakeFields = namedtuple( - 'fields', ['assignees', 'labels', 'milestone', 'state', 'title']) -FakeAssignee = namedtuple('assignees', ['login']) -FakeLabel = namedtuple('label', ['name']) -FakeMilestone = namedtuple('milestone', ['title', 'state']) + "fields", ["assignees", "labels", "milestone", "state", "title"] +) +FakeAssignee = namedtuple("assignees", ["login"]) +FakeLabel = namedtuple("label", ["name"]) +FakeMilestone = namedtuple("milestone", ["title", "state"]) RAW_VERSION_JSON = [ - {'title': 'JS-0.4.0', 'state': 'open'}, - {'title': '1.0.0', 'state': 'open'}, - {'title': '2.0.0', 'state': 'open'}, - {'title': '0.9.0', 'state': 'open'}, - {'title': '0.10.0', 'state': 'open'}, - {'title': '0.8.0', 'state': 'closed'}, - {'title': '0.7.0', 'state': 'closed'} + {"title": "JS-0.4.0", "state": "open"}, + {"title": "1.0.0", "state": "open"}, + {"title": "2.0.0", "state": "open"}, + {"title": "0.9.0", "state": "open"}, + {"title": "0.10.0", "state": "open"}, + {"title": "0.8.0", "state": "closed"}, + {"title": "0.7.0", "state": "closed"}, ] -SOURCE_VERSIONS = [FakeMilestone(raw['title'], raw['state']) - for raw in RAW_VERSION_JSON] -fake_issue_id = 'GH-1234' -fields = FakeFields([FakeAssignee('groundhog')._asdict()], - [FakeLabel('Component: C++')._asdict(), - FakeLabel('Component: Format')._asdict()], - FakeMilestone('', 'open')._asdict(), - 'open', '[C++][Format] The issue Title') +SOURCE_VERSIONS = [ + FakeMilestone(raw["title"], raw["state"]) for raw in RAW_VERSION_JSON +] +fake_issue_id = "GH-1234" +fields = FakeFields( + [FakeAssignee("groundhog")._asdict()], + [FakeLabel("Component: C++")._asdict(), FakeLabel("Component: Format")._asdict()], + FakeMilestone("", "open")._asdict(), + "open", + "[C++][Format] The issue Title", +) FAKE_ISSUE_1 = FakeIssue(fields) class FakeGitHub: - - def __init__(self, issues=None, project_versions=None, state='open'): + def __init__(self, issues=None, project_versions=None, state="open"): self._issues = issues self._project_versions = project_versions self._state = state @@ -66,13 +69,11 @@ def issue(self): @property def current_versions(self): - return [ - v.title for v in self._project_versions if not v.state == 'closed' - ] + return [v.title for v in self._project_versions if v.state != "closed"] @property def current_fix_versions(self): - return 'JS-0.4.0' + return "JS-0.4.0" @property def state(self): @@ -86,12 +87,13 @@ def get_milestones(self): def assign_milestone(self, issue_id, milestone): self._transitions.append( - {'action': 'assign_milestone', 'issue_id': issue_id, - 'milestone': milestone}) + {"action": "assign_milestone", "issue_id": issue_id, "milestone": milestone} + ) def close_issue(self, issue_id, comment): self._transitions.append( - {'action': 'close_issue', 'issue_id': issue_id, 'comment': comment}) + {"action": "close_issue", "issue_id": issue_id, "comment": comment} + ) @property def captured_transitions(self): @@ -99,7 +101,6 @@ def captured_transitions(self): class FakeCLI: - def __init__(self, responses=()): self.responses = responses self.position = 0 @@ -114,49 +115,45 @@ def fail(self, msg): def test_gh_fix_versions(): - gh = FakeGitHub(issues={fake_issue_id: FAKE_ISSUE_1}, - project_versions=SOURCE_VERSIONS) + gh = FakeGitHub( + issues={fake_issue_id: FAKE_ISSUE_1}, project_versions=SOURCE_VERSIONS + ) issue = merge_arrow_pr.GitHubIssue(gh, fake_issue_id, FakeCLI()) - fix_version = merge_arrow_pr.get_candidate_fix_version( - issue.current_versions - ) - assert fix_version == '1.0.0' + fix_version = merge_arrow_pr.get_candidate_fix_version(issue.current_versions) + assert fix_version == "1.0.0" def test_gh_fix_versions_filters_maintenance(): maintenance_branches = ["maint-1.0.0"] - gh = FakeGitHub(issues={fake_issue_id: FAKE_ISSUE_1}, - project_versions=SOURCE_VERSIONS) + gh = FakeGitHub( + issues={fake_issue_id: FAKE_ISSUE_1}, project_versions=SOURCE_VERSIONS + ) issue = merge_arrow_pr.GitHubIssue(gh, fake_issue_id, FakeCLI()) fix_version = merge_arrow_pr.get_candidate_fix_version( - issue.current_versions, - maintenance_branches=maintenance_branches + issue.current_versions, maintenance_branches=maintenance_branches ) - assert fix_version == '2.0.0' + assert fix_version == "2.0.0" def test_gh_only_suggest_major_release(): versions_json = [ - {'name': '0.9.1', 'state': "open"}, - {'name': '0.10.0', 'state': "open"}, - {'name': '1.0.0', 'state': "open"}, + {"name": "0.9.1", "state": "open"}, + {"name": "0.10.0", "state": "open"}, + {"name": "1.0.0", "state": "open"}, ] - versions = [FakeMilestone(raw['name'], raw['state']) for raw in versions_json] + versions = [FakeMilestone(raw["name"], raw["state"]) for raw in versions_json] gh = FakeGitHub(issues={fake_issue_id: FAKE_ISSUE_1}, project_versions=versions) issue = merge_arrow_pr.GitHubIssue(gh, fake_issue_id, FakeCLI()) - fix_version = merge_arrow_pr.get_candidate_fix_version( - issue.current_versions - ) - assert fix_version == '1.0.0' + fix_version = merge_arrow_pr.get_candidate_fix_version(issue.current_versions) + assert fix_version == "1.0.0" def test_gh_invalid_issue(): class Mock: - def issue(self, gh_id): raise Exception("not found") @@ -165,124 +162,142 @@ def issue(self, gh_id): def test_gh_resolve(): - gh = FakeGitHub(issues={fake_issue_id: FAKE_ISSUE_1}, - project_versions=SOURCE_VERSIONS) + gh = FakeGitHub( + issues={fake_issue_id: FAKE_ISSUE_1}, project_versions=SOURCE_VERSIONS + ) - my_comment = 'my comment' + my_comment = "my comment" fix_version = "0.10.0" issue = merge_arrow_pr.GitHubIssue(gh, fake_issue_id, FakeCLI()) issue.resolve(fix_version, my_comment, pr_body="") assert len(gh.captured_transitions) == 2 - assert gh.captured_transitions[0]['action'] == 'assign_milestone' - assert gh.captured_transitions[1]['action'] == 'close_issue' - assert gh.captured_transitions[1]['comment'] == my_comment - assert gh.captured_transitions[0]['milestone'] == fix_version + assert gh.captured_transitions[0]["action"] == "assign_milestone" + assert gh.captured_transitions[1]["action"] == "close_issue" + assert gh.captured_transitions[1]["comment"] == my_comment + assert gh.captured_transitions[0]["milestone"] == fix_version def test_gh_resolve_non_mainline(): - gh = FakeGitHub(issues={fake_issue_id: FAKE_ISSUE_1}, - project_versions=SOURCE_VERSIONS) + gh = FakeGitHub( + issues={fake_issue_id: FAKE_ISSUE_1}, project_versions=SOURCE_VERSIONS + ) - my_comment = 'my comment' + my_comment = "my comment" fix_version = "JS-0.4.0" issue = merge_arrow_pr.GitHubIssue(gh, fake_issue_id, FakeCLI()) issue.resolve(fix_version, my_comment, "") assert len(gh.captured_transitions) == 2 - assert gh.captured_transitions[1]['comment'] == my_comment - assert gh.captured_transitions[0]['milestone'] == fix_version + assert gh.captured_transitions[1]["comment"] == my_comment + assert gh.captured_transitions[0]["milestone"] == fix_version def test_gh_resolve_released_fix_version(): # ARROW-5083 - gh = FakeGitHub(issues={fake_issue_id: FAKE_ISSUE_1}, - project_versions=SOURCE_VERSIONS) + gh = FakeGitHub( + issues={fake_issue_id: FAKE_ISSUE_1}, project_versions=SOURCE_VERSIONS + ) - cmd = FakeCLI(responses=['1.0.0']) + cmd = FakeCLI(responses=["1.0.0"]) fix_versions_json = merge_arrow_pr.prompt_for_fix_version(cmd, gh) assert fix_versions_json == "1.0.0" def test_multiple_authors_bad_input(): - a0 = 'Jimbob Crawfish <jimbob.crawfish@gmail.com>' - a1 = 'Jarvis McCratchett <jarvis.mccratchett@hotmail.com>' - a2 = 'Hank Miller <hank.miller@protonmail.com>' + a0 = "Jimbob Crawfish <jimbob.crawfish@gmail.com>" + a1 = "Jarvis McCratchett <jarvis.mccratchett@hotmail.com>" + a2 = "Hank Miller <hank.miller@protonmail.com>" distinct_authors = [a0, a1] - cmd = FakeCLI(responses=['']) - primary_author, distinct_other_authors = \ - merge_arrow_pr.get_primary_author(cmd, distinct_authors) + cmd = FakeCLI(responses=[""]) + primary_author, distinct_other_authors = merge_arrow_pr.get_primary_author( + cmd, distinct_authors + ) assert primary_author == a0 assert distinct_other_authors == [a1] - cmd = FakeCLI(responses=['oops', a1]) - primary_author, distinct_other_authors = \ - merge_arrow_pr.get_primary_author(cmd, distinct_authors) + cmd = FakeCLI(responses=["oops", a1]) + primary_author, distinct_other_authors = merge_arrow_pr.get_primary_author( + cmd, distinct_authors + ) assert primary_author == a1 assert distinct_other_authors == [a0] cmd = FakeCLI(responses=[a2]) - primary_author, distinct_other_authors = \ - merge_arrow_pr.get_primary_author(cmd, distinct_authors) + primary_author, distinct_other_authors = merge_arrow_pr.get_primary_author( + cmd, distinct_authors + ) assert primary_author == a2 assert distinct_other_authors == [a0, a1] def test_gh_already_resolved(): - fields = FakeFields([FakeAssignee('groundhog')._asdict()], - [FakeLabel('Component: Java')._asdict()], - FakeMilestone('', 'open')._asdict(), - 'closed', '[Java] The issue Title') + fields = FakeFields( + [FakeAssignee("groundhog")._asdict()], + [FakeLabel("Component: Java")._asdict()], + FakeMilestone("", "open")._asdict(), + "closed", + "[Java] The issue Title", + ) issue = FakeIssue(fields) - gh = FakeGitHub(issues={fake_issue_id: issue}, - project_versions=SOURCE_VERSIONS) + gh = FakeGitHub(issues={fake_issue_id: issue}, project_versions=SOURCE_VERSIONS) fix_versions = [SOURCE_VERSIONS[0]._asdict()] issue = merge_arrow_pr.GitHubIssue(gh, fake_issue_id, FakeCLI()) - with pytest.raises(Exception, - match="GitHub issue GH-1234 already has status 'closed'"): + with pytest.raises( + Exception, match="GitHub issue GH-1234 already has status 'closed'" + ): issue.resolve(fix_versions, "", "") def test_gh_output_no_components(): # ARROW-5472 - status = 'Interesting work' + status = "Interesting work" output = merge_arrow_pr.format_issue_output( - 'github', 'GH-1234', 'Resolved', status, - 'username', [] + "github", "GH-1234", "Resolved", status, "username", [] ) - assert output == """=== GITHUB GH-1234 === + assert ( + output + == """=== GITHUB GH-1234 === Summary\t\tInteresting work Assignee\tusername Components\tNO COMPONENTS!!! Status\t\tResolved URL\t\thttps://github.com/apache/arrow/issues/1234""" + ) output = merge_arrow_pr.format_issue_output( - 'github', 'GH-1234', 'Resolved', status, 'username', - [FakeLabel('C++'), FakeLabel('Python')] + "github", + "GH-1234", + "Resolved", + status, + "username", + [FakeLabel("C++"), FakeLabel("Python")], ) - assert output == """=== GITHUB GH-1234 === + assert ( + output + == """=== GITHUB GH-1234 === Summary\t\tInteresting work Assignee\tusername Components\tC++, Python Status\t\tResolved URL\t\thttps://github.com/apache/arrow/issues/1234""" + ) def test_sorting_versions(): versions_json = [ - {'title': '11.0.0', 'state': 'open'}, - {'title': '9.0.0', 'state': 'open'}, - {'title': '10.0.0', 'state': 'open'}, + {"title": "11.0.0", "state": "open"}, + {"title": "9.0.0", "state": "open"}, + {"title": "10.0.0", "state": "open"}, ] - versions = [FakeMilestone(raw['title'], raw['state']) for raw in versions_json] + versions = [FakeMilestone(raw["title"], raw["state"]) for raw in versions_json] fix_version = merge_arrow_pr.get_candidate_fix_version([x.title for x in versions]) assert fix_version == "9.0.0" diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index b738dc04b0c81..a4c50b5e08d69 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1958,7 +1958,8 @@ cdef class Array(_PandasConvertible): inner_array = pyarrow_unwrap_array(casted_array) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.type} to requested type {target_type}: {e}" + f"Could not cast {self.type} to requested " + f"type {target_type}: {e}" ) else: inner_array = self.sp_array @@ -2103,7 +2104,8 @@ cdef class Array(_PandasConvertible): inner_array = pyarrow_unwrap_array(casted_array) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.type} to requested type {target_type}: {e}" + f"Could not cast {self.type} to requested " + f"type {target_type}: {e}" ) else: inner_array = self.sp_array diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 2c92ecbfa7344..1411c57ac351f 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -177,7 +177,8 @@ def _ensure_cuda_loaded(): if __cuda_loaded is not True: raise ImportError( "Trying to import data on a CUDA device, but PyArrow is not built with " - f"CUDA support.\n(importing 'pyarrow.cuda' resulted in \"{__cuda_loaded}\")." + "CUDA support.\n" + f"(importing 'pyarrow.cuda' resulted in \"{__cuda_loaded}\")." ) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 5a6cd390489bf..f5627f90ee1e4 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1409,7 +1409,8 @@ cdef class ChunkedArray(_PandasConvertible): chunked = self.cast(target_type, safe=True) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.type} to requested type {target_type}: {e}" + f"Could not cast {self.type} to requested " + f"type {target_type}: {e}" ) else: chunked = self @@ -3818,7 +3819,8 @@ cdef class RecordBatch(_Tabular): inner_batch = pyarrow_unwrap_batch(casted_batch) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.schema} to requested schema {target_schema}: {e}" + f"Could not cast {self.schema} to requested " + f"schema {target_schema}: {e}" ) else: inner_batch = self.sp_batch @@ -3997,7 +3999,8 @@ cdef class RecordBatch(_Tabular): inner_batch = pyarrow_unwrap_batch(casted_batch) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.schema} to requested schema {target_schema}: {e}" + f"Could not cast {self.schema} to requested " + f"schema {target_schema}: {e}" ) else: inner_batch = self.sp_batch diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index b6aaa2840d83c..7c18976025834 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -25,6 +25,7 @@ import tempfile import textwrap import threading + import time from shutil import copytree from urllib.parse import quote @@ -1546,10 +1547,15 @@ def test_parquet_fragment_statistics(tempdir): fragment = list(dataset.get_fragments())[0] - import datetime - def dt_s(x): return datetime.datetime(1970, 1, 1, 0, 0, x) - def dt_ms(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x*1000) - def dt_us(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x) + def dt_s(x): + return datetime.datetime(1970, 1, 1, 0, 0, x) + + def dt_ms(x): + return datetime.datetime(1970, 1, 1, 0, 0, 0, x*1000) + + def dt_us(x): + return datetime.datetime(1970, 1, 1, 0, 0, 0, x) + date = datetime.date time = datetime.time diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 4ad04c9ad1ecb..db761da546f26 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4054,9 +4054,11 @@ def test_nested_with_timestamp_tz(): if unit in ['s', 'ms']: # This is used for verifying timezone conversion to micros are not # important - def truncate(x): return x.replace(microsecond=0) + def truncate(x): + return x.replace(microsecond=0) else: - def truncate(x): return x + def truncate(x): + return x arr = pa.array([ts], type=pa.timestamp(unit)) arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York')) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 27d63a67fedbd..83891a12c25fa 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -148,7 +148,8 @@ cdef void* _as_c_pointer(v, allow_null=False) except *: else: capsule_name_str = capsule_name.decode() raise ValueError( - f"Can't convert PyCapsule with name '{capsule_name_str}' to pointer address" + f"Can't convert PyCapsule with name '{capsule_name_str}' " + "to pointer address" ) else: raise TypeError(f"Expected a pointer value, got {type(v)!r}") diff --git a/python/setup.cfg b/python/setup.cfg index 3df4ff27ef87a..9880336bbff52 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -34,8 +34,11 @@ filterwarnings = faulthandler_timeout = 300 [pep8] -ignore = E211,E225,E226,E227,E402,W503,W504 +ignore = E203,E211,E225,E226,E227,E402,W503,W504 max_line_length = 88 [flake8] +ignore = E203,E211,E225,E226,E227,E402,W503,W504 max-line-length = 88 + +