gpu-mode · msaroufim · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/examples/conv2d_py/eval.py b/examples/conv2d_py/eval.py
@@ -0,0 +1 @@
+../eval.py
diff --git a/examples/conv2d_py/reference.py b/examples/conv2d_py/reference.py
@@ -0,0 +1,62 @@
+from utils import verbose_allclose
+import torch
+import torch.nn.functional as F
+from task import input_t, output_t, KernelSpec
+
+def ref_kernel(data: input_t, spec: KernelSpec) -> output_t:
+    """
+    Reference implementation of 2D convolution using PyTorch.
+    Args:
+        data: Tuple of (input tensor, kernel tensor)
+        spec: Convolution specifications (stride, padding)
+    Returns:
+        Output tensor after convolution
+    """
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor, 
+        kernel,
+        stride=spec.stride,
+        padding=spec.padding
+    )
+
+def generate_input(size: int, kernel_size: int, channels: int, batch: int, seed: int) -> input_t:
+    """
+    Generates random input and kernel tensors.
+    Returns:
+        Tuple of (input tensor, kernel tensor)
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+
+    # Generate input tensor: [batch, in_channels, height, width]
+    input_tensor = torch.randn(
+        batch, channels, size, size,
+        device='cuda', 
+        dtype=torch.float32, 
+        generator=gen
+    ).contiguous()
+
+    # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
+    # Here we use same number of output channels as input channels for simplicity
+    kernel = torch.randn(
+        channels, channels, kernel_size, kernel_size,
+        device='cuda',
+        dtype=torch.float32,
+        generator=gen
+    ).contiguous()
+
+    return (input_tensor, kernel)
+
+def check_implementation(
+    data: input_t,
+    spec: KernelSpec,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data, spec)
+    reasons = verbose_allclose(output, expected, rtol=1e-3, atol=1e-3)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+
+    return '' 
diff --git a/examples/conv2d_py/task.py b/examples/conv2d_py/task.py
@@ -0,0 +1,18 @@
+from typing import TypedDict, TypeVar, Tuple
+import torch
+from dataclasses import dataclass
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+@dataclass
+class KernelSpec:
+    stride: int
+    padding: int
+
+class TestSpec(TypedDict):
+    size: int
+    kernel_size: int
+    channels: int
+    batch: int
+    seed: int 
diff --git a/examples/conv2d_py/task.yml b/examples/conv2d_py/task.yml
@@ -0,0 +1,31 @@
+# name: conv2d-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a 2D convolution kernel using CUDA inline function that matches the reference implementation.
+  The kernel should perform 2D convolution with the given specifications (stride and padding).
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 32, "kernel_size": 3, "channels": 16, "batch": 1, "seed": 4242}
+  - {"size": 32, "kernel_size": 5, "channels": 16, "batch": 2, "seed": 5236}
+  - {"size": 64, "kernel_size": 3, "channels": 32, "batch": 1, "seed": 1001}
+  - {"size": 64, "kernel_size": 5, "channels": 32, "batch": 2, "seed": 5531}
+  - {"size": 128, "kernel_size": 3, "channels": 64, "batch": 1, "seed": 9173}
+
+benchmarks:
+  - {"size": 128, "kernel_size": 3, "channels": 64, "batch": 4, "seed": 54352}
+  - {"size": 128, "kernel_size": 5, "channels": 64, "batch": 4, "seed": 93246}
+  - {"size": 256, "kernel_size": 3, "channels": 128, "batch": 2, "seed": 6256}
+  - {"size": 256, "kernel_size": 5, "channels": 128, "batch": 2, "seed": 8841}
+  - {"size": 512, "kernel_size": 3, "channels": 256, "batch": 1, "seed": 6252} 
diff --git a/examples/conv2d_py/utils.py b/examples/conv2d_py/utils.py
@@ -0,0 +1 @@
+../utils.py
diff --git a/examples/grayscale_py/eval.py b/examples/grayscale_py/eval.py
@@ -0,0 +1 @@
+../eval.py
diff --git a/examples/grayscale_py/reference.py b/examples/grayscale_py/reference.py
@@ -0,0 +1,44 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of RGB to grayscale conversion using PyTorch.
+    Uses the standard coefficients: Y = 0.2989 R + 0.5870 G + 0.1140 B
+
+    Args:
+        data: RGB tensor of shape (H, W, 3) with values in [0, 1]
+    Returns:
+        Grayscale tensor of shape (H, W) with values in [0, 1]
+    """
+    # Standard RGB to Grayscale coefficients
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random RGB image tensor of specified size.
+    Returns:
+        Tensor of shape (size, size, 3) with values in [0, 1]
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.rand(size, size, 3, 
+                     device='cuda', 
+                     dtype=torch.float32, 
+                     generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected, rtol=1e-4, atol=1e-4)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+
+    return '' 
diff --git a/examples/grayscale_py/task.py b/examples/grayscale_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)  # Input will be (H, W, 3) RGB tensor
+output_t = TypeVar("output_t", bound=torch.Tensor)  # Output will be (H, W) grayscale tensor
+
+class TestSpec(TypedDict):
+    size: int  # Size of the square image (H=W)
+    seed: int 
diff --git a/examples/grayscale_py/task.yml b/examples/grayscale_py/task.yml
@@ -0,0 +1,33 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement an RGB to grayscale conversion kernel that matches the reference implementation.
+  The kernel should convert RGB images to grayscale using the standard coefficients:
+  Y = 0.2989 R + 0.5870 G + 0.1140 B
+
+  Input: RGB tensor of shape (H, W, 3) with values in [0, 1]
+  Output: Grayscale tensor of shape (H, W) with values in [0, 1]
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 127, "seed": 4242}
+  - {"size": 128, "seed": 5236}
+  - {"size": 129, "seed": 1001}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  - {"size": 1024, "seed": 54352}
+  - {"size": 2048, "seed": 93246}
+  - {"size": 4096, "seed": 6256}
+  - {"size": 8192, "seed": 8841}
+  - {"size": 16384, "seed": 6252} 
diff --git a/examples/grayscale_py/utils.py b/examples/grayscale_py/utils.py
@@ -0,0 +1 @@
+../utils.py
diff --git a/examples/histogram_py/eval.py b/examples/histogram_py/eval.py
diff --git a/examples/histogram_py/reference.py b/examples/histogram_py/reference.py
@@ -0,0 +1,47 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t, HistogramSpec
+
+def ref_kernel(data: input_t, spec: HistogramSpec) -> output_t:
+    """
+    Reference implementation of histogram using PyTorch.
+    Args:
+        data: Input tensor to compute histogram on
+        spec: Histogram specifications (num_bins, min_val, max_val)
+    Returns:
+        Tensor containing bin counts
+    """
+    # Clip values to range
+    clipped = torch.clamp(data, spec.min_val, spec.max_val)
+
+    # Scale to bin indices
+    bin_width = (spec.max_val - spec.min_val) / spec.num_bins
+    indices = ((clipped - spec.min_val) / bin_width).long()
+    indices = torch.clamp(indices, 0, spec.num_bins - 1)
+
+    # Count values in each bin
+    return torch.bincount(indices, minlength=spec.num_bins).to(torch.float32)
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor with values roughly in [0, 1].
+    Returns:
+        Tensor to compute histogram on
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    # Generate values with normal distribution for interesting histograms
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    spec: HistogramSpec,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data, spec)
+    reasons = verbose_allclose(output, expected)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+
+    return '' 
diff --git a/examples/histogram_py/task.py b/examples/histogram_py/task.py
@@ -0,0 +1,16 @@
+from typing import TypedDict, TypeVar
+import torch
+from dataclasses import dataclass
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+@dataclass
+class HistogramSpec:
+    num_bins: int
+    min_val: float
+    max_val: float
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
diff --git a/examples/histogram_py/task.yml b/examples/histogram_py/task.yml
@@ -0,0 +1,31 @@
+# name: histogram-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a histogram kernel using CUDA inline function that matches the reference implementation.
+  The kernel should count the number of elements falling into each bin across the specified range.
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 8192, "seed": 54352}
+  - {"size": 16384, "seed": 93246}
+  - {"size": 32768, "seed": 6256}
+  - {"size": 65536, "seed": 8841}
+  - {"size": 131072, "seed": 6252} 
diff --git a/examples/histogram_py/utils.py b/examples/histogram_py/utils.py
@@ -0,0 +1 @@
+../utils.py
diff --git a/examples/prefixsum_py/eval.py b/examples/prefixsum_py/eval.py
@@ -0,0 +1 @@
+../eval.py
diff --git a/examples/prefixsum_py/reference.py b/examples/prefixsum_py/reference.py
@@ -0,0 +1,35 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of inclusive prefix sum using PyTorch.
+    Args:
+        data: Input tensor to compute prefix sum on
+    Returns:
+        Tensor containing the inclusive prefix sum
+    """
+    return torch.cumsum(data, dim=0)
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor.
+    Returns:
+        Tensor to compute prefix sum on
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected, rtol=1e-5, atol=1e-5)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+
+    return '' 
diff --git a/examples/prefixsum_py/task.py b/examples/prefixsum_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int