Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Practice Problems #156

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/conv2d_py/eval.py
62 changes: 62 additions & 0 deletions examples/conv2d_py/reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from utils import verbose_allclose
import torch
import torch.nn.functional as F
from task import input_t, output_t, KernelSpec

def ref_kernel(data: input_t, spec: KernelSpec) -> output_t:
"""
Reference implementation of 2D convolution using PyTorch.
Args:
data: Tuple of (input tensor, kernel tensor)
spec: Convolution specifications (stride, padding)
Returns:
Output tensor after convolution
"""
input_tensor, kernel = data
return F.conv2d(
input_tensor,
kernel,
stride=spec.stride,
padding=spec.padding
)

def generate_input(size: int, kernel_size: int, channels: int, batch: int, seed: int) -> input_t:
"""
Generates random input and kernel tensors.
Returns:
Tuple of (input tensor, kernel tensor)
"""
gen = torch.Generator(device='cuda')
gen.manual_seed(seed)

# Generate input tensor: [batch, in_channels, height, width]
input_tensor = torch.randn(
batch, channels, size, size,
device='cuda',
dtype=torch.float32,
generator=gen
).contiguous()

# Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
# Here we use same number of output channels as input channels for simplicity
kernel = torch.randn(
channels, channels, kernel_size, kernel_size,
device='cuda',
dtype=torch.float32,
generator=gen
).contiguous()

return (input_tensor, kernel)

def check_implementation(
data: input_t,
spec: KernelSpec,
output: output_t,
) -> str:
expected = ref_kernel(data, spec)
reasons = verbose_allclose(output, expected, rtol=1e-3, atol=1e-3)

if len(reasons) > 0:
return "mismatch found! custom implementation doesn't match reference: " + reasons[0]

return ''
18 changes: 18 additions & 0 deletions examples/conv2d_py/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from typing import TypedDict, TypeVar, Tuple
import torch
from dataclasses import dataclass

input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
output_t = TypeVar("output_t", bound=torch.Tensor)

@dataclass
class KernelSpec:
stride: int
padding: int

class TestSpec(TypedDict):
size: int
kernel_size: int
channels: int
batch: int
seed: int
31 changes: 31 additions & 0 deletions examples/conv2d_py/task.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# name: conv2d-cuda-inline

files:
- {"name": "submission.py", "source": "@SUBMISSION@"}
- {"name": "task.py", "source": "task.py"}
- {"name": "utils.py", "source": "utils.py"}
- {"name": "reference.py", "source": "reference.py"}
- {"name": "eval.py", "source": "eval.py"}

lang: "py"

description: |
Implement a 2D convolution kernel using CUDA inline function that matches the reference implementation.
The kernel should perform 2D convolution with the given specifications (stride and padding).

config:
main: "eval.py"

tests:
- {"size": 32, "kernel_size": 3, "channels": 16, "batch": 1, "seed": 4242}
- {"size": 32, "kernel_size": 5, "channels": 16, "batch": 2, "seed": 5236}
- {"size": 64, "kernel_size": 3, "channels": 32, "batch": 1, "seed": 1001}
- {"size": 64, "kernel_size": 5, "channels": 32, "batch": 2, "seed": 5531}
- {"size": 128, "kernel_size": 3, "channels": 64, "batch": 1, "seed": 9173}

benchmarks:
- {"size": 128, "kernel_size": 3, "channels": 64, "batch": 4, "seed": 54352}
- {"size": 128, "kernel_size": 5, "channels": 64, "batch": 4, "seed": 93246}
- {"size": 256, "kernel_size": 3, "channels": 128, "batch": 2, "seed": 6256}
- {"size": 256, "kernel_size": 5, "channels": 128, "batch": 2, "seed": 8841}
- {"size": 512, "kernel_size": 3, "channels": 256, "batch": 1, "seed": 6252}
1 change: 1 addition & 0 deletions examples/conv2d_py/utils.py
1 change: 1 addition & 0 deletions examples/grayscale_py/eval.py
44 changes: 44 additions & 0 deletions examples/grayscale_py/reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from utils import verbose_allclose
import torch
from task import input_t, output_t

def ref_kernel(data: input_t) -> output_t:
"""
Reference implementation of RGB to grayscale conversion using PyTorch.
Uses the standard coefficients: Y = 0.2989 R + 0.5870 G + 0.1140 B

Args:
data: RGB tensor of shape (H, W, 3) with values in [0, 1]
Returns:
Grayscale tensor of shape (H, W) with values in [0, 1]
"""
# Standard RGB to Grayscale coefficients
weights = torch.tensor([0.2989, 0.5870, 0.1140],
device=data.device,
dtype=data.dtype)
return torch.sum(data * weights, dim=-1)

def generate_input(size: int, seed: int) -> input_t:
"""
Generates random RGB image tensor of specified size.
Returns:
Tensor of shape (size, size, 3) with values in [0, 1]
"""
gen = torch.Generator(device='cuda')
gen.manual_seed(seed)
return torch.rand(size, size, 3,
device='cuda',
dtype=torch.float32,
generator=gen).contiguous()

def check_implementation(
data: input_t,
output: output_t,
) -> str:
expected = ref_kernel(data)
reasons = verbose_allclose(output, expected, rtol=1e-4, atol=1e-4)

if len(reasons) > 0:
return "mismatch found! custom implementation doesn't match reference: " + reasons[0]

return ''
9 changes: 9 additions & 0 deletions examples/grayscale_py/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from typing import TypedDict, TypeVar
import torch

input_t = TypeVar("input_t", bound=torch.Tensor) # Input will be (H, W, 3) RGB tensor
output_t = TypeVar("output_t", bound=torch.Tensor) # Output will be (H, W) grayscale tensor

class TestSpec(TypedDict):
size: int # Size of the square image (H=W)
seed: int
33 changes: 33 additions & 0 deletions examples/grayscale_py/task.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
files:
- {"name": "submission.py", "source": "@SUBMISSION@"}
- {"name": "task.py", "source": "task.py"}
- {"name": "utils.py", "source": "utils.py"}
- {"name": "reference.py", "source": "reference.py"}
- {"name": "eval.py", "source": "eval.py"}

lang: "py"

description: |
Implement an RGB to grayscale conversion kernel that matches the reference implementation.
The kernel should convert RGB images to grayscale using the standard coefficients:
Y = 0.2989 R + 0.5870 G + 0.1140 B

Input: RGB tensor of shape (H, W, 3) with values in [0, 1]
Output: Grayscale tensor of shape (H, W) with values in [0, 1]

config:
main: "eval.py"

tests:
- {"size": 127, "seed": 4242}
- {"size": 128, "seed": 5236}
- {"size": 129, "seed": 1001}
- {"size": 256, "seed": 5531}
- {"size": 512, "seed": 9173}

benchmarks:
- {"size": 1024, "seed": 54352}
- {"size": 2048, "seed": 93246}
- {"size": 4096, "seed": 6256}
- {"size": 8192, "seed": 8841}
- {"size": 16384, "seed": 6252}
1 change: 1 addition & 0 deletions examples/grayscale_py/utils.py
Empty file added examples/histogram_py/eval.py
Empty file.
47 changes: 47 additions & 0 deletions examples/histogram_py/reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from utils import verbose_allclose
import torch
from task import input_t, output_t, HistogramSpec

def ref_kernel(data: input_t, spec: HistogramSpec) -> output_t:
"""
Reference implementation of histogram using PyTorch.
Args:
data: Input tensor to compute histogram on
spec: Histogram specifications (num_bins, min_val, max_val)
Returns:
Tensor containing bin counts
"""
# Clip values to range
clipped = torch.clamp(data, spec.min_val, spec.max_val)

# Scale to bin indices
bin_width = (spec.max_val - spec.min_val) / spec.num_bins
indices = ((clipped - spec.min_val) / bin_width).long()
indices = torch.clamp(indices, 0, spec.num_bins - 1)

# Count values in each bin
return torch.bincount(indices, minlength=spec.num_bins).to(torch.float32)

def generate_input(size: int, seed: int) -> input_t:
"""
Generates random input tensor with values roughly in [0, 1].
Returns:
Tensor to compute histogram on
"""
gen = torch.Generator(device='cuda')
gen.manual_seed(seed)
# Generate values with normal distribution for interesting histograms
return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()

def check_implementation(
data: input_t,
spec: HistogramSpec,
output: output_t,
) -> str:
expected = ref_kernel(data, spec)
reasons = verbose_allclose(output, expected)

if len(reasons) > 0:
return "mismatch found! custom implementation doesn't match reference: " + reasons[0]

return ''
16 changes: 16 additions & 0 deletions examples/histogram_py/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import TypedDict, TypeVar
import torch
from dataclasses import dataclass

input_t = TypeVar("input_t", bound=torch.Tensor)
output_t = TypeVar("output_t", bound=torch.Tensor)

@dataclass
class HistogramSpec:
num_bins: int
min_val: float
max_val: float

class TestSpec(TypedDict):
size: int
seed: int
31 changes: 31 additions & 0 deletions examples/histogram_py/task.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# name: histogram-cuda-inline

files:
- {"name": "submission.py", "source": "@SUBMISSION@"}
- {"name": "task.py", "source": "task.py"}
- {"name": "utils.py", "source": "utils.py"}
- {"name": "reference.py", "source": "reference.py"}
- {"name": "eval.py", "source": "eval.py"}

lang: "py"

description: |
Implement a histogram kernel using CUDA inline function that matches the reference implementation.
The kernel should count the number of elements falling into each bin across the specified range.

config:
main: "eval.py"

tests:
- {"size": 1023, "seed": 4242}
- {"size": 1024, "seed": 5236}
- {"size": 1025, "seed": 1001}
- {"size": 2048, "seed": 5531}
- {"size": 4096, "seed": 9173}

benchmarks:
- {"size": 8192, "seed": 54352}
- {"size": 16384, "seed": 93246}
- {"size": 32768, "seed": 6256}
- {"size": 65536, "seed": 8841}
- {"size": 131072, "seed": 6252}
1 change: 1 addition & 0 deletions examples/histogram_py/utils.py
1 change: 1 addition & 0 deletions examples/prefixsum_py/eval.py
35 changes: 35 additions & 0 deletions examples/prefixsum_py/reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from utils import verbose_allclose
import torch
from task import input_t, output_t

def ref_kernel(data: input_t) -> output_t:
"""
Reference implementation of inclusive prefix sum using PyTorch.
Args:
data: Input tensor to compute prefix sum on
Returns:
Tensor containing the inclusive prefix sum
"""
return torch.cumsum(data, dim=0)

def generate_input(size: int, seed: int) -> input_t:
"""
Generates random input tensor.
Returns:
Tensor to compute prefix sum on
"""
gen = torch.Generator(device='cuda')
gen.manual_seed(seed)
return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()

def check_implementation(
data: input_t,
output: output_t,
) -> str:
expected = ref_kernel(data)
reasons = verbose_allclose(output, expected, rtol=1e-5, atol=1e-5)

if len(reasons) > 0:
return "mismatch found! custom implementation doesn't match reference: " + reasons[0]

return ''
9 changes: 9 additions & 0 deletions examples/prefixsum_py/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from typing import TypedDict, TypeVar
import torch

input_t = TypeVar("input_t", bound=torch.Tensor)
output_t = TypeVar("output_t", bound=torch.Tensor)

class TestSpec(TypedDict):
size: int
seed: int
Loading
Loading