GPU 100 day challenge - Mojo gpu puzzle: GPU fundamental

This challenge is inspired by fellow members in opensource4you. In this 100 days(maybe not lol), we are going to explore cutting-edge modern gpu programming techniques from easy to advanced.

In the first topic, I will begin with Mojo gpu puzzle.

Puzzle 1: add 10

1
2
3
4
5
6
fn add_10(
    output: UnsafePointer[Scalar[dtype]], a: UnsafePointer[Scalar[dtype]]
):
    i = thread_idx.x
    # FILL ME IN (roughly 1 line)
    output[i] = a[i] + 10

Puzzle 2: add

1
2
3
4
5
6
7
8
fn add(
    output: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    b: UnsafePointer[Scalar[dtype]],
):
    i = thread_idx.x
    # FILL ME IN (roughly 1 line)
    output[i] = a[i] + b[i]

Puzzle 3 : add 10 guard

1
2
3
4
5
6
7
8
9
fn add_10_guard(
    output: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    size: Int,
):
    i = thread_idx.x
    # FILL ME IN (roughly 2 lines)
    if i < size:
        output[i] = a[i] + 10

Puzzle 4: add 10 2D

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
fn add_10_2d(
    output: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    size: Int,
):
    row = thread_idx.y
    col = thread_idx.x
    # FILL ME IN (roughly 2 lines)
    if row < size and col < size:
        output[row * size + col] = a[row * size + col] + 10

With layout tensor

layout tensor is a datastructure that enables we query multi-dimemtional array on the fly, like this: layout_tensor[row, col], in 3D: layout_tensor[x, y, z].

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
fn add_10_2d(
    output: LayoutTensor[mut=True, dtype, layout],
    a: LayoutTensor[mut=True, dtype, layout],
    size: Int,
):
    row = thread_idx.y
    col = thread_idx.x
    # FILL ME IN (roughly 2 lines)
    if row < size and col < size:
        output[row, col] = a[row, col] + 10

Puzzle 5: braodcast add

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
fn broadcast_add(
    output: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    b: UnsafePointer[Scalar[dtype]],
    size: Int,
):
    row = thread_idx.y
    col = thread_idx.x
    # FILL ME IN (roughly 2 lines)
    if row < size and col < size:
        output[row * size + col] = a[col] + b[row]

With layout tensor

important: layer tensor should be written in 2d in this puzzle, in constrast, the UnsafePointer is always 1d.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
fn broadcast_add[
    out_layout: Layout,
    a_layout: Layout,
    b_layout: Layout,
](
    output: LayoutTensor[mut=True, dtype, out_layout],
    a: LayoutTensor[mut=False, dtype, a_layout],
    b: LayoutTensor[mut=False, dtype, b_layout],
    size: Int,
):
    row = thread_idx.y
    col = thread_idx.x
    # FILL ME IN (roughly 2 lines)
    if row < size and col < size:
        output[row, col] = a[0, col] + b[row, 0]

Puzzle 6: add 10 blocks

In this puzzle, we learn how to process data that is bigger than our block size.

1
2
3
4
5
6
7
8
9
fn add_10_blocks(
    output: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    size: Int,
):
    i = block_dim.x * block_idx.x + thread_idx.x
    # FILL ME IN (roughly 2 lines)
    if i < size:
        output[i] = a[i] + 10

Puzzle 7: 2d blocks

block_idx: see a block as an unit, block_idx.x is through x=0 been how much unit of blocks. block_idx.y vice versa. block_idx.x specifies the block’s position along the X-axis (column direction) within this grid of blocks. It indicates how many units of blocks have been passed horizontally from the starting point (X=0).

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
fn add_10_blocks_2d(
    output: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    size: Int,
):
    row = block_dim.y * block_idx.y + thread_idx.y
    col = block_dim.x * block_idx.x + thread_idx.x
    # FILL ME IN (roughly 2 lines)
    if row < size and col < size:
        output[row * size + col] = a[row * size + col] + 10

With layout tensor

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
fn add_10_blocks_2d[
    out_layout: Layout,
    a_layout: Layout,
](
    output: LayoutTensor[mut=True, dtype, out_layout],
    a: LayoutTensor[mut=False, dtype, a_layout],
    size: Int,
):
    row = block_dim.y * block_idx.y + thread_idx.y
    col = block_dim.x * block_idx.x + thread_idx.x
    # FILL ME IN (roughly 2 lines)
    if row < size and col < size:
        output[row, col] = a[row, col] + 10

Puzzle 8: shared memory

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
fn add_10_shared(
    output: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    size: Int,
):
    shared = stack_allocation[
        TPB,
        Scalar[dtype],
        address_space = AddressSpace.SHARED,
    ]()
    global_i = block_dim.x * block_idx.x + thread_idx.x
    local_i = thread_idx.x
    # local data into shared memory
    if global_i < size:
        shared[local_i] = a[global_i]

    # wait for all threads to complete
    # works within a thread block
    barrier()

    # FILL ME IN (roughly 2 lines)
    if global_i < size:
        output[global_i] = shared[local_i] + 10

With layout tensor

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
fn add_10_shared_layout_tensor[
    layout: Layout
](
    output: LayoutTensor[mut=True, dtype, layout],
    a: LayoutTensor[mut=True, dtype, layout],
    size: Int,
):
    # Allocate shared memory using tensor builder
    shared = tb[dtype]().row_major[TPB]().shared().alloc()

    global_i = block_dim.x * block_idx.x + thread_idx.x
    local_i = thread_idx.x

    if global_i < size:
        shared[local_i] = a[global_i]

    barrier()

    # FILL ME IN (roughly 2 lines)
    if global_i < size:
        output[global_i] = shared[local_i] + 10

Different init shared memory method

Raw Memory Approach: Shared memory is allocated using stack_allocationTPB, Scalar[dtype], address_space = AddressSpace.SHARED. This is a more direct, lower-level allocation method.
LayoutTensor Version: Shared memory is allocated using tb[dtype]().row_major[TPB]().shared().alloc(). This method uses a “tensor builder” (tb) provided by LayoutTensor to allocate shared memory, which aligns with LayoutTensor’s philosophy of simplifying memory management.