Multidimensional grids and data in cuda

1st May 2025

🔹 What is `threadIdx.(x, y, z)`?

Each thread in CUDA is part of a block. The thread has a position inside its block, described by threadIdx.x, threadIdx.y, and threadIdx.z.

🔸 Imagine a 3D block:

Let’s say we launch a block with dimensions (4, 3, 2):

4 threads in x
3 threads in y
2 threads in z

Total threads per block: 4 × 3 × 2 = 24

Now, each thread can be uniquely identified by its (x, y, z) index inside that block:

threadIdx.z = 0 or 1
threadIdx.y = 0 to 2
threadIdx.x = 0 to 3

🔹 What is `blockIdx.(x, y, z)`?

A grid consists of many blocks. Each block has a position within the grid, described by blockIdx.x, blockIdx.y, and blockIdx.z.

🔸 Imagine a 2D grid of blocks:

Let’s say:

dim3 gridDim(3, 2);   // 3 blocks in x, 2 in y
dim3 blockDim(4, 4);  // 4×4 threads per block

So we have 3×2 = 6 blocks total.
Each block is a 4×4 square of threads.

Combined Indexing

Each thread’s global position in the full grid is computed using both:

int global_x = blockIdx.x * blockDim.x + threadIdx.x;
int global_y = blockIdx.y * blockDim.y + threadIdx.y;
int global_z = blockIdx.z * blockDim.z + threadIdx.z;

This maps the thread to a unique global position in the problem space (like pixels, matrix cells, etc).

Example:

#include <stdio.h>

__global__ void showThreadBlockIndex() {
    printf("Block (%d, %d, %d), Thread (%d, %d, %d)\n",
           blockIdx.x, blockIdx.y, blockIdx.z,
           threadIdx.x, threadIdx.y, threadIdx.z);
}

int main() {
    dim3 blockDim(2, 2, 1); // 2x2 threads per block
    dim3 gridDim(3, 2, 1);  // 3x2 blocks

    showThreadBlockIndex<<<gridDim, blockDim>>>();
    cudaDeviceSynchronize();
    return 0;
}

Output: 
Block (0,0,0), Thread (0,0,0)
Block (0,0,0), Thread (1,0,0)
Block (0,0,0), Thread (0,1,0)
Block (0,0,0), Thread (1,1,0)
Block (1,0,0), Thread (0,0,0)
...

🔷 Mapping Threads to Multidimensional Data

🟢 Easy

What is the formula for computing a thread’s global row and column index in a 2D grid?

Answer:

int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

🟡 Medium

Convert a 2D index (row, col) in a matrix of size m × n into a linear 1D index. Explain why this is needed in CUDA.

int index = row * n + col;

🔴 Hard

Write code that handles boundary conditions for a thread operating on a 62×76 image using a 16×16 block size. Only valid threads should modify image data.

#include <stdio.h>
#include <cuda_runtime.h>

// CUDA kernel to process a grayscale image
__global__ void processImage(unsigned char* image, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;  // column
    int y = blockIdx.y * blockDim.y + threadIdx.y;  // row

    if (x < width && y < height) {
        int index = y * width + x;
        image[index] = 255; // Example Operation: Set pixel to white
    }
}

int main() {
    const int width = 76;
    const int height = 62;
    const int imageSize = width * height;

    // Host and device image pointers
    unsigned char* image_h = (unsigned char*)malloc(imageSize);
    unsigned char* image_d;

    // Allocate memory on device
    cudaMalloc(&image_d, imageSize * sizeof(unsigned char));
    cudaMemset(image_d, 0, imageSize);  // optional: initialize to black

    // Launch CUDA kernel
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x,
                 (height + blockDim.y - 1) / blockDim.y);

    processImage<<<gridDim, blockDim>>>(image_d, width, height);
    cudaDeviceSynchronize();

    // Copy result back to host
    cudaMemcpy(image_h, image_d, imageSize, cudaMemcpyDeviceToHost);

    // Optional: print a few pixels
    printf("First 10 pixels after kernel:\n");
    for (int i = 0; i < 10; ++i) {
        printf("Pixel %d = %d\n", i, image_h[i]);
    }

    // Free memory
    cudaFree(image_d);
    free(image_h);

    return 0;
}

🔷 Image Blur – A More Complex Kernel

The following program takes in a RGB image in PGM file format and blurs it using 3×3 kernel average

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

// ------------------------
// Utility to read PGM file
// ------------------------
unsigned char* readPGM(const char* filename, int* width, int* height) {
    FILE* f = fopen(filename, "rb");
    if (!f) { perror("File open failed"); exit(1); }

    char magic[3];
    fscanf(f, "%2s", magic);
    if (magic[0] != 'P' || (magic[1] != '5' && magic[1] != '2')) {
        fprintf(stderr, "Not a PGM file\n");
        exit(1);
    }

    // Skip comments
    int c;
    do { c = fgetc(f); } while (c == '#');
    ungetc(c, f);

    fscanf(f, "%d %d\n", width, height);
    int maxval;
    fscanf(f, "%d\n", &maxval);

    unsigned char* data = (unsigned char*)malloc((*width) * (*height));
    fread(data, 1, (*width) * (*height), f);
    fclose(f);
    return data;
}

// ------------------------
// Utility to write PGM file
// ------------------------
void writePGM(const char* filename, unsigned char* data, int width, int height) {
    FILE* f = fopen(filename, "wb");
    fprintf(f, "P5\n%d %d\n255\n", width, height);
    fwrite(data, 1, width * height, f);
    fclose(f);
}

// ------------------------
// CUDA blur kernel (3x3 average)
// ------------------------
__global__ void blurKernel(unsigned char* input, unsigned char* output, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x; // col
    int y = blockIdx.y * blockDim.y + threadIdx.y; // row

    if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
        int sum = 0;
        for (int dy = -1; dy <= 1; dy++) {
            for (int dx = -1; dx <= 1; dx++) {
                int nx = x + dx;
                int ny = y + dy;
                sum += input[ny * width + nx];
            }
        }
        output[y * width + x] = sum / 9;
    }
}

int main(int argc, char** argv) {
    if (argc != 2) {
        printf("Usage: ./blur_pgm image.pgm\n");
        return 1;
    }

    int width, height;
    unsigned char* image_h = readPGM(argv[1], &width, &height);
    unsigned char* result_h = (unsigned char*)malloc(width * height);

    unsigned char *image_d, *result_d;
    cudaMalloc(&image_d, width * height);
    cudaMalloc(&result_d, width * height);
    cudaMemcpy(image_d, image_h, width * height, cudaMemcpyHostToDevice);

    dim3 blockDim(16, 16);
    dim3 gridDim((width + 15) / 16, (height + 15) / 16);
    blurKernel<<<gridDim, blockDim>>>(image_d, result_d, width, height);
    cudaDeviceSynchronize();

    cudaMemcpy(result_h, result_d, width * height, cudaMemcpyDeviceToHost);
    writePGM("blurred.pgm", result_h, width, height);

    printf("Blurred image saved as 'blurred.pgm'\n");

    // Cleanup
    cudaFree(image_d);
    cudaFree(result_d);
    free(image_h);
    free(result_h);

    return 0;
}

In CUDA, blurring edges of an image is challenging because threads assigned to border pixels may attempt to access memory outside the valid image range. This can result in illegal memory access, crashes, or undefined behavior.

To avoid this, several strategies can be used:

Boundary Checks
Each time a thread tries to access a neighbor pixel, it first checks whether the coordinates are within image bounds. For example, if (nx >= 0 && nx < width && ny >= 0 && ny < height), then it accesses the pixel; otherwise, it skips it. This method is simple and safe but may cause edge pixels to be averaged over fewer neighbors, leading to bias near the edges.
Clamp (Edge Replication)
This strategy keeps all neighbor accesses within valid bounds by clamping coordinates. For example, nx = min(max(x + dx, 0), width – 1). This causes edge pixels to use the closest valid border pixel, resulting in smoothed but slightly “smeared” edges. It’s commonly used in libraries like OpenCV.
Zero Padding
If a neighbor pixel is outside the image, it contributes a value of zero to the blur. This is safe but causes the image to darken around the edges since the average includes zeroes.
Mirror Padding
This reflects out-of-bound coordinates back into the image as if the image were mirrored at the border. For example, a coordinate -1 becomes 1, and width becomes width – 2. This method maintains edge structure but is more complex.
Ignore Borders
This is the simplest method. You only apply the blur to interior pixels (e.g., x > 0 && x < width – 1 && y > 0 && y < height – 1). Border pixels are left unchanged. This avoids all issues but leaves the border unprocessed.

Each method has trade-offs between safety, visual quality, and complexity. Clamp and mirror padding are the most visually pleasing, while boundary checks and ignoring borders are easiest to implement.

🔷 Matrix Multiplication

Matrix multiplication is a fundamental operation in scientific computing and deep learning. In CUDA, it involves mapping threads in a 2D grid to individual elements in the output matrix.

If matrix M is of size i × j, and N is j × k, then the result P is i × k, where:

Pseudocode:

for each row in M:
    for each col in N:
        P[row][col] = 0
        for k from 0 to num_columns_in_M - 1:
            P[row][col] += M[row][k] * N[k][col]

CUDA Code Sample:

float Pvalue = 0.0f;
for (int k = 0; k < common_dim; ++k) {
    Pvalue += M[row * j + k] * N[k * k_dim + col];
}
P[row * k_dim + col] = Pvalue;

Different difficulties of matrix multiplication cuda kernels:

1. Easy – Naive Kernel (One Thread Per Output Element)

//Each thread computes one P[row][col]. No optimizations.


__global__ void matMulNaive(float* A, float* B, float* C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < K) {
        float sum = 0;
        for (int i = 0; i < N; ++i) {
            sum += A[row * N + i] * B[i * K + col];
        }
        C[row * K + col] = sum;
    }
}

2. Medium – Tiling with Shared Memory (Faster, Coalesced)

// Each block loads tiles of A and B into shared memory to reduce global memory // accesses.

__global__ void matMulTiled(float* A, float* B, float* C, int M, int N, int K) {
    __shared__ float As[16][16];
    __shared__ float Bs[16][16];

    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int row = blockIdx.y * 16 + ty;
    int col = blockIdx.x * 16 + tx;

    float sum = 0;

    for (int t = 0; t < (N + 15) / 16; ++t) {
        if (row < M && t * 16 + tx < N)
            As[ty][tx] = A[row * N + t * 16 + tx];
        else
            As[ty][tx] = 0;

        if (t * 16 + ty < N && col < K)
            Bs[ty][tx] = B[(t * 16 + ty) * K + col];
        else
            Bs[ty][tx] = 0;

        __syncthreads();

        for (int k = 0; k < 16; ++k)
            sum += As[ty][k] * Bs[k][tx];

        __syncthreads();
    }

    if (row < M && col < K)
        C[row * K + col] = sum;
}

In matrix multiplication, what computation does each thread perform?

🟡 Medium

For two matrices M (64×32) and N (32×128), how would you organize threads using a 2D block/grid configuration? Provide block and grid dimensions.

🔴 Hard

Write CUDA pseudocode for matrix multiplication using 2D blocks. Include formulas for row, col, and loop index for dot product.

🔷 Summary Concepts

🟢 Easy

Why does CUDA limit thread and block dimensions to 3D?

🟡 Medium

Explain how you could simulate a 4D dataset (e.g., (t, z, y, x)) using a 3D CUDA grid and manual flattening.

🔴 Hard

Given a 5D dataset with shape (batch, t, z, y, x), write formulas to compute the 1D flattened index from 5D coordinates and vice versa.

+ Uncategorised

+ ai, artificial-intelligence, social-media, technology

What are GPU threads, blocks, grids, threadidx blockidx, blockdim? Explained with simple simulation example in relation to each other

26th Apr 2025

+

+

+

+

+

+
Concepts:
- Thread:
  
  One tiny worker that does a small part of the job.
  
  Example: One thread could add just one element of two arrays.
- Block:
  
  A group of threads.
  
  Threads inside a block can cooperate (share data, synchronize).
  
  Example: A block might have 256 threads, each working on a small piece.
- Grid:
  
  A group of blocks.
  
  The entire GPU program launches a grid of blocks, so the work is distributed widely.
  
  Example: 100 blocks, each with 256 threads.
- threadIdx:
  
  Every thread has a thread ID inside its block.
  
  threadIdx.x tells you which thread you are (within the block).
- blockIdx:
  
  Every block has a block ID inside the grid.
  
  blockIdx.x tells you which block you are.
- blockDim:
  
  Tells you how many threads are in a block (the block’s size).
  
  blockDim.x is the number of threads per block.
Very Simple Simulation Example

Suppose you have two arrays:A = [1, 2, 3, 4, 5, 6, 7, 8] B = [10, 20, 30, 40, 50, 60, 70, 80]

You want to add them element by element and get C[i] = A[i] + B[i].

Imagine:
- 2 blocks
- 4 threads per block
Thus:
- Total threads = 2 × 4 = 8 threads (good, we have 8 elements).
Each thread will do one addition.

Now look how each thread figures out its global index:

int global_index = blockIdx.x * blockDim.x + threadIdx.x;

Where:
- blockIdx.x = block number (0 or 1)
- blockDim.x = 4 (threads per block)
- threadIdx.x = thread number within the block (0, 1, 2, or 3)
Mapping:

So thread (block 0, thread 0) adds A[0] + B[0],
thread (block 1, thread 2) adds A[6] + B[6], and so on.

CUDA Code Sketch:

__global__ void add_vectors(int *A, int *B, int *C) {

int idx = blockIdx.x * blockDim.x + threadIdx.x;

C[idx] = A[idx] + B[idx];

}

When you call the kernel:

add_vectors<<<2, 4>>>(A, B, C);
- 2 blocks
- 4 threads each
- All elements get processed in parallel.
Final simple picture:
- Threads are workers.
- Blocks are teams of workers.
- Grid is the whole construction site.
- threadIdx, blockIdx, blockDim are the address system that tells each worker what piece of the job they must do.
Figure: A Grid consisting of 2 Blocks with 4 threads in each block

Exercises:

1. If we want to use each thread in a grid to calculate one output element of a vector addition, what would be the expression for mapping the thread/block indices to the data index (i)?

Answer: i=blockIdx.xblockDim.x + threadIdx.x;

2. Assume that we want to use each thread to calculate two adjacent elements of a vector addition. What would be the expression for mapping the thread/block indices to the data index (i) of the first element to be processed by a thread?

Answer: i=(blockIdx.xblockDim.x + threadIdx.x)*2

Explanation: So the question basically asks that each thread “i” will process 2 consecutive elements in a vector. For example if a vector has 6 elements and we use 3 threads to process them, thread 0 will process elements 0 and 1 of the vector, while the thread 1 will process the elements 2 and 3, thread 2 will process elements 4 and 5 and so forth.
```
__global__ void vecAddKernel(float* A, float* B, float* C, int n){
    int i = (threadIdx.x + blockDim.x * blockIdx.x) * 2;
    if (i < n){
        C[i] = A[i] + B[i];
        C[i + 1] = A[i + 1] + B[i + 1];
    }
}
```
3. We want to use each thread to calculate two elements of a vector addition.
Each thread block processes 2*blockDim.x consecutive elements that form
two sections. All threads in each block will process a section first, each
processing one element. They will then all move to the next section, each
processing one element. Assume that variable i should be the index for the
first element to be processed by a thread. What would be the expression for
mapping the thread/block indices to data index of the first element?

Answer: i=blockIdx.xblockDim.x2 + threadIdx.x;

Explanation: Let blockDim.x = 2 (i.e., 2 threads per block)

So, each block processes 2 * 2 = 4 elements.

Let the vector have 8 elements. So, element 0->3 is section 1 and element 4->7 is section 2.
```
// Imagine blockIdx.x = 0, blockDim.x=256 and threadIdx.x = 0; then i = 0;
// if blockIdx.x = 1 then i becomes 1 * 256 * 2 + 0 = 512;
// Meaning all the elements before 512 were processed by blockIdx.x = 0;
__global__ void vecAddKernel(float* A, float* B, float* C, int n){
    int i = blockDim.x * blockIdx.x * 2 + threadIdx.x;
    if (i < n){
        C[i] = A[i] + B[i]; // Imagine i = 0, then indexing here is 0
    }

    int j = i + blockDim.x;
    if (j < n){
        C[j] = A[j] + B[j];
    }
}
```
4. For a vector addition, assume that the vector length is 8000, each thread
calculates one output element, and the thread block size is 1024 threads. The
programmer configures the kernel call to have a minimum number of thread
blocks to cover all output elements. How many threads will be in the grid?

Answer: 8192.

Explanation: Each block has 1024 threads. Minimum number of blocks required to cover 8000 elements is (8000/1024 = 7.8125) but we can’t have 7.8125 blocks so we round up to 8 blocks with 1024 threads each amounts to 8192.

5. If we want to allocate an array of v integer elements in the CUDA device
global memory, what would be an appropriate expression for the second
argument of the cudaMalloc call?

Answer: v * sizeof(int)

Explanations:

cudaMalloc ( void** devPtr, size_t size );
devPtr: A pointer to the device memory pointer. size: Number of bytes to allocate.
6. If we want to allocate an array of n floating-point elements and have a
floating-point pointer variable A_d to point to the allocated memory, what
would be an appropriate expression for the first argument of the cudaMalloc

Answer: (void **) &A_d

7. If we want to copy 3000 bytes of data from host array A_h (A_h is a pointer
to element 0 of the source array) to device array A_d (A_d is a pointer to
element 0 of the destination array), what would be an appropriate API call
for this data copy in CUDA?

Answer: cudaMemcpy(A_d, A_h, 3000, cudaMemcpyHostToDevice);

8. How would one declare a variable err that can appropriately receive the
returned value of a CUDA API call?

Answer: cudaError_t err;

9. Consider the following CUDA kernel and the corresponding host function
that calls it:
```
__global__ void foo_kernel(float* a, float* b, unsigned int N) {
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        b[i] = 2.7f * a[i] - 4.3f;
    }
}
void foo(float* a_d, float* b_d) {
    unsigned int N = 200000;
    unsigned int threadsPerBlock = 128;
    unsigned int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;

    foo_kernel<<<blocks, threadsPerBlock>>>(a_d, b_d, N);
}
```
a. What is the number of threads per block?

Answer: 128

b. What is the number of threads in the grid?

Answer: 200064

c. What is the number of blocks in the grid?

Answer: 1563

d. What is the number of threads that execute the code on line 02?

Answer: 200064

e. What is the number of threads that execute the code on line 04?

Answer: 200000 because i<N filters 64 threads

f. A new summer intern was frustrated with CUDA. He has been complaining
that CUDA is very tedious. He had to declare many functions that he plans
to execute on both the host and the device twice, once as a host function and
once as a device function. What is your response?

Answer: host device indicates the function can be run on both the CPU and GPU, therefore intern doesn’t need to define the function two times.
+

+

+

+

+

+

+ Beginner, CUDA, CUDA Basics, GPU Architecture, GPU Programming, Parallel Programming

+

+

+

+

+

+

+

+

+

+

+

+

Elahi Compute

What are GPU threads, blocks, grids, threadidx blockidx, blockdim? Explained with simple simulation example in relation to each other

Concepts:

Very Simple Simulation Example

Now look how each thread figures out its global index:

Mapping:

CUDA Code Sketch:

Final simple picture:

Multidimensional grids and data in cuda

🔹 What is threadIdx.(x, y, z)?

🔸 Imagine a 3D block:

🔹 What is blockIdx.(x, y, z)?

🔸 Imagine a 2D grid of blocks:

Combined Indexing

🔷 Mapping Threads to Multidimensional Data

🟢 Easy

🟡 Medium

🔴 Hard

🔷 Image Blur – A More Complex Kernel

🔷 Matrix Multiplication

Different difficulties of matrix multiplication cuda kernels:

2. Medium – Tiling with Shared Memory (Faster, Coalesced)

🟡 Medium

🔴 Hard

🔷 Summary Concepts

🟢 Easy

🟡 Medium

🔴 Hard

What are GPU threads, blocks, grids, threadidx blockidx, blockdim? Explained with simple simulation example in relation to each other

Concepts:

Very Simple Simulation Example

Now look how each thread figures out its global index:

Mapping:

CUDA Code Sketch:

Final simple picture:

Elahi Compute

🔹 What is `threadIdx.(x, y, z)`?

🔹 What is `blockIdx.(x, y, z)`?