In this example, we will continue with vector addition in GPU using the CUDA programming model. This is an excellent example to begin with because we usually need to do some arithmetic operations using matrices or vectors. For that, we need to know how to access the indexes of the matrix or vector to do the computation efficiently. In this example, we will practice SIMT computation by adding two vectors.

• Memory allocation on both CPU and GPU. Because as discussed before, GPU is an accelerator and can not act as a host machine. So therefore, the computation has to be initiated via CPU. That means, we need to first initialise the data on the host, that is CPU. At the same time, we also need to initialise the memory allocation on the GPU. Because, we need to transfer the data from a CPU to GPU.
• Allocating the CPU memory for a, b, and out vector

// Initialize the memory on the host
float *a, *b, *out;

// Allocate host memory
a   = (float*)malloc(sizeof(float) * N);
b   = (float*)malloc(sizeof(float) * N);
out   = (float*)malloc(sizeof(float) * N);


• Allocating the GPU memory for d_a, d_b, and d_out matrix

// Initialize the memory on the device
float *d_a, *d_b, *d_out;

// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * N);
cudaMalloc((void**)&d_b, sizeof(float) * N);
cudaMalloc((void**)&d_out, sizeof(float) * N);


• Now we need to fill the values for the arrays a and b.

// Initialize host arrays
for(int i = 0; i < N; i++)
{
a[i] = 1.0f;
b[i] = 2.0f;
}


• Transfer initialized value from CPU to GPU

// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);


• Creating a 2D thread block

// Thread organization
dim3 dimGrid(1, 1, 1);
dim3 dimBlock(16, 16, 1);


//1D grid of 1D blocks
__device__ int getGlobalIdx_1D_1D()
{
return blockIdx.x * blockDim.x + threadIdx.x;
}

//1D grid of 2D blocks
__device__ int getGlobalIdx_1D_2D()
{
return blockIdx.x * blockDim.x * blockDim.y
}

//1D grid of 3D blocks
__device__ int getGlobalIdx_1D_3D()
{
return blockIdx.x * blockDim.x * blockDim.y * blockDim.z
+ threadIdx.z * blockDim.y * blockDim.x
}

//2D grid of 1D blocks
__device__ int getGlobalIdx_2D_1D()
{
int blockId   = blockIdx.y * gridDim.x + blockIdx.x;
}

//2D grid of 2D blocks
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) +
}

//2D grid of 3D blocks
__device__ int getGlobalIdx_2D_3D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
}

//3D grid of 1D blocks
__device__ int getGlobalIdx_3D_1D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
}

//3D grid of 2D blocks
__device__ int getGlobalIdx_3D_2D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y)
}

//3D grid of 3D blocks
__device__ int getGlobalIdx_3D_3D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))

• Calling the kernel function

// execute the CUDA kernel function


• Vector addition kernel function call definition

// CPU function that adds two vector
float * Vector_Add(float *a, float *b, float *out, int n)
{
for(int i = 0; i < n; i ++)
{
out[i] = a[i] + b[i];
}
return out;
}

// GPU function that adds two vectors
__global__ void vector_add(float *a, float *b,
float *out, int n)
{
int i = blockIdx.x * blockDim.x * blockDim.y +
// Allow the   threads only within the size of N
if(i < n)
{
out[i] = a[i] + b[i];
}

}

• Copy back computed value from GPU to CPU

// Transfer data back to host memory
cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);


• Deallocate the host and device memory

// Deallocate device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);

// Deallocate host memory
free(a);
free(b);
free(out);


### Questions and Solutions¶

//-*-C++-*-

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <time.h>

#define N 5120
#define MAX_ERR 1e-6

// CPU function that adds two vector
float * Vector_Add(float *a, float *b, float *out, int n)
{
for(int i = 0; i < n; i ++)
{
out[i] = a[i] + b[i];
}
return out;
}

int main()
{
// Initialize the memory on the host
float *a, *b, *out;

// Allocate host memory
a   = (float*)malloc(sizeof(float) * N);
b   = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);

// Initialize host arrays
for(int i = 0; i < N; i++)
{
a[i] = 1.0f;
b[i] = 2.0f;
}

// Start measuring time
clock_t start = clock();

// Executing CPU function

// Stop measuring time and calculate the elapsed time
clock_t end = clock();
double elapsed = (double)(end - start)/CLOCKS_PER_SEC;

printf("Time measured: %.3f seconds.\n", elapsed);

// Verification
for(int i = 0; i < N; i++)
{
assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
}

printf("out = %f\n", out);
printf("PASSED\n");

// Deallocate host memory
free(a);
free(b);
free(out);

return 0;
}

//-*-C++-*-

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <time.h>
#include <cuda.h>

#define N 5120
#define MAX_ERR 1e-6

// GPU function that adds two vectors
__global__ void vector_add(float *a, float *b,
float *out, int n)
{
int i = ........

// Allow the   threads only within the size of N
if------
{
out[i] = a[i] + b[i];
}

}

int main()
{
// Initialize the memory on the host
float *a, *b, *out;

// Allocate host memory
a   = (float*)......

// Initialize the memory on the device
float *d_a, *d_b, *d_out;

// Allocate device memory
cudaMalloc((void**)&d_a,......

// Initialize host arrays
for(int i = 0; i < N; i++)
{
a[i] = ....
b[i] = ....
}

// Transfer data from host to device memory
cudaMemcpy.....

dim3 dimGrid....
dim3 dimBlock....

// execute the CUDA kernel function

// Transfer data back to host memory
cudaMemcpy....

// Verification
for(int i = 0; i < N; i++)
{
assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
}

printf("out = %f\n", out);
printf("PASSED\n");

// Deallocate device memory
cudaFree...

// Deallocate host memory
free..

return 0;
}

//-*-C++-*-

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <time.h>
#include <cuda.h>

#define N 5120
#define MAX_ERR 1e-6

// GPU function that adds two vectors
__global__ void vector_add(float *a, float *b,
float *out, int n)
{

int i = blockIdx.x * blockDim.x * blockDim.y +
// Allow the   threads only within the size of N
if(i < n)
{
out[i] = a[i] + b[i];
}

}

int main()
{
// Initialize the memory on the host
float *a, *b, *out;

// Allocate host memory
a   = (float*)malloc(sizeof(float) * N);
b   = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);

// Initialize the memory on the device
float *d_a, *d_b, *d_out;

// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * N);
cudaMalloc((void**)&d_b, sizeof(float) * N);
cudaMalloc((void**)&d_out, sizeof(float) * N);

// Initialize host arrays
for(int i = 0; i < N; i++)
{
a[i] = 1.0f;
b[i] = 2.0f;
}

// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);

dim3 dimGrid(ceil(N/32), ceil(N/32), 1);
dim3 dimBlock(32, 32, 1);

// execute the CUDA kernel function

// Transfer data back to host memory
cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);

// Verification
for(int i = 0; i < N; i++)
{
assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
}

printf("out = %f\n", out);
printf("PASSED\n");

// Deallocate device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);

// Deallocate host memory
free(a);
free(b);
free(out);

return 0;
}

Compilation and Output
// compilation
$gcc Vector-addition.c -o Vector-Addition-CPU // execution$ ./Vector-Addition-CPU

// output
$./Vector-addition-CPU out = 3.000000 PASSED  // compilation$ nvcc -arch=compute_70 Vector-addition.cu -o Vector-Addition-GPU

// execution
$./Vector-Addition-GPU // output$ ./Vector-addition-GPU
out = 3.000000
PASSED

Questions
• What happens if you remove the syncthreads(); from the __global void vector_add(float *a, float *b, float *out, int n) function.
• Can you remove the if condition if(i < n) from the global void vector_add(float *a, float *b, float *out, int n) function? If so, how can you do that?
• Here we do not use the cudaDeviceSynchronize() in the main application. Can you figure out why we do not need to use it?
• Can you create a different thread block for a larger number of arrays?

Last update: June 20, 2023 10:06:03
Created: March 11, 2023 20:16:27