Matrix Multiplication
We will now look into basic matrix multiplication. In this example, we will perform the matrix multiplication. Matrix multiplication involves a nested loop. Again, most of the time, we might end up doing computation with a nested loop. Therefore, studying this example would be good practice for solving the nested loop in the future.
-
Allocating the CPU memory for A, B, and C matrices. Here we notice that the matrix is stored in a 1D array because we want to consider the same function concept for CPU and GPU.
-
Allocating the GPU memory for A, B, and C matrix
-
Now we need to fill the values for the matrix A and B.
-
Transfer initialized A and B matrix from CPU to GPU
-
2D thread block for indexing x and y
-
Calling the kernel function
matrix multiplication function call
__global__ void matrix_mul(float* d_a, float* d_b, float* d_c, int width) { int row = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.y + threadIdx.y; if ((row < width) && (col < width)) { float temp = 0; // each thread computes one // element of the block sub-matrix for (int i = 0; i < width; ++i) { temp += d_a[row*width+i]*d_b[i*width+col]; } d_c[row*width+col] = temp; } }
-
Copy back computed value from GPU to CPU; transfer the data back to GPU (from device to host). Here is the C matrix that contains the product of the two matrices.
-
Deallocate the host and device memory
Questions and Solutions¶
Examples: Matrix Multiplication
//-*-C++-*-
// Matrix-multiplication.c
#include<iostream>
#include<cuda.h>
using namespace std;
float * matrix_mul(float *h_a, float *h_b, float *h_c, int width)
{
for(int row = 0; row < width ; ++row)
{
for(int col = 0; col < width ; ++col)
{
float temp = 0;
for(int i = 0; i < width ; ++i)
{
temp += h_a[row*width+i] * h_b[i*width+col];
}
h_c[row*width+col] = temp;
}
}
return h_c;
}
int main()
{
cout << "Programme assumes that matrix (square matrix )size is N*N "<<endl;
cout << "Please enter the N size number "<< endl;
int N = 0;
cin >> N;
// Initialize the memory on the host
float *a, *b, *c;
// Allocate host memory
a = (float*)malloc(sizeof(float) * (N*N));
b = (float*)malloc(sizeof(float) * (N*N));
c = (float*)malloc(sizeof(float) * (N*N));
// Initialize host matrix
for(int i = 0; i < (N*N); i++)
{
a[i] = 1.0f;
b[i] = 2.0f;
}
// Device function call
matrix_mul(a, b, c, N);
// Verification
for(int i = 0; i < N; i++)
{
for(int j = 0; j < N; j++)
{
cout << c[j] <<" ";
}
cout << " " <<endl;
}
// Deallocate host memory
free(a);
free(b);
free(c);
return 0;
}
//-*-C++-*-
// Matrix-multiplication-template.cu
#include<iostream>
#include<cuda.h>
using namespace std;
__global__ void matrix_mul(float* d_a, float* d_b,
float* d_c, int width)
{
// create a 2d threads block
int row = ..................
int col = ....................
// only allow the threads that are needed for the computation
if (................................)
{
float temp = 0;
// each thread computes one
// element of the block sub-matrix
for (int i = 0; i < width; ++i)
{
temp += d_a[row*width+i]*d_b[i*width+col];
}
d_c[row*width+col] = temp;
}
}
// Host call (matrix multiplication)
float * cpu_matrix_mul(float *h_a, float *h_b, float *h_c, int width)
{
for(int row = 0; row < width ; ++row)
{
for(int col = 0; col < width ; ++col)
{
float single_entry = 0;
for(int i = 0; i < width ; ++i)
{
single_entry += h_a[row*width+i] * h_b[i*width+col];
}
h_c[row*width+col] = single_entry;
}
}
return h_c;
}
int main()
{
cout << "Programme assumes that matrix (square matrix) size is N*N "<<endl;
cout << "Please enter the N size number "<< endl;
int N = 0;
cin >> N;
// Initialize the memory on the host
float *a, *b, *c, *host_check;
// Initialize the memory on the device
float *d_a, *d_b, *d_c;
// Allocate host memory
a = (float*)malloc(sizeof(float) * (N*N));
...
...
// Initialize host matrix
for(int i = 0; i < (N*N); i++)
{
a[i] = 2.0f;
b[i] = 2.0f;
}
// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * (N*N));
...
...
// Transfer data from host to device memory
cudaMemcpy(.........................);
cudaMemcpy(.........................);
// Thread organization
int blockSize = ..............;
dim3 dimBlock(......................);
dim3 dimGrid(.......................);
// Device function call
matrix_mul<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
// Transfer data back to host memory
cudaMemcpy(c, d_c, sizeof(float) * (N*N), cudaMemcpyDeviceToHost);
// CPU computation for verification
cpu_matrix_mul(a,b,host_check,N);
// Verification
bool flag=1;
for(int i = 0; i < N; i++)
{
for(int j = 0; j < N; j++)
{
if(c[j*N+i]!= host_check[j*N+i])
{
flag=0;
break;
}
}
}
if (flag==0)
{
cout <<"Two matrices are not equal" << endl;
}
else
cout << "Two matrices are equal" << endl;
// Deallocate device memory
cudaFree...
// Deallocate host memory
free...
return 0;
}
//-*-C++-*-
// Matrix-multiplication.cu
#include<iostream>
#include<cuda.h>
using namespace std;
__global__ void matrix_mul(float* d_a, float* d_b,
float* d_c, int width)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if ((row < width) && (col < width))
{
float temp = 0;
// each thread computes one
// element of the block sub-matrix
for (int i = 0; i < width; ++i)
{
temp += d_a[row*width+i]*d_b[i*width+col];
}
d_c[row*width+col] = temp;
}
}
// Host call (matrix multiplication)
float * cpu_matrix_mul(float *h_a, float *h_b, float *h_c, int width)
{
for(int row = 0; row < width ; ++row)
{
for(int col = 0; col < width ; ++col)
{
float single_entry = 0;
for(int i = 0; i < width ; ++i)
{
single_entry += h_a[row*width+i] * h_b[i*width+col];
}
h_c[row*width+col] = single_entry;
}
}
return h_c;
}
int main()
{
cout << "Programme assumes that matrix (square matrix) size is N*N "<<endl;
cout << "Please enter the N size number "<< endl;
int N = 0;
cin >> N;
// Initialize the memory on the host
float *a, *b, *c, *host_check;
// Initialize the memory on the device
float *d_a, *d_b, *d_c;
// Allocate host memory
a = (float*)malloc(sizeof(float) * (N*N));
b = (float*)malloc(sizeof(float) * (N*N));
c = (float*)malloc(sizeof(float) * (N*N));
host_check = (float*)malloc(sizeof(float) * (N*N));
// Initialize host matrix
for(int i = 0; i < (N*N); i++)
{
a[i] = 2.0f;
b[i] = 2.0f;
}
// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * (N*N));
cudaMalloc((void**)&d_b, sizeof(float) * (N*N));
cudaMalloc((void**)&d_c, sizeof(float) * (N*N));
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * (N*N), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * (N*N), cudaMemcpyHostToDevice);
// Thread organization
int blockSize = 32;
dim3 dimBlock(blockSize,blockSize,1);
dim3 dimGrid(ceil(N/float(blockSize)),ceil(N/float(blockSize)),1);
// Device function call
matrix_mul<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
// Transfer data back to host memory
cudaMemcpy(c, d_c, sizeof(float) * (N*N), cudaMemcpyDeviceToHost);
// cpu computation for verification
cpu_matrix_mul(a,b,host_check,N);
// Verification
bool flag=1;
for(int i = 0; i < N; i++)
{
for(int j = 0; j < N; j++)
{
if(c[j*N+i]!= host_check[j*N+i])
{
flag=0;
break;
}
}
}
if (flag==0)
{
cout <<"Two matrices are not equal" << endl;
}
else
cout << "Two matrices are equal" << endl;
// Deallocate device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Deallocate host memory
free(a);
free(b);
free(c);
free(host_check);
return 0;
}
Compilation and Output
// compilation
$ gcc Matrix-multiplication.c -o Matrix-Multiplication-CPU
// execution
$ ./Matrix-Multiplication-CPU
// output
$ g++ Matrix-multiplication.cc -o Matrix-multiplication
$ ./Matrix-multiplication
Programme assumes that matrix (square matrix) size is N*N
Please enter the N size number
4
16 16 16 16
16 16 16 16
16 16 16 16
16 16 16 16
Questions
- Right now, we are using the 1D array to represent the matrix. However, you can also do it with the 2D matrix. Can you try with 2D array matrix multiplication with 2D thread block?
- Can you get the correct solution if you remove the
if ((row < width) && (col < width))
condition from the__global__ void matrix_mul(float* d_a, float* d_b, float* d_c, int width)
function? - Please try with different thread blocks and different matrix sizes.
Created: March 11, 2023 20:16:27