Skip to content

Matrix Multiplication

We will now look into basic matrix multiplication. In this tutorial, we will explore the process of matrix multiplication, a fundamental operation in linear algebra. Matrix multiplication is executed through a nested loop structure, which is a common approach for numerous computational tasks involving matrices. Mastering this example will provide valuable insight and practical experience in handling nested loops, a crucial skill for solving complex problems in future applications.

b
  • Allocating the CPU memory for A, B, and C matrices. Here, we notice that the matrix is stored in a 1D array because we want to consider the same function concept for CPU and GPU.

    // Initialize the memory on the host
    float *a, *b, *c;
    
    // Allocate host memory
    a   = (float*)malloc(sizeof(float) * (N*N));
    b   = (float*)malloc(sizeof(float) * (N*N));
    c   = (float*)malloc(sizeof(float) * (N*N));
    

  • Allocating the GPU memory for A, B, and C matrix

    // Initialize the memory on the device
    float *d_a, *d_b, *d_c;
    
    // Allocate device memory
    cudaMalloc((void**)&d_a, sizeof(float) * (N*N));
    cudaMalloc((void**)&d_b, sizeof(float) * (N*N));
    cudaMalloc((void**)&d_c, sizeof(float) * (N*N));
    

  • Now, we need to fill in the values for the matrix A and B.

    // Initialize host matrix
    for(int i = 0; i < (N*N); i++)
       {
        a[i] = 2.0f;
        b[i] = 2.0f;
       }
    

  • Transfer initialized A and B matrix from CPU to GPU

    cudaMemcpy(d_a, a, sizeof(float) * (N*N), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * (N*N), cudaMemcpyHostToDevice);
    

  • 2D thread block for indexing x and y

    // Thread organization
    int blockSize = 32;
    dim3 dimBlock(blockSize,blockSize,1);
    dim3 dimGrid(ceil(N/float(blockSize)),ceil(N/float(blockSize)),1);
    

  • Calling the kernel function

    // Device function call
    matrix_mul<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
    

    matrix multiplication function call
    float * matrix_mul(float *h_a, float *h_b, float *h_c, int width)
    {
      for(int row = 0; row < width ; ++row)
        {
          for(int col = 0; col < width ; ++col)
            {
              float temp = 0;
              for(int i = 0; i < width ; ++i)
                {
                  temp += h_a[row*width+i] * h_b[i*width+col];
                }
              h_c[row*width+col] = temp;
            }
        }
      return h_c;
    }
    
    __global__ void matrix_mul(float* d_a, float* d_b, 
    float* d_c, int width)
    {
      int row = blockIdx.x * blockDim.x + threadIdx.x;
      int col = blockIdx.y * blockDim.y + threadIdx.y;
    
      if ((row < width) && (col < width)) 
        {
          float temp = 0;
          // each thread computes one 
          // element of the block sub-matrix
          for (int i = 0; i < width; ++i) 
            {
              temp += d_a[row*width+i]*d_b[i*width+col];
            }
          d_c[row*width+col] = temp;
        }
    }
    
  • Copy back computed value from GPU to CPU; transfer the data back to GPU (from device to host). Here is the C matrix that contains the product of the two matrices.

    // Transfer data back to host memory
    cudaMemcpy(c, d_c, sizeof(float) * (N*N), cudaMemcpyDeviceToHost);
    

  • Deallocate the host and device memory

    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    
    // Deallocate host memory
    free(a); 
    free(b); 
    free(c);
    

Questions and Solutions

Examples: Matrix Multiplication
//-*-C++-*-
// Matrix-multiplication.cc

#include<iostream>
#include<cuda.h>

using namespace std;

float * matrix_mul(float *h_a, float *h_b, float *h_c, int width)   
{                                                                 
  for(int row = 0; row < width ; ++row)                           
    {                                                             
      for(int col = 0; col < width ; ++col)                       
        {                                                         
          float temp = 0;                                       
          for(int i = 0; i < width ; ++i)                         
            {                                                     
              temp += h_a[row*width+i] * h_b[i*width+col];      
            }                                                     
          h_c[row*width+col] = temp;                            
        }                                                         
    }   
  return h_c;           
}

int main()
{

  cout << "Programme assumes that matrix (square matrix )size is N*N "<<endl;
  cout << "Please enter the N size number "<< endl;
  int N = 0;
  cin >> N;

  // Initialize the memory on the host
  float *a, *b, *c;       

  // Allocate host memory
  a   = (float*)malloc(sizeof(float) * (N*N));
  b   = (float*)malloc(sizeof(float) * (N*N));
  c   = (float*)malloc(sizeof(float) * (N*N));

  // Initialize host matrix
  for(int i = 0; i < (N*N); i++)
    {
      a[i] = 1.0f;
      b[i] = 2.0f;
    }

  // Device function call 
  matrix_mul(a, b, c, N);

  // Verification
  for(int i = 0; i < N; i++)
    {
      for(int j = 0; j < N; j++)
         {
          cout << c[j] <<" ";
         }
      cout << " " <<endl;
   }

  // Deallocate host memory
 free(a); 
 free(b); 
 free(c);

 return 0;
}
//-*-C++-*-
// Matrix-multiplication-template.cu

#include<iostream>
#include<cuda.h>

using namespace std;

__global__ void matrix_mul(float* d_a, float* d_b, 
float* d_c, int width)
{

  // create a 2d threads block
  int row = ..................
  int col = ....................

  // only allow the threads that are needed for the computation 
  if (................................)
    {
      float temp = 0;
      // each thread computes one 
      // element of the block sub-matrix
      for (int i = 0; i < width; ++i) 
        {
          temp += d_a[row*width+i]*d_b[i*width+col];
        }
      d_c[row*width+col] = temp;
    }
}

// Host call (matrix multiplication)
float * cpu_matrix_mul(float *h_a, float *h_b, float *h_c, int width)   
{                                                                 
  for(int row = 0; row < width ; ++row)                           
    {                                                             
      for(int col = 0; col < width ; ++col)                       
        {                                                         
          float single_entry = 0;                                       
          for(int i = 0; i < width ; ++i)                         
            {                                                     
              single_entry += h_a[row*width+i] * h_b[i*width+col];      
            }                                                     
          h_c[row*width+col] = single_entry;                            
        }                                                         
    }   
  return h_c;           
}

int main()
{

  cout << "Programme assumes that matrix (square matrix) size is N*N "<<endl;
  cout << "Please enter the N size number "<< endl;
  int N = 0;
  cin >> N;

  // Initialize the memory on the host
  float *a, *b, *c, *host_check;       

  // Initialize the memory on the device
  float *d_a, *d_b, *d_c; 

  // Allocate host memory
  a   = (float*)malloc(sizeof(float) * (N*N));
  ...
  ...

  // Initialize host matrix
  for(int i = 0; i < (N*N); i++)
    {
      a[i] = 2.0f;
      b[i] = 2.0f;
    }

  // Allocate device memory
  cudaMalloc((void**)&d_a, sizeof(float) * (N*N));
  ...
  ...

  // Transfer data from a host to device memory
  cudaMemcpy(.........................);
  cudaMemcpy(.........................);

  // Thread organization
  int blockSize = ..............;
  dim3 dimBlock(......................);
  dim3 dimGrid(.......................);

  // Device function call 
  matrix_mul<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);

  // Transfer data back to host memory
  cudaMemcpy(c, d_c, sizeof(float) * (N*N), cudaMemcpyDeviceToHost);

  // CPU computation for verification 
  cpu_matrix_mul(a,b,host_check,N);

  // Verification
  bool flag=1;
  for(int i = 0; i < N; i++)
    {
     for(int j = 0; j < N; j++)
       {
         if(c[j*N+i]!= host_check[j*N+i])
           {
             flag=0;
             break;
           }
       }
    }
  if (flag==0)
    {
      cout <<"Two matrices are not equal" << endl;
    }
  else
    cout << "Two matrices are equal" << endl;

  // Deallocate device memory
  cudaFree...

  // Deallocate host memory
  free...

  return 0;
}
//-*-C++-*-
// Matrix-multiplication.cu

#include<iostream>
#include<cuda.h>

using namespace std;

__global__ void matrix_mul(float* d_a, float* d_b, 
float* d_c, int width)
{

  int row = blockIdx.x * blockDim.x + threadIdx.x;
  int col = blockIdx.y * blockDim.y + threadIdx.y;

  if ((row < width) && (col < width)) 
    {
      float temp = 0;
      // Each thread computes one 
      // Element of the block sub-matrix
      for (int i = 0; i < width; ++i) 
        {
          temp += d_a[row*width+i]*d_b[i*width+col];
        }
      d_c[row*width+col] = temp;
    }
}

// Host call (matrix multiplication)
float * cpu_matrix_mul(float *h_a, float *h_b, float *h_c, int width)   
{                                                                 
  for(int row = 0; row < width ; ++row)                           
    {                                                             
      for(int col = 0; col < width ; ++col)                       
        {                                                         
          float single_entry = 0;                                       
          for(int i = 0; i < width ; ++i)                         
            {                                                     
              single_entry += h_a[row*width+i] * h_b[i*width+col];      
            }                                                     
          h_c[row*width+col] = single_entry;                            
        }                                                         
    }   
  return h_c;           
}


int main()
{

  cout << "Programme assumes that matrix (square matrix) size is N*N "<<endl;
  cout << "Please enter the N size number "<< endl;
  int N = 0;
  cin >> N;

  // Initialize the memory on the host
  float *a, *b, *c, *host_check;       

  // Initialize the memory on the device
  float *d_a, *d_b, *d_c; 

  // Allocate host memory
  a   = (float*)malloc(sizeof(float) * (N*N));
  b   = (float*)malloc(sizeof(float) * (N*N));
  c   = (float*)malloc(sizeof(float) * (N*N));
  host_check = (float*)malloc(sizeof(float) * (N*N));

  // Initialize host matrix
  for(int i = 0; i < (N*N); i++)
    {
      a[i] = 2.0f;
      b[i] = 2.0f;
    }

  // Allocate device memory
  cudaMalloc((void**)&d_a, sizeof(float) * (N*N));
  cudaMalloc((void**)&d_b, sizeof(float) * (N*N));
  cudaMalloc((void**)&d_c, sizeof(float) * (N*N));

  // Transfer data from host to device memory
  cudaMemcpy(d_a, a, sizeof(float) * (N*N), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, sizeof(float) * (N*N), cudaMemcpyHostToDevice);

  // Thread organization
  int blockSize = 32;
  dim3 dimBlock(blockSize,blockSize,1);
  dim3 dimGrid(ceil(N/float(blockSize)),ceil(N/float(blockSize)),1);

  // Device function call 
  matrix_mul<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);

  // Transfer data back to host memory
  cudaMemcpy(c, d_c, sizeof(float) * (N*N), cudaMemcpyDeviceToHost);

  // CPU computation for verification 
  cpu_matrix_mul(a,b,host_check,N);

  // Verification
  bool flag=1;
  for(int i = 0; i < N; i++)
    {
     for(int j = 0; j < N; j++)
       {
         if(c[j*N+i]!= host_check[j*N+i])
           {
             flag=0;
             break;
           }
       }
    }
  if (flag==0)
    {
      cout <<"Two matrices are not equal" << endl;
    }
  else
    cout << "Two matrices are equal" << endl;

  // Deallocate device memory
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  // Deallocate host memory
  free(a); 
  free(b); 
  free(c);
  free(host_check);

  return 0;
}
Compilation and Output
// compilation
$ g++ Matrix-multiplication.cc -o Matrix-Multiplication-CPU

// execution 
$ ./Matrix-Multiplication-CPU

// output
$ g++ Matrix-multiplication.cc -o Matrix-multiplication
$ ./Matrix-multiplication
The programme assumes that the matrix (square matrix) size is N*N 
Please enter the N size number 
4
16 16 16 16 
16 16 16 16  
16 16 16 16  
16 16 16 16 
// compilation
$ nvcc -arch=compute_70 Matrix-multiplication.cu -o Matrix-Multiplication-GPU

// execution
$ ./Matrix-Multiplication-GPU
The programme assumes that the matrix (square matrix) size is N*N 
Please enter the N size number
$ 256

// output
$ Two matrices are equal
Questions
  • Right now, we are using the 1D array to represent the matrix. However, you can also do it with the 2D matrix. Can you try 2D array matrix multiplication with a 2D thread block?
  • Can you get the correct solution if you remove the if ((row < width) && (col < width)) condition from the __global__ void matrix_mul(float* d_a, float* d_b, float* d_c, int width) function?
  • Please try using different thread blocks and different matrix sizes.
    // Thread organization
    int blockSize = 32;
    dim3 dimBlock(blockSize,blockSize,1);
    dim3 dimGrid(ceil(N/float(blockSize)),ceil(N/float(blockSize)),1);