Skip to content

Unified Memory

Unified memory simplifies the explicit data movement from host to device by programmers. OpenACC API will manage the data transfer between CPU and GPU. In this example, we will look into vector addition in GPU using the unified memory concept.

  • Just using the compiler flag -gpu=managed will enable the unified memory in OpenACC.

The table below summarises the required steps needed for the unified memory concept.

Unified Memory

nvc -fast -acc=gpu -gpu=cc80 -gpu=managed -Minfo=accel test.c
nvfortran -fast -acc=gpu -gpu=cc80 -gpu=managed -Minfo=accel test.c
Without unified memory With unified memory
Allocate the host memory Allocate the host memory
Initialize the host value Initialize the host value
Use data cluases, e.g,. copy, copyin Use data cluases, e.g,. copy, copyin
Do the computation using the GPU kernel Do the computation using the GPU kernel
Free host memory Free host memory

Questions and Solutions

Examples: Vector Addition
// Vector-addition-template.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <time.h>
#include <openacc.h>    


#define N 5120
#define MAX_ERR 1e-6


// GPU function that adds two vectors 
// function that adds two vector 
void Vector_Addition(float *restrict a, float *restrict b, float *restrict c, int n) 
{

// add here either parallel or kernel and do need to add data map clauses
#pragma acc 
for(int i = 0; i < n; i ++)
   {
     c[i] = a[i] + b[i];
   }
}

int main()
{
  // Initialize the memory on the host
  float *restrict a, *restrict b, *restrict c;       

  // Allocate host memory
  a = (float*)malloc(sizeof(float) * N);
  b = (float*)malloc(sizeof(float) * N);
  c = (float*)malloc(sizeof(float) * N);

  // Initialize host arrays
  for(int i = 0; i < N; i++)
    {
      a[i] = 1.0f;
      b[i] = 2.0f;
    }

  // Start measuring time
  clock_t start = clock();

  // Executing CPU function 
  Vector_Addition(a, b, c, N);

  // Stop measuring time and calculate the elapsed time
  clock_t end = clock();
  double elapsed = (double)(end - start)/CLOCKS_PER_SEC;

  printf("Time measured: %.3f seconds.\n", elapsed);

  // Verification
  for(int i = 0; i < N; i++)
    {
      assert(fabs(c[i] - a[i] - b[i]) < MAX_ERR);
    }

  printf("PASSED\n");

  // Deallocate host memory
  free(a); 
  free(b); 
  free(c);

  return 0;
}
// Vector-addition-openacc.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <time.h>
#include <openacc.h>

#define N 5120
#define MAX_ERR 1e-6


// function that adds two vector 
void Vector_Addition(float *restrict a, float *restrict b, float *restrict c, int n) 
{
#pragma acc kernels loop
for(int i = 0; i < n; i ++)
   {
    c[i] = a[i] + b[i];
   }
}

int main()
{
  // Initialize the memory on the host
  float *restrict a, *restrict b, *restrict c;       

  // Allocate host memory
  a = (float*)malloc(sizeof(float) * N);
  b = (float*)malloc(sizeof(float) * N);
  c = (float*)malloc(sizeof(float) * N);

  // Initialize host arrays
  for(int i = 0; i < N; i++)
    {
      a[i] = 1.0f;
      b[i] = 2.0f;
    }

  // Start measuring time
  clock_t start = clock();

  // Executing CPU function 
  Vector_Addition(a, b, c, N);

  // Stop measuring time and calculate the elapsed time
  clock_t end = clock();
  double elapsed = (double)(end - start)/CLOCKS_PER_SEC;

  printf("Time measured: %.3f seconds.\n", elapsed);

  // Verification
  for(int i = 0; i < N; i++)
    {
      assert(fabs(c[i] - a[i] - b[i]) < MAX_ERR);
    }

  printf("PASSED\n");

  // Deallocate host memory
  free(a); 
  free(b); 
  free(c);

  return 0;
}
!! Vector-addition-openacc.f90

module Vector_Addition_Mod
  implicit none
contains
 subroutine Vector_Addition(a, b, c, n)
    ! Input vectors
    real(8), intent(in), dimension(:) :: a                        
    real(8), intent(in), dimension(:) :: b
    real(8), intent(out), dimension(:) :: c
    integer :: i, n
    // add here your acc directive
    do i = 1, n
       c(i) = a(i) + b(i)
    end do
    !$acc.....
  end subroutine Vector_Addition
end module Vector_Addition_Mod

program main
  use openacc
  use Vector_Addition_Mod
  implicit none

  ! Input vectors
  real(8), dimension(:), allocatable :: a
  real(8), dimension(:), allocatable :: b 
  ! Output vector
  real(8), dimension(:), allocatable :: c

  integer :: n, i             
  print *, "This program does the addition of two vectors "
  print *, "Please specify the vector size = " 
  read *, n  

  ! Allocate memory for vector
  allocate(a(n))
  allocate(b(n))
  allocate(c(n))

  ! Initialize content of input vectors, 
  ! vector a[i] = sin(i)^2 vector b[i] = cos(i)^2
  do i = 1, n
     a(i) = sin(i*1D0) * sin(i*1D0)
     b(i) = cos(i*1D0) * cos(i*1D0) 
  enddo

  ! Call the vector add subroutine 
  call Vector_Addition(a, b, c, n)

  !!Verification
  do i = 1, n
     if (abs(c(i)-(a(i)+b(i))==0.00000)) then 
     else
        print *, "FAIL"
     endif
  enddo
  print *, "PASS"

  ! Delete the memory
  deallocate(a)
  deallocate(b)
  deallocate(c)

end program main        
!! Vector-addition-openacc.f90

module Vector_Addition_Mod
  implicit none
contains
 subroutine Vector_Addition(a, b, c, n)
    ! Input vectors
    real(8), intent(in), dimension(:) :: a                        
    real(8), intent(in), dimension(:) :: b
    real(8), intent(out), dimension(:) :: c
    integer :: i, n
    !$acc parallel loop 
    do i = 1, n
       c(i) = a(i) + b(i)
    end do
    !$acc end parallel
  end subroutine Vector_Addition
end module Vector_Addition_Mod

program main
  use openacc
  use Vector_Addition_Mod
  implicit none

  ! Input vectors
  real(8), dimension(:), allocatable :: a
  real(8), dimension(:), allocatable :: b 
  ! Output vector
  real(8), dimension(:), allocatable :: c

  integer :: n, i             
  print *, "This program does the addition of two vectors "
  print *, "Please specify the vector size = " 
  read *, n  

  ! Allocate memory for vector
  allocate(a(n))
  allocate(b(n))
  allocate(c(n))

  ! Initialize content of input vectors, 
  ! vector a[i] = sin(i)^2 vector b[i] = cos(i)^2
  do i = 1, n
     a(i) = sin(i*1D0) * sin(i*1D0)
     b(i) = cos(i*1D0) * cos(i*1D0) 
  enddo

  ! Call the vector add subroutine 
  call Vector_Addition(a, b, c, n)

  !!Verification
  do i = 1, n
     if (abs(c(i)-(a(i)+b(i))==0.00000)) then 
     else
        print *, "FAIL"
     endif
  enddo
  print *, "PASS"

  ! Delete the memory
  deallocate(a)
  deallocate(b)
  deallocate(c)

end program main
Compilation and Output
// compilation
$ nvc -fast -acc=gpu -gpu=cc80 -Minfo=accel -gpu=managed Vector-addition-openacc.c -o Vector-Addition-GPU
Vector_Addition:
12, Generating copyin(a[:n]) [if not already present]
    Generating copyout(c[:n]) [if not already present]
    Generating copyin(b[:n]) [if not already present]
14, Loop is parallelizable
    Generating NVIDIA GPU code
    14, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */

// execution
$ ./Vector-Addition-GPU

// output
$ ./Vector-addition-GPU
PASSED
// compilation
$ nvfortran -fast -acc=gpu -gpu=cc80 -gpu=managed -Minfo=accel Vector-addition-openacc.f90 -o Vector-Addition-GPU
vector_addition:
     12, Generating NVIDIA GPU code
         13, !$acc loop gang, vector(128) ! blockidx%x threadidx%x
     12, Generating implicit copyin(a(:n)) [if not already present]
         Generating implicit copyout(c(:n)) [if not already present]
         Generating implicit copyin(b(:n)) [if not already present

// execution         
$ ./Vector-Addition-GPU

// output
This program does the addition of two vectors 
Please specify the vector size = 
1000000
PASS
Questions
  • Do you already see any performance difference? Using unified memory?