SIMD and Others
In this exercise, we will try to add the simd
classes to our existing problems, for example, vector addition.
Examples and Question: SIMD - Vector Addition
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <time.h>
#define N 5120
#define MAX_ERR 1e-6
// CPU function that adds two vector
float * Vector_Add(float *a, float *b, float *c, int n)
{
for(int i = 0; i < n; i ++)
{
c[i] = a[i] + b[i];
}
return c;
}
int main()
{
// Initialize the variables
float *a, *b, *c;
// Allocate the memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
c = (float*)malloc(sizeof(float) * N);
// Initialize the arrays
for(int i = 0; i < N; i++)
{
a[i] = 1.0f;
b[i] = 2.0f;
}
// Start measuring time
clock_t start = clock();
// Executing vector addition function
Vector_Add(a, b, c, N);
// Stop measuring time and calculate the elapsed time
clock_t end = clock();
double elapsed = (double)(end - start)/CLOCKS_PER_SEC;
printf("Time measured: %.3f seconds.\n", elapsed);
// Verification
for(int i = 0; i < N; i++)
{
assert(fabs(c[i] - a[i] - b[i]) < MAX_ERR);
}
printf("c[0] = %f\n", c[0]);
printf("PASSED\n");
// Deallocate the memory
free(a);
free(b);
free(c);
return 0;
}
module Vector_Addition_Mod
implicit none
contains
subroutine Vector_Addition(a, b, c, n)
! Input vectors
real(8), intent(in), dimension(:) :: a
real(8), intent(in), dimension(:) :: b
real(8), intent(out), dimension(:) :: c
integer :: i, n
do i = 1, n
c(i) = a(i) + b(i)
end do
end subroutine Vector_Addition
end module Vector_Addition_Mod
program main
use Vector_Addition_Mod
implicit none
! Input vectors
real(8), dimension(:), allocatable :: a
real(8), dimension(:), allocatable :: b
! Output vector
real(8), dimension(:), allocatable :: c
! real(8) :: sum = 0
integer :: n, i
print *, "This program does the addition of two vectors "
print *, "Please specify the vector size = "
read *, n
! Allocate memory for vector
allocate(a(n))
allocate(b(n))
allocate(c(n))
! Initialize content of input vectors,
! vector a[i] = sin(i)^2 vector b[i] = cos(i)^2
do i = 1, n
a(i) = sin(i*1D0) * sin(i*1D0)
b(i) = cos(i*1D0) * cos(i*1D0)
enddo
! Call the vector addition subroutine
call Vector_Addition(a, b, c, n)
!!Verification
do i = 1, n
if (abs(c(i)-(a(i)+b(i)) == 0.00000)) then
else
print *, "FAIL"
endif
enddo
print *, "PASS"
! Delete the memory
deallocate(a)
deallocate(b)
deallocate(c)
end program main
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <time.h>
#include <omp.h>
#define N 5120
#define MAX_ERR 1e-6
// CPU function that adds two vector
float * Vector_Add(float *a, float *b, float *c, int n)
{
// ADD YOUR PARALLEL REGION FOR THE LOOP SIMD
for(int i = 0; i < n; i ++)
{
c[i] = a[i] + b[i];
}
return c;
}
int main()
{
// Initialize the variables
float *a, *b, *c;
// Allocate the memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
c = (float*)malloc(sizeof(float) * N);
// Initialize the arrays
for(int i = 0; i < N; i++)
{
a[i] = 1.0f;
b[i] = 2.0f;
}
// Start measuring time
clock_t start = clock();
// ADD YOUR PARALLEL REGION HERE
// Executing vector addition function
Vector_Add(a, b, c, N);
// Stop measuring time and calculate the elapsed time
clock_t end = clock();
double elapsed = (double)(end - start)/CLOCKS_PER_SEC;
printf("Time measured: %.3f seconds.\n", elapsed);
// Verification
for(int i = 0; i < N; i++)
{
assert(fabs(c[i] - a[i] - b[i]) < MAX_ERR);
}
printf("c[0] = %f\n", c[0]);
printf("PASSED\n");
// Deallocate the memory
free(a);
free(b);
free(c);
return 0;
}
module Vector_Addition_Mod
implicit none
contains
subroutine Vector_Addition(a, b, c, n)
use omp_lib
! Input vectors
real(8), intent(in), dimension(:) :: a
real(8), intent(in), dimension(:) :: b
real(8), intent(out), dimension(:) :: c
integer :: i, n
!! ADD YOUR PARALLEL DO LOOP WITH SIMD
do i = 1, n
c(i) = a(i) + b(i)
end do
end subroutine Vector_Addition
end module Vector_Addition_Mod
program main
use Vector_Addition_Mod
implicit none
! Input vectors
real(8), dimension(:), allocatable :: a
real(8), dimension(:), allocatable :: b
! Output vector
real(8), dimension(:), allocatable :: c
! real(8) :: sum = 0
integer :: n, i
print *, "This program does the addition of two vectors "
print *, "Please specify the vector size = "
read *, n
! Allocate memory for vector
allocate(a(n))
allocate(b(n))
allocate(c(n))
! Initialize content of input vectors,
! vector a[i] = sin(i)^2 vector b[i] = cos(i)^2
do i = 1, n
a(i) = sin(i*1D0) * sin(i*1D0)
b(i) = cos(i*1D0) * cos(i*1D0)
enddo
!! ADD YOUR PARALLEL REGION
! Call the vector add subroutine
call Vector_Addition(a, b, c, n)
!!Verification
do i = 1, n
if (abs(c(i)-(a(i)+b(i)) == 0.00000)) then
else
print *, "FAIL"
endif
enddo
print *, "PASS"
! Delete the memory
deallocate(a)
deallocate(b)
deallocate(c)
end program main
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <time.h>
#include <omp.h>
#define N 5120
#define MAX_ERR 1e-6
// CPU function that adds two vector
float * Vector_Add(float *a, float *b, float *c, int n)
{
// ADD YOUR PARALLEL SIMD
#pragma omp for simd
for(int i = 0; i < n; i ++)
{
c[i] = a[i] + b[i];
}
return c;
}
int main()
{
// Initialize the variables
float *a, *b, *c;
// Allocate the memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
c = (float*)malloc(sizeof(float) * N);
// Initialize the arrays
for(int i = 0; i < N; i++)
{
a[i] = 1.0f;
b[i] = 2.0f;
}
double start = omp_get_wtime();
#pragma omp parallel
// Executing vector addition function
Vector_Add(a, b, c, N);
double end = omp_get_wtime();
printf("Work took %f seconds\n", end - start);
// Verification
for(int i = 0; i < N; i++)
{
assert(fabs(c[i] - a[i] - b[i]) < MAX_ERR);
}
printf("c[0] = %f\n", c[0]);
printf("PASSED\n");
// Deallocate the memory
free(a);
free(b);
free(c);
return 0;
}
module Vector_Addition_Mod
implicit none
contains
subroutine Vector_Addition(a, b, c, n)
use omp_lib
! Input vectors
real(8), intent(in), dimension(:) :: a
real(8), intent(in), dimension(:) :: b
real(8), intent(out), dimension(:) :: c
integer :: i, n
!$omp do simd
do i = 1, n
c(i) = a(i) + b(i)
end do
!$omp end do simd
end subroutine Vector_Addition
end module Vector_Addition_Mod
program main
use Vector_Addition_Mod
implicit none
! Input vectors
real(8), dimension(:), allocatable :: a
real(8), dimension(:), allocatable :: b
! Output vector
real(8), dimension(:), allocatable :: c
! real(8) :: sum = 0
double precision :: start, end
integer :: n, i
print *, "This program does the addition of two vectors "
print *, "Please specify the vector size = "
read *, n
! Allocate memory for vector
allocate(a(n))
allocate(b(n))
allocate(c(n))
! Initialize content of input vectors,
! vector a[i] = sin(i)^2 vector b[i] = cos(i)^2
do i = 1, n
a(i) = sin(i*1D0) * sin(i*1D0)
b(i) = cos(i*1D0) * cos(i*1D0)
enddo
start = omp_get_wtime()
!$omp parallel
! Call the vector addition subroutine
call Vector_Addition(a, b, c, n)
!$omp end parallel
end = omp_get_wtime()
PRINT *, "Work took", end - start, "seconds"
!!Verification
do i = 1, n
if (abs(c(i)-(a(i)+b(i)) == 0.00000)) then
else
print *, "FAIL"
endif
enddo
print *, "PASS"
! Delete the memory
deallocate(a)
deallocate(b)
deallocate(c)
end program main
- Please try the examples without the
simd
clause. Do you notice any performance differences?
Critical, Single, and Master¶
We will explore how single, master and critical are working in the OpenMP programming model. For this, we consider the following simple examples.
Examples and Question: Critical, Single and Master
#include<iostream>
#include<omp.h>
#include <omp.h>
using namespace std;
int main()
{
cout << "Hello world from the master thread "<< endl;
cout << endl;
// creating the parallel region (with N number of threads)
#pragma omp parallel
{
cout << "Hello world from thread id "
<< omp_get_thread_num() << " from the team size of "
<< omp_get_num_threads()
<< endl;
} // parallel region is closed
cout << endl;
cout << "end of the programme from the master thread" << endl;
return 0;
}
- Try single clause
- Try master clause
- Try critical clause