Using a CUDA GPU to do Jacobi Iterations

25 Jan 2019

Download Source Files Here (tar.gz)

In this post we will look at using CUDA programming to get a GPU to compute Jacobi iterations for approximations to harmonic functions satisfying Dirichlet boundary conditions (i.e. specifying the values of the function along the boundary). Our domain will simply be the unit square in the xy-plane. Using a GPU will allow us to implement every step of a single iteration in parallel.

As an example, here are the results for a PDE discretization grid of size 20x20.

Values Surface Values Heatmap

Let’s take a look at the errors and the logarithms base 10 of the relative errors. Errors Heatmap Log10 Relative Errors Heatmap

For this post we will be making use of software inaddition to compiling C++.

Building and Running the Code

We can simply use our makefile to build the CUDA C++ code.

example.exe : example.cu jacobi.obj
	nvcc example.cu jacobi.obj -o example.exe

jacobi.obj : jacobi.cu jacobi.cuh
	nvcc -dc jacobi.cu

.PHONY: cleangraphs
cleangraphs:
	rm graphs/*.svg

Our directory structure will be:

To use it we simply run make from a terminal from the main directory.

To use CUDA to find the approximate function on a 20x20 grid, we simply need to run the following from the main directory.

example.exe 20

We get the following output:

Making initial values and true values
Before Average Error = 0.00788951
Copying to Device
Doing Jacobi Iterations
Copying result to values
Copying to file 'values.dat'
Now getting errors
After Average Error = 0.000176011
Now getting relative errors

Then, to make the graphs, we need to run the Bash script:

sh makeGraphs.sh 20

Reviewing example.cu

Now let’s review example.cu. First let’s include headers we will need.

/*! example.cu
 *
 * Example to compute Jacobi iterations for specific size of grid discretization.
 *
 * \author Matthew McGonagle
 */

#include <iostream>
#include "jacobi.cuh"
#include <cuda_runtime.h>
#include <cmath>
#include <string>

Next, let’s define the harmonic function that we will use to assign our boundary values and check the validity of our results.

//! Harmonic function of x and y is used to compute true values and the boundary values. 
__host__
float getHarmonic(float x, float y) 
{
    //! Real Part ((z - 0.5 - 0.5i)^5) = (x - 0.5)^5 - 10 (x - 0.5)^3 (y - 0.5)^2 + 5 (x - 0.5) (y - 0.5)^4.

    x -= 0.5;
    y -= 0.5;
    return pow(x, 5) - 10 * pow(x, 3) * pow(y, 2) + 5 * pow(x, 1) * pow(y, 4);
}

Next, let’s deal with the parameters sent to main().

/*! Compute order of N^2 Jacobi iterations for harmonic solution on xy unit square for boundary values where
 *      we divide the square into an NxN grid; save the results to file. 
 * 
 *  The number N is sent to the executable as a string and as the first and only parameter. The default value
 *      is 20 if no parameter is given. Also we require N > 1. 
 */
int main(int argc, char * argv[]) 
{
    // First get the dimensions from command line arguments.

    int N;
    if(argc < 2) // default vale for no parameters.
        N = 20;
    else { 
        N = std::stoi(argv[1]);
        if( N < 2) // Use default for not good values of N.
            N = 20;
    }

Next, let’s define the variables we will need.

    int nIterations = 3 * N * N, // For good convergence the number of iterations is of the same order as gridsize.
        dimensions[2] = {N, N}, // The dimensions of the grid to approximate PDE (not the CUDA execution grid).
        nThreads = N / 10 + 1, // Number of CUDA threads per CUDA block dimension. 
        memSize = dimensions[0] * dimensions[1] * sizeof(float);
    const float lowerLeft[2] = {0, 0},  // Lower left coordinate of rectangular domain.
                upperRight[2] = {1, 1}; // Upper right coordinate of rectangular domain.
    //! We use flat arrays, because CUDA uses flat arrays.
    float * values, * trueValues, * in, * out, * errors, * relErrors;
    const dim3 blockSize( nThreads , nThreads), // The size of CUDA block of threads.
               gridSize( (dimensions[0] + nThreads - 1) / nThreads, (dimensions[1] + nThreads - 1) / nThreads);
               /*  The number of blocks in CUDA execution grid; make sure there is atleast enough 
                *  threads to have one for each point in our differential equation discretization grid.
                *  There be extra threads that are unnecessary.
                */

Next, let’s setup the intial values (including boundary values), find the true values, and find the initial average error.

    std::cout << "Making initial values and true values" << std::endl;
    // Initial values includes boundary values.
    values = makeInitialValues( dimensions, lowerLeft, upperRight, & getHarmonic ); 

    // Find the true values of harmonic function using the boundary values function.
    trueValues = makeTrueValues( dimensions, lowerLeft, upperRight, & getHarmonic );

    std::cout << "Before Average Error = " 
              << getAverageError(values, trueValues, dimensions) //dimensions[0], dimensions[1]) 
              << std::endl;

Next, we need to copy the initial values to the CUDA device.

    // Need to copy values from host to CUDA device.
    std::cout << "Copying to Device" << std::endl;
    try 
    {
        copyToDevice(values, dimensions, &in, &out);
    }
    catch( ... )
    {
        std::cout << "Exception happened while copying to device" << std::endl;
    }

Next, we use the host device to tell the CUDA device to do each iteration. We simply synchronize between each iteration.

    // At end of loop, output is inside pointer *in.

    std::cout << "Doing Jacobi Iterations" << std::endl;
    for( int i = 0; i < nIterations; i++)
    {
        // Call CUDA device kernel to a Jacobi iteration. 
        doJacobiIteration<<< gridSize, blockSize >>>(dimensions[0], dimensions[1], in, out);
        cudaDeviceSynchronize();
        if(cudaGetLastError() != cudaSuccess)
        {
            std::cout << "Error Launching Kernel" << std::endl;
            return 1;
        }
        std::swap(in, out);
    }

Next, let’s get the results and save the relevant data to file.

    // Get the result from the CUDA device.
    std::cout << "Copying result to values" << std::endl;
    if(cudaMemcpy( values, in, memSize, cudaMemcpyDeviceToHost ) != cudaSuccess) 
    {
        std::cout << "There was a problem retrieving the result from the device" << std::endl;
        return 1;    
    }

    // Now compute errors and save important data to file.

    std::cout << "Copying to file 'values.dat'" << std::endl;
    saveToFile( values, dimensions, lowerLeft, upperRight, "data/values.dat");

    std::cout << "Now getting errors" << std::endl;
    errors = getErrors(values, trueValues, dimensions);
    saveToFile( errors, dimensions, lowerLeft, upperRight, "data/errors.dat");
    std::cout << "After Average Error = " 
              << getAverageError(values, trueValues, dimensions)
              << std::endl;

    std::cout << "Now getting relative errors" << std::endl;
    relErrors = getRelativeErrors(errors, trueValues, dimensions);
    saveToFile( relErrors, dimensions, lowerLeft, upperRight, "data/log10RelErrors.dat");

Finally let’s clean up and exit.

    // Clean up memory.

    cudaFree(in); // First clean up on CUDA device.
    cudaFree(out);
    delete[] values; // Now clean up on host.
    delete[] errors;
    delete[] relErrors;
    delete[] trueValues;

    return 0;
}

Graphing the Results

We will use the GNUPlot script fromFile.p and the Bash script makeGraphs.sh. First, let’s take a look at the GNUPlot script.

# fromFile.p
# Make 2d surface graph and 2d heatmap graph
# parameters
# ----------
# filename The input file to use for the graph. It should be an array to triples
#          in the form of (x-coordinate, y-coordinate, z-value).
#
# outname  The output file name.
#  
# dimX The dimension of the x-coordinate.
#
# dimY The dimension of the y-coordinate.

set terminal svg size 400,200 dynamic

set title outname." Heat Plot"
set output 'graphs/'.outname.'_heat_'.dimX.'_'.dimY.'.svg'
plot filename binary record=(dimX,dimY) scan=yx with image notitle 

set title outname." Surface Plot"
set grid
set hidden3d
set output 'graphs/'.outname.'_surface_'.dimX.'_'.dimY.'.svg'
splot filename binary record=(dimX,dimY) scan=yx with lines notitle 

Next, we use a Bash script to run the GNUPlot script on each data file.

# makeGraphs.sh
# Use GNUPlot to make graphs for existing data files from Jacobi iteration.
# Parameter $1 The side length of the square PDE discretization square grid.

if [ "$1" == "" ]
then
    dimX=20
    dimY=20
else
    dimX=$1
    dimY=$1
fi

echo "Graphing data/values.dat" 
gnuplot -e "filename = 'data/values.dat'" -e "outname = 'Values'" \
        -e "dimX = '$dimX'" -e "dimY = '$dimY'" fromFile.p
echo "Graphing data/errors.dat"
gnuplot -e "filename = 'data/errors.dat'" -e "outname = 'Errors'" \
        -e "dimX = '$dimX'" -e "dimY = '$dimY'" fromFile.p
echo "Graphing data/log10RelErrors.dat"
gnuplot -e "filename = 'data/log10RelErrors.dat'" -e "outname = 'Log10RelErrors'" \
        -e "dimX = '$dimX'" -e "dimY = '$dimY'" fromFile.p

This makes graphs including those at the beginning of this post.

Reviewing jacobi.cuh

Next, let’s take a look at the functions we used in jacobi.cuh.

/*! jacobi.cuh
 * 
 * Functions for handling Jacobi iterations on CUDA device.
 * 
 * \author Matthew McGonagle 
 */

#pragma once

//! Type for pointer to function for computing boundary values.
typedef float (*harmonic)(float, float);

/*! Compute a single Jacboi iteration for a single point on the PDE discretization grid.
 *
 * Compute a single Jacobi iteration for harmonic function (and Dirichlet boundary conditions).
 *
 * \param dimX The x-dimension of the PDE discretization grid.
 * \param dimY The y-dimension of the PDE discretization grid.
 * \param in Pointer to input values of iteration.
 * \param out Pointer to output values of iteration. 
 */
__global__ 
void doJacobiIteration(int dimX, int dimY, float * in, float * out);

/*! Copy values array from host to CUDA device.
 * 
 * \param values Pointer to values array.
 * \param dimensions Holds the x and y dimensions of the PDE discretization grid.
 * \param in Will hold value pointing to location of flat input array allocated on CUDA device.
 * \param out Will hold value pointing to location of flat output array allocated on CUDA device. 
 */
__host__
void copyToDevice(float * values, const int dimensions[2], float ** in, float ** out);

/*! Fill in the boundary values for flat array of grid values.
 * 
 * \param values Pointer to flat array to hold values.
 * \param dimensions Holds x and y dimensions of the PDE discretization grid.
 * \param lowerLeft The lower left corner coordinates of the xy-rectangular domain.
 * \param upperRight The upper right corner coordinates of the xy-rectangular domain. 
 * \param f The function to give the boundary values.
 */
__host__
void setBoundaryValues(float * values, const int dimensions[2], const float lowerLeft[2], const float upperRight[2], harmonic f);

/*! Set the initial values for the iteration process (including boundary values).
 *
 * \param dimensions Holds x and y dimensions of the PDE discretization grid.
 * \param lowerLeft The lower left corner coordinates of the rectangular xy-domain.
 * \param upperRight The upper right corner coordinates of the rectangular xy-domain.
 * \param f The function for the boundary values.
 */
__host__
float * makeInitialValues(const int dimensions[2], const float lowerLeft[2], const float upperRight[2], harmonic f);

/*! Set the true values of the solution.
 *
 * \param dimensions Holds x and y dimensions of the PDE discretization grid.
 * \param lowerLeft The lower left corner coordinates of the rectangular xy-domain.
 * \param upperRight The upper right corner coordinates of the rectangular xy-domain.
 * \param f The true values function.
 */
__host__
float * makeTrueValues(const int dimensions[2], const float lowerLeft[2], const float upperRight[2], harmonic f);

/*! Get the errors between the approximate solution and the true values.
 * 
 * \param values The values of the approximation.
 * \param trueValues The values of the true harmonic function.
 * \param dimensions The xy-dimensions of the PDE discretization grid.
 */
__host__
float * getErrors(const float * values, const float * trueValues, const int dimensions[2]);

/*! Get the logarithm of the relative errors between the approximation and true harmonic function.
 * 
 * \param errors The non-relative errors.
 * \param trueValues The true values of the harmonic function.
 * \param dimensions The xy-dimensions of the PDE discretization grid.
 * \param cutOff We round up absolute trueValues that are smaller than the cutOff to the cutOff. We also
                 round up to cutOff to avoid taking the logarithm of 0.
 */
__host__
float * getRelativeErrors(const float * errors, const float * trueValues, const int dimensions[2], float cutOff = 0.00001);

/*! Get the average absolute error.
 * 
 * \param values The values of the approximation.
 * \param trueValues The values of the true harmonic function.
 * \param dimensions The xy-dimensions of the PDE discretization grid.
 */
__host__
float getAverageError(const float * values, const float * trueValues, const int dimensions[2]); //const int dimX, const int dimY );

/*! Print values to standard output. Best to use for small dimensions.
 *
 * \param dimensions The xy-dimensions of the PDE discretization grid.
 * \param values The values to print.  
 */
__host__ 
void printValues(const int dimensions[2], const float * values);

/*! Save values of flat array to file as a binary file.
 *
 * The values are saved as an array of triplets as an (x-coordinate, y-coordinate, value). This format
 * is to be compatible with GNU plot.
 *  
 * \param values The values to save to file.
 * \param dimensions The xy-dimensions of the PDE discretization grid.
 * \param  lowerLeft The lower left corner coordinates of the rectangular xy-domain.
 * \param upperRight The upper right corner coordinates of the rectangular xy-domain.
 * \param filename File name to save to. 
 */
__host__
void saveToFile(const float * values, const int dimensions[2], const float lowerLeft[2], const float upperRight[2],
                const char * filename);

Reviewing jacobi.cu

Next, let’s see how the functions are implemented.

/*! jacobi.cu
 */

#include "jacobi.cuh"
#include <iostream>
#include <fstream>

__global__
void doJacobiIteration(int dimX, int dimY, float * in, float * out)
{
    const int i = blockIdx.x * blockDim.x + threadIdx.x,
              j = blockIdx.y * blockDim.y + threadIdx.y;
    const int offset = i * dimY + j;

    // Remember to do nothing for boundary values.

    if( i < 1 || i > dimX - 2 ) 
        return;

    if( j < 1 || j > dimY - 2 )
        return;

    out += offset; 
    in += offset;

    // Jacobi iteration for harmonic means the ouput is average of neighbor points in grid.

    *out = *(in - 1) * 0.25 +
           *(in + 1) * 0.25 + 
           *(in - dimY) * 0.25 + 
           *(in + dimY) * 0.25;
}

__host__
void copyToDevice(float * values, const int dimensions[2], float ** in, float ** out)
{
    const int memSize = dimensions[0] * dimensions[1] * sizeof(float); 

    if (cudaMalloc( in, memSize ) != cudaSuccess)
        throw "Can't allocate in on device.";

    if (cudaMalloc( out, memSize ) != cudaSuccess)
        throw "Can't allocate out on device.";

    if(cudaMemcpy( *in, values, memSize, cudaMemcpyHostToDevice ) != cudaSuccess)
        throw "Can't copy values to in on device.";

    if(cudaMemcpy( *out, values, memSize, cudaMemcpyHostToDevice ) != cudaSuccess)
        throw "Can't copy values to out on device.";
 
}

__host__
void setBoundaryValues(float * values, const int dimensions[2], const float lowerLeft[2], const float upperRight[2], harmonic f)
{
    float stride[2], pos;
    int i, last[2] = {dimensions[0] - 1, dimensions[1] - 1};
    float * memPos1, * memPos2; 

    for (i = 0; i < 2; i++)
        stride[i] = (upperRight[i] - lowerLeft[i]) / last[i];

    // Fill in top and bottom.

    memPos1 = values;
    memPos2 = values + (dimensions[1]-1);
    for (i = 0, pos = lowerLeft[0]; i < dimensions[0]; i++, pos += stride[0], memPos1+=dimensions[1], memPos2+=dimensions[1])
    {
        *memPos1 = f(pos, lowerLeft[1]); 
        *memPos2 = f(pos, upperRight[1]);
    }

    // Fill in sides.

    memPos1 = values + 1;
    memPos2 = values + (dimensions[0] - 1) * dimensions[1] + 1;

    for (i = 0, pos = lowerLeft[1]+stride[1]; i < dimensions[0] - 2; i++, pos += stride[1], memPos1++ , memPos2++ )
    {
        *memPos1 = f(lowerLeft[0], pos);
        *memPos2 = f(upperRight[0], pos);
    }
}

__host__
float * makeInitialValues( const int dimensions[2], const float lowerLeft[2], const float upperRight[2], harmonic f )
{
    float * values = new float[dimensions[0] * dimensions[1]],
          * rowPos = values,
          * colPos; 

    // We don't do anything for boundary values yet.

    rowPos = values + dimensions[1];
    for (int i = 0; i < dimensions[0] - 2; i++, rowPos += dimensions[1])
    {
        colPos = rowPos + 1;
        for (int j = 0; j < dimensions[1] - 2; j++, colPos++)
            *colPos = 0;    
    }
    setBoundaryValues( values, dimensions, lowerLeft, upperRight, f );

    return values;
}

__host__
float * makeTrueValues(const int dimensions[2], const float lowerLeft[2], const float upperRight[2], harmonic f)
{
    float *values = new float[dimensions[0] * dimensions[1]],
          *rowPosition = values,
          *colPosition; 

    float stride[2] {(upperRight[0] - lowerLeft[0]) / static_cast<float>(dimensions[0] - 1),
                     (upperRight[1] - lowerLeft[1]) / static_cast<float>(dimensions[1] - 1) }; 

    int i, j;
    float x, y;

    for (i = 0, x = lowerLeft[0]; i < dimensions[0]; i++, x += stride[0], rowPosition += dimensions[1]) 
    {
        colPosition = rowPosition;
        for (j = 0, y = lowerLeft[1]; j < dimensions[1] ; j++, y += stride[1], colPosition++)
            *colPosition = f(x, y);    
    }

    return values;

}

__host__
float * getErrors(const float * values, const float * trueValues, const int dimensions[2])
{

    float * errors = new float[dimensions[0] * dimensions[1]];
    unsigned int position = 0;
    for ( int i = 0; i < dimensions[0]; i++)
    {
        for (int j = 0; j < dimensions[1]; j++, position++)
            errors[position] = values[position] - trueValues[position];
    }

    return errors;
}

__host__
float * getRelativeErrors(const float * errors, const float * trueValues, const int dimensions[2], float cutOff)
{
    float * relErrors = new float[dimensions[0] * dimensions[1]], * newError;

    float absError, absTrue;
    const float log10 = std::log(10);

    newError = relErrors;
    for(int i = 0; i < dimensions[0]; i++)
    {
        for(int j = 0; j < dimensions[1]; j++, newError++, errors++, trueValues++)
        {
            absError = abs(*errors);
            absTrue = abs(*trueValues);

            // Use a cutoff as a work around to dividing by 0.
            if (absTrue < cutOff)
                absTrue = cutOff;

            // Now use cutoff to work around logarithm of 0.
            if (absError / absTrue < cutOff)
                *newError = std::log(cutOff) / log10;
            else
                *newError = std::log(absError / absTrue) / log10; 
        }
    }  

    return relErrors;
}

__host__
float getAverageError(const float * values, const float * trueValues, const int dimensions[2]) //dimX, const int dimY )
{
    // Now get the average error.
        double error = 0; 
        int offset;
        for (int i = 0; i < dimensions[0]; i++)
        {
            offset = i * dimensions[1];
            for (int j = 0; j < dimensions[1]; j++, offset++)
            {
                error += abs(values[offset] - trueValues[offset]);
            }
        } 
    
        error /= dimensions[0] * dimensions[1];
        return static_cast<float>(error);
}

__host__
void printValues(const int dimensions[2], const float * values) 
{
    const float * pos = values;
    for (int i = 0; i < dimensions[0]; i++) 
    {
        for (int j = 0; j < dimensions[1]; j++, pos++)
            std::cout << *pos << ",\t";
        std::cout << std::endl;
    }
    std::cout << std::endl;
}

__host__
void saveToFile(const float * values, const int dimensions[2], const float lowerLeft[2], const float upperRight[2],
const char * filename) 
{

   std::ofstream myFile(filename, std::ios::binary); 
   if(!myFile.is_open()) {
        throw "Unable to open file.";
   } 

   unsigned int sizeValues = dimensions[0] * dimensions[1] * sizeof(float);
   float * tuples = new float[dimensions[0] * dimensions[1] * 3], * coord;
   float position[2], skip[2];

   for(int i = 0; i < 2; i++)
   {
        position[i] = lowerLeft[i];
        skip[i] = (upperRight[i] - lowerLeft[i]) / (dimensions[i] - 1);
   }

   coord = tuples;
   for( int i = 0; i < dimensions[0]; i++, position[0] += skip[0])
   {
        position[1] = lowerLeft[1];
        for( int j = 0; j < dimensions[1]; j++, position[1] += skip[1], values++) 
        {
            *coord = position[0];
            coord++;
            *coord = position[1];
            coord++;
            *coord = *values; 
            coord++;
        }
   }

   myFile.write((const char *) tuples, 3 * sizeValues); 
   myFile.close();
   delete tuples;
}

Download Source Files Here (tar.gz)