OpenCL Minimum Setup

If you look at the sample code available for using OpenCL 1.2/2.0 on from the primary vendors you will notice that it is all very complicated, some is out of date ( i.e using depreciated functions) or difficult to get running.

The demos they provide do some clever things and definitely show you some good tricks but they are far from a good entry point. Ideally a newcomer to GPGPU wants to be able to open a single file, compilable demo. It should be simple and cover the basics only. Nothing more.

The user should be able to walk from the top of the file to the bottom without having to jump around and see the working pipeline in-order and clearly. Once they are comfortable with seeing the minimum and can have a play around, then you can show them more than just the top layer.

If those of us in the high-performance computing end of software development havent learned anything (and sometimes I think we havent learned anything) from javascript, GUI development tools and the rapid pace of the app development world we should hopefully have at least learned that getting someone onto your platform and working quickly is the best way to keep them. A single hour to understanding how to use something basically is better than a whole day of stress to only gain slightly more.

But enough ranting. I have written a minimal code demo for OpenCL in this style. It lacks all the options, robustness, safety and control the of Intel samples - but its basically ~100 lines of code instead of many thousand and is enough to show the basic concepts and usage patterns.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
//Include the OpenCL Headers
// On Intel default directory is: C:\Intel\OpenCL\sdk\include\CL
// During installation of the Intel SDK it usually placed $(INTELOCLSDKROOT) into your
// system environment. So the project should have the default include directory of: $(INTELOCLSDKROOT)\include
// On AMD and NVIDIA this is different. When you grab the SDK from their site it will usually tell me
// but it is mostly variations on this. Easiest way to find it if the site it confusing is to open a sample
// in visual studio and check the include directories listed in the project properties.
#include <CL/cl.h>

// Standard library to make some things easier.
#include <vector>
#include <string>
#include <fstream>
#include <streambuf>

#define BUFFER_ENTRY_COUNT 256

int main()
{
    //The important objects for initialising and using OpenCL.
    cl_platform_id		platform	= 0;
    cl_device_id		device		= 0;
    cl_context			context		= 0;
    cl_command_queue	queue		= 0;

    //First get our platform-------------------
    {
        //Find which platforms are available
        cl_uint numPlatforms = 0;
        cl_int err = clGetPlatformIDs(0, 0, &numPlatforms);
        if (err != CL_SUCCESS || numPlatforms == 0) return 0;

        //Fetch the IDs and take our first one.
        std::vector<cl_platform_id> availablePlatforms(numPlatforms);
        err = clGetPlatformIDs(numPlatforms, &availablePlatforms[0], 0);
        if (err != CL_SUCCESS) return 0;
        platform = availablePlatforms[0];
    }

    //Now we need our device------------------
    {
        //You can specify if you want CPU/GPU or Accelerator here, but for simple getting going
        //we will just take any.
        cl_device_type  deviceType = CL_DEVICE_TYPE_ALL;

        // Same as above, get the number of devices before we fetch the information.
        cl_uint num_of_devices	= 0;
        cl_int err				= clGetDeviceIDs(platform, deviceType, 0, 0, &num_of_devices	);
        if (err != CL_SUCCESS || num_of_devices==0) return 0;

        //Fetch the ids and select the first one.
        std::vector<cl_device_id> deviceVector(num_of_devices);
        err	= clGetDeviceIDs(platform, deviceType, num_of_devices,	&deviceVector[0], 0);
        if (err != CL_SUCCESS) return 0;
        device = deviceVector[0];
    }

    //Create the context minimal code.
    {
        cl_context_properties contextProps[3];
        contextProps[0] = CL_CONTEXT_PLATFORM;
        contextProps[1] = cl_context_properties(platform);
        contextProps[2] = 0;

        cl_int err = 0;
        context = clCreateContext(&contextProps[0], 1, &device, 0, 0, &err);
        if (err != CL_SUCCESS) return 0;
    }

    //Create a Queue
    {
        cl_int err = 0;
        cl_command_queue_properties props= 0;
        queue = clCreateCommandQueueWithProperties(context, device, &props, &err);
        if (err != CL_SUCCESS) return 0;
    }

    std::string CLProgramFilename	= "./simpleprogram.cl";
    std::string CLProgramKernelName = "EmptyKernel";
    std::string CLProgramSource = "";
    cl_program  CLProgram = 0;
    cl_kernel   CLProgramKernel = 0;
    //Read program source code from file
    {
        std::ifstream file(CLProgramFilename);
        std::string temp;
        while (std::getline(file, temp)) 
        {
            CLProgramSource.append(temp);
        }
    }

    //Create Program from source
    {
        //Take the source and get the program
        cl_int err;
        const char* text = CLProgramSource.c_str();
        cl_program program = clCreateProgramWithSource(context, 1, &text, 0, &err);
        if (err != CL_SUCCESS) return 0;

        //Build it for your specified device.
        err = clBuildProgram(program, (cl_uint)1, &device, "", 0, 0);
        if (err != CL_SUCCESS) return 0;

        //Pull out the kernel(function) we want to use from the program.
        //Programs can have many kernels
        CLProgramKernel = clCreateKernel(program, CLProgramKernelName.c_str(), &err);
        if (err != CL_SUCCESS) return 0;
    }

    cl_mem outputBuffer = 0;
    cl_uint buffSize = BUFFER_ENTRY_COUNT * sizeof(cl_int);
    //Create an output Buffer
    {
        //We are creating a buffer here. The important flags are the CL_MEM_... ones.
        // In this example we say we want one that the kernel can only write and the CPU
        // can only request to read.
        // There are many options here and combining them in different ways has interesting performance effects.
        cl_int	err		= 0;
        outputBuffer	= clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, buffSize, NULL, &err);
        if (err != CL_SUCCESS) return 0;
    }

    //Run Kernel
    {
        //Set the buffer we write to. This maps to the index of the variable in the function in the kernel.
        cl_int err = clSetKernelArg(CLProgramKernel, 0, sizeof(cl_mem), (void *)&outputBuffer);

        //Global size is the total number of things we want to do.
        //Local size is the chunks we are breaking it into. If global not divisible by local
        //it will throw an error.
        cl_uint globalSize	= BUFFER_ENTRY_COUNT;
        cl_uint localSize	= 16;
        err = clEnqueueNDRangeKernel(queue, CLProgramKernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
        if (err != CL_SUCCESS) return 0;

        //Ensuring all the work is done before we copy out our buffer to check the kernel ran correctly.
        err = clFinish(queue);
        if (err != CL_SUCCESS) return 0;

        //Validate the output from our buffer
        cl_int ourOutput[BUFFER_ENTRY_COUNT];
        err = clEnqueueReadBuffer(queue, outputBuffer, CL_TRUE, 0, buffSize, ourOutput, 0, NULL, NULL);
        if (err != CL_SUCCESS) return 0;

        //Check the array has the magic number in it
        if (ourOutput[6] != 42)	return 0;
    }

    //Everything went well.
    return 0;
}

I hope this is useful to anyone trying to get started with OpenCL. I will do the same for Vulkan Compute and DirectX11/12 if I see this gets any traction.