/* * Created by Brett Terpstra 6920201 on 20/11/22. * Copyright (c) 2022 Brett Terpstra. All Rights Reserved. */ #ifndef STEP_3_CL_H #define STEP_3_CL_H // OpenCL includes #include #include #include #include #include namespace Raytracing { class CLProgram { private: cl_int m_CL_ERR{}; std::string m_source; cl_device_id m_deviceID{}; cl_context m_context{}; cl_command_queue m_commandQueue{}; cl_program m_program{}; std::unordered_map buffers; std::unordered_map kernels; /** * Checks for some basic errors after calling OpenCL commands. Stuff like GPU out of memory... etc. */ void checkBasicErrors() const; public: /** * Loads the shader from a file on class creation * @param file file to load OpenCL "shader" (code) file */ explicit CLProgram(const std::string& file); /** * Used by the OpenCL class to create a basic OpenCL program * @param context provided by the OpenCL class. * @param deviceID provided by the OpenCL class. */ void loadCLShader(cl_context context, cl_device_id deviceID); /** * Kernels are the entry points in OpenCL. You can have multiple of them in a single program. * @param kernelName both the name of the kernel function in the source and the reference to the kernel object used in other functions in this class. */ void createKernel(const std::string& kernelName); /** * Buffers are the quintessential datastructures in OpenCL. They are basically regions of memory allocated to a program. * @param bufferName the name of the buffer used to store internally * @param flags read write flags for the buffer. One of CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE * @param bytes the number of bytes to be allocated. */ void createBuffer(const std::string& bufferName, cl_mem_flags flags, size_t bytes); /** * Creates a buffer on the GPU using the data pointed to by the supplied pointer. This copy happens as soon as this is called. * @param bufferName the name of the buffer used to store internally * @param flags One of CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE * @param bytes the number of bytes to be allocated. Must be less than equal to the number of bytes at ptr * @param ptr the pointer to copy to the GPU. */ void createBuffer(const std::string& bufferName, cl_mem_flags flags, size_t bytes, void* ptr); /** * Creates a buffer on the GPU using the data pointed to by the supplied pointer. This copy happens as soon as this is called. * @param bufferName the name of the buffer used to store internally * @param flags One of CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE * @param bytes the number of bytes to be allocated. Must be less than equal to the number of bytes at ptr * @param ptr the pointer to copy to the GPU. */ void createImage(const std::string& imageName, int width, int height); /** * Allows you to bind certain buffers to a specific index in the kernel's argument list. * @param kernel kernel to bind to * @param buffer buffer to bind to argIndex * @param argIndex the index of the argument for this buffer. */ void setKernelArgument(const std::string& kernel, const std::string& buffer, int argIndex); /** * Runs the kernel code on the GPU. Is blocking. * @param kernel kernel function name to call * @param globalWorkSize the total number of times to execute the kernel function code. Corresponds to the result of get_global_id * @param localWorkSize how many work items make up a work group to be executed by a kernel. 64 is recommended, must not exceed the printed value "device max workgroup size" * @param globalWorkOffset not used. can be used to set an offset to the result of get_global_id */ void runKernel(const std::string& kernel, size_t globalWorkSize, size_t localWorkSize, const size_t* globalWorkOffset = NULL); /** * Runs the kernel code on the GPU. Is blocking. * This version allows you to specify the number of work dimensions. * globalWorkSize and localWorkSize must be an array of workDim size which specify the work size for each kernel * For example a work dim of 2 allows for two separate work sizes to be set per dimension. * An image is two dimensional and so global work size would be {width of image, height of image} * and local work size would be size_t localWork = {8, 8} for a total of 64 (again recommended). * The resulting execution causes get_global_id(0) to run [0, width) times and get_global_id(1) to run [0, height) times * @param kernel kernel function name to call * @param globalWorkSize the total number of times to execute the kernel function code. Corresponds to the result of get_global_id(dim) * @param localWorkSize how many work items make up a work group to be executed by a kernel. total 64 is recommended, total must not exceed the printed value "device max workgroup size" * @param workDim number of dimensions to the work group being executed. * @param globalWorkOffset not used. can be used to set an offset to the result of get_global_id */ void runKernel( const std::string& kernel, size_t* globalWorkSize, size_t* localWorkSize, cl_uint workDim = 1, const size_t* globalWorkOffset = NULL ); /** * Enqueues a write command to the buffer specified by the buffer name, * @param buffer the buffer to write to * @param bytes the number of bytes to be copied * @param ptr the pointer to copy from. Must have at least bytes available * @param blocking should this function wait for the bytes to be uploaded to the GPU? * @param offset offset in the buffer object to write to */ void writeBuffer(const std::string& buffer, size_t bytes, void* ptr, cl_bool blocking = CL_TRUE, size_t offset = 0); /** * Enqueues a read command from the buffered specified by the buffer name. * Defaults to blocking but can be set to be non-blocking. * @param buffer buffer to read from * @param bytes the number of bytes to read. Make sure ptr has at least those bytes available. * @param ptr the ptr to write the read bytes to. * @param blocking should we wait for the read or do it async? * @param offset offset in the buffer to read from. */ void readBuffer(const std::string& buffer, size_t bytes, void* ptr, cl_bool blocking = CL_TRUE, size_t offset = 0); /** * Reads an image from the GPU into the memory region specified. Allocated memory region must be large enough to hold the image. * @param imageName name of the buffer to read from * @param width width of the image. Must be less than or equal to the width of the image on the GPU * @param height height of the image. Also must be less than or equal to the height of the image on the GPU * @param ptr pointer to the memory region to read into * @param blocking should we wait for the read operation to complete? Defaults to yes. * @param x x coordinate to start the read from. Defaults to zero since it's unlikely to be needed here. Included for possible future use. * @param y y coordinate to start the read from. */ void readImage( const std::string& imageName, size_t width, size_t height, void* ptr, cl_bool blocking = CL_TRUE, size_t x = 0, size_t y = 0 ); /** * Reads an image buffer into a RayCasting Image class. * Image supplied must have a with and height that matches the width and height of the image buffer specified by the name. * @param imageName name of the buffer you wish to read from * @param image reference to an image that you want the GPU data read into. */ void readImage(const std::string& imageName, Image& image); /** * Issues all previously queued OpenCL commands in a command-queue to the device associated with the command-queue. */ void flushCommands(); /** * Blocks until all previously queued OpenCL commands in a command-queue are issued to the associated device and have completed. */ void finishCommands(); ~CLProgram(); }; class OpenCL { private: cl_int m_CL_ERR; cl_uint m_numPlatforms; int m_activePlatform; cl_platform_id* m_platformIDs; cl_uint m_numOfPlatformIDs{}; cl_device_id m_deviceID{}; cl_uint m_numOfDevices{}; cl_uint m_computeUnits{}; cl_uint m_deviceClockFreq{}; cl_context m_context; /** * prints out the important info about the specified device. * @param device device to data dump */ void printDeviceInfo(cl_device_id device); public: /** * creates an opencl instance on the specified platform and device. Defaults to the first GPU device */ explicit OpenCL(int platformID = 0, int deviceID = 0); /** * Creates the global OpenCL instance for the engine */ static void init(); /** * Creates an OpenCL program object using the global OpenCL connection * @param program */ static void createCLProgram(CLProgram& program); /** * @return the number of compute units the device has */ static cl_uint activeDeviceComputeUnits(); /** * the frequency in megahertz of the device * @return */ static cl_uint activeDeviceFrequency(); ~OpenCL(); }; } #endif //STEP_3_CL_H