blob: 5b0a9c078f819fc29a4226ce26265475697838f7 [file] [log] [blame]
#ifndef __CPU_KERNEL_H__
#define __CPU_KERNEL_H__
#include "device_interface.h"
#include "CPUID.hpp"
#include "Resource.hpp"
//#include <llvm/ExecutionEngine/GenericValue.h>
#include <vector>
#include <string>
#include <mutex>
#include <stdint.h>
#ifndef MAX_WORK_DIMS
#define MAX_WORK_DIMS 3
#endif
namespace llvm
{
class Function;
}
namespace Devices
{
class CPUDevice;
class Kernel;
class KernelEvent;
class Image2D;
class Image3D;
/**
* \brief CPU kernel
*
* This class holds passive information about a kernel (\c Coal::Kernel object
* and device on which it is run) and provides the \c callFunction() function.
*
* This function is described at the end of \ref llvm .
*
* \see Coal::CPUKernelWorkGroup
*/
class CPUKernel : public DeviceKernel
{
public:
/**
* \brief Constructor
* \param device device on which the kernel will be run
* \param kernel \c Coal::Kernel object holding information about this
* kernel
* \param function \c llvm::Function to run
*/
CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function);
~CPUKernel();
size_t workGroupSize() const;
cl_ulong localMemSize() const;
cl_ulong privateMemSize() const;
size_t preferredWorkGroupSizeMultiple() const;
size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
size_t global_work_size) const;
Kernel *kernel() const; /*!< \brief \c Coal::Kernel object this kernel will run */
CPUDevice *device() const; /*!< \brief device on which the kernel will be run */
llvm::Function *function() const; /*!< \brief \c llvm::Function representing the kernel but <strong>not to be run</strong> */
llvm::Function *callFunction(); /*!< \brief stub function used to run the kernel, see \ref llvm */
/**
* \brief Calculate where to place a value in an array
*
* This function is used to calculate where to place a value in an
* array given its size, properly aligning it.
*
* This function is called repeatedly to obtain the aligned position of
* each value that must be place in the array
*
* \code
* size_t array_len = 0, array_offset = 0;
* void *array;
*
* // First, get the array size given alignment constraints
* typeOffset(array_len, sizeof(int));
* typeOffset(array_len, sizeof(float));
* typeOffset(array_len, sizeof(void *));
*
* // Then, allocate memory
* array = malloc(array_len)
*
* // Finally, place the arguments
* *(int *)((char *)array + typeOffset(array_offset, sizeof(int))) = 1337;
* *(float *)((char *)array + typeOffset(array_offset, sizeof(int))) = 3.1415f;
* *(void **)((char *)array + typeOffset(array_offset, sizeof(int))) = array;
* \endcode
*
* \param offset offset at which the value will be placed. This variable
* gets incremented by <tt>type_len + padding</tt>.
* \param type_len size in bytes of the value that will be stored
* \return offset at which the value will be stored (equal to \p offset
* before incrementation.
*/
static size_t typeOffset(size_t &offset, size_t type_len);
private:
CPUDevice *p_device;
Kernel *p_kernel;
llvm::Function *p_function, *p_call_function;
sw::Resource *p_call_function_mutex;
};
class CPUKernelEvent;
/**
* \brief CPU kernel work-group
*
* This class represent a bulk of work-items that will be run. It is the one
* to actually run the kernel of its elements.
*
* \see \ref llvm
* \nosubgrouping
*/
class CPUKernelWorkGroup
{
public:
/**
* \brief Constructor
* \param kernel kernel to run
* \param event event containing information about the kernel run
* \param cpu_event CPU-specific information and cache about \p event
* \param work_group_index index of this work-group in the kernel
*/
CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
CPUKernelEvent *cpu_event,
const size_t *work_group_index);
~CPUKernelWorkGroup();
/**
* \brief Build a structure of arguments
*
* As C doesn't support calling functions with variable arguments
* unknown at the compilation, this function builds the list of
* arguments in memory. This array will then be passed to a LLVM stub
* function reading it and passing its values to the actuel kernel.
*
* \see \ref llvm
* \param locals_to_free if this kernel takes \c __local arguments, they
* must be \c malloc()'ed for every work-group.
* They are placed in this vector to be
* \c free()'ed at the end of \c run().
* \return address of a memory location containing the arguments
*/
void *callArgs(std::vector<void *> &locals_to_free);
/**
* \brief Run the work-group
*
* This function is the core of CPU-acceleration. It runs the work-items
* of this work-group given the correct arguments.
*
* \see \ref llvm
* \see \ref barrier
* \see callArgs()
* \return true if success, false in case of an error
*/
bool run();
/**
* \name Native implementation of built-in OpenCL C functions
* @{
*/
size_t getGlobalId(cl_uint dimindx) const;
cl_uint getWorkDim() const;
size_t getGlobalSize(cl_uint dimindx) const;
size_t getLocalSize(cl_uint dimindx) const;
size_t getLocalID(cl_uint dimindx) const;
size_t getNumGroups(cl_uint dimindx) const;
size_t getGroupID(cl_uint dimindx) const;
size_t getGlobalOffset(cl_uint dimindx) const;
void barrier(unsigned int flags);
void *getImageData(Image2D *image, int x, int y, int z) const;
void writeImage(Image2D *image, int x, int y, int z, float *color) const;
void writeImage(Image2D *image, int x, int y, int z, int32_t *color) const;
void writeImage(Image2D *image, int x, int y, int z, uint32_t *color) const;
void readImage(float *result, Image2D *image, int x, int y, int z,
uint32_t sampler) const;
void readImage(int32_t *result, Image2D *image, int x, int y, int z,
uint32_t sampler) const;
void readImage(uint32_t *result, Image2D *image, int x, int y, int z,
uint32_t sampler) const;
void readImage(float *result, Image2D *image, float x, float y, float z,
uint32_t sampler) const;
void readImage(int32_t *result, Image2D *image, float x, float y, float z,
uint32_t sampler) const;
void readImage(uint32_t *result, Image2D *image, float x, float y, float z,
uint32_t sampler) const;
/**
* @}
*/
/**
* \brief Function called when a built-in name cannot be found
*/
void builtinNotFound(const std::string &name) const;
private:
template<typename T>
void writeImageImpl(Image2D *image, int x, int y, int z, T *color) const;
template<typename T>
void readImageImplI(T *result, Image2D *image, int x, int y, int z,
uint32_t sampler) const;
template<typename T>
void readImageImplF(T *result, Image2D *image, float x, float y, float z,
uint32_t sampler) const;
template<typename T>
void linear3D(T *result, float a, float b, float c,
int i0, int j0, int k0, int i1, int j1, int k1,
Image3D *image) const;
template<typename T>
void linear2D(T *result, float a, float b, float c, int i0, int j0,
int i1, int j1, Image2D *image) const;
private:
CPUKernel *p_kernel;
CPUKernelEvent *p_cpu_event;
KernelEvent *p_event;
cl_uint p_work_dim;
size_t p_index[MAX_WORK_DIMS],
p_max_local_id[MAX_WORK_DIMS],
p_global_id_start_offset[MAX_WORK_DIMS];
void(*p_kernel_func_addr)(void *);
void *p_args;
// Machinery to have barrier() working
struct Context
{
size_t local_id[MAX_WORK_DIMS];
//ucontext_t context;
unsigned int initialized;
};
Context *getContextAddr(unsigned int index);
Context *p_current_context;
Context p_dummy_context;
void *p_contexts;
size_t p_stack_size;
unsigned int p_num_work_items, p_current_work_item;
bool p_had_barrier;
};
/**
* \brief CPU-specific information about a kernel event
*
* This class put in a \c Coal::KernelEvent device-data field
* (see \c Coal::Event::setDeviceData()) is responsible for dispatching the
* \c Coal::CPUKernelWorkGroup objects between the CPU worker threads.
*/
class CPUKernelEvent
{
public:
/**
* \brief Constructor
* \param device device running the kernel
* \param event \c Coal::KernelEvent holding device-agnostic data
* about the event
*/
CPUKernelEvent(CPUDevice *device, KernelEvent *event);
~CPUKernelEvent();
bool reserve(); /*!< \brief The next Work Group that will execute will be the last. Locks the event */
bool finished(); /*!< \brief All the work groups have finished */
CPUKernelWorkGroup *takeInstance(); /*!< \brief Must be called exactly one time after reserve(). Unlocks the event */
void *kernelArgs() const; /*!< \brief Return the cached kernel arguments */
void cacheKernelArgs(void *args); /*!< \brief Cache pre-built kernel arguments */
void workGroupFinished(); /*!< \brief A work-group has just finished */
private:
CPUDevice *p_device;
KernelEvent *p_event;
size_t p_current_work_group[MAX_WORK_DIMS],
p_max_work_groups[MAX_WORK_DIMS];
size_t p_current_wg, p_finished_wg, p_num_wg;
std::mutex p_mutex;
void *p_kernel_args;
};
}
#endif