src/OpenCL/OpenCL1/kernel.cpp - SwiftShader - Git at Google

 //TODO: copyrights

 #include "kernel.h"
 #include "device.h"
 #include "buffer.h"
 #include "program.h"
 #include "builtins.h"

 #include "kernel.h"
 #include "memobject.h"
 #include "events.h"
 #include "program.h"

 //#include <llvm/Function.h>
 //#include <llvm/Constants.h>
 //#include <llvm/Instructions.h>
 //#include <llvm/LLVMContext.h>
 //#include <llvm/Module.h>
 //#include <llvm/ExecutionEngine/ExecutionEngine.h>

 #include <cstdlib>
 #include <cstring>
 #include <iostream>

 using namespace Devices;

 CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function)
 	: DeviceKernel(), p_device(device), p_kernel(kernel), p_function(function),
 	p_call_function(0)
 {
 	p_call_function_mutex = new sw::Resource(0);
 }

 CPUKernel::~CPUKernel()
 {
 	//TODO
 	//if(p_call_function)
 		//p_call_function->eraseFromParent();

 	p_call_function_mutex->lock(sw::DESTRUCT);
 	p_call_function_mutex->unlock();
 	p_call_function_mutex->destruct();
 }

 size_t CPUKernel::workGroupSize() const
 {
 	return 0; // TODO
 }

 cl_ulong CPUKernel::localMemSize() const
 {
 	return 0; // TODO
 }

 cl_ulong CPUKernel::privateMemSize() const
 {
 	return 0; // TODO
 }

 size_t CPUKernel::preferredWorkGroupSizeMultiple() const
 {
 	return 0; // TODO
 }

 template<typename T>
 T k_exp(T base, unsigned int e)
 {
 	T rs = base;

 	for(unsigned int i = 1; i<e; ++i)
 		rs *= base;

 	return rs;
 }

 // Try to find the size a work group has to have to be executed the fastest on
 // the CPU.
 size_t CPUKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
 	size_t global_work_size) const
 {
 	unsigned int cpus = sw::CPUID::coreCount();

 	// Don't break in too small parts
 	if(k_exp(global_work_size, num_dims) > 64)
 		return global_work_size;

 	// Find the divisor of global_work_size the closest to cpus but >= than it
 	unsigned int divisor = cpus;

 	while(true)
 	{
 		if((global_work_size % divisor) == 0)
 			break;

 		// Don't let the loop go up to global_work_size, the overhead would be
 		// too huge
 		if(divisor > global_work_size || divisor > cpus * 32)
 		{
 			divisor = 1;  // Not parallel but has no CommandQueue overhead
 			break;
 		}
 	}

 	// Return the size
 	return global_work_size / divisor;
 }

 llvm::Function *CPUKernel::function() const
 {
 	return p_function;
 }

 Kernel *CPUKernel::kernel() const
 {
 	return p_kernel;
 }

 CPUDevice *CPUKernel::device() const
 {
 	return p_device;
 }

 // From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two
 template <class T>
 T next_power_of_two(T k) {
 	if(k == 0)
 		return 1;
 	k--;
 	for(int i = 1; i<sizeof(T) * 8; i <<= 1)
 		k = k | k >> i;
 	return k + 1;
 }

 size_t CPUKernel::typeOffset(size_t &offset, size_t type_len)
 {
 	size_t rs = offset;

 	// Align offset to stype_len
 	type_len = next_power_of_two(type_len);
 	size_t mask = ~(type_len - 1);

 	while(rs & mask != rs)
 		rs++;

 	// Where to try to place the next value
 	offset = rs + type_len;

 	return rs;
 }

 llvm::Function *CPUKernel::callFunction()
 {
 	p_call_function_mutex->lock(sw::PUBLIC);

 	// If we can reuse the same function between work groups, do it
 	if(p_call_function)
 	{
 		llvm::Function *rs = p_call_function;
 		p_call_function_mutex->unlock();

 		return rs;
 	}

 	//TODO
 	/* Create a stub function in the form of
 	*
 	* void stub(void *args) {
 	*     kernel(*(int *)((char *)args + 0),
 	*            *(float **)((char *)args + sizeof(int)),
 	*            *(sampler_t *)((char *)args + sizeof(int) + sizeof(float *)));
 	* }
 	*
 	* In LLVM, it is exprimed in the form of :
 	*
 	* @stub(i8* args) {
 	*     kernel(
 	*         load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)),
 	*         load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)),
 	*         ...
 	*     );
 	* }
 	*/
 	//llvm::FunctionType *kernel_function_type = p_function->getFunctionType();
 	//llvm::FunctionType *stub_function_type = llvm::FunctionType::get(
 	//	p_function->getReturnType(),
 	//	llvm::Type::getInt8PtrTy(
 	//	p_function->getContext()),
 	//	false);
 	//llvm::Function *stub_function = llvm::Function::Create(
 	//	stub_function_type,
 	//	llvm::Function::InternalLinkage,
 	//	"",
 	//	p_function->getParent());

 	//// Insert a basic block
 	//llvm::BasicBlock *basic_block = llvm::BasicBlock::Create(
 	//	p_function->getContext(),
 	//	"",
 	//	stub_function);

 	//// Create the function arguments
 	//llvm::Argument &stub_arg = stub_function->getArgumentList().front();
 	//llvm::SmallVector<llvm::Value *, 8> args;
 	//size_t args_offset = 0;

 	//for(unsigned int i = 0; i<kernel_function_type->getNumParams(); ++i)
 	//{
 	//	llvm::Type *param_type = kernel_function_type->getParamType(i);
 	//	llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value
 	//	const Kernel::Arg &arg = p_kernel->arg(i);

 	//	// Calculate the size of the arg
 	//	size_t arg_size = arg.valueSize() * arg.vecDim();

 	//	// Get where to place this argument
 	//	size_t arg_offset = typeOffset(args_offset, arg_size);

 	//	// %1 = getelementptr(args, $arg_offset);
 	//	llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds(
 	//		&stub_arg,
 	//		llvm::ConstantInt::get(stub_function->getContext(),
 	//		llvm::APInt(64, arg_offset)),
 	//		"",
 	//		basic_block);

 	//	// %2 = bitcast(%1, $param_type_ptr)
 	//	llvm::Value *bitcast = new llvm::BitCastInst(
 	//		getelementptr,
 	//		param_type_ptr,
 	//		"",
 	//		basic_block);

 	//	// %3 = load(%2)
 	//	llvm::Value *load = new llvm::LoadInst(
 	//		bitcast,
 	//		"",
 	//		false,
 	//		arg_size,   // We ensure that an argument is always aligned on its size, it enables things like fast movaps
 	//		basic_block);

 	//	// We have the value, send it to the function
 	//	args.push_back(load);
 	//}

 	//// Create the call instruction
 	//llvm::CallInst *call_inst = llvm::CallInst::Create(
 	//	p_function,
 	//	args,
 	//	"",
 	//	basic_block);
 	//call_inst->setCallingConv(p_function->getCallingConv());
 	//call_inst->setTailCall();

 	//// Create a return instruction to end the stub
 	//llvm::ReturnInst::Create(
 	//	p_function->getContext(),
 	//	basic_block);

 	//// Retain the function if it can be reused
 	//p_call_function = stub_function;

 	p_call_function_mutex->unlock();


 	llvm::Function *rs = p_call_function;
 	p_call_function_mutex->unlock();
 	return rs;

 	//return stub_function;
 }

 /*
 * CPUKernelEvent
 */
 CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event)
 	: p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0),
 	p_kernel_args(0)
 {
 	// Set current work group to (0, 0, ..., 0)
 	std::memset(p_current_work_group, 0, event->work_dim() * sizeof(size_t));

 	// Populate p_max_work_groups
 	p_num_wg = 1;

 	for(cl_uint i = 0; i<event->work_dim(); ++i)
 	{
 		p_max_work_groups[i] =
 			(event->global_work_size(i) / event->local_work_size(i)) - 1; // 0..n-1, not 1..n

 		p_num_wg *= p_max_work_groups[i] + 1;
 	}
 }

 CPUKernelEvent::~CPUKernelEvent()
 {

 	if(p_kernel_args)
 		std::free(p_kernel_args);
 }

 bool CPUKernelEvent::reserve()
 {
 	// Lock, this will be unlocked in takeInstance()
 	p_mutex.lock();

 	// Last work group if current == max - 1
 	return (p_current_wg == p_num_wg - 1);
 }

 bool CPUKernelEvent::finished()
 {
 	bool rs;

 	p_mutex.lock();

 	rs = (p_finished_wg == p_num_wg);

 	p_mutex.unlock();

 	return rs;
 }

 void CPUKernelEvent::workGroupFinished()
 {
 	p_mutex.lock();

 	p_finished_wg++;

 	p_mutex.unlock();
 }

 CPUKernelWorkGroup *CPUKernelEvent::takeInstance()
 {
 	CPUKernelWorkGroup *wg = new CPUKernelWorkGroup((CPUKernel *)p_event->deviceKernel(),
 		p_event,
 		this,
 		p_current_work_group);

 	// Increment current work group
 	incVec(p_event->work_dim(), p_current_work_group, p_max_work_groups);
 	p_current_wg += 1;

 	// Release event
 	p_mutex.unlock();

 	return wg;
 }

 void *CPUKernelEvent::kernelArgs() const
 {
 	return p_kernel_args;
 }

 void CPUKernelEvent::cacheKernelArgs(void *args)
 {
 	p_kernel_args = args;
 }

 /*
 * CPUKernelWorkGroup
 */
 CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
 	CPUKernelEvent *cpu_event,
 	const size_t *work_group_index)
 	: p_kernel(kernel), p_cpu_event(cpu_event), p_event(event),
 	p_work_dim(event->work_dim()), p_contexts(0), p_stack_size(8192 /* TODO */),
 	p_had_barrier(false)
 {

 	// Set index
 	std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t));

 	// Set maxs and global id
 	p_num_work_items = 1;

 	for(unsigned int i = 0; i<p_work_dim; ++i)
 	{
 		p_max_local_id[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n
 		p_num_work_items *= event->local_work_size(i);

 		// Set global id
 		p_global_id_start_offset[i] = (p_index[i] * event->local_work_size(i))
 			+ event->global_work_offset(i);
 	}
 }

 CPUKernelWorkGroup::~CPUKernelWorkGroup()
 {
 	p_cpu_event->workGroupFinished();
 }

 void *CPUKernelWorkGroup::callArgs(std::vector<void *> &locals_to_free)
 {
 	//TODO
 	//if(p_cpu_event->kernelArgs() && !p_kernel->kernel()->hasLocals())
 	//{
 	//	// We have cached the args and can reuse them
 	//	return p_cpu_event->kernelArgs();
 	//}

 	// We need to create them from scratch
 	void *rs;

 	size_t args_size = 0;

 	//TODO
 	/*for(unsigned int i = 0; i<p_kernel->kernel()->numArgs(); ++i)
 	{
 		const Kernel::Arg &arg = p_kernel->kernel()->arg(i);
 		CPUKernel::typeOffset(args_size, arg.valueSize() * arg.vecDim());
 	}*/

 	rs = std::malloc(args_size);

 	if(!rs)
 		return false;

 	size_t arg_offset = 0;

 	//TODO
 	//for(unsigned int i = 0; i<p_kernel->kernel()->numArgs(); ++i)
 	//{
 	//	const Kernel::Arg &arg = p_kernel->kernel()->arg(i);
 	//	size_t size = arg.valueSize() * arg.vecDim();
 	//	size_t offset = CPUKernel::typeOffset(arg_offset, size);

 	//	// Where to place the argument
 	//	unsigned char *target = (unsigned char *)rs;
 	//	target += offset;

 	//	// We may have to perform some changes in the values (buffers, etc)
 	//	switch(arg.kind())
 	//	{
 	//	case Kernel::Arg::Buffer:
 	//	{
 	//		MemObject *buffer = *(MemObject **)arg.data();

 	//		if(arg.file() == Kernel::Arg::Local)
 	//		{
 	//			// Alloc a buffer and pass it to the kernel
 	//			void *local_buffer = std::malloc(arg.allocAtKernelRuntime());
 	//			locals_to_free.push_back(local_buffer);
 	//			*(void **)target = local_buffer;
 	//		}
 	//		else
 	//		{
 	//			if(!buffer)
 	//			{
 	//				// We can do that, just send NULL
 	//				*(void **)target = NULL;
 	//			}
 	//			else
 	//			{
 	//				// Get the CPU buffer, allocate it and get its pointer
 	//				CPUBuffer *cpubuf =
 	//					(CPUBuffer *)buffer->deviceBuffer(p_kernel->device());
 	//				void *buf_ptr = 0;

 	//				buffer->allocate(p_kernel->device());
 	//				buf_ptr = cpubuf->data();

 	//				*(void **)target = buf_ptr;
 	//			}
 	//		}

 	//		break;
 	//	}
 	//	case Kernel::Arg::Image2D:
 	//	case Kernel::Arg::Image3D:
 	//	{
 	//		// We need to ensure the image is allocated
 	//		Image2D *image = *(Image2D **)arg.data();
 	//		image->allocate(p_kernel->device());

 	//		// Fall through to the memcpy
 	//	}
 	//	default:
 	//		// Simply copy the arg's data into the buffer
 	//		std::memcpy(target, arg.data(), size);
 	//		break;
 	//	}
 	//}

 	// Cache the arguments if we can do so
 	/*if(!p_kernel->kernel()->hasLocals())
 		p_cpu_event->cacheKernelArgs(rs);*/

 	return rs;
 }

 bool CPUKernelWorkGroup::run()
 {
 	// Get the kernel function to call
 	std::vector<void *> locals_to_free;
 	llvm::Function *kernel_func = p_kernel->callFunction();

 	if(!kernel_func)
 		return false;

 	//TODO
 	//Program *p = (Program *)p_kernel->kernel()->parent();
 	//CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(p_kernel->device()));

 	//p_kernel_func_addr =
 	//	(void(*)(void *))prog->jit()->getPointerToFunction(kernel_func);

 	//// Get the arguments
 	//p_args = callArgs(locals_to_free);

 	//// Tell the builtins this thread will run a kernel work group
 	//setThreadLocalWorkGroup(this);

 	//// Initialize the dummy context used by the builtins before a call to barrier()
 	//p_current_work_item = 0;
 	//p_current_context = &p_dummy_context;

 	//std::memset(p_dummy_context.local_id, 0, p_work_dim * sizeof(size_t));

 	//do
 	//{
 	//	// Simply call the "call function", it and the builtins will do the rest
 	//	p_kernel_func_addr(p_args);
 	//} while(!p_had_barrier &&
 	//	!incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id));

 	//// If no barrier() call was made, all is fine. If not, only the first
 	//// work-item has currently finished. We must let the others run.
 	//if(p_had_barrier)
 	//{
 	//	Context *main_context = p_current_context; // After the first swapcontext,
 	//	// we will not be able to trust
 	//	// p_current_context anymore.

 	//	// We'll call swapcontext for each remaining work-item. They will
 	//	// finish, and when they'll do so, this main context will be resumed, so
 	//	// it's easy (i starts from 1 because the main context already finished)
 	//	for(unsigned int i = 1; i<p_num_work_items; ++i)
 	//	{
 	//		Context *ctx = getContextAddr(i);
 	//		swapcontext(&main_context->context, &ctx->context);
 	//	}
 	//}

 	//// Free the allocated locals
 	//if(p_kernel->kernel()->hasLocals())
 	//{
 	//	for(size_t i = 0; i<locals_to_free.size(); ++i)
 	//	{
 	//		std::free(locals_to_free[i]);
 	//	}

 	//	std::free(p_args);
 	//}

 	return true;
 }

 CPUKernelWorkGroup::Context *CPUKernelWorkGroup::getContextAddr(unsigned int index)
 {
 	size_t size;
 	char *data = (char *)p_contexts;

 	// Each Context in data is an element of size p_stack_size + sizeof(Context)
 	size = p_stack_size + sizeof(Context);
 	size *= index;  // To get an offset

 	return (Context *)(data + size); // Pointer to the context
 }
	//TODO: copyrights

	#include "kernel.h"
	#include "device.h"
	#include "buffer.h"
	#include "program.h"
	#include "builtins.h"

	#include "kernel.h"
	#include "memobject.h"
	#include "events.h"
	#include "program.h"

	//#include <llvm/Function.h>
	//#include <llvm/Constants.h>
	//#include <llvm/Instructions.h>
	//#include <llvm/LLVMContext.h>
	//#include <llvm/Module.h>
	//#include <llvm/ExecutionEngine/ExecutionEngine.h>

	#include <cstdlib>
	#include <cstring>
	#include <iostream>

	using namespace Devices;

	CPUKernel::CPUKernel(CPUDevice device, Kernel kernel, llvm::Function *function)
	: DeviceKernel(), p_device(device), p_kernel(kernel), p_function(function),
	p_call_function(0)
	{
	p_call_function_mutex = new sw::Resource(0);
	}

	CPUKernel::~CPUKernel()
	{
	//TODO
	//if(p_call_function)
	//p_call_function->eraseFromParent();

	p_call_function_mutex->lock(sw::DESTRUCT);
	p_call_function_mutex->unlock();
	p_call_function_mutex->destruct();
	}

	size_t CPUKernel::workGroupSize() const
	{
	return 0; // TODO
	}

	cl_ulong CPUKernel::localMemSize() const
	{
	return 0; // TODO
	}

	cl_ulong CPUKernel::privateMemSize() const
	{
	return 0; // TODO
	}

	size_t CPUKernel::preferredWorkGroupSizeMultiple() const
	{
	return 0; // TODO
	}

	template<typename T>
	T k_exp(T base, unsigned int e)
	{
	T rs = base;

	for(unsigned int i = 1; i<e; ++i)
	rs *= base;

	return rs;
	}

	// Try to find the size a work group has to have to be executed the fastest on
	// the CPU.
	size_t CPUKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
	size_t global_work_size) const
	{
	unsigned int cpus = sw::CPUID::coreCount();

	// Don't break in too small parts
	if(k_exp(global_work_size, num_dims) > 64)
	return global_work_size;

	// Find the divisor of global_work_size the closest to cpus but >= than it
	unsigned int divisor = cpus;

	while(true)
	{
	if((global_work_size % divisor) == 0)
	break;

	// Don't let the loop go up to global_work_size, the overhead would be
	// too huge
	if(divisor > global_work_size \|\| divisor > cpus * 32)
	{
	divisor = 1; // Not parallel but has no CommandQueue overhead
	break;
	}
	}

	// Return the size
	return global_work_size / divisor;
	}

	llvm::Function *CPUKernel::function() const
	{
	return p_function;
	}

	Kernel *CPUKernel::kernel() const
	{
	return p_kernel;
	}

	CPUDevice *CPUKernel::device() const
	{
	return p_device;
	}

	// From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two
	template <class T>
	T next_power_of_two(T k) {
	if(k == 0)
	return 1;
	k--;
	for(int i = 1; i<sizeof(T) * 8; i <<= 1)
	k = k \| k >> i;
	return k + 1;
	}

	size_t CPUKernel::typeOffset(size_t &offset, size_t type_len)
	{
	size_t rs = offset;

	// Align offset to stype_len
	type_len = next_power_of_two(type_len);
	size_t mask = ~(type_len - 1);

	while(rs & mask != rs)
	rs++;

	// Where to try to place the next value
	offset = rs + type_len;

	return rs;
	}

	llvm::Function *CPUKernel::callFunction()
	{
	p_call_function_mutex->lock(sw::PUBLIC);

	// If we can reuse the same function between work groups, do it
	if(p_call_function)
	{
	llvm::Function *rs = p_call_function;
	p_call_function_mutex->unlock();

	return rs;
	}

	//TODO
	/* Create a stub function in the form of
	*
	* void stub(void *args) {
	* kernel((int )((char *)args + 0),
	* (float )((char )args + sizeof(int)),
	* (sampler_t )((char )args + sizeof(int) + sizeof(float )));
	* }
	*
	* In LLVM, it is exprimed in the form of :
	*
	* @stub(i8* args) {
	* kernel(
	* load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)),
	* load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)),
	* ...
	* );
	* }
	*/
	//llvm::FunctionType *kernel_function_type = p_function->getFunctionType();
	//llvm::FunctionType *stub_function_type = llvm::FunctionType::get(
	// p_function->getReturnType(),
	// llvm::Type::getInt8PtrTy(
	// p_function->getContext()),
	// false);
	//llvm::Function *stub_function = llvm::Function::Create(
	// stub_function_type,
	// llvm::Function::InternalLinkage,
	// "",
	// p_function->getParent());

	//// Insert a basic block
	//llvm::BasicBlock *basic_block = llvm::BasicBlock::Create(
	// p_function->getContext(),
	// "",
	// stub_function);

	//// Create the function arguments
	//llvm::Argument &stub_arg = stub_function->getArgumentList().front();
	//llvm::SmallVector<llvm::Value *, 8> args;
	//size_t args_offset = 0;

	//for(unsigned int i = 0; i<kernel_function_type->getNumParams(); ++i)
	//{
	// llvm::Type *param_type = kernel_function_type->getParamType(i);
	// llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value
	// const Kernel::Arg &arg = p_kernel->arg(i);

	// // Calculate the size of the arg
	// size_t arg_size = arg.valueSize() * arg.vecDim();

	// // Get where to place this argument
	// size_t arg_offset = typeOffset(args_offset, arg_size);

	// // %1 = getelementptr(args, $arg_offset);
	// llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds(
	// &stub_arg,
	// llvm::ConstantInt::get(stub_function->getContext(),
	// llvm::APInt(64, arg_offset)),
	// "",
	// basic_block);

	// // %2 = bitcast(%1, $param_type_ptr)
	// llvm::Value *bitcast = new llvm::BitCastInst(
	// getelementptr,
	// param_type_ptr,
	// "",
	// basic_block);

	// // %3 = load(%2)
	// llvm::Value *load = new llvm::LoadInst(
	// bitcast,
	// "",
	// false,
	// arg_size, // We ensure that an argument is always aligned on its size, it enables things like fast movaps
	// basic_block);

	// // We have the value, send it to the function
	// args.push_back(load);
	//}

	//// Create the call instruction
	//llvm::CallInst *call_inst = llvm::CallInst::Create(
	// p_function,
	// args,
	// "",
	// basic_block);
	//call_inst->setCallingConv(p_function->getCallingConv());
	//call_inst->setTailCall();

	//// Create a return instruction to end the stub
	//llvm::ReturnInst::Create(
	// p_function->getContext(),
	// basic_block);

	//// Retain the function if it can be reused
	//p_call_function = stub_function;

	p_call_function_mutex->unlock();


	llvm::Function *rs = p_call_function;
	p_call_function_mutex->unlock();
	return rs;

	//return stub_function;
	}

	/*
	* CPUKernelEvent
	*/
	CPUKernelEvent::CPUKernelEvent(CPUDevice device, KernelEvent event)
	: p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0),
	p_kernel_args(0)
	{
	// Set current work group to (0, 0, ..., 0)
	std::memset(p_current_work_group, 0, event->work_dim() * sizeof(size_t));

	// Populate p_max_work_groups
	p_num_wg = 1;

	for(cl_uint i = 0; i<event->work_dim(); ++i)
	{
	p_max_work_groups[i] =
	(event->global_work_size(i) / event->local_work_size(i)) - 1; // 0..n-1, not 1..n

	p_num_wg *= p_max_work_groups[i] + 1;
	}
	}

	CPUKernelEvent::~CPUKernelEvent()
	{

	if(p_kernel_args)
	std::free(p_kernel_args);
	}

	bool CPUKernelEvent::reserve()
	{
	// Lock, this will be unlocked in takeInstance()
	p_mutex.lock();

	// Last work group if current == max - 1
	return (p_current_wg == p_num_wg - 1);
	}

	bool CPUKernelEvent::finished()
	{
	bool rs;

	p_mutex.lock();

	rs = (p_finished_wg == p_num_wg);

	p_mutex.unlock();

	return rs;
	}

	void CPUKernelEvent::workGroupFinished()
	{
	p_mutex.lock();

	p_finished_wg++;

	p_mutex.unlock();
	}

	CPUKernelWorkGroup *CPUKernelEvent::takeInstance()
	{
	CPUKernelWorkGroup wg = new CPUKernelWorkGroup((CPUKernel )p_event->deviceKernel(),
	p_event,
	this,
	p_current_work_group);

	// Increment current work group
	incVec(p_event->work_dim(), p_current_work_group, p_max_work_groups);
	p_current_wg += 1;

	// Release event
	p_mutex.unlock();

	return wg;
	}

	void *CPUKernelEvent::kernelArgs() const
	{
	return p_kernel_args;
	}

	void CPUKernelEvent::cacheKernelArgs(void *args)
	{
	p_kernel_args = args;
	}

	/*
	* CPUKernelWorkGroup
	*/
	CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel kernel, KernelEvent event,
	CPUKernelEvent *cpu_event,
	const size_t *work_group_index)
	: p_kernel(kernel), p_cpu_event(cpu_event), p_event(event),
	p_work_dim(event->work_dim()), p_contexts(0), p_stack_size(8192 /* TODO */),
	p_had_barrier(false)
	{

	// Set index
	std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t));

	// Set maxs and global id
	p_num_work_items = 1;

	for(unsigned int i = 0; i<p_work_dim; ++i)
	{
	p_max_local_id[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n
	p_num_work_items *= event->local_work_size(i);

	// Set global id
	p_global_id_start_offset[i] = (p_index[i] * event->local_work_size(i))
	+ event->global_work_offset(i);
	}
	}

	CPUKernelWorkGroup::~CPUKernelWorkGroup()
	{
	p_cpu_event->workGroupFinished();
	}

	void CPUKernelWorkGroup::callArgs(std::vector<void > &locals_to_free)
	{
	//TODO
	//if(p_cpu_event->kernelArgs() && !p_kernel->kernel()->hasLocals())
	//{
	// // We have cached the args and can reuse them
	// return p_cpu_event->kernelArgs();
	//}

	// We need to create them from scratch
	void *rs;

	size_t args_size = 0;

	//TODO
	/*for(unsigned int i = 0; i<p_kernel->kernel()->numArgs(); ++i)
	{
	const Kernel::Arg &arg = p_kernel->kernel()->arg(i);
	CPUKernel::typeOffset(args_size, arg.valueSize() * arg.vecDim());
	}*/

	rs = std::malloc(args_size);

	if(!rs)
	return false;

	size_t arg_offset = 0;

	//TODO
	//for(unsigned int i = 0; i<p_kernel->kernel()->numArgs(); ++i)
	//{
	// const Kernel::Arg &arg = p_kernel->kernel()->arg(i);
	// size_t size = arg.valueSize() * arg.vecDim();
	// size_t offset = CPUKernel::typeOffset(arg_offset, size);

	// // Where to place the argument
	// unsigned char target = (unsigned char )rs;
	// target += offset;

	// // We may have to perform some changes in the values (buffers, etc)
	// switch(arg.kind())
	// {
	// case Kernel::Arg::Buffer:
	// {
	// MemObject buffer = (MemObject **)arg.data();

	// if(arg.file() == Kernel::Arg::Local)
	// {
	// // Alloc a buffer and pass it to the kernel
	// void *local_buffer = std::malloc(arg.allocAtKernelRuntime());
	// locals_to_free.push_back(local_buffer);
	// (void *)target = local_buffer;
	// }
	// else
	// {
	// if(!buffer)
	// {
	// // We can do that, just send NULL
	// (void *)target = NULL;
	// }
	// else
	// {
	// // Get the CPU buffer, allocate it and get its pointer
	// CPUBuffer *cpubuf =
	// (CPUBuffer *)buffer->deviceBuffer(p_kernel->device());
	// void *buf_ptr = 0;

	// buffer->allocate(p_kernel->device());
	// buf_ptr = cpubuf->data();

	// (void *)target = buf_ptr;
	// }
	// }

	// break;
	// }
	// case Kernel::Arg::Image2D:
	// case Kernel::Arg::Image3D:
	// {
	// // We need to ensure the image is allocated
	// Image2D image = (Image2D **)arg.data();
	// image->allocate(p_kernel->device());

	// // Fall through to the memcpy
	// }
	// default:
	// // Simply copy the arg's data into the buffer
	// std::memcpy(target, arg.data(), size);
	// break;
	// }
	//}

	// Cache the arguments if we can do so
	/*if(!p_kernel->kernel()->hasLocals())
	p_cpu_event->cacheKernelArgs(rs);*/

	return rs;
	}

	bool CPUKernelWorkGroup::run()
	{
	// Get the kernel function to call
	std::vector<void *> locals_to_free;
	llvm::Function *kernel_func = p_kernel->callFunction();

	if(!kernel_func)
	return false;

	//TODO
	//Program p = (Program )p_kernel->kernel()->parent();
	//CPUProgram prog = (CPUProgram )(p->deviceDependentProgram(p_kernel->device()));

	//p_kernel_func_addr =
	// (void()(void ))prog->jit()->getPointerToFunction(kernel_func);

	//// Get the arguments
	//p_args = callArgs(locals_to_free);

	//// Tell the builtins this thread will run a kernel work group
	//setThreadLocalWorkGroup(this);

	//// Initialize the dummy context used by the builtins before a call to barrier()
	//p_current_work_item = 0;
	//p_current_context = &p_dummy_context;

	//std::memset(p_dummy_context.local_id, 0, p_work_dim * sizeof(size_t));

	//do
	//{
	// // Simply call the "call function", it and the builtins will do the rest
	// p_kernel_func_addr(p_args);
	//} while(!p_had_barrier &&
	// !incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id));

	//// If no barrier() call was made, all is fine. If not, only the first
	//// work-item has currently finished. We must let the others run.
	//if(p_had_barrier)
	//{
	// Context *main_context = p_current_context; // After the first swapcontext,
	// // we will not be able to trust
	// // p_current_context anymore.

	// // We'll call swapcontext for each remaining work-item. They will
	// // finish, and when they'll do so, this main context will be resumed, so
	// // it's easy (i starts from 1 because the main context already finished)
	// for(unsigned int i = 1; i<p_num_work_items; ++i)
	// {
	// Context *ctx = getContextAddr(i);
	// swapcontext(&main_context->context, &ctx->context);
	// }
	//}

	//// Free the allocated locals
	//if(p_kernel->kernel()->hasLocals())
	//{
	// for(size_t i = 0; i<locals_to_free.size(); ++i)
	// {
	// std::free(locals_to_free[i]);
	// }

	// std::free(p_args);
	//}

	return true;
	}

	CPUKernelWorkGroup::Context *CPUKernelWorkGroup::getContextAddr(unsigned int index)
	{
	size_t size;
	char data = (char )p_contexts;

	// Each Context in data is an element of size p_stack_size + sizeof(Context)
	size = p_stack_size + sizeof(Context);
	size *= index; // To get an offset

	return (Context *)(data + size); // Pointer to the context
	}