Revert "Updated SDL, Bullet and OpenAL soft libs"

This reverts commit 370161cfb1.
2026-04-28 15:55:39 +00:00 · 2019-07-08 09:49:44 -05:00 · 2019-07-08 09:49:44 -05:00 · bc77ff0833
commit bc77ff0833
parent 63be684474
1102 changed files with 62741 additions and 204988 deletions
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h
@ -1,42 +0,0 @@
-
-#ifndef B3_GPU_BROADPHASE_INTERFACE_H
-#define B3_GPU_BROADPHASE_INTERFACE_H
-
-#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
-#include "Bullet3Common/b3Vector3.h"
-#include "b3SapAabb.h"
-#include "Bullet3Common/shared/b3Int2.h"
-#include "Bullet3Common/shared/b3Int4.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
-
-class b3GpuBroadphaseInterface
-{
-public:
-	typedef class b3GpuBroadphaseInterface*(CreateFunc)(cl_context ctx, cl_device_id device, cl_command_queue q);
-
-	virtual ~b3GpuBroadphaseInterface()
-	{
-	}
-
-	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
-	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
-
-	virtual void calculateOverlappingPairs(int maxPairs) = 0;
-	virtual void calculateOverlappingPairsHost(int maxPairs) = 0;
-
-	//call writeAabbsToGpu after done making all changes (createProxy etc)
-	virtual void writeAabbsToGpu() = 0;
-
-	virtual cl_mem getAabbBufferWS() = 0;
-	virtual int getNumOverlap() = 0;
-	virtual cl_mem getOverlappingPairBuffer() = 0;
-
-	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() = 0;
-	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() = 0;
-
-	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;
-	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;
-	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0;
-};
-
-#endif  //B3_GPU_BROADPHASE_INTERFACE_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp
@ -1,338 +0,0 @@
-
-#include "b3GpuGridBroadphase.h"
-#include "Bullet3Geometry/b3AabbUtil.h"
-#include "kernels/gridBroadphaseKernels.h"
-#include "kernels/sapKernels.h"
-//#include "kernels/gridBroadphase.cl"
-
-#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
-
-#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
-#define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl"
-
-cl_kernel kCalcHashAABB;
-cl_kernel kClearCellStart;
-cl_kernel kFindCellStart;
-cl_kernel kFindOverlappingPairs;
-cl_kernel m_copyAabbsKernel;
-cl_kernel m_sap2Kernel;
-
-//int maxPairsPerBody = 64;
-int maxBodiesPerCell = 256;  //??
-
-b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q)
-	: m_context(ctx),
-	  m_device(device),
-	  m_queue(q),
-	  m_allAabbsGPU1(ctx, q),
-	  m_smallAabbsMappingGPU(ctx, q),
-	  m_largeAabbsMappingGPU(ctx, q),
-	  m_gpuPairs(ctx, q),
-
-	  m_hashGpu(ctx, q),
-
-	  m_cellStartGpu(ctx, q),
-	  m_paramsGPU(ctx, q)
-{
-	b3Vector3 gridSize = b3MakeVector3(3, 3, 3);
-	b3Vector3 invGridSize = b3MakeVector3(1.f / gridSize[0], 1.f / gridSize[1], 1.f / gridSize[2]);
-
-	m_paramsCPU.m_gridSize[0] = 128;
-	m_paramsCPU.m_gridSize[1] = 128;
-	m_paramsCPU.m_gridSize[2] = 128;
-	m_paramsCPU.m_gridSize[3] = maxBodiesPerCell;
-	m_paramsCPU.setMaxBodiesPerCell(maxBodiesPerCell);
-	m_paramsCPU.m_invCellSize[0] = invGridSize[0];
-	m_paramsCPU.m_invCellSize[1] = invGridSize[1];
-	m_paramsCPU.m_invCellSize[2] = invGridSize[2];
-	m_paramsCPU.m_invCellSize[3] = 0.f;
-	m_paramsGPU.push_back(m_paramsCPU);
-
-	cl_int errNum = 0;
-
-	{
-		const char* sapSrc = sapCL;
-		cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH);
-		b3Assert(errNum == CL_SUCCESS);
-		m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg);
-		m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg);
-		b3Assert(errNum == CL_SUCCESS);
-	}
-
-	{
-		cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, gridBroadphaseCL, &errNum, "", B3_GRID_BROADPHASE_PATH);
-		b3Assert(errNum == CL_SUCCESS);
-
-		kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kCalcHashAABB", &errNum, gridProg);
-		b3Assert(errNum == CL_SUCCESS);
-
-		kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kClearCellStart", &errNum, gridProg);
-		b3Assert(errNum == CL_SUCCESS);
-
-		kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindCellStart", &errNum, gridProg);
-		b3Assert(errNum == CL_SUCCESS);
-
-		kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindOverlappingPairs", &errNum, gridProg);
-		b3Assert(errNum == CL_SUCCESS);
-	}
-
-	m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);
-}
-b3GpuGridBroadphase::~b3GpuGridBroadphase()
-{
-	clReleaseKernel(kCalcHashAABB);
-	clReleaseKernel(kClearCellStart);
-	clReleaseKernel(kFindCellStart);
-	clReleaseKernel(kFindOverlappingPairs);
-	clReleaseKernel(m_sap2Kernel);
-	clReleaseKernel(m_copyAabbsKernel);
-
-	delete m_sorter;
-}
-
-void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
-{
-	b3SapAabb aabb;
-	aabb.m_minVec = aabbMin;
-	aabb.m_maxVec = aabbMax;
-	aabb.m_minIndices[3] = userPtr;
-	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();  //NOT userPtr;
-	m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
-
-	m_allAabbsCPU1.push_back(aabb);
-}
-void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
-{
-	b3SapAabb aabb;
-	aabb.m_minVec = aabbMin;
-	aabb.m_maxVec = aabbMax;
-	aabb.m_minIndices[3] = userPtr;
-	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();  //NOT userPtr;
-	m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
-
-	m_allAabbsCPU1.push_back(aabb);
-}
-
-void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
-{
-	B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs");
-
-	if (0)
-	{
-		calculateOverlappingPairsHost(maxPairs);
-		/*
-		b3AlignedObjectArray<b3Int4> cpuPairs;
-		m_gpuPairs.copyToHost(cpuPairs);
-		printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size());
-		for (int i=0;i<m_gpuPairs.size();i++)
-		{
-			printf("host pair %d = %d,%d\n",i,cpuPairs[i].x,cpuPairs[i].y);
-		}
-		*/
-		return;
-	}
-
-	int numSmallAabbs = m_smallAabbsMappingGPU.size();
-
-	b3OpenCLArray<int> pairCount(m_context, m_queue);
-	pairCount.push_back(0);
-	m_gpuPairs.resize(maxPairs);  //numSmallAabbs*maxPairsPerBody);
-
-	{
-		int numLargeAabbs = m_largeAabbsMappingGPU.size();
-		if (numLargeAabbs && numSmallAabbs)
-		{
-			B3_PROFILE("sap2Kernel");
-			b3BufferInfoCL bInfo[] = {
-				b3BufferInfoCL(m_allAabbsGPU1.getBufferCL()),
-				b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()),
-				b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()),
-				b3BufferInfoCL(m_gpuPairs.getBufferCL()),
-				b3BufferInfoCL(pairCount.getBufferCL())};
-			b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel");
-			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(numLargeAabbs);
-			launcher.setConst(numSmallAabbs);
-			launcher.setConst(0);  //axis is not used
-			launcher.setConst(maxPairs);
-			//@todo: use actual maximum work item sizes of the device instead of hardcoded values
-			launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64);
-
-			int numPairs = pairCount.at(0);
-
-			if (numPairs > maxPairs)
-			{
-				b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
-				numPairs = maxPairs;
-			}
-		}
-	}
-
-	if (numSmallAabbs)
-	{
-		B3_PROFILE("gridKernel");
-		m_hashGpu.resize(numSmallAabbs);
-		{
-			B3_PROFILE("kCalcHashAABB");
-			b3LauncherCL launch(m_queue, kCalcHashAABB, "kCalcHashAABB");
-			launch.setConst(numSmallAabbs);
-			launch.setBuffer(m_allAabbsGPU1.getBufferCL());
-			launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
-			launch.setBuffer(m_hashGpu.getBufferCL());
-			launch.setBuffer(this->m_paramsGPU.getBufferCL());
-			launch.launch1D(numSmallAabbs);
-		}
-
-		m_sorter->execute(m_hashGpu);
-
-		int numCells = this->m_paramsCPU.m_gridSize[0] * this->m_paramsCPU.m_gridSize[1] * this->m_paramsCPU.m_gridSize[2];
-		m_cellStartGpu.resize(numCells);
-		//b3AlignedObjectArray<int >			cellStartCpu;
-
-		{
-			B3_PROFILE("kClearCellStart");
-			b3LauncherCL launch(m_queue, kClearCellStart, "kClearCellStart");
-			launch.setConst(numCells);
-			launch.setBuffer(m_cellStartGpu.getBufferCL());
-			launch.launch1D(numCells);
-			//m_cellStartGpu.copyToHost(cellStartCpu);
-			//printf("??\n");
-		}
-
-		{
-			B3_PROFILE("kFindCellStart");
-			b3LauncherCL launch(m_queue, kFindCellStart, "kFindCellStart");
-			launch.setConst(numSmallAabbs);
-			launch.setBuffer(m_hashGpu.getBufferCL());
-			launch.setBuffer(m_cellStartGpu.getBufferCL());
-			launch.launch1D(numSmallAabbs);
-			//m_cellStartGpu.copyToHost(cellStartCpu);
-			//printf("??\n");
-		}
-
-		{
-			B3_PROFILE("kFindOverlappingPairs");
-
-			b3LauncherCL launch(m_queue, kFindOverlappingPairs, "kFindOverlappingPairs");
-			launch.setConst(numSmallAabbs);
-			launch.setBuffer(m_allAabbsGPU1.getBufferCL());
-			launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
-			launch.setBuffer(m_hashGpu.getBufferCL());
-			launch.setBuffer(m_cellStartGpu.getBufferCL());
-
-			launch.setBuffer(m_paramsGPU.getBufferCL());
-			//launch.setBuffer(0);
-			launch.setBuffer(pairCount.getBufferCL());
-			launch.setBuffer(m_gpuPairs.getBufferCL());
-
-			launch.setConst(maxPairs);
-			launch.launch1D(numSmallAabbs);
-
-			int numPairs = pairCount.at(0);
-			if (numPairs > maxPairs)
-			{
-				b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
-				numPairs = maxPairs;
-			}
-
-			m_gpuPairs.resize(numPairs);
-
-			if (0)
-			{
-				b3AlignedObjectArray<b3Int4> pairsCpu;
-				m_gpuPairs.copyToHost(pairsCpu);
-
-				int sz = m_gpuPairs.size();
-				printf("m_gpuPairs.size()=%d\n", sz);
-				for (int i = 0; i < m_gpuPairs.size(); i++)
-				{
-					printf("pair %d = %d,%d\n", i, pairsCpu[i].x, pairsCpu[i].y);
-				}
-
-				printf("?!?\n");
-			}
-		}
-	}
-
-	//calculateOverlappingPairsHost(maxPairs);
-}
-void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
-{
-	m_hostPairs.resize(0);
-	m_allAabbsGPU1.copyToHost(m_allAabbsCPU1);
-	for (int i = 0; i < m_allAabbsCPU1.size(); i++)
-	{
-		for (int j = i + 1; j < m_allAabbsCPU1.size(); j++)
-		{
-			if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec,
-									   m_allAabbsCPU1[j].m_minVec, m_allAabbsCPU1[j].m_maxVec))
-			{
-				b3Int4 pair;
-				int a = m_allAabbsCPU1[j].m_minIndices[3];
-				int b = m_allAabbsCPU1[i].m_minIndices[3];
-				if (a <= b)
-				{
-					pair.x = a;
-					pair.y = b;  //store the original index in the unsorted aabb array
-				}
-				else
-				{
-					pair.x = b;
-					pair.y = a;  //store the original index in the unsorted aabb array
-				}
-
-				if (m_hostPairs.size() < maxPairs)
-				{
-					m_hostPairs.push_back(pair);
-				}
-			}
-		}
-	}
-
-	m_gpuPairs.copyFromHost(m_hostPairs);
-}
-
-//call writeAabbsToGpu after done making all changes (createProxy etc)
-void b3GpuGridBroadphase::writeAabbsToGpu()
-{
-	m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1);
-	m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
-	m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
-}
-
-cl_mem b3GpuGridBroadphase::getAabbBufferWS()
-{
-	return this->m_allAabbsGPU1.getBufferCL();
-}
-int b3GpuGridBroadphase::getNumOverlap()
-{
-	return m_gpuPairs.size();
-}
-cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
-{
-	return m_gpuPairs.getBufferCL();
-}
-
-b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
-{
-	return m_allAabbsGPU1;
-}
-
-b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
-{
-	return m_allAabbsCPU1;
-}
-
-b3OpenCLArray<b3Int4>& b3GpuGridBroadphase::getOverlappingPairsGPU()
-{
-	return m_gpuPairs;
-}
-b3OpenCLArray<int>& b3GpuGridBroadphase::getSmallAabbIndicesGPU()
-{
-	return m_smallAabbsMappingGPU;
-}
-b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()
-{
-	return m_largeAabbsMappingGPU;
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h
@ -1,80 +0,0 @@
-#ifndef B3_GPU_GRID_BROADPHASE_H
-#define B3_GPU_GRID_BROADPHASE_H
-
-#include "b3GpuBroadphaseInterface.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
-
-struct b3ParamsGridBroadphaseCL
-{
-	float m_invCellSize[4];
-	int m_gridSize[4];
-
-	int getMaxBodiesPerCell() const
-	{
-		return m_gridSize[3];
-	}
-
-	void setMaxBodiesPerCell(int maxOverlap)
-	{
-		m_gridSize[3] = maxOverlap;
-	}
-};
-
-class b3GpuGridBroadphase : public b3GpuBroadphaseInterface
-{
-protected:
-	cl_context m_context;
-	cl_device_id m_device;
-	cl_command_queue m_queue;
-
-	b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
-	b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;
-
-	b3OpenCLArray<int> m_smallAabbsMappingGPU;
-	b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
-
-	b3OpenCLArray<int> m_largeAabbsMappingGPU;
-	b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
-
-	b3AlignedObjectArray<b3Int4> m_hostPairs;
-	b3OpenCLArray<b3Int4> m_gpuPairs;
-
-	b3OpenCLArray<b3SortData> m_hashGpu;
-	b3OpenCLArray<int> m_cellStartGpu;
-
-	b3ParamsGridBroadphaseCL m_paramsCPU;
-	b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;
-
-	class b3RadixSort32CL* m_sorter;
-
-public:
-	b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q);
-	virtual ~b3GpuGridBroadphase();
-
-	static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx, cl_device_id device, cl_command_queue q)
-	{
-		return new b3GpuGridBroadphase(ctx, device, q);
-	}
-
-	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
-	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
-
-	virtual void calculateOverlappingPairs(int maxPairs);
-	virtual void calculateOverlappingPairsHost(int maxPairs);
-
-	//call writeAabbsToGpu after done making all changes (createProxy etc)
-	virtual void writeAabbsToGpu();
-
-	virtual cl_mem getAabbBufferWS();
-	virtual int getNumOverlap();
-	virtual cl_mem getOverlappingPairBuffer();
-
-	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
-	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
-
-	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
-	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
-	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
-};
-
-#endif  //B3_GPU_GRID_BROADPHASE_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
@ -1,557 +0,0 @@
-/*
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it freely,
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Initial Author Jackson Lee, 2014
-
-#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
-
-#include "b3GpuParallelLinearBvh.h"
-
-b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue),
-																												  m_radixSorter(context, device, queue),
-
-																												  m_rootNodeIndex(context, queue),
-																												  m_maxDistanceFromRoot(context, queue),
-																												  m_temp(context, queue),
-
-																												  m_internalNodeAabbs(context, queue),
-																												  m_internalNodeLeafIndexRanges(context, queue),
-																												  m_internalNodeChildNodes(context, queue),
-																												  m_internalNodeParentNodes(context, queue),
-
-																												  m_commonPrefixes(context, queue),
-																												  m_commonPrefixLengths(context, queue),
-																												  m_distanceFromRoot(context, queue),
-
-																												  m_leafNodeParentNodes(context, queue),
-																												  m_mortonCodesAndAabbIndicies(context, queue),
-																												  m_mergedAabb(context, queue),
-																												  m_leafNodeAabbs(context, queue),
-
-																												  m_largeAabbs(context, queue)
-{
-	m_rootNodeIndex.resize(1);
-	m_maxDistanceFromRoot.resize(1);
-	m_temp.resize(1);
-
-	//
-	const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
-
-	const char* kernelSource = parallelLinearBvhCL;  //parallelLinearBvhCL.h
-	cl_int error;
-	char* additionalMacros = 0;
-	m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
-	b3Assert(m_parallelLinearBvhProgram);
-
-	m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_separateAabbsKernel);
-	m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_findAllNodesMergedAabbKernel);
-	m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
-
-	m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_computeAdjacentPairCommonPrefixKernel);
-	m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
-	m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
-	m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_findDistanceFromRootKernel);
-	m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
-
-	m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_findLeafIndexRangesKernel);
-
-	m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_plbvhCalculateOverlappingPairsKernel);
-	m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_plbvhRayTraverseKernel);
-	m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_plbvhLargeAabbAabbTestKernel);
-	m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros);
-	b3Assert(m_plbvhLargeAabbRayTestKernel);
-}
-
-b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
-{
-	clReleaseKernel(m_separateAabbsKernel);
-	clReleaseKernel(m_findAllNodesMergedAabbKernel);
-	clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
-
-	clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
-	clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
-	clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
-	clReleaseKernel(m_findDistanceFromRootKernel);
-	clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
-
-	clReleaseKernel(m_findLeafIndexRangesKernel);
-
-	clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
-	clReleaseKernel(m_plbvhRayTraverseKernel);
-	clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
-	clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
-
-	clReleaseProgram(m_parallelLinearBvhProgram);
-}
-
-void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
-								   const b3OpenCLArray<int>& largeAabbIndices)
-{
-	B3_PROFILE("b3ParallelLinearBvh::build()");
-
-	int numLargeAabbs = largeAabbIndices.size();
-	int numSmallAabbs = smallAabbIndices.size();
-
-	//Since all AABBs(both large and small) are input as a contiguous array,
-	//with 2 additional arrays used to indicate the indices of large and small AABBs,
-	//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
-	{
-		B3_PROFILE("Separate large and small AABBs");
-
-		m_largeAabbs.resize(numLargeAabbs);
-		m_leafNodeAabbs.resize(numSmallAabbs);
-
-		//Write large AABBs into m_largeAabbs
-		{
-			b3BufferInfoCL bufferInfo[] =
-				{
-					b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
-					b3BufferInfoCL(largeAabbIndices.getBufferCL()),
-
-					b3BufferInfoCL(m_largeAabbs.getBufferCL())};
-
-			b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
-			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(numLargeAabbs);
-
-			launcher.launch1D(numLargeAabbs);
-		}
-
-		//Write small AABBs into m_leafNodeAabbs
-		{
-			b3BufferInfoCL bufferInfo[] =
-				{
-					b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
-					b3BufferInfoCL(smallAabbIndices.getBufferCL()),
-
-					b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())};
-
-			b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
-			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(numSmallAabbs);
-
-			launcher.launch1D(numSmallAabbs);
-		}
-
-		clFinish(m_queue);
-	}
-
-	//
-	int numLeaves = numSmallAabbs;  //Number of leaves in the BVH == Number of rigid bodies with small AABBs
-	int numInternalNodes = numLeaves - 1;
-
-	if (numLeaves < 2)
-	{
-		//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
-		//so it does not matter if numLeaves == 0 and rootNodeIndex == -1
-		int rootNodeIndex = numLeaves - 1;
-		m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
-
-		//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
-		//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
-		//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
-		//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
-		if (numLeaves == 1)
-		{
-			b3SortData leaf;
-			leaf.m_value = 0;  //1 leaf so index is always 0; leaf.m_key does not need to be set
-
-			m_mortonCodesAndAabbIndicies.resize(1);
-			m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
-		}
-
-		return;
-	}
-
-	//
-	{
-		m_internalNodeAabbs.resize(numInternalNodes);
-		m_internalNodeLeafIndexRanges.resize(numInternalNodes);
-		m_internalNodeChildNodes.resize(numInternalNodes);
-		m_internalNodeParentNodes.resize(numInternalNodes);
-
-		m_commonPrefixes.resize(numInternalNodes);
-		m_commonPrefixLengths.resize(numInternalNodes);
-		m_distanceFromRoot.resize(numInternalNodes);
-
-		m_leafNodeParentNodes.resize(numLeaves);
-		m_mortonCodesAndAabbIndicies.resize(numLeaves);
-		m_mergedAabb.resize(numLeaves);
-	}
-
-	//Find the merged AABB of all small AABBs; this is used to define the size of
-	//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
-	{
-		B3_PROFILE("Find AABB of merged nodes");
-
-		m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs);  //Need to make a copy since the kernel modifies the array
-
-		for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
-			 numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
-		{
-			b3BufferInfoCL bufferInfo[] =
-				{
-					b3BufferInfoCL(m_mergedAabb.getBufferCL())  //Resulting AABB is stored in m_mergedAabb[0]
-				};
-
-			b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
-			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(numAabbsNeedingMerge);
-
-			launcher.launch1D(numAabbsNeedingMerge);
-		}
-
-		clFinish(m_queue);
-	}
-
-	//Insert the center of the AABBs into a virtual grid,
-	//then convert the discrete grid coordinates into a morton code
-	//For each element in m_mortonCodesAndAabbIndicies, set
-	//	m_key == morton code (value to sort by)
-	//	m_value == small AABB index
-	{
-		B3_PROFILE("Assign morton codes");
-
-		b3BufferInfoCL bufferInfo[] =
-			{
-				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
-				b3BufferInfoCL(m_mergedAabb.getBufferCL()),
-				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
-		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(numLeaves);
-
-		launcher.launch1D(numLeaves);
-		clFinish(m_queue);
-	}
-
-	//
-	{
-		B3_PROFILE("Sort leaves by morton codes");
-
-		m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
-		clFinish(m_queue);
-	}
-
-	//
-	constructBinaryRadixTree();
-
-	//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
-	//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
-	//The child nodes of each node split their parent's index range into 2 contiguous halves.
-	//
-	//For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31].
-	//The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31].
-	//
-	//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
-	{
-		B3_PROFILE("m_findLeafIndexRangesKernel");
-
-		b3BufferInfoCL bufferInfo[] =
-			{
-				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
-				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
-		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(numInternalNodes);
-
-		launcher.launch1D(numInternalNodes);
-		clFinish(m_queue);
-	}
-}
-
-void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs)
-{
-	int maxPairs = out_overlappingPairs.size();
-	b3OpenCLArray<int>& numPairsGpu = m_temp;
-
-	int reset = 0;
-	numPairsGpu.copyFromHostPointer(&reset, 1);
-
-	//
-	if (m_leafNodeAabbs.size() > 1)
-	{
-		B3_PROFILE("PLBVH small-small AABB test");
-
-		int numQueryAabbs = m_leafNodeAabbs.size();
-
-		b3BufferInfoCL bufferInfo[] =
-			{
-				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
-
-				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
-				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
-				b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
-				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
-				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
-
-				b3BufferInfoCL(numPairsGpu.getBufferCL()),
-				b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
-		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(maxPairs);
-		launcher.setConst(numQueryAabbs);
-
-		launcher.launch1D(numQueryAabbs);
-		clFinish(m_queue);
-	}
-
-	int numLargeAabbRigids = m_largeAabbs.size();
-	if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0)
-	{
-		B3_PROFILE("PLBVH large-small AABB test");
-
-		int numQueryAabbs = m_leafNodeAabbs.size();
-
-		b3BufferInfoCL bufferInfo[] =
-			{
-				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
-				b3BufferInfoCL(m_largeAabbs.getBufferCL()),
-
-				b3BufferInfoCL(numPairsGpu.getBufferCL()),
-				b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
-		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(maxPairs);
-		launcher.setConst(numLargeAabbRigids);
-		launcher.setConst(numQueryAabbs);
-
-		launcher.launch1D(numQueryAabbs);
-		clFinish(m_queue);
-	}
-
-	//
-	int numPairs = -1;
-	numPairsGpu.copyToHostPointer(&numPairs, 1);
-	if (numPairs > maxPairs)
-	{
-		b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
-		numPairs = maxPairs;
-		numPairsGpu.copyFromHostPointer(&maxPairs, 1);
-	}
-
-	out_overlappingPairs.resize(numPairs);
-}
-
-void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
-													 b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
-{
-	B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
-
-	int numRays = rays.size();
-	int maxRayRigidPairs = out_rayRigidPairs.size();
-
-	int reset = 0;
-	out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
-
-	//
-	if (m_leafNodeAabbs.size() > 0)
-	{
-		B3_PROFILE("PLBVH ray test small AABB");
-
-		b3BufferInfoCL bufferInfo[] =
-			{
-				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
-
-				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
-				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
-				b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
-				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
-				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
-
-				b3BufferInfoCL(rays.getBufferCL()),
-
-				b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
-				b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
-		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(maxRayRigidPairs);
-		launcher.setConst(numRays);
-
-		launcher.launch1D(numRays);
-		clFinish(m_queue);
-	}
-
-	int numLargeAabbRigids = m_largeAabbs.size();
-	if (numLargeAabbRigids > 0)
-	{
-		B3_PROFILE("PLBVH ray test large AABB");
-
-		b3BufferInfoCL bufferInfo[] =
-			{
-				b3BufferInfoCL(m_largeAabbs.getBufferCL()),
-				b3BufferInfoCL(rays.getBufferCL()),
-
-				b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
-				b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
-		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(numLargeAabbRigids);
-		launcher.setConst(maxRayRigidPairs);
-		launcher.setConst(numRays);
-
-		launcher.launch1D(numRays);
-		clFinish(m_queue);
-	}
-
-	//
-	int numRayRigidPairs = -1;
-	out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
-
-	if (numRayRigidPairs > maxRayRigidPairs)
-		b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
-}
-
-void b3GpuParallelLinearBvh::constructBinaryRadixTree()
-{
-	B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
-
-	int numLeaves = m_leafNodeAabbs.size();
-	int numInternalNodes = numLeaves - 1;
-
-	//Each internal node is placed in between 2 leaf nodes.
-	//By using this arrangement and computing the common prefix between
-	//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
-	{
-		B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
-
-		b3BufferInfoCL bufferInfo[] =
-			{
-				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
-				b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
-				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
-		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(numInternalNodes);
-
-		launcher.launch1D(numInternalNodes);
-		clFinish(m_queue);
-	}
-
-	//For each leaf node, select its parent node by
-	//comparing the 2 nearest internal nodes and assign child node indices
-	{
-		B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
-
-		b3BufferInfoCL bufferInfo[] =
-			{
-				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
-				b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()),
-				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
-		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(numLeaves);
-
-		launcher.launch1D(numLeaves);
-		clFinish(m_queue);
-	}
-
-	//For each internal node, perform 2 binary searches among the other internal nodes
-	//to its left and right to find its potential parent nodes and assign child node indices
-	{
-		B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
-
-		b3BufferInfoCL bufferInfo[] =
-			{
-				b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
-				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
-				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
-				b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
-				b3BufferInfoCL(m_rootNodeIndex.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
-		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(numInternalNodes);
-
-		launcher.launch1D(numInternalNodes);
-		clFinish(m_queue);
-	}
-
-	//Find the number of nodes seperating each internal node and the root node
-	//so that the AABBs can be set using the next kernel.
-	//Also determine the maximum number of nodes separating an internal node and the root node.
-	{
-		B3_PROFILE("m_findDistanceFromRootKernel");
-
-		b3BufferInfoCL bufferInfo[] =
-			{
-				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
-				b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
-				b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()),
-				b3BufferInfoCL(m_distanceFromRoot.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
-		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(numInternalNodes);
-
-		launcher.launch1D(numInternalNodes);
-		clFinish(m_queue);
-	}
-
-	//Starting from the internal nodes nearest to the leaf nodes, recursively move up
-	//the tree towards the root to set the AABBs of each internal node; each internal node
-	//checks its children and merges their AABBs
-	{
-		B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
-
-		int maxDistanceFromRoot = -1;
-		{
-			B3_PROFILE("copy maxDistanceFromRoot to CPU");
-			m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
-			clFinish(m_queue);
-		}
-
-		for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
-		{
-			b3BufferInfoCL bufferInfo[] =
-				{
-					b3BufferInfoCL(m_distanceFromRoot.getBufferCL()),
-					b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
-					b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
-					b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
-					b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())};
-
-			b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
-			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(maxDistanceFromRoot);
-			launcher.setConst(distanceFromRoot);
-			launcher.setConst(numInternalNodes);
-
-			//It may seem inefficent to launch a thread for each internal node when a
-			//much smaller number of nodes is actually processed, but this is actually
-			//faster than determining the exact nodes that are ready to merge their child AABBs.
-			launcher.launch1D(numInternalNodes);
-		}
-
-		clFinish(m_queue);
-	}
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
@ -1,125 +0,0 @@
-/*
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it freely,
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Initial Author Jackson Lee, 2014
-
-#ifndef B3_GPU_PARALLEL_LINEAR_BVH_H
-#define B3_GPU_PARALLEL_LINEAR_BVH_H
-
-//#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
-#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
-#include "Bullet3Common/shared/b3Int2.h"
-#include "Bullet3Common/shared/b3Int4.h"
-#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
-
-#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
-
-#include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h"
-
-#define b3Int64 cl_long
-
-///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame
-///@remarks
-///See presentation in docs/b3GpuParallelLinearBvh.pdf for algorithm details.
-///@par
-///Related papers: \n
-///"Fast BVH Construction on GPUs" [Lauterbach et al. 2009] \n
-///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
-///@par
-///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
-/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
-/// - [fully parallel] Sort morton codes
-/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
-/// - [somewhat parallel] Set internal node AABBs
-///@par
-///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
-///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree.
-///Instead of searching for the child nodes of each internal node, we search for the parent node of each node.
-///Additionally, a non-atomic traversal that starts from the leaf nodes and moves towards the root node is used to set the AABBs.
-class b3GpuParallelLinearBvh
-{
-	cl_command_queue m_queue;
-
-	cl_program m_parallelLinearBvhProgram;
-
-	cl_kernel m_separateAabbsKernel;
-	cl_kernel m_findAllNodesMergedAabbKernel;
-	cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
-
-	//Binary radix tree construction kernels
-	cl_kernel m_computeAdjacentPairCommonPrefixKernel;
-	cl_kernel m_buildBinaryRadixTreeLeafNodesKernel;
-	cl_kernel m_buildBinaryRadixTreeInternalNodesKernel;
-	cl_kernel m_findDistanceFromRootKernel;
-	cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel;
-
-	cl_kernel m_findLeafIndexRangesKernel;
-
-	//Traversal kernels
-	cl_kernel m_plbvhCalculateOverlappingPairsKernel;
-	cl_kernel m_plbvhRayTraverseKernel;
-	cl_kernel m_plbvhLargeAabbAabbTestKernel;
-	cl_kernel m_plbvhLargeAabbRayTestKernel;
-
-	b3RadixSort32CL m_radixSorter;
-
-	//1 element
-	b3OpenCLArray<int> m_rootNodeIndex;        //Most significant bit(0x80000000) is set to indicate internal node
-	b3OpenCLArray<int> m_maxDistanceFromRoot;  //Max number of internal nodes between an internal node and the root node
-	b3OpenCLArray<int> m_temp;                 //Used to hold the number of pairs in calculateOverlappingPairs()
-
-	//1 element per internal node (number_of_internal_nodes == number_of_leaves - 1)
-	b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
-	b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges;  //x == min leaf index, y == max leaf index
-	b3OpenCLArray<b3Int2> m_internalNodeChildNodes;       //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
-	b3OpenCLArray<int> m_internalNodeParentNodes;         //For parent node index, msb(0x80000000) is not set since it is always internal
-
-	//1 element per internal node; for binary radix tree construction
-	b3OpenCLArray<b3Int64> m_commonPrefixes;
-	b3OpenCLArray<int> m_commonPrefixLengths;
-	b3OpenCLArray<int> m_distanceFromRoot;  //Number of internal nodes between this node and the root
-
-	//1 element per leaf node (leaf nodes only include small AABBs)
-	b3OpenCLArray<int> m_leafNodeParentNodes;                //For parent node index, msb(0x80000000) is not set since it is always internal
-	b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies;  //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
-	b3OpenCLArray<b3SapAabb> m_mergedAabb;                   //m_mergedAabb[0] contains the merged AABB of all leaf nodes
-	b3OpenCLArray<b3SapAabb> m_leafNodeAabbs;                //Contains only small AABBs
-
-	//1 element per large AABB, which is not stored in the BVH
-	b3OpenCLArray<b3SapAabb> m_largeAabbs;
-
-public:
-	b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue);
-	virtual ~b3GpuParallelLinearBvh();
-
-	///Must be called before any other function
-	void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
-			   const b3OpenCLArray<int>& largeAabbIndices);
-
-	///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs.
-	///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs.
-	///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized.
-	void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs);
-
-	///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections;
-	///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough.
-	///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index.
-	///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded.
-	void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
-								 b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
-
-private:
-	void constructBinaryRadixTree();
-};
-
-#endif
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp
@ -1,76 +0,0 @@
-/*
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it freely,
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Initial Author Jackson Lee, 2014
-
-#include "b3GpuParallelLinearBvhBroadphase.h"
-
-b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : m_plbvh(context, device, queue),
-
-																																	  m_overlappingPairsGpu(context, queue),
-
-																																	  m_aabbsGpu(context, queue),
-																																	  m_smallAabbsMappingGpu(context, queue),
-																																	  m_largeAabbsMappingGpu(context, queue)
-{
-}
-
-void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
-{
-	int newAabbIndex = m_aabbsCpu.size();
-
-	b3SapAabb aabb;
-	aabb.m_minVec = aabbMin;
-	aabb.m_maxVec = aabbMax;
-
-	aabb.m_minIndices[3] = userPtr;
-	aabb.m_signedMaxIndices[3] = newAabbIndex;
-
-	m_smallAabbsMappingCpu.push_back(newAabbIndex);
-
-	m_aabbsCpu.push_back(aabb);
-}
-void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
-{
-	int newAabbIndex = m_aabbsCpu.size();
-
-	b3SapAabb aabb;
-	aabb.m_minVec = aabbMin;
-	aabb.m_maxVec = aabbMax;
-
-	aabb.m_minIndices[3] = userPtr;
-	aabb.m_signedMaxIndices[3] = newAabbIndex;
-
-	m_largeAabbsMappingCpu.push_back(newAabbIndex);
-
-	m_aabbsCpu.push_back(aabb);
-}
-
-void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs)
-{
-	//Reconstruct BVH
-	m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu);
-
-	//
-	m_overlappingPairsGpu.resize(maxPairs);
-	m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu);
-}
-void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs)
-{
-	b3Assert(0);  //CPU version not implemented
-}
-
-void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
-{
-	m_aabbsGpu.copyFromHost(m_aabbsCpu);
-	m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu);
-	m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu);
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
@ -1,66 +0,0 @@
-/*
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it freely,
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Initial Author Jackson Lee, 2014
-
-#ifndef B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
-#define B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
-
-#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
-
-#include "b3GpuParallelLinearBvh.h"
-
-class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
-{
-	b3GpuParallelLinearBvh m_plbvh;
-
-	b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
-
-	b3OpenCLArray<b3SapAabb> m_aabbsGpu;
-	b3OpenCLArray<int> m_smallAabbsMappingGpu;
-	b3OpenCLArray<int> m_largeAabbsMappingGpu;
-
-	b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
-	b3AlignedObjectArray<int> m_smallAabbsMappingCpu;
-	b3AlignedObjectArray<int> m_largeAabbsMappingCpu;
-
-public:
-	b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue);
-	virtual ~b3GpuParallelLinearBvhBroadphase() {}
-
-	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
-	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
-
-	virtual void calculateOverlappingPairs(int maxPairs);
-	virtual void calculateOverlappingPairsHost(int maxPairs);
-
-	//call writeAabbsToGpu after done making all changes (createProxy etc)
-	virtual void writeAabbsToGpu();
-
-	virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
-	virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }
-
-	virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
-	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
-
-	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; }
-	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; }
-	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; }
-
-	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
-
-	static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
-	{
-		return new b3GpuParallelLinearBvhBroadphase(context, device, queue);
-	}
-};
-
-#endif
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
@ -1,143 +0,0 @@
-#ifndef B3_GPU_SAP_BROADPHASE_H
-#define B3_GPU_SAP_BROADPHASE_H
-
-#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"  //b3Int2
-class b3Vector3;
-#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
-
-#include "b3SapAabb.h"
-#include "Bullet3Common/shared/b3Int2.h"
-
-#include "b3GpuBroadphaseInterface.h"
-
-class b3GpuSapBroadphase : public b3GpuBroadphaseInterface
-{
-	cl_context m_context;
-	cl_device_id m_device;
-	cl_command_queue m_queue;
-	cl_kernel m_flipFloatKernel;
-	cl_kernel m_scatterKernel;
-	cl_kernel m_copyAabbsKernel;
-	cl_kernel m_sapKernel;
-	cl_kernel m_sap2Kernel;
-	cl_kernel m_prepareSumVarianceKernel;
-
-	class b3RadixSort32CL* m_sorter;
-
-	///test for 3d SAP
-	b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
-	b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2];
-	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0;
-	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1;
-	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2;
-	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev;
-	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev;
-	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev;
-
-	b3OpenCLArray<b3SortData> m_sortedAxisGPU0;
-	b3OpenCLArray<b3SortData> m_sortedAxisGPU1;
-	b3OpenCLArray<b3SortData> m_sortedAxisGPU2;
-	b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev;
-	b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev;
-	b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev;
-
-	b3OpenCLArray<b3Int4> m_addedHostPairsGPU;
-	b3OpenCLArray<b3Int4> m_removedHostPairsGPU;
-	b3OpenCLArray<int> m_addedCountGPU;
-	b3OpenCLArray<int> m_removedCountGPU;
-
-	int m_currentBuffer;
-
-public:
-	b3OpenCLArray<int> m_pairCount;
-
-	b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
-	b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
-
-	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
-	{
-		return m_allAabbsGPU;
-	}
-	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
-	{
-		return m_allAabbsCPU;
-	}
-
-	b3OpenCLArray<b3Vector3> m_sum;
-	b3OpenCLArray<b3Vector3> m_sum2;
-	b3OpenCLArray<b3Vector3> m_dst;
-
-	b3OpenCLArray<int> m_smallAabbsMappingGPU;
-	b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
-
-	b3OpenCLArray<int> m_largeAabbsMappingGPU;
-	b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
-
-	b3OpenCLArray<b3Int4> m_overlappingPairs;
-
-	//temporary gpu work memory
-	b3OpenCLArray<b3SortData> m_gpuSmallSortData;
-	b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
-
-	class b3PrefixScanFloat4CL* m_prefixScanFloat4;
-
-	enum b3GpuSapKernelType
-	{
-		B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU = 1,
-		B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU,
-		B3_GPU_SAP_KERNEL_ORIGINAL,
-		B3_GPU_SAP_KERNEL_BARRIER,
-		B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY
-	};
-
-	b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType = B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
-	virtual ~b3GpuSapBroadphase();
-
-	static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx, cl_device_id device, cl_command_queue q)
-	{
-		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
-	}
-
-	static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx, cl_device_id device, cl_command_queue q)
-	{
-		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
-	}
-
-	static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx, cl_device_id device, cl_command_queue q)
-	{
-		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_ORIGINAL);
-	}
-	static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx, cl_device_id device, cl_command_queue q)
-	{
-		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BARRIER);
-	}
-	static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx, cl_device_id device, cl_command_queue q)
-	{
-		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
-	}
-
-	virtual void calculateOverlappingPairs(int maxPairs);
-	virtual void calculateOverlappingPairsHost(int maxPairs);
-
-	void reset();
-
-	void init3dSap();
-	virtual void calculateOverlappingPairsHostIncremental3Sap();
-
-	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
-	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
-
-	//call writeAabbsToGpu after done making all changes (createProxy etc)
-	virtual void writeAabbsToGpu();
-
-	virtual cl_mem getAabbBufferWS();
-	virtual int getNumOverlap();
-	virtual cl_mem getOverlappingPairBuffer();
-
-	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
-	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
-	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
-};
-
-#endif  //B3_GPU_SAP_BROADPHASE_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
@ -1,13 +0,0 @@
-#ifndef B3_SAP_AABB_H
-#define B3_SAP_AABB_H
-
-#include "Bullet3Common/b3Scalar.h"
-#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
-
-///just make sure that the b3Aabb is 16-byte aligned
-B3_ATTRIBUTE_ALIGNED16(struct)
-b3SapAabb : public b3Aabb{
-
-			};
-
-#endif  //B3_SAP_AABB_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl
@ -1,216 +0,0 @@
-
-
-int getPosHash(int4 gridPos, __global float4* pParams)
-{
-	int4 gridDim = *((__global int4*)(pParams + 1));
-	gridPos.x &= gridDim.x - 1;
-	gridPos.y &= gridDim.y - 1;
-	gridPos.z &= gridDim.z - 1;
-	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;
-	return hash;
-} 
-
-int4 getGridPos(float4 worldPos, __global float4* pParams)
-{
-    int4 gridPos;
-	int4 gridDim = *((__global int4*)(pParams + 1));
-    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);
-    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);
-    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);
-    return gridPos;
-}
-
-
-// calculate grid hash value for each body using its AABB
-__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )
-{
-    int index = get_global_id(0);
-    if(index >= numObjects)
-	{
-		return;
-	}
-	float4 bbMin = allpAABB[smallAabbMapping[index]*2];
-	float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];
-	float4 pos;
-	pos.x = (bbMin.x + bbMax.x) * 0.5f;
-	pos.y = (bbMin.y + bbMax.y) * 0.5f;
-	pos.z = (bbMin.z + bbMax.z) * 0.5f;
-	pos.w = 0.f;
-    // get address in grid
-    int4 gridPos = getGridPos(pos, pParams);
-    int gridHash = getPosHash(gridPos, pParams);
-    // store grid hash and body index
-    int2 hashVal;
-    hashVal.x = gridHash;
-    hashVal.y = index;
-    pHash[index] = hashVal;
-}
-
-__kernel void kClearCellStart(	int numCells, 
-								__global int* pCellStart )
-{
-    int index = get_global_id(0);
-    if(index >= numCells)
-	{
-		return;
-	}
-	pCellStart[index] = -1;
-}
-
-__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )
-{
-	__local int sharedHash[513];
-    int index = get_global_id(0);
-	int2 sortedData;
-
-    if(index < numObjects)
-	{
-		sortedData = pHash[index];
-		// Load hash data into shared memory so that we can look 
-		// at neighboring body's hash value without loading
-		// two hash values per thread
-		sharedHash[get_local_id(0) + 1] = sortedData.x;
-		if((index > 0) && (get_local_id(0) == 0))
-		{
-			// first thread in block must load neighbor body hash
-			sharedHash[0] = pHash[index-1].x;
-		}
-	}
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(index < numObjects)
-	{
-		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))
-		{
-			cellStart[sortedData.x] = index;
-		}
-	}
-}
-
-int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)
-{
-	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && 
-			(min0.y <= max1.y)&& (min1.y <= max0.y) && 
-			(min0.z <= max1.z)&& (min1.z <= max0.z); 
-}
-
-
-
-
-//search for AABB 'index' against other AABBs' in this cell
-void findPairsInCell(	int numObjects,
-						int4	gridPos,
-						int    index,
-						__global int2*  pHash,
-						__global int*   pCellStart,
-						__global float4* allpAABB, 
-						__global const int* smallAabbMapping,
-						__global float4* pParams,
-							volatile  __global int* pairCount,
-						__global int4*   pPairBuff2,
-						int maxPairs
-						)
-{
-	int4 pGridDim = *((__global int4*)(pParams + 1));
-	int maxBodiesPerCell = pGridDim.w;
-    int gridHash = getPosHash(gridPos, pParams);
-    // get start of bucket for this cell
-    int bucketStart = pCellStart[gridHash];
-    if (bucketStart == -1)
-	{
-        return;   // cell empty
-	}
-	// iterate over bodies in this cell
-    int2 sortedData = pHash[index];
-	int unsorted_indx = sortedData.y;
-    float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; 
-	float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];
-	int handleIndex =  as_int(min0.w);
-	
-	int bucketEnd = bucketStart + maxBodiesPerCell;
-	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;
-	for(int index2 = bucketStart; index2 < bucketEnd; index2++) 
-	{
-        int2 cellData = pHash[index2];
-        if (cellData.x != gridHash)
-        {
-			break;   // no longer in same bucket
-		}
-		int unsorted_indx2 = cellData.y;
-        //if (unsorted_indx2 < unsorted_indx) // check not colliding with self
-		if (unsorted_indx2 != unsorted_indx) // check not colliding with self
-        {   
-			float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];
-			float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];
-			if(testAABBOverlap(min0, max0, min1, max1))
-			{
-				if (pairCount)
-				{
-					int handleIndex2 = as_int(min1.w);
-					if (handleIndex<handleIndex2)
-					{
-						int curPair = atomic_add(pairCount,1);
-						if (curPair<maxPairs)
-						{
-							int4 newpair;
-							newpair.x = handleIndex;
-							newpair.y = handleIndex2;
-							newpair.z = -1;
-							newpair.w = -1;
-							pPairBuff2[curPair] = newpair;
-						}
-					}
-				
-				}
-			}
-		}
-	}
-}
-
-__kernel void kFindOverlappingPairs(	int numObjects,
-										__global float4* allpAABB, 
-										__global const int* smallAabbMapping,
-										__global int2* pHash, 
-										__global int* pCellStart, 
-										__global float4* pParams ,
-										volatile  __global int* pairCount,
-										__global int4*   pPairBuff2,
-										int maxPairs
-										)
-
-{
-    int index = get_global_id(0);
-    if(index >= numObjects)
-	{
-		return;
-	}
-    int2 sortedData = pHash[index];
-	int unsorted_indx = sortedData.y;
-	float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];
-	float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];
-	float4 pos;
-	pos.x = (bbMin.x + bbMax.x) * 0.5f;
-	pos.y = (bbMin.y + bbMax.y) * 0.5f;
-	pos.z = (bbMin.z + bbMax.z) * 0.5f;
-    // get address in grid
-    int4 gridPosA = getGridPos(pos, pParams);
-    int4 gridPosB; 
-    // examine only neighbouring cells
-    for(int z=-1; z<=1; z++) 
-    {
-		gridPosB.z = gridPosA.z + z;
-        for(int y=-1; y<=1; y++) 
-        {
-			gridPosB.y = gridPosA.y + y;
-            for(int x=-1; x<=1; x++) 
-            {
-				gridPosB.x = gridPosA.x + x;
-                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);
-            }
-        }
-    }
-}
-
-
-
-
-
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
@ -1,198 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* gridBroadphaseCL =
-	"int getPosHash(int4 gridPos, __global float4* pParams)\n"
-	"{\n"
-	"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
-	"	gridPos.x &= gridDim.x - 1;\n"
-	"	gridPos.y &= gridDim.y - 1;\n"
-	"	gridPos.z &= gridDim.z - 1;\n"
-	"	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
-	"	return hash;\n"
-	"} \n"
-	"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
-	"{\n"
-	"    int4 gridPos;\n"
-	"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
-	"    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
-	"    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
-	"    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
-	"    return gridPos;\n"
-	"}\n"
-	"// calculate grid hash value for each body using its AABB\n"
-	"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
-	"{\n"
-	"    int index = get_global_id(0);\n"
-	"    if(index >= numObjects)\n"
-	"	{\n"
-	"		return;\n"
-	"	}\n"
-	"	float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
-	"	float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
-	"	float4 pos;\n"
-	"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
-	"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
-	"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
-	"	pos.w = 0.f;\n"
-	"    // get address in grid\n"
-	"    int4 gridPos = getGridPos(pos, pParams);\n"
-	"    int gridHash = getPosHash(gridPos, pParams);\n"
-	"    // store grid hash and body index\n"
-	"    int2 hashVal;\n"
-	"    hashVal.x = gridHash;\n"
-	"    hashVal.y = index;\n"
-	"    pHash[index] = hashVal;\n"
-	"}\n"
-	"__kernel void kClearCellStart(	int numCells, \n"
-	"								__global int* pCellStart )\n"
-	"{\n"
-	"    int index = get_global_id(0);\n"
-	"    if(index >= numCells)\n"
-	"	{\n"
-	"		return;\n"
-	"	}\n"
-	"	pCellStart[index] = -1;\n"
-	"}\n"
-	"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
-	"{\n"
-	"	__local int sharedHash[513];\n"
-	"    int index = get_global_id(0);\n"
-	"	int2 sortedData;\n"
-	"    if(index < numObjects)\n"
-	"	{\n"
-	"		sortedData = pHash[index];\n"
-	"		// Load hash data into shared memory so that we can look \n"
-	"		// at neighboring body's hash value without loading\n"
-	"		// two hash values per thread\n"
-	"		sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
-	"		if((index > 0) && (get_local_id(0) == 0))\n"
-	"		{\n"
-	"			// first thread in block must load neighbor body hash\n"
-	"			sharedHash[0] = pHash[index-1].x;\n"
-	"		}\n"
-	"	}\n"
-	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"    if(index < numObjects)\n"
-	"	{\n"
-	"		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
-	"		{\n"
-	"			cellStart[sortedData.x] = index;\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
-	"{\n"
-	"	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
-	"			(min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
-	"			(min0.z <= max1.z)&& (min1.z <= max0.z); \n"
-	"}\n"
-	"//search for AABB 'index' against other AABBs' in this cell\n"
-	"void findPairsInCell(	int numObjects,\n"
-	"						int4	gridPos,\n"
-	"						int    index,\n"
-	"						__global int2*  pHash,\n"
-	"						__global int*   pCellStart,\n"
-	"						__global float4* allpAABB, \n"
-	"						__global const int* smallAabbMapping,\n"
-	"						__global float4* pParams,\n"
-	"							volatile  __global int* pairCount,\n"
-	"						__global int4*   pPairBuff2,\n"
-	"						int maxPairs\n"
-	"						)\n"
-	"{\n"
-	"	int4 pGridDim = *((__global int4*)(pParams + 1));\n"
-	"	int maxBodiesPerCell = pGridDim.w;\n"
-	"    int gridHash = getPosHash(gridPos, pParams);\n"
-	"    // get start of bucket for this cell\n"
-	"    int bucketStart = pCellStart[gridHash];\n"
-	"    if (bucketStart == -1)\n"
-	"	{\n"
-	"        return;   // cell empty\n"
-	"	}\n"
-	"	// iterate over bodies in this cell\n"
-	"    int2 sortedData = pHash[index];\n"
-	"	int unsorted_indx = sortedData.y;\n"
-	"    float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
-	"	float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
-	"	int handleIndex =  as_int(min0.w);\n"
-	"	\n"
-	"	int bucketEnd = bucketStart + maxBodiesPerCell;\n"
-	"	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
-	"	for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
-	"	{\n"
-	"        int2 cellData = pHash[index2];\n"
-	"        if (cellData.x != gridHash)\n"
-	"        {\n"
-	"			break;   // no longer in same bucket\n"
-	"		}\n"
-	"		int unsorted_indx2 = cellData.y;\n"
-	"        //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
-	"		if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
-	"        {   \n"
-	"			float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
-	"			float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
-	"			if(testAABBOverlap(min0, max0, min1, max1))\n"
-	"			{\n"
-	"				if (pairCount)\n"
-	"				{\n"
-	"					int handleIndex2 = as_int(min1.w);\n"
-	"					if (handleIndex<handleIndex2)\n"
-	"					{\n"
-	"						int curPair = atomic_add(pairCount,1);\n"
-	"						if (curPair<maxPairs)\n"
-	"						{\n"
-	"							int4 newpair;\n"
-	"							newpair.x = handleIndex;\n"
-	"							newpair.y = handleIndex2;\n"
-	"							newpair.z = -1;\n"
-	"							newpair.w = -1;\n"
-	"							pPairBuff2[curPair] = newpair;\n"
-	"						}\n"
-	"					}\n"
-	"				\n"
-	"				}\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"__kernel void kFindOverlappingPairs(	int numObjects,\n"
-	"										__global float4* allpAABB, \n"
-	"										__global const int* smallAabbMapping,\n"
-	"										__global int2* pHash, \n"
-	"										__global int* pCellStart, \n"
-	"										__global float4* pParams ,\n"
-	"										volatile  __global int* pairCount,\n"
-	"										__global int4*   pPairBuff2,\n"
-	"										int maxPairs\n"
-	"										)\n"
-	"{\n"
-	"    int index = get_global_id(0);\n"
-	"    if(index >= numObjects)\n"
-	"	{\n"
-	"		return;\n"
-	"	}\n"
-	"    int2 sortedData = pHash[index];\n"
-	"	int unsorted_indx = sortedData.y;\n"
-	"	float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
-	"	float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
-	"	float4 pos;\n"
-	"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
-	"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
-	"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
-	"    // get address in grid\n"
-	"    int4 gridPosA = getGridPos(pos, pParams);\n"
-	"    int4 gridPosB; \n"
-	"    // examine only neighbouring cells\n"
-	"    for(int z=-1; z<=1; z++) \n"
-	"    {\n"
-	"		gridPosB.z = gridPosA.z + z;\n"
-	"        for(int y=-1; y<=1; y++) \n"
-	"        {\n"
-	"			gridPosB.y = gridPosA.y + y;\n"
-	"            for(int x=-1; x<=1; x++) \n"
-	"            {\n"
-	"				gridPosB.x = gridPosA.x + x;\n"
-	"                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
-	"            }\n"
-	"        }\n"
-	"    }\n"
-	"}\n";
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
@ -1,767 +0,0 @@
-/*
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it freely,
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Initial Author Jackson Lee, 2014
-
-typedef float b3Scalar;
-typedef float4 b3Vector3;
-#define b3Max max
-#define b3Min min
-#define b3Sqrt sqrt
-
-typedef struct
-{
-	unsigned int m_key;
-	unsigned int m_value;
-} SortDataCL;
-
-typedef struct 
-{
-	union
-	{
-		float4	m_min;
-		float   m_minElems[4];
-		int			m_minIndices[4];
-	};
-	union
-	{
-		float4	m_max;
-		float   m_maxElems[4];
-		int			m_maxIndices[4];
-	};
-} b3AabbCL;
-
-
-unsigned int interleaveBits(unsigned int x)
-{
-	//........ ........ ......12 3456789A	//x
-	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x after interleaving bits
-	
-	//......12 3456789A ......12 3456789A	//x ^ (x << 16)
-	//11111111 ........ ........ 11111111	//0x FF 00 00 FF
-	//......12 ........ ........ 3456789A	//x = (x ^ (x << 16)) & 0xFF0000FF;
-	
-	//......12 ........ 3456789A 3456789A	//x ^ (x <<  8)
-	//......11 ........ 1111.... ....1111	//0x 03 00 F0 0F
-	//......12 ........ 3456.... ....789A	//x = (x ^ (x <<  8)) & 0x0300F00F;
-	
-	//..12..12 ....3456 3456.... 789A789A	//x ^ (x <<  4)
-	//......11 ....11.. ..11.... 11....11	//0x 03 0C 30 C3
-	//......12 ....34.. ..56.... 78....9A	//x = (x ^ (x <<  4)) & 0x030C30C3;
-	
-	//....1212 ..3434.. 5656..78 78..9A9A	//x ^ (x <<  2)
-	//....1..1 ..1..1.. 1..1..1. .1..1..1	//0x 09 24 92 49
-	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x = (x ^ (x <<  2)) & 0x09249249;
-	
-	//........ ........ ......11 11111111	//0x000003FF
-	x &= 0x000003FF;		//Clear all bits above bit 10
-	
-	x = (x ^ (x << 16)) & 0xFF0000FF;
-	x = (x ^ (x <<  8)) & 0x0300F00F;
-	x = (x ^ (x <<  4)) & 0x030C30C3;
-	x = (x ^ (x <<  2)) & 0x09249249;
-	
-	return x;
-}
-unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)
-{
-	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;
-}
-
-__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)
-{
-	int separatedAabbIndex = get_global_id(0);
-	if(separatedAabbIndex >= numAabbsToSeparate) return;
-
-	int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];
-	out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];
-}
-
-//Should replace with an optimized parallel reduction
-__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)
-{
-	//Each time this kernel is added to the command queue, 
-	//the number of AABBs needing to be merged is halved
-	//
-	//Example with 159 AABBs:
-	//	numRemainingAabbs == 159 / 2 + 159 % 2 == 80
-	//	numMergedAabbs == 159 - 80 == 79
-	//So, indices [0, 78] are merged with [0 + 80, 78 + 80]
-	
-	int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;
-	int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;
-	
-	int aabbIndex = get_global_id(0);
-	if(aabbIndex >= numMergedAabbs) return;
-	
-	int otherAabbIndex = aabbIndex + numRemainingAabbs;
-	
-	b3AabbCL aabb = out_mergedAabb[aabbIndex];
-	b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
-		
-	b3AabbCL mergedAabb;
-	mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
-	mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
-	out_mergedAabb[aabbIndex] = mergedAabb;
-}
-
-__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, 
-												__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)
-{
-	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index
-	if(leafNodeIndex >= numAabbs) return;
-	
-	b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];
-	b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;
-	b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;
-	
-	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];
-	b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;
-	b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;
-	
-	//Quantize into integer coordinates
-	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size
-	b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;
-	
-	int4 discretePosition;
-	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );
-	discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );
-	discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );
-	
-	//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]
-	discretePosition = b3Max( -512, b3Min(discretePosition, 511) );
-	discretePosition += 512;
-	
-	//Interleave bits(assign a morton code, also known as a z-curve)
-	unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);
-	
-	//
-	SortDataCL mortonCodeIndexPair;
-	mortonCodeIndexPair.m_key = mortonCode;
-	mortonCodeIndexPair.m_value = leafNodeIndex;
-	
-	out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;
-}
-
-#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128
-
-//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.
-//If it is set, then the index is for an internal node; otherwise, it is a leaf node. 
-//In both cases, the bit should be cleared to access the actual node index.
-int isLeafNode(int index) { return (index >> 31 == 0); }
-int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }
-int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }
-
-//From sap.cl
-#define NEW_PAIR_MARKER -1
-
-bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)
-{
-	bool overlap = true;
-	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
-	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
-	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
-	return overlap;
-}
-//From sap.cl
-
-__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, 
-
-											__global int* rootNodeIndex, 
-											__global int2* internalNodeChildIndices, 
-											__global b3AabbCL* internalNodeAabbs,
-											__global int2* internalNodeLeafIndexRanges,
-											
-											__global SortDataCL* mortonCodesAndAabbIndices,
-											__global int* out_numPairs, __global int4* out_overlappingPairs, 
-											int maxPairs, int numQueryAabbs)
-{
-	//Using get_group_id()/get_local_id() is Faster than get_global_id(0) since
-	//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)
-	int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
-	if(queryBvhNodeIndex >= numQueryAabbs) return;
-	
-	int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;
-	b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];
-	
-	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];
-	
-	int stackSize = 1;
-	stack[0] = *rootNodeIndex;
-	
-	while(stackSize)
-	{
-		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
-		--stackSize;
-		
-		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false
-		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
-		
-		//Optimization - if the BVH is structured as a binary radix tree, then
-		//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).
-		//This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.
-		{
-			int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;
-			if(highestLeafIndex <= queryBvhNodeIndex) continue;
-		}
-		
-		//bvhRigidIndex is not used if internal node
-		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
-	
-		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
-		if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )
-		{
-			if(isLeaf)
-			{
-				int4 pair;
-				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];
-				pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
-				pair.z = NEW_PAIR_MARKER;
-				pair.w = NEW_PAIR_MARKER;
-				
-				int pairIndex = atomic_inc(out_numPairs);
-				if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
-			}
-			
-			if(!isLeaf)	//Internal node
-			{
-				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
-				{
-					//Error
-				}
-				else
-				{
-					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
-					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
-				}
-			}
-		}
-		
-	}
-}
-
-
-//From rayCastKernels.cl
-typedef struct
-{
-	float4 m_from;
-	float4 m_to;
-} b3RayInfo;
-//From rayCastKernels.cl
-
-b3Vector3 b3Vector3_normalize(b3Vector3 v)
-{
-	b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};
-	return normalize(normal);	//OpenCL normalize == vector4 normalize
-}
-b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
-b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }
-
-int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)
-{
-	//AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).
-	//t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.
-	//
-	//if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane 
-	//and min.x will be the far plane; otherwise, it is reversed.
-	//
-	//In order for there to be a collision, the t_min and t_max of each pair must overlap.
-	//This can be tested for by selecting the highest t_min and lowest t_max and comparing them.
-	
-	int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) );	//isless(x,y) returns (x < y)
-	
-	//When using vector types, the select() function checks the most signficant bit, 
-	//but isless() sets the least significant bit.
-	isNegative <<= 31;
-
-	//select(b, a, condition) == condition ? a : b
-	//When using select() with vector types, (condition[i]) is true if its most significant bit is 1
-	b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;
-	b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;
-	
-	b3Scalar t_min_final = 0.0f;
-	b3Scalar t_max_final = rayLength;
-	
-	//Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. 
-	//Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])
-	//Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.
-	t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );
-	t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );
-	
-	return (t_min_final <= t_max_final);
-}
-
-__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,
-
-								__global int* rootNodeIndex, 
-								__global int2* internalNodeChildIndices, 
-								__global b3AabbCL* internalNodeAabbs,
-								__global int2* internalNodeLeafIndexRanges,
-								__global SortDataCL* mortonCodesAndAabbIndices,
-								
-								__global b3RayInfo* rays,
-								
-								__global int* out_numRayRigidPairs, 
-								__global int2* out_rayRigidPairs,
-								int maxRayRigidPairs, int numRays)
-{
-	int rayIndex = get_global_id(0);
-	if(rayIndex >= numRays) return;
-	
-	//
-	b3Vector3 rayFrom = rays[rayIndex].m_from;
-	b3Vector3 rayTo = rays[rayIndex].m_to;
-	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);
-	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );
-	
-	//
-	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];
-	
-	int stackSize = 1;
-	stack[0] = *rootNodeIndex;
-	
-	while(stackSize)
-	{
-		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
-		--stackSize;
-		
-		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false
-		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
-		
-		//bvhRigidIndex is not used if internal node
-		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
-	
-		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
-		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb)  )
-		{
-			if(isLeaf)
-			{
-				int2 rayRigidPair;
-				rayRigidPair.x = rayIndex;
-				rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
-				
-				int pairIndex = atomic_inc(out_numRayRigidPairs);
-				if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;
-			}
-			
-			if(!isLeaf)	//Internal node
-			{
-				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
-				{
-					//Error
-				}
-				else
-				{
-					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
-					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
-				}
-			}
-		}
-	}
-}
-
-__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, 
-									__global int* out_numPairs, __global int4* out_overlappingPairs, 
-									int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)
-{
-	int smallAabbIndex = get_global_id(0);
-	if(smallAabbIndex >= numSmallAabbRigids) return;
-	
-	b3AabbCL smallAabb = smallAabbs[smallAabbIndex];
-	for(int i = 0; i < numLargeAabbRigids; ++i)
-	{
-		b3AabbCL largeAabb = largeAabbs[i];
-		if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )
-		{
-			int4 pair;
-			pair.x = largeAabb.m_minIndices[3];
-			pair.y = smallAabb.m_minIndices[3];
-			pair.z = NEW_PAIR_MARKER;
-			pair.w = NEW_PAIR_MARKER;
-			
-			int pairIndex = atomic_inc(out_numPairs);
-			if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
-		}
-	}
-}
-__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,
-									__global int* out_numRayRigidPairs,  __global int2* out_rayRigidPairs,
-									int numLargeAabbRigids, int maxRayRigidPairs, int numRays)
-{
-	int rayIndex = get_global_id(0);
-	if(rayIndex >= numRays) return;
-	
-	b3Vector3 rayFrom = rays[rayIndex].m_from;
-	b3Vector3 rayTo = rays[rayIndex].m_to;
-	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);
-	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );
-	
-	for(int i = 0; i < numLargeAabbRigids; ++i)
-	{
-		b3AabbCL rigidAabb = largeRigidAabbs[i];
-		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )
-		{
-			int2 rayRigidPair;
-			rayRigidPair.x = rayIndex;
-			rayRigidPair.y = rigidAabb.m_minIndices[3];
-			
-			int pairIndex = atomic_inc(out_numRayRigidPairs);
-			if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;
-		}
-	}
-}
-
-
-//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.
-//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.
-//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.
-//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).
-#define B3_PLBVH_INVALID_COMMON_PREFIX 128
-
-#define B3_PLBVH_ROOT_NODE_MARKER -1
-
-#define b3Int64 long
-
-int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }
-b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) 
-{
-	//This function only needs to return (i & j) in order for the algorithm to work,
-	//but it may help with debugging to mask out the lower bits.
-
-	b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);
-
-	b3Int64 sharedBits = i & j;
-	b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength);	//Set all bits after the common prefix to 0
-	
-	return sharedBits & bitmask;
-}
-
-//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths
-int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)
-{
-	return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );
-}
-
-__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,
-											__global b3Int64* out_commonPrefixes,
-											__global int* out_commonPrefixLengths,
-											int numInternalNodes)
-{
-	int internalNodeIndex = get_global_id(0);
-	if (internalNodeIndex >= numInternalNodes) return;
-	
-	//Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,
-	//and the number of internal nodes is always numLeafNodes - 1
-	int leftLeafIndex = internalNodeIndex;
-	int rightLeafIndex = internalNodeIndex + 1;
-	
-	int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;
-	int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;
-	
-	//Binary radix tree construction algorithm does not work if there are duplicate morton codes.
-	//Append the index of each leaf node to each morton code so that there are no duplicates.
-	//The algorithm also requires that the morton codes are sorted in ascending order; this requirement
-	//is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.
-	//
-	//upsample(a, b) == ( ((b3Int64)a) << 32) | b
-	b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);
-	b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);
-	
-	out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);
-	out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);
-}
-
-
-__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,
-											__global int2* out_childNodes, int numLeafNodes)
-{
-	int leafNodeIndex = get_global_id(0);
-	if (leafNodeIndex >= numLeafNodes) return;
-	
-	int numInternalNodes = numLeafNodes - 1;
-	
-	int leftSplitIndex = leafNodeIndex - 1;
-	int rightSplitIndex = leafNodeIndex;
-	
-	int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
-	int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
-	
-	//Parent node is the highest adjacent common prefix that is lower than the node's common prefix
-	//Leaf nodes are considered as having the highest common prefix
-	int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);
-	
-	//Handle cases for the edge nodes; the first and last node
-	//For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX
-	if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;
-	if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;
-	
-	int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;
-	out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;
-	
-	int isRightChild = (isLeftHigherCommonPrefix);	//If the left node is the parent, then this node is its right child and vice versa
-	
-	//out_childNodesAsInt[0] == int2.x == left child
-	//out_childNodesAsInt[1] == int2.y == right child
-	int isLeaf = 1;
-	__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);
-	out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);
-}
-
-__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,
-												__global int2* out_childNodes,
-												__global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,
-												int numInternalNodes)
-{
-	int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
-	if(internalNodeIndex >= numInternalNodes) return;
-	
-	b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];
-	int nodePrefixLength = commonPrefixLengths[internalNodeIndex];
-	
-//#define USE_LINEAR_SEARCH
-#ifdef USE_LINEAR_SEARCH
-	int leftIndex = -1;
-	int rightIndex = -1;
-	
-	//Find nearest element to left with a lower common prefix
-	for(int i = internalNodeIndex - 1; i >= 0; --i)
-	{
-		int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);
-		if(nodeLeftSharedPrefixLength < nodePrefixLength)
-		{
-			leftIndex = i;
-			break;
-		}
-	}
-	
-	//Find nearest element to right with a lower common prefix
-	for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)
-	{
-		int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);
-		if(nodeRightSharedPrefixLength < nodePrefixLength)
-		{
-			rightIndex = i;
-			break;
-		}
-	}
-	
-#else //Use binary search
-
-	//Find nearest element to left with a lower common prefix
-	int leftIndex = -1;
-	{
-		int lower = 0;
-		int upper = internalNodeIndex - 1;
-		
-		while(lower <= upper)
-		{
-			int mid = (lower + upper) / 2;
-			b3Int64 midPrefix = commonPrefixes[mid];
-			int midPrefixLength = commonPrefixLengths[mid];
-			
-			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);
-			if(nodeMidSharedPrefixLength < nodePrefixLength) 
-			{
-				int right = mid + 1;
-				if(right < internalNodeIndex)
-				{
-					b3Int64 rightPrefix = commonPrefixes[right];
-					int rightPrefixLength = commonPrefixLengths[right];
-					
-					int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);
-					if(nodeRightSharedPrefixLength < nodePrefixLength) 
-					{
-						lower = right;
-						leftIndex = right;
-					}
-					else 
-					{
-						leftIndex = mid;
-						break;
-					}
-				}
-				else 
-				{
-					leftIndex = mid;
-					break;
-				}
-			}
-			else upper = mid - 1;
-		}
-	}
-	
-	//Find nearest element to right with a lower common prefix
-	int rightIndex = -1;
-	{
-		int lower = internalNodeIndex + 1;
-		int upper = numInternalNodes - 1;
-		
-		while(lower <= upper)
-		{
-			int mid = (lower + upper) / 2;
-			b3Int64 midPrefix = commonPrefixes[mid];
-			int midPrefixLength = commonPrefixLengths[mid];
-			
-			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);
-			if(nodeMidSharedPrefixLength < nodePrefixLength) 
-			{
-				int left = mid - 1;
-				if(left > internalNodeIndex)
-				{
-					b3Int64 leftPrefix = commonPrefixes[left];
-					int leftPrefixLength = commonPrefixLengths[left];
-				
-					int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);
-					if(nodeLeftSharedPrefixLength < nodePrefixLength) 
-					{
-						upper = left;
-						rightIndex = left;
-					}
-					else 
-					{
-						rightIndex = mid;
-						break;
-					}
-				}
-				else 
-				{
-					rightIndex = mid;
-					break;
-				}
-			}
-			else lower = mid + 1;
-		}
-	}
-#endif
-	
-	//Select parent
-	{
-		int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
-		int rightPrefixLength =  (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
-		
-		int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);
-		
-		if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;
-		else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;
-		
-		int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;
-		
-		int isRootNode = (leftIndex == -1 && rightIndex == -1);
-		out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;
-		
-		int isLeaf = 0;
-		if(!isRootNode)
-		{
-			int isRightChild = (isLeftHigherPrefixLength);	//If the left node is the parent, then this node is its right child and vice versa
-			
-			//out_childNodesAsInt[0] == int2.x == left child
-			//out_childNodesAsInt[1] == int2.y == right child
-			__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);
-			out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);
-		}
-		else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);
-	}
-}
-
-__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,
-									__global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)
-{
-	if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);
-
-	int internalNodeIndex = get_global_id(0);
-	if(internalNodeIndex >= numInternalNodes) return;
-	
-	//
-	int distanceFromRoot = 0;
-	{
-		int parentIndex = internalNodeParentNodes[internalNodeIndex];
-		while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)
-		{
-			parentIndex = internalNodeParentNodes[parentIndex];
-			++distanceFromRoot;
-		}
-	}
-	out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;
-	
-	//
-	__local int localMaxDistanceFromRoot;
-	if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;
-	barrier(CLK_LOCAL_MEM_FENCE);
-	
-	atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);
-	barrier(CLK_LOCAL_MEM_FENCE);
-	
-	if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);
-}
-
-__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,
-												__global int2* childNodes,
-												__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,
-												int maxDistanceFromRoot, int processedDistance, int numInternalNodes)
-{
-	int internalNodeIndex = get_global_id(0);
-	if(internalNodeIndex >= numInternalNodes) return;
-	
-	int distance = distanceFromRoot[internalNodeIndex];
-	
-	if(distance == processedDistance)
-	{
-		int leftChildIndex = childNodes[internalNodeIndex].x;
-		int rightChildIndex = childNodes[internalNodeIndex].y;
-		
-		int isLeftChildLeaf = isLeafNode(leftChildIndex);
-		int isRightChildLeaf = isLeafNode(rightChildIndex);
-		
-		leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);
-		rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);
-		
-		//leftRigidIndex/rightRigidIndex is not used if internal node
-		int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;
-		int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;
-		
-		b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];
-		b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];
-		
-		b3AabbCL mergedAabb;
-		mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);
-		mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);
-		internalNodeAabbs[internalNodeIndex] = mergedAabb;
-	}
-}
-
-__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)
-{
-	int internalNodeIndex = get_global_id(0);
-	if(internalNodeIndex >= numInternalNodes) return;
-	
-	int numLeafNodes = numInternalNodes + 1;
-	
-	int2 childNodes = internalNodeChildNodes[internalNodeIndex];
-	
-	int2 leafIndexRange;	//x == min leaf index, y == max leaf index
-	
-	//Find lowest leaf index covered by this internal node
-	{
-		int lowestIndex = childNodes.x;		//childNodes.x == Left child
-		while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;
-		leafIndexRange.x = lowestIndex;
-	}
-	
-	//Find highest leaf index covered by this internal node
-	{
-		int highestIndex = childNodes.y;	//childNodes.y == Right child
-		while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;
-		leafIndexRange.y = highestIndex;
-	}
-	
-	//
-	out_leafIndexRanges[internalNodeIndex] = leafIndexRange;
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
@ -1,728 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* parallelLinearBvhCL =
-	"/*\n"
-	"This software is provided 'as-is', without any express or implied warranty.\n"
-	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-	"Permission is granted to anyone to use this software for any purpose,\n"
-	"including commercial applications, and to alter it and redistribute it freely,\n"
-	"subject to the following restrictions:\n"
-	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-	"3. This notice may not be removed or altered from any source distribution.\n"
-	"*/\n"
-	"//Initial Author Jackson Lee, 2014\n"
-	"typedef float b3Scalar;\n"
-	"typedef float4 b3Vector3;\n"
-	"#define b3Max max\n"
-	"#define b3Min min\n"
-	"#define b3Sqrt sqrt\n"
-	"typedef struct\n"
-	"{\n"
-	"	unsigned int m_key;\n"
-	"	unsigned int m_value;\n"
-	"} SortDataCL;\n"
-	"typedef struct \n"
-	"{\n"
-	"	union\n"
-	"	{\n"
-	"		float4	m_min;\n"
-	"		float   m_minElems[4];\n"
-	"		int			m_minIndices[4];\n"
-	"	};\n"
-	"	union\n"
-	"	{\n"
-	"		float4	m_max;\n"
-	"		float   m_maxElems[4];\n"
-	"		int			m_maxIndices[4];\n"
-	"	};\n"
-	"} b3AabbCL;\n"
-	"unsigned int interleaveBits(unsigned int x)\n"
-	"{\n"
-	"	//........ ........ ......12 3456789A	//x\n"
-	"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x after interleaving bits\n"
-	"	\n"
-	"	//......12 3456789A ......12 3456789A	//x ^ (x << 16)\n"
-	"	//11111111 ........ ........ 11111111	//0x FF 00 00 FF\n"
-	"	//......12 ........ ........ 3456789A	//x = (x ^ (x << 16)) & 0xFF0000FF;\n"
-	"	\n"
-	"	//......12 ........ 3456789A 3456789A	//x ^ (x <<  8)\n"
-	"	//......11 ........ 1111.... ....1111	//0x 03 00 F0 0F\n"
-	"	//......12 ........ 3456.... ....789A	//x = (x ^ (x <<  8)) & 0x0300F00F;\n"
-	"	\n"
-	"	//..12..12 ....3456 3456.... 789A789A	//x ^ (x <<  4)\n"
-	"	//......11 ....11.. ..11.... 11....11	//0x 03 0C 30 C3\n"
-	"	//......12 ....34.. ..56.... 78....9A	//x = (x ^ (x <<  4)) & 0x030C30C3;\n"
-	"	\n"
-	"	//....1212 ..3434.. 5656..78 78..9A9A	//x ^ (x <<  2)\n"
-	"	//....1..1 ..1..1.. 1..1..1. .1..1..1	//0x 09 24 92 49\n"
-	"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x = (x ^ (x <<  2)) & 0x09249249;\n"
-	"	\n"
-	"	//........ ........ ......11 11111111	//0x000003FF\n"
-	"	x &= 0x000003FF;		//Clear all bits above bit 10\n"
-	"	\n"
-	"	x = (x ^ (x << 16)) & 0xFF0000FF;\n"
-	"	x = (x ^ (x <<  8)) & 0x0300F00F;\n"
-	"	x = (x ^ (x <<  4)) & 0x030C30C3;\n"
-	"	x = (x ^ (x <<  2)) & 0x09249249;\n"
-	"	\n"
-	"	return x;\n"
-	"}\n"
-	"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n"
-	"{\n"
-	"	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
-	"}\n"
-	"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n"
-	"{\n"
-	"	int separatedAabbIndex = get_global_id(0);\n"
-	"	if(separatedAabbIndex >= numAabbsToSeparate) return;\n"
-	"	int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n"
-	"	out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n"
-	"}\n"
-	"//Should replace with an optimized parallel reduction\n"
-	"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n"
-	"{\n"
-	"	//Each time this kernel is added to the command queue, \n"
-	"	//the number of AABBs needing to be merged is halved\n"
-	"	//\n"
-	"	//Example with 159 AABBs:\n"
-	"	//	numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n"
-	"	//	numMergedAabbs == 159 - 80 == 79\n"
-	"	//So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n"
-	"	\n"
-	"	int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n"
-	"	int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n"
-	"	\n"
-	"	int aabbIndex = get_global_id(0);\n"
-	"	if(aabbIndex >= numMergedAabbs) return;\n"
-	"	\n"
-	"	int otherAabbIndex = aabbIndex + numRemainingAabbs;\n"
-	"	\n"
-	"	b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
-	"	b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
-	"		\n"
-	"	b3AabbCL mergedAabb;\n"
-	"	mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
-	"	mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
-	"	out_mergedAabb[aabbIndex] = mergedAabb;\n"
-	"}\n"
-	"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
-	"												__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
-	"{\n"
-	"	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index\n"
-	"	if(leafNodeIndex >= numAabbs) return;\n"
-	"	\n"
-	"	b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n"
-	"	b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n"
-	"	b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n"
-	"	\n"
-	"	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
-	"	b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n"
-	"	b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n"
-	"	\n"
-	"	//Quantize into integer coordinates\n"
-	"	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
-	"	b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n"
-	"	\n"
-	"	int4 discretePosition;\n"
-	"	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
-	"	discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n"
-	"	discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n"
-	"	\n"
-	"	//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n"
-	"	discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n"
-	"	discretePosition += 512;\n"
-	"	\n"
-	"	//Interleave bits(assign a morton code, also known as a z-curve)\n"
-	"	unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n"
-	"	\n"
-	"	//\n"
-	"	SortDataCL mortonCodeIndexPair;\n"
-	"	mortonCodeIndexPair.m_key = mortonCode;\n"
-	"	mortonCodeIndexPair.m_value = leafNodeIndex;\n"
-	"	\n"
-	"	out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
-	"}\n"
-	"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n"
-	"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n"
-	"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n"
-	"//In both cases, the bit should be cleared to access the actual node index.\n"
-	"int isLeafNode(int index) { return (index >> 31 == 0); }\n"
-	"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n"
-	"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n"
-	"//From sap.cl\n"
-	"#define NEW_PAIR_MARKER -1\n"
-	"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n"
-	"{\n"
-	"	bool overlap = true;\n"
-	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-	"	return overlap;\n"
-	"}\n"
-	"//From sap.cl\n"
-	"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
-	"											__global int* rootNodeIndex, \n"
-	"											__global int2* internalNodeChildIndices, \n"
-	"											__global b3AabbCL* internalNodeAabbs,\n"
-	"											__global int2* internalNodeLeafIndexRanges,\n"
-	"											\n"
-	"											__global SortDataCL* mortonCodesAndAabbIndices,\n"
-	"											__global int* out_numPairs, __global int4* out_overlappingPairs, \n"
-	"											int maxPairs, int numQueryAabbs)\n"
-	"{\n"
-	"	//Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n"
-	"	//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n"
-	"	int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
-	"	if(queryBvhNodeIndex >= numQueryAabbs) return;\n"
-	"	\n"
-	"	int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n"
-	"	b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
-	"	\n"
-	"	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
-	"	\n"
-	"	int stackSize = 1;\n"
-	"	stack[0] = *rootNodeIndex;\n"
-	"	\n"
-	"	while(stackSize)\n"
-	"	{\n"
-	"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
-	"		--stackSize;\n"
-	"		\n"
-	"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n"
-	"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
-	"		\n"
-	"		//Optimization - if the BVH is structured as a binary radix tree, then\n"
-	"		//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n"
-	"		//This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n"
-	"		{\n"
-	"			int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
-	"			if(highestLeafIndex <= queryBvhNodeIndex) continue;\n"
-	"		}\n"
-	"		\n"
-	"		//bvhRigidIndex is not used if internal node\n"
-	"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
-	"	\n"
-	"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
-	"		if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
-	"		{\n"
-	"			if(isLeaf)\n"
-	"			{\n"
-	"				int4 pair;\n"
-	"				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
-	"				pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
-	"				pair.z = NEW_PAIR_MARKER;\n"
-	"				pair.w = NEW_PAIR_MARKER;\n"
-	"				\n"
-	"				int pairIndex = atomic_inc(out_numPairs);\n"
-	"				if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
-	"			}\n"
-	"			\n"
-	"			if(!isLeaf)	//Internal node\n"
-	"			{\n"
-	"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
-	"				{\n"
-	"					//Error\n"
-	"				}\n"
-	"				else\n"
-	"				{\n"
-	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
-	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
-	"				}\n"
-	"			}\n"
-	"		}\n"
-	"		\n"
-	"	}\n"
-	"}\n"
-	"//From rayCastKernels.cl\n"
-	"typedef struct\n"
-	"{\n"
-	"	float4 m_from;\n"
-	"	float4 m_to;\n"
-	"} b3RayInfo;\n"
-	"//From rayCastKernels.cl\n"
-	"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n"
-	"{\n"
-	"	b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n"
-	"	return normalize(normal);	//OpenCL normalize == vector4 normalize\n"
-	"}\n"
-	"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n"
-	"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n"
-	"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n"
-	"{\n"
-	"	//AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n"
-	"	//t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n"
-	"	//\n"
-	"	//if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n"
-	"	//and min.x will be the far plane; otherwise, it is reversed.\n"
-	"	//\n"
-	"	//In order for there to be a collision, the t_min and t_max of each pair must overlap.\n"
-	"	//This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n"
-	"	\n"
-	"	int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) );	//isless(x,y) returns (x < y)\n"
-	"	\n"
-	"	//When using vector types, the select() function checks the most signficant bit, \n"
-	"	//but isless() sets the least significant bit.\n"
-	"	isNegative <<= 31;\n"
-	"	//select(b, a, condition) == condition ? a : b\n"
-	"	//When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n"
-	"	b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
-	"	b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
-	"	\n"
-	"	b3Scalar t_min_final = 0.0f;\n"
-	"	b3Scalar t_max_final = rayLength;\n"
-	"	\n"
-	"	//Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n"
-	"	//Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n"
-	"	//Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n"
-	"	t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n"
-	"	t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n"
-	"	\n"
-	"	return (t_min_final <= t_max_final);\n"
-	"}\n"
-	"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n"
-	"								__global int* rootNodeIndex, \n"
-	"								__global int2* internalNodeChildIndices, \n"
-	"								__global b3AabbCL* internalNodeAabbs,\n"
-	"								__global int2* internalNodeLeafIndexRanges,\n"
-	"								__global SortDataCL* mortonCodesAndAabbIndices,\n"
-	"								\n"
-	"								__global b3RayInfo* rays,\n"
-	"								\n"
-	"								__global int* out_numRayRigidPairs, \n"
-	"								__global int2* out_rayRigidPairs,\n"
-	"								int maxRayRigidPairs, int numRays)\n"
-	"{\n"
-	"	int rayIndex = get_global_id(0);\n"
-	"	if(rayIndex >= numRays) return;\n"
-	"	\n"
-	"	//\n"
-	"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
-	"	b3Vector3 rayTo = rays[rayIndex].m_to;\n"
-	"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
-	"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
-	"	\n"
-	"	//\n"
-	"	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
-	"	\n"
-	"	int stackSize = 1;\n"
-	"	stack[0] = *rootNodeIndex;\n"
-	"	\n"
-	"	while(stackSize)\n"
-	"	{\n"
-	"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
-	"		--stackSize;\n"
-	"		\n"
-	"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n"
-	"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
-	"		\n"
-	"		//bvhRigidIndex is not used if internal node\n"
-	"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
-	"	\n"
-	"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
-	"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb)  )\n"
-	"		{\n"
-	"			if(isLeaf)\n"
-	"			{\n"
-	"				int2 rayRigidPair;\n"
-	"				rayRigidPair.x = rayIndex;\n"
-	"				rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
-	"				\n"
-	"				int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
-	"				if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
-	"			}\n"
-	"			\n"
-	"			if(!isLeaf)	//Internal node\n"
-	"			{\n"
-	"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
-	"				{\n"
-	"					//Error\n"
-	"				}\n"
-	"				else\n"
-	"				{\n"
-	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
-	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
-	"				}\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n"
-	"									__global int* out_numPairs, __global int4* out_overlappingPairs, \n"
-	"									int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n"
-	"{\n"
-	"	int smallAabbIndex = get_global_id(0);\n"
-	"	if(smallAabbIndex >= numSmallAabbRigids) return;\n"
-	"	\n"
-	"	b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n"
-	"	for(int i = 0; i < numLargeAabbRigids; ++i)\n"
-	"	{\n"
-	"		b3AabbCL largeAabb = largeAabbs[i];\n"
-	"		if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n"
-	"		{\n"
-	"			int4 pair;\n"
-	"			pair.x = largeAabb.m_minIndices[3];\n"
-	"			pair.y = smallAabb.m_minIndices[3];\n"
-	"			pair.z = NEW_PAIR_MARKER;\n"
-	"			pair.w = NEW_PAIR_MARKER;\n"
-	"			\n"
-	"			int pairIndex = atomic_inc(out_numPairs);\n"
-	"			if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n"
-	"									__global int* out_numRayRigidPairs,  __global int2* out_rayRigidPairs,\n"
-	"									int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n"
-	"{\n"
-	"	int rayIndex = get_global_id(0);\n"
-	"	if(rayIndex >= numRays) return;\n"
-	"	\n"
-	"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
-	"	b3Vector3 rayTo = rays[rayIndex].m_to;\n"
-	"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
-	"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
-	"	\n"
-	"	for(int i = 0; i < numLargeAabbRigids; ++i)\n"
-	"	{\n"
-	"		b3AabbCL rigidAabb = largeRigidAabbs[i];\n"
-	"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n"
-	"		{\n"
-	"			int2 rayRigidPair;\n"
-	"			rayRigidPair.x = rayIndex;\n"
-	"			rayRigidPair.y = rigidAabb.m_minIndices[3];\n"
-	"			\n"
-	"			int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
-	"			if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n"
-	"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n"
-	"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n"
-	"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n"
-	"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n"
-	"#define B3_PLBVH_ROOT_NODE_MARKER -1\n"
-	"#define b3Int64 long\n"
-	"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n"
-	"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n"
-	"{\n"
-	"	//This function only needs to return (i & j) in order for the algorithm to work,\n"
-	"	//but it may help with debugging to mask out the lower bits.\n"
-	"	b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n"
-	"	b3Int64 sharedBits = i & j;\n"
-	"	b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength);	//Set all bits after the common prefix to 0\n"
-	"	\n"
-	"	return sharedBits & bitmask;\n"
-	"}\n"
-	"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n"
-	"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n"
-	"{\n"
-	"	return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n"
-	"}\n"
-	"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n"
-	"											__global b3Int64* out_commonPrefixes,\n"
-	"											__global int* out_commonPrefixLengths,\n"
-	"											int numInternalNodes)\n"
-	"{\n"
-	"	int internalNodeIndex = get_global_id(0);\n"
-	"	if (internalNodeIndex >= numInternalNodes) return;\n"
-	"	\n"
-	"	//Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n"
-	"	//and the number of internal nodes is always numLeafNodes - 1\n"
-	"	int leftLeafIndex = internalNodeIndex;\n"
-	"	int rightLeafIndex = internalNodeIndex + 1;\n"
-	"	\n"
-	"	int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n"
-	"	int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n"
-	"	\n"
-	"	//Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n"
-	"	//Append the index of each leaf node to each morton code so that there are no duplicates.\n"
-	"	//The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n"
-	"	//is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n"
-	"	//\n"
-	"	//upsample(a, b) == ( ((b3Int64)a) << 32) | b\n"
-	"	b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n"
-	"	b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n"
-	"	\n"
-	"	out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
-	"	out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
-	"}\n"
-	"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n"
-	"											__global int2* out_childNodes, int numLeafNodes)\n"
-	"{\n"
-	"	int leafNodeIndex = get_global_id(0);\n"
-	"	if (leafNodeIndex >= numLeafNodes) return;\n"
-	"	\n"
-	"	int numInternalNodes = numLeafNodes - 1;\n"
-	"	\n"
-	"	int leftSplitIndex = leafNodeIndex - 1;\n"
-	"	int rightSplitIndex = leafNodeIndex;\n"
-	"	\n"
-	"	int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-	"	int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-	"	\n"
-	"	//Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n"
-	"	//Leaf nodes are considered as having the highest common prefix\n"
-	"	int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n"
-	"	\n"
-	"	//Handle cases for the edge nodes; the first and last node\n"
-	"	//For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n"
-	"	if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n"
-	"	if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n"
-	"	\n"
-	"	int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n"
-	"	out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n"
-	"	\n"
-	"	int isRightChild = (isLeftHigherCommonPrefix);	//If the left node is the parent, then this node is its right child and vice versa\n"
-	"	\n"
-	"	//out_childNodesAsInt[0] == int2.x == left child\n"
-	"	//out_childNodesAsInt[1] == int2.y == right child\n"
-	"	int isLeaf = 1;\n"
-	"	__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
-	"	out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n"
-	"}\n"
-	"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n"
-	"												__global int2* out_childNodes,\n"
-	"												__global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n"
-	"												int numInternalNodes)\n"
-	"{\n"
-	"	int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
-	"	if(internalNodeIndex >= numInternalNodes) return;\n"
-	"	\n"
-	"	b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n"
-	"	int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n"
-	"	\n"
-	"//#define USE_LINEAR_SEARCH\n"
-	"#ifdef USE_LINEAR_SEARCH\n"
-	"	int leftIndex = -1;\n"
-	"	int rightIndex = -1;\n"
-	"	\n"
-	"	//Find nearest element to left with a lower common prefix\n"
-	"	for(int i = internalNodeIndex - 1; i >= 0; --i)\n"
-	"	{\n"
-	"		int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
-	"		if(nodeLeftSharedPrefixLength < nodePrefixLength)\n"
-	"		{\n"
-	"			leftIndex = i;\n"
-	"			break;\n"
-	"		}\n"
-	"	}\n"
-	"	\n"
-	"	//Find nearest element to right with a lower common prefix\n"
-	"	for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n"
-	"	{\n"
-	"		int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
-	"		if(nodeRightSharedPrefixLength < nodePrefixLength)\n"
-	"		{\n"
-	"			rightIndex = i;\n"
-	"			break;\n"
-	"		}\n"
-	"	}\n"
-	"	\n"
-	"#else //Use binary search\n"
-	"	//Find nearest element to left with a lower common prefix\n"
-	"	int leftIndex = -1;\n"
-	"	{\n"
-	"		int lower = 0;\n"
-	"		int upper = internalNodeIndex - 1;\n"
-	"		\n"
-	"		while(lower <= upper)\n"
-	"		{\n"
-	"			int mid = (lower + upper) / 2;\n"
-	"			b3Int64 midPrefix = commonPrefixes[mid];\n"
-	"			int midPrefixLength = commonPrefixLengths[mid];\n"
-	"			\n"
-	"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
-	"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
-	"			{\n"
-	"				int right = mid + 1;\n"
-	"				if(right < internalNodeIndex)\n"
-	"				{\n"
-	"					b3Int64 rightPrefix = commonPrefixes[right];\n"
-	"					int rightPrefixLength = commonPrefixLengths[right];\n"
-	"					\n"
-	"					int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n"
-	"					if(nodeRightSharedPrefixLength < nodePrefixLength) \n"
-	"					{\n"
-	"						lower = right;\n"
-	"						leftIndex = right;\n"
-	"					}\n"
-	"					else \n"
-	"					{\n"
-	"						leftIndex = mid;\n"
-	"						break;\n"
-	"					}\n"
-	"				}\n"
-	"				else \n"
-	"				{\n"
-	"					leftIndex = mid;\n"
-	"					break;\n"
-	"				}\n"
-	"			}\n"
-	"			else upper = mid - 1;\n"
-	"		}\n"
-	"	}\n"
-	"	\n"
-	"	//Find nearest element to right with a lower common prefix\n"
-	"	int rightIndex = -1;\n"
-	"	{\n"
-	"		int lower = internalNodeIndex + 1;\n"
-	"		int upper = numInternalNodes - 1;\n"
-	"		\n"
-	"		while(lower <= upper)\n"
-	"		{\n"
-	"			int mid = (lower + upper) / 2;\n"
-	"			b3Int64 midPrefix = commonPrefixes[mid];\n"
-	"			int midPrefixLength = commonPrefixLengths[mid];\n"
-	"			\n"
-	"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
-	"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
-	"			{\n"
-	"				int left = mid - 1;\n"
-	"				if(left > internalNodeIndex)\n"
-	"				{\n"
-	"					b3Int64 leftPrefix = commonPrefixes[left];\n"
-	"					int leftPrefixLength = commonPrefixLengths[left];\n"
-	"				\n"
-	"					int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n"
-	"					if(nodeLeftSharedPrefixLength < nodePrefixLength) \n"
-	"					{\n"
-	"						upper = left;\n"
-	"						rightIndex = left;\n"
-	"					}\n"
-	"					else \n"
-	"					{\n"
-	"						rightIndex = mid;\n"
-	"						break;\n"
-	"					}\n"
-	"				}\n"
-	"				else \n"
-	"				{\n"
-	"					rightIndex = mid;\n"
-	"					break;\n"
-	"				}\n"
-	"			}\n"
-	"			else lower = mid + 1;\n"
-	"		}\n"
-	"	}\n"
-	"#endif\n"
-	"	\n"
-	"	//Select parent\n"
-	"	{\n"
-	"		int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-	"		int rightPrefixLength =  (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
-	"		\n"
-	"		int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n"
-	"		\n"
-	"		if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n"
-	"		else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n"
-	"		\n"
-	"		int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n"
-	"		\n"
-	"		int isRootNode = (leftIndex == -1 && rightIndex == -1);\n"
-	"		out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n"
-	"		\n"
-	"		int isLeaf = 0;\n"
-	"		if(!isRootNode)\n"
-	"		{\n"
-	"			int isRightChild = (isLeftHigherPrefixLength);	//If the left node is the parent, then this node is its right child and vice versa\n"
-	"			\n"
-	"			//out_childNodesAsInt[0] == int2.x == left child\n"
-	"			//out_childNodesAsInt[1] == int2.y == right child\n"
-	"			__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
-	"			out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
-	"		}\n"
-	"		else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
-	"	}\n"
-	"}\n"
-	"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n"
-	"									__global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n"
-	"{\n"
-	"	if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n"
-	"	int internalNodeIndex = get_global_id(0);\n"
-	"	if(internalNodeIndex >= numInternalNodes) return;\n"
-	"	\n"
-	"	//\n"
-	"	int distanceFromRoot = 0;\n"
-	"	{\n"
-	"		int parentIndex = internalNodeParentNodes[internalNodeIndex];\n"
-	"		while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n"
-	"		{\n"
-	"			parentIndex = internalNodeParentNodes[parentIndex];\n"
-	"			++distanceFromRoot;\n"
-	"		}\n"
-	"	}\n"
-	"	out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n"
-	"	\n"
-	"	//\n"
-	"	__local int localMaxDistanceFromRoot;\n"
-	"	if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n"
-	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"	\n"
-	"	atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n"
-	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"	\n"
-	"	if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n"
-	"}\n"
-	"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n"
-	"												__global int2* childNodes,\n"
-	"												__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n"
-	"												int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n"
-	"{\n"
-	"	int internalNodeIndex = get_global_id(0);\n"
-	"	if(internalNodeIndex >= numInternalNodes) return;\n"
-	"	\n"
-	"	int distance = distanceFromRoot[internalNodeIndex];\n"
-	"	\n"
-	"	if(distance == processedDistance)\n"
-	"	{\n"
-	"		int leftChildIndex = childNodes[internalNodeIndex].x;\n"
-	"		int rightChildIndex = childNodes[internalNodeIndex].y;\n"
-	"		\n"
-	"		int isLeftChildLeaf = isLeafNode(leftChildIndex);\n"
-	"		int isRightChildLeaf = isLeafNode(rightChildIndex);\n"
-	"		\n"
-	"		leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n"
-	"		rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n"
-	"		\n"
-	"		//leftRigidIndex/rightRigidIndex is not used if internal node\n"
-	"		int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
-	"		int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
-	"		\n"
-	"		b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n"
-	"		b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n"
-	"		\n"
-	"		b3AabbCL mergedAabb;\n"
-	"		mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
-	"		mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
-	"		internalNodeAabbs[internalNodeIndex] = mergedAabb;\n"
-	"	}\n"
-	"}\n"
-	"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n"
-	"{\n"
-	"	int internalNodeIndex = get_global_id(0);\n"
-	"	if(internalNodeIndex >= numInternalNodes) return;\n"
-	"	\n"
-	"	int numLeafNodes = numInternalNodes + 1;\n"
-	"	\n"
-	"	int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n"
-	"	\n"
-	"	int2 leafIndexRange;	//x == min leaf index, y == max leaf index\n"
-	"	\n"
-	"	//Find lowest leaf index covered by this internal node\n"
-	"	{\n"
-	"		int lowestIndex = childNodes.x;		//childNodes.x == Left child\n"
-	"		while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n"
-	"		leafIndexRange.x = lowestIndex;\n"
-	"	}\n"
-	"	\n"
-	"	//Find highest leaf index covered by this internal node\n"
-	"	{\n"
-	"		int highestIndex = childNodes.y;	//childNodes.y == Right child\n"
-	"		while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n"
-	"		leafIndexRange.y = highestIndex;\n"
-	"	}\n"
-	"	\n"
-	"	//\n"
-	"	out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n"
-	"}\n";
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
@ -1,389 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Erwin Coumans
-
-#define NEW_PAIR_MARKER -1
-
-typedef struct 
-{
-	union
-	{
-		float4	m_min;
-		float   m_minElems[4];
-		int			m_minIndices[4];
-	};
-	union
-	{
-		float4	m_max;
-		float   m_maxElems[4];
-		int			m_maxIndices[4];
-	};
-} btAabbCL;
-
-
-/// conservative test for overlap between two aabbs
-bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
-bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
-{
-	bool overlap = true;
-	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
-	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
-	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
-	return overlap;
-}
-bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
-bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
-{
-	bool overlap = true;
-	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
-	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
-	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
-	return overlap;
-}
-
-bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
-bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
-{
-	bool overlap = true;
-	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
-	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
-	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
-	return overlap;
-}
-
-
-__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping,  __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)
-{
-	int i = get_global_id(0);
-	if (i>=numUnsortedAabbs)
-		return;
-
-	int j = get_global_id(1);
-	if (j>=numUnSortedAabbs2)
-		return;
-
-
-	__global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];
-	__global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];
-
-	if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))
-	{
-		int4 myPair;
-		
-		int xIndex = unsortedAabbPtr[0].m_minIndices[3];
-		int yIndex = unsortedAabbPtr2[0].m_minIndices[3];
-		if (xIndex>yIndex)
-		{
-			int tmp = xIndex;
-			xIndex=yIndex;
-			yIndex=tmp;
-		}
-		
-		myPair.x = xIndex;
-		myPair.y = yIndex;
-		myPair.z = NEW_PAIR_MARKER;
-		myPair.w = NEW_PAIR_MARKER;
-
-
-		int curPair = atomic_inc (pairCount);
-		if (curPair<maxPairs)
-		{
-				pairsOut[curPair] = myPair; //flush to main memory
-		}
-	}
-}
-
-
-
-__kernel void   computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
-{
-	int i = get_global_id(0);
-	if (i>=numObjects)
-		return;
-	for (int j=i+1;j<numObjects;j++)
-	{
-		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
-		{
-			int4 myPair;
-			myPair.x = aabbs[i].m_minIndices[3];
-			myPair.y = aabbs[j].m_minIndices[3];
-			myPair.z = NEW_PAIR_MARKER;
-			myPair.w = NEW_PAIR_MARKER;
-
-			int curPair = atomic_inc (pairCount);
-			if (curPair<maxPairs)
-			{
-					pairsOut[curPair] = myPair; //flush to main memory
-			}
-		}
-	}
-}
-
-__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
-{
-	int i = get_global_id(0);
-	if (i>=numObjects)
-		return;
-	for (int j=i+1;j<numObjects;j++)
-	{
-  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) 
-		{
-			break;
-		}
-		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
-		{
-			int4 myPair;
-			myPair.x = aabbs[i].m_minIndices[3];
-			myPair.y = aabbs[j].m_minIndices[3];
-			myPair.z = NEW_PAIR_MARKER;
-			myPair.w = NEW_PAIR_MARKER;
-
-			int curPair = atomic_inc (pairCount);
-			if (curPair<maxPairs)
-			{
-					pairsOut[curPair] = myPair; //flush to main memory
-			}
-		}
-	}
-}
-
-
-
-
-__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
-{
-	int i = get_global_id(0);
-	int localId = get_local_id(0);
-
-	__local int numActiveWgItems[1];
-	__local int breakRequest[1];
-
-	if (localId==0)
-	{
-		numActiveWgItems[0] = 0;
-		breakRequest[0] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	atomic_inc(numActiveWgItems);
-	barrier(CLK_LOCAL_MEM_FENCE);
-	int localBreak = 0;
-
-	int j=i+1;
-	do
-	{
-		barrier(CLK_LOCAL_MEM_FENCE);
-	
-		if (j<numObjects)
-		{
-	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) 
-			{
-				if (!localBreak)
-				{
-					atomic_inc(breakRequest);
-					localBreak = 1;
-				}
-			}
-		}
-		
-		barrier(CLK_LOCAL_MEM_FENCE);
-		
-		if (j>=numObjects && !localBreak)
-		{
-			atomic_inc(breakRequest);
-			localBreak = 1;
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-		
-		if (!localBreak)
-		{
-			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
-			{
-				int4 myPair;
-				myPair.x = aabbs[i].m_minIndices[3];
-				myPair.y = aabbs[j].m_minIndices[3];
-				myPair.z = NEW_PAIR_MARKER;
-				myPair.w = NEW_PAIR_MARKER;
-
-				int curPair = atomic_inc (pairCount);
-				if (curPair<maxPairs)
-				{
-						pairsOut[curPair] = myPair; //flush to main memory
-				}
-			}
-		}
-		j++;
-
-	} while (breakRequest[0]<numActiveWgItems[0]);
-}
-
-
-__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
-{
-	int i = get_global_id(0);
-	int localId = get_local_id(0);
-
-	__local int numActiveWgItems[1];
-	__local int breakRequest[1];
-	__local btAabbCL localAabbs[128];// = aabbs[i];
-	
-	btAabbCL myAabb;
-	
-	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
-	float testValue = 	myAabb.m_maxElems[axis];
-	
-	if (localId==0)
-	{
-		numActiveWgItems[0] = 0;
-		breakRequest[0] = 0;
-	}
-	int localCount=0;
-	int block=0;
-	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
-	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
-	
-	barrier(CLK_LOCAL_MEM_FENCE);
-	atomic_inc(numActiveWgItems);
-	barrier(CLK_LOCAL_MEM_FENCE);
-	int localBreak = 0;
-	
-	int j=i+1;
-	do
-	{
-		barrier(CLK_LOCAL_MEM_FENCE);
-	
-		if (j<numObjects)
-		{
-	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) 
-			{
-				if (!localBreak)
-				{
-					atomic_inc(breakRequest);
-					localBreak = 1;
-				}
-			}
-		}
-		
-		barrier(CLK_LOCAL_MEM_FENCE);
-		
-		if (j>=numObjects && !localBreak)
-		{
-			atomic_inc(breakRequest);
-			localBreak = 1;
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-		
-		if (!localBreak)
-		{
-			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
-			{
-				int4 myPair;
-				myPair.x = myAabb.m_minIndices[3];
-				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
-				myPair.z = NEW_PAIR_MARKER;
-				myPair.w = NEW_PAIR_MARKER;
-
-				int curPair = atomic_inc (pairCount);
-				if (curPair<maxPairs)
-				{
-						pairsOut[curPair] = myPair; //flush to main memory
-				}
-			}
-		}
-		
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		localCount++;
-		if (localCount==64)
-		{
-			localCount = 0;
-			block+=64;			
-			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
-			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
-		}
-		j++;
-		
-	} while (breakRequest[0]<numActiveWgItems[0]);
-	
-}
-
-
-
-
-//http://stereopsis.com/radix.html
-unsigned int FloatFlip(float fl);
-unsigned int FloatFlip(float fl)
-{
-	unsigned int f = *(unsigned int*)&fl;
-	unsigned int mask = -(int)(f >> 31) | 0x80000000;
-	return f ^ mask;
-}
-float IFloatFlip(unsigned int f);
-float IFloatFlip(unsigned int f)
-{
-	unsigned int mask = ((f >> 31) - 1) | 0x80000000;
-	unsigned int fl = f ^ mask;
-	return *(float*)&fl;
-}
-
-
-
-
-__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
-{
-	int i = get_global_id(0);
-	if (i>=numObjects)
-		return;
-	int src = destAabbs[i].m_maxIndices[3];
-	destAabbs[i] = allAabbs[src];
-	destAabbs[i].m_maxIndices[3] = src;
-}
-
-
-__kernel void   flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)
-{
-	int i = get_global_id(0);
-	if (i>=numObjects)
-		return;
-	
-	
-	sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);
-	sortData[i].y = i;
-		
-}
-
-
-__kernel void   scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
-{
-	int i = get_global_id(0);
-	if (i>=numObjects)
-		return;
-	
-	sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];
-}
-
-
-
-__kernel void   prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)
-{
-	int i = get_global_id(0);
-	if (i>=numAabbs)
-		return;
-	
-	btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];
-	
-	float4 s;
-	s = (smallAabb.m_max+smallAabb.m_min)*0.5f;
-	sum[i]=s;
-	sum2[i]=s*s;	
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
@ -1,341 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* sapCL =
-	"/*\n"
-	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-	"This software is provided 'as-is', without any express or implied warranty.\n"
-	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-	"Permission is granted to anyone to use this software for any purpose, \n"
-	"including commercial applications, and to alter it and redistribute it freely, \n"
-	"subject to the following restrictions:\n"
-	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-	"3. This notice may not be removed or altered from any source distribution.\n"
-	"*/\n"
-	"//Originally written by Erwin Coumans\n"
-	"#define NEW_PAIR_MARKER -1\n"
-	"typedef struct \n"
-	"{\n"
-	"	union\n"
-	"	{\n"
-	"		float4	m_min;\n"
-	"		float   m_minElems[4];\n"
-	"		int			m_minIndices[4];\n"
-	"	};\n"
-	"	union\n"
-	"	{\n"
-	"		float4	m_max;\n"
-	"		float   m_maxElems[4];\n"
-	"		int			m_maxIndices[4];\n"
-	"	};\n"
-	"} btAabbCL;\n"
-	"/// conservative test for overlap between two aabbs\n"
-	"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
-	"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
-	"{\n"
-	"	bool overlap = true;\n"
-	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-	"	return overlap;\n"
-	"}\n"
-	"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
-	"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
-	"{\n"
-	"	bool overlap = true;\n"
-	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-	"	return overlap;\n"
-	"}\n"
-	"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
-	"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
-	"{\n"
-	"	bool overlap = true;\n"
-	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-	"	return overlap;\n"
-	"}\n"
-	"__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping,  __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
-	"{\n"
-	"	int i = get_global_id(0);\n"
-	"	if (i>=numUnsortedAabbs)\n"
-	"		return;\n"
-	"	int j = get_global_id(1);\n"
-	"	if (j>=numUnSortedAabbs2)\n"
-	"		return;\n"
-	"	__global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
-	"	__global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
-	"	if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
-	"	{\n"
-	"		int4 myPair;\n"
-	"		\n"
-	"		int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
-	"		int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
-	"		if (xIndex>yIndex)\n"
-	"		{\n"
-	"			int tmp = xIndex;\n"
-	"			xIndex=yIndex;\n"
-	"			yIndex=tmp;\n"
-	"		}\n"
-	"		\n"
-	"		myPair.x = xIndex;\n"
-	"		myPair.y = yIndex;\n"
-	"		myPair.z = NEW_PAIR_MARKER;\n"
-	"		myPair.w = NEW_PAIR_MARKER;\n"
-	"		int curPair = atomic_inc (pairCount);\n"
-	"		if (curPair<maxPairs)\n"
-	"		{\n"
-	"				pairsOut[curPair] = myPair; //flush to main memory\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"__kernel void   computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-	"{\n"
-	"	int i = get_global_id(0);\n"
-	"	if (i>=numObjects)\n"
-	"		return;\n"
-	"	for (int j=i+1;j<numObjects;j++)\n"
-	"	{\n"
-	"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-	"		{\n"
-	"			int4 myPair;\n"
-	"			myPair.x = aabbs[i].m_minIndices[3];\n"
-	"			myPair.y = aabbs[j].m_minIndices[3];\n"
-	"			myPair.z = NEW_PAIR_MARKER;\n"
-	"			myPair.w = NEW_PAIR_MARKER;\n"
-	"			int curPair = atomic_inc (pairCount);\n"
-	"			if (curPair<maxPairs)\n"
-	"			{\n"
-	"					pairsOut[curPair] = myPair; //flush to main memory\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-	"{\n"
-	"	int i = get_global_id(0);\n"
-	"	if (i>=numObjects)\n"
-	"		return;\n"
-	"	for (int j=i+1;j<numObjects;j++)\n"
-	"	{\n"
-	"  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
-	"		{\n"
-	"			break;\n"
-	"		}\n"
-	"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-	"		{\n"
-	"			int4 myPair;\n"
-	"			myPair.x = aabbs[i].m_minIndices[3];\n"
-	"			myPair.y = aabbs[j].m_minIndices[3];\n"
-	"			myPair.z = NEW_PAIR_MARKER;\n"
-	"			myPair.w = NEW_PAIR_MARKER;\n"
-	"			int curPair = atomic_inc (pairCount);\n"
-	"			if (curPair<maxPairs)\n"
-	"			{\n"
-	"					pairsOut[curPair] = myPair; //flush to main memory\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-	"{\n"
-	"	int i = get_global_id(0);\n"
-	"	int localId = get_local_id(0);\n"
-	"	__local int numActiveWgItems[1];\n"
-	"	__local int breakRequest[1];\n"
-	"	if (localId==0)\n"
-	"	{\n"
-	"		numActiveWgItems[0] = 0;\n"
-	"		breakRequest[0] = 0;\n"
-	"	}\n"
-	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"	atomic_inc(numActiveWgItems);\n"
-	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"	int localBreak = 0;\n"
-	"	int j=i+1;\n"
-	"	do\n"
-	"	{\n"
-	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"	\n"
-	"		if (j<numObjects)\n"
-	"		{\n"
-	"	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
-	"			{\n"
-	"				if (!localBreak)\n"
-	"				{\n"
-	"					atomic_inc(breakRequest);\n"
-	"					localBreak = 1;\n"
-	"				}\n"
-	"			}\n"
-	"		}\n"
-	"		\n"
-	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"		\n"
-	"		if (j>=numObjects && !localBreak)\n"
-	"		{\n"
-	"			atomic_inc(breakRequest);\n"
-	"			localBreak = 1;\n"
-	"		}\n"
-	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"		\n"
-	"		if (!localBreak)\n"
-	"		{\n"
-	"			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-	"			{\n"
-	"				int4 myPair;\n"
-	"				myPair.x = aabbs[i].m_minIndices[3];\n"
-	"				myPair.y = aabbs[j].m_minIndices[3];\n"
-	"				myPair.z = NEW_PAIR_MARKER;\n"
-	"				myPair.w = NEW_PAIR_MARKER;\n"
-	"				int curPair = atomic_inc (pairCount);\n"
-	"				if (curPair<maxPairs)\n"
-	"				{\n"
-	"						pairsOut[curPair] = myPair; //flush to main memory\n"
-	"				}\n"
-	"			}\n"
-	"		}\n"
-	"		j++;\n"
-	"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
-	"}\n"
-	"__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-	"{\n"
-	"	int i = get_global_id(0);\n"
-	"	int localId = get_local_id(0);\n"
-	"	__local int numActiveWgItems[1];\n"
-	"	__local int breakRequest[1];\n"
-	"	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
-	"	\n"
-	"	btAabbCL myAabb;\n"
-	"	\n"
-	"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
-	"	float testValue = 	myAabb.m_maxElems[axis];\n"
-	"	\n"
-	"	if (localId==0)\n"
-	"	{\n"
-	"		numActiveWgItems[0] = 0;\n"
-	"		breakRequest[0] = 0;\n"
-	"	}\n"
-	"	int localCount=0;\n"
-	"	int block=0;\n"
-	"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
-	"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
-	"	\n"
-	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"	atomic_inc(numActiveWgItems);\n"
-	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"	int localBreak = 0;\n"
-	"	\n"
-	"	int j=i+1;\n"
-	"	do\n"
-	"	{\n"
-	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"	\n"
-	"		if (j<numObjects)\n"
-	"		{\n"
-	"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
-	"			{\n"
-	"				if (!localBreak)\n"
-	"				{\n"
-	"					atomic_inc(breakRequest);\n"
-	"					localBreak = 1;\n"
-	"				}\n"
-	"			}\n"
-	"		}\n"
-	"		\n"
-	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"		\n"
-	"		if (j>=numObjects && !localBreak)\n"
-	"		{\n"
-	"			atomic_inc(breakRequest);\n"
-	"			localBreak = 1;\n"
-	"		}\n"
-	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"		\n"
-	"		if (!localBreak)\n"
-	"		{\n"
-	"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
-	"			{\n"
-	"				int4 myPair;\n"
-	"				myPair.x = myAabb.m_minIndices[3];\n"
-	"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
-	"				myPair.z = NEW_PAIR_MARKER;\n"
-	"				myPair.w = NEW_PAIR_MARKER;\n"
-	"				int curPair = atomic_inc (pairCount);\n"
-	"				if (curPair<maxPairs)\n"
-	"				{\n"
-	"						pairsOut[curPair] = myPair; //flush to main memory\n"
-	"				}\n"
-	"			}\n"
-	"		}\n"
-	"		\n"
-	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-	"		localCount++;\n"
-	"		if (localCount==64)\n"
-	"		{\n"
-	"			localCount = 0;\n"
-	"			block+=64;			\n"
-	"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
-	"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
-	"		}\n"
-	"		j++;\n"
-	"		\n"
-	"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
-	"	\n"
-	"}\n"
-	"//http://stereopsis.com/radix.html\n"
-	"unsigned int FloatFlip(float fl);\n"
-	"unsigned int FloatFlip(float fl)\n"
-	"{\n"
-	"	unsigned int f = *(unsigned int*)&fl;\n"
-	"	unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
-	"	return f ^ mask;\n"
-	"}\n"
-	"float IFloatFlip(unsigned int f);\n"
-	"float IFloatFlip(unsigned int f)\n"
-	"{\n"
-	"	unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
-	"	unsigned int fl = f ^ mask;\n"
-	"	return *(float*)&fl;\n"
-	"}\n"
-	"__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
-	"{\n"
-	"	int i = get_global_id(0);\n"
-	"	if (i>=numObjects)\n"
-	"		return;\n"
-	"	int src = destAabbs[i].m_maxIndices[3];\n"
-	"	destAabbs[i] = allAabbs[src];\n"
-	"	destAabbs[i].m_maxIndices[3] = src;\n"
-	"}\n"
-	"__kernel void   flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
-	"{\n"
-	"	int i = get_global_id(0);\n"
-	"	if (i>=numObjects)\n"
-	"		return;\n"
-	"	\n"
-	"	\n"
-	"	sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
-	"	sortData[i].y = i;\n"
-	"		\n"
-	"}\n"
-	"__kernel void   scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
-	"{\n"
-	"	int i = get_global_id(0);\n"
-	"	if (i>=numObjects)\n"
-	"		return;\n"
-	"	\n"
-	"	sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
-	"}\n"
-	"__kernel void   prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
-	"{\n"
-	"	int i = get_global_id(0);\n"
-	"	if (i>=numAabbs)\n"
-	"		return;\n"
-	"	\n"
-	"	btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
-	"	\n"
-	"	float4 s;\n"
-	"	s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
-	"	sum[i]=s;\n"
-	"	sum2[i]=s*s;	\n"
-	"}\n";
--- a/Engine/lib/bullet/src/Bullet3OpenCL/CMakeLists.txt
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/CMakeLists.txt
@ -1,77 +0,0 @@
-INCLUDE_DIRECTORIES( ${BULLET_PHYSICS_SOURCE_DIR}/src  )
-
-ADD_DEFINITIONS(-DB3_USE_CLEW)
-
-SET(Bullet3OpenCL_clew_SRCS
-	../clew/clew.c
-	BroadphaseCollision/b3GpuGridBroadphase.cpp
-	BroadphaseCollision/b3GpuSapBroadphase.cpp
-	BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp
-	BroadphaseCollision/b3GpuParallelLinearBvh.cpp
-	Initialize/b3OpenCLUtils.cpp
-	NarrowphaseCollision/b3ContactCache.cpp
-	NarrowphaseCollision/b3ConvexHullContact.cpp
-	NarrowphaseCollision/b3GjkEpa.cpp
-	NarrowphaseCollision/b3OptimizedBvh.cpp
-	NarrowphaseCollision/b3QuantizedBvh.cpp
-	NarrowphaseCollision/b3StridingMeshInterface.cpp
-	NarrowphaseCollision/b3TriangleCallback.cpp
-	NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
-	NarrowphaseCollision/b3VoronoiSimplexSolver.cpp
-	ParallelPrimitives/b3BoundSearchCL.cpp
-	ParallelPrimitives/b3FillCL.cpp
-	ParallelPrimitives/b3LauncherCL.cpp
-	ParallelPrimitives/b3PrefixScanCL.cpp
-	ParallelPrimitives/b3PrefixScanFloat4CL.cpp
-	ParallelPrimitives/b3RadixSort32CL.cpp
-	Raycast/b3GpuRaycast.cpp
-	RigidBody/b3GpuGenericConstraint.cpp
-	RigidBody/b3GpuJacobiContactSolver.cpp
-	RigidBody/b3GpuNarrowPhase.cpp
-	RigidBody/b3GpuPgsConstraintSolver.cpp
-	RigidBody/b3GpuPgsContactSolver.cpp
-	RigidBody/b3GpuRigidBodyPipeline.cpp
-	RigidBody/b3Solver.cpp
-)
-
-
-SET(Bullet3OpenCL_clew_HDRS
-#	${Root_HDRS}
-)
-
-
-ADD_LIBRARY(Bullet3OpenCL_clew ${Bullet3OpenCL_clew_SRCS} ${Bullet3OpenCL_clew_HDRS})
-SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES VERSION ${BULLET_VERSION})
-SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES SOVERSION ${BULLET_VERSION})
-IF (BUILD_SHARED_LIBS)
-  TARGET_LINK_LIBRARIES(Bullet3OpenCL_clew LinearMath Bullet3Dynamics ${CMAKE_DL_LIBS})
-ENDIF (BUILD_SHARED_LIBS)
-
-
-IF (INSTALL_LIBS)
-	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
-		#INSTALL of other files requires CMake 2.6
-		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
-			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-				INSTALL(TARGETS Bullet3OpenCL_clew DESTINATION .)
-			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-				INSTALL(TARGETS Bullet3OpenCL_clew RUNTIME DESTINATION bin
-								LIBRARY DESTINATION lib${LIB_SUFFIX}
-								ARCHIVE DESTINATION lib${LIB_SUFFIX})
-				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-DESTINATION ${INCLUDE_INSTALL_DIR} FILES_MATCHING PATTERN "*.h" PATTERN ".svn" EXCLUDE PATTERN "CMakeFiles" EXCLUDE)
-#				INSTALL(FILES ../btBullet3OpenCL_clewCommon.h
-#DESTINATION ${INCLUDE_INSTALL_DIR}/Bullet3OpenCL_clew)
-			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
-
-		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-			SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES FRAMEWORK true)
-
-			SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES PUBLIC_HEADER "${Root_HDRS}")
-			# Have to list out sub-directories manually:
-			SET_PROPERTY(SOURCE ${BroadphaseCollision_HDRS} PROPERTY MACOSX_PACKAGE_LOCATION Headers/BroadphaseCollision)
-
-		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
-ENDIF (INSTALL_LIBS)
--- a/Engine/lib/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h
@ -1,51 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef B3_OPENCL_INCLUDE_H
-#define B3_OPENCL_INCLUDE_H
-
-#ifdef B3_USE_CLEW
-#include "clew/clew.h"
-#else
-
-#ifdef __APPLE__
-#ifdef USE_MINICL
-#include <MiniCL/cl.h>
-#else
-#include <OpenCL/cl.h>
-#include <OpenCL/cl_ext.h>  //clLogMessagesToStderrAPPLE
-#endif
-#else
-#ifdef USE_MINICL
-#include <MiniCL/cl.h>
-#else
-#include <CL/cl.h>
-#ifdef _WIN32
-#include "CL/cl_gl.h"
-#endif  //_WIN32
-#endif
-#endif  //__APPLE__
-#endif  //B3_USE_CLEW
-
-#include <assert.h>
-#include <stdio.h>
-#define oclCHECKERROR(a, b)              \
-	if ((a) != (b))                      \
-	{                                    \
-		printf("OCL Error : %d\n", (a)); \
-		assert((a) == (b));              \
-	}
-
-#endif  //B3_OPENCL_INCLUDE_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp
@ -1,963 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
-Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it freely,
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-//Original author: Roman Ponomarev
-//Mostly Reimplemented by Erwin Coumans
-
-bool gDebugForceLoadingFromSource = false;
-bool gDebugSkipLoadingBinary = false;
-
-#include "Bullet3Common/b3Logging.h"
-
-#include <string.h>
-
-#ifdef _WIN32
-#pragma warning(disable : 4996)
-#endif
-#include "b3OpenCLUtils.h"
-//#include "b3OpenCLInclude.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#define B3_MAX_CL_DEVICES 16  //who needs 16 devices?
-
-#ifdef _WIN32
-#include <windows.h>
-#endif
-
-#include <assert.h>
-#define b3Assert assert
-#ifndef _WIN32
-#include <sys/stat.h>
-
-#endif
-
-static const char* sCachedBinaryPath = "cache";
-
-//Set the preferred platform vendor using the OpenCL SDK
-static const char* spPlatformVendor =
-#if defined(CL_PLATFORM_MINI_CL)
-	"MiniCL, SCEA";
-#elif defined(CL_PLATFORM_AMD)
-	"Advanced Micro Devices, Inc.";
-#elif defined(CL_PLATFORM_NVIDIA)
-	"NVIDIA Corporation";
-#elif defined(CL_PLATFORM_INTEL)
-	"Intel(R) Corporation";
-#elif defined(B3_USE_CLEW)
-	"clew (OpenCL Extension Wrangler library)";
-#else
-	"Unknown Vendor";
-#endif
-
-#ifndef CL_PLATFORM_MINI_CL
-#ifdef _WIN32
-#ifndef B3_USE_CLEW
-#include "CL/cl_gl.h"
-#endif  //B3_USE_CLEW
-#endif  //_WIN32
-#endif
-
-void MyFatalBreakAPPLE(const char* errstr,
-					   const void* private_info,
-					   size_t cb,
-					   void* user_data)
-{
-	const char* patloc = strstr(errstr, "Warning");
-	//find out if it is a warning or error, exit if error
-
-	if (patloc)
-	{
-		b3Warning("Warning: %s\n", errstr);
-	}
-	else
-	{
-		b3Error("Error: %s\n", errstr);
-		b3Assert(0);
-	}
-}
-
-#ifdef B3_USE_CLEW
-
-int b3OpenCLUtils_clewInit()
-{
-	int result = -1;
-
-#ifdef _WIN32
-	const char* cl = "OpenCL.dll";
-#elif defined __APPLE__
-	const char* cl = "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
-#else  //presumable Linux? \
-	   //linux (tested on Ubuntu 12.10 with Catalyst 13.4 beta drivers, not that there is no symbolic link from libOpenCL.so
-	const char* cl = "libOpenCL.so.1";
-	result = clewInit(cl);
-	if (result != CLEW_SUCCESS)
-	{
-		cl = "libOpenCL.so";
-	}
-	else
-	{
-		clewExit();
-	}
-#endif
-	result = clewInit(cl);
-	if (result != CLEW_SUCCESS)
-	{
-		b3Error("clewInit failed with error code %d\n", result);
-	}
-	else
-	{
-		b3Printf("clewInit succesfull using %s\n", cl);
-	}
-	return result;
-}
-#endif
-
-int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum)
-{
-#ifdef B3_USE_CLEW
-	b3OpenCLUtils_clewInit();
-#endif
-
-	cl_platform_id pPlatforms[10] = {0};
-
-	cl_uint numPlatforms = 0;
-	cl_int ciErrNum = clGetPlatformIDs(10, pPlatforms, &numPlatforms);
-	//cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
-
-	if (ciErrNum != CL_SUCCESS)
-	{
-		if (pErrNum != NULL)
-			*pErrNum = ciErrNum;
-	}
-	return numPlatforms;
-}
-
-const char* b3OpenCLUtils_getSdkVendorName()
-{
-	return spPlatformVendor;
-}
-
-void b3OpenCLUtils_setCachePath(const char* path)
-{
-	sCachedBinaryPath = path;
-}
-
-cl_platform_id b3OpenCLUtils_getPlatform(int platformIndex0, cl_int* pErrNum)
-{
-#ifdef B3_USE_CLEW
-	b3OpenCLUtils_clewInit();
-#endif
-
-	cl_platform_id platform = 0;
-	unsigned int platformIndex = (unsigned int)platformIndex0;
-	cl_uint numPlatforms;
-	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
-
-	if (platformIndex < numPlatforms)
-	{
-		cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * numPlatforms);
-		ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
-		if (ciErrNum != CL_SUCCESS)
-		{
-			if (pErrNum != NULL)
-				*pErrNum = ciErrNum;
-			return platform;
-		}
-
-		platform = platforms[platformIndex];
-
-		free(platforms);
-	}
-
-	return platform;
-}
-
-void b3OpenCLUtils::getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo)
-{
-	b3Assert(platform);
-	cl_int ciErrNum;
-	ciErrNum = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, B3_MAX_STRING_LENGTH, platformInfo->m_platformVendor, NULL);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	ciErrNum = clGetPlatformInfo(platform, CL_PLATFORM_NAME, B3_MAX_STRING_LENGTH, platformInfo->m_platformName, NULL);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	ciErrNum = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, B3_MAX_STRING_LENGTH, platformInfo->m_platformVersion, NULL);
-	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-}
-
-void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform)
-{
-	b3OpenCLPlatformInfo platformInfo;
-	b3OpenCLUtils::getPlatformInfo(platform, &platformInfo);
-	b3Printf("Platform info:\n");
-	b3Printf("  CL_PLATFORM_VENDOR: \t\t\t%s\n", platformInfo.m_platformVendor);
-	b3Printf("  CL_PLATFORM_NAME: \t\t\t%s\n", platformInfo.m_platformName);
-	b3Printf("  CL_PLATFORM_VERSION: \t\t\t%s\n", platformInfo.m_platformVersion);
-}
-
-cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex)
-{
-	cl_context retContext = 0;
-	cl_int ciErrNum = 0;
-	cl_uint num_entries;
-	cl_device_id devices[B3_MAX_CL_DEVICES];
-	cl_uint num_devices;
-	cl_context_properties* cprops;
-
-	/*
-	* If we could find our platform, use it. Otherwise pass a NULL and get whatever the
-	* implementation thinks we should be using.
-	*/
-	cl_context_properties cps[7] = {0, 0, 0, 0, 0, 0, 0};
-	cps[0] = CL_CONTEXT_PLATFORM;
-	cps[1] = (cl_context_properties)platform;
-#ifdef _WIN32
-#ifndef B3_USE_CLEW
-	if (pGLContext && pGLDC)
-	{
-		cps[2] = CL_GL_CONTEXT_KHR;
-		cps[3] = (cl_context_properties)pGLContext;
-		cps[4] = CL_WGL_HDC_KHR;
-		cps[5] = (cl_context_properties)pGLDC;
-	}
-#endif  //B3_USE_CLEW
-#endif  //_WIN32
-	num_entries = B3_MAX_CL_DEVICES;
-
-	num_devices = -1;
-
-	ciErrNum = clGetDeviceIDs(
-		platform,
-		deviceType,
-		num_entries,
-		devices,
-		&num_devices);
-
-	if (ciErrNum < 0)
-	{
-		b3Printf("clGetDeviceIDs returned %d\n", ciErrNum);
-		return 0;
-	}
-	cprops = (NULL == platform) ? NULL : cps;
-
-	if (!num_devices)
-		return 0;
-
-	if (pGLContext)
-	{
-		//search for the GPU that relates to the OpenCL context
-		unsigned int i;
-		for (i = 0; i < num_devices; i++)
-		{
-			retContext = clCreateContext(cprops, 1, &devices[i], NULL, NULL, &ciErrNum);
-			if (ciErrNum == CL_SUCCESS)
-				break;
-		}
-	}
-	else
-	{
-		if (preferredDeviceIndex >= 0 && (unsigned int)preferredDeviceIndex < num_devices)
-		{
-			//create a context of the preferred device index
-			retContext = clCreateContext(cprops, 1, &devices[preferredDeviceIndex], NULL, NULL, &ciErrNum);
-		}
-		else
-		{
-			//create a context of all devices
-#if defined(__APPLE__)
-			retContext = clCreateContext(cprops, num_devices, devices, MyFatalBreakAPPLE, NULL, &ciErrNum);
-#else
-			b3Printf("numDevices=%d\n", num_devices);
-
-			retContext = clCreateContext(cprops, num_devices, devices, NULL, NULL, &ciErrNum);
-#endif
-		}
-	}
-	if (pErrNum != NULL)
-	{
-		*pErrNum = ciErrNum;
-	};
-
-	return retContext;
-}
-
-cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* retPlatformId)
-{
-#ifdef B3_USE_CLEW
-	b3OpenCLUtils_clewInit();
-#endif
-
-	cl_uint numPlatforms;
-	cl_context retContext = 0;
-	unsigned int i;
-
-	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
-	if (ciErrNum != CL_SUCCESS)
-	{
-		if (pErrNum != NULL) *pErrNum = ciErrNum;
-		return NULL;
-	}
-	if (numPlatforms > 0)
-	{
-		cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * numPlatforms);
-		ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
-		if (ciErrNum != CL_SUCCESS)
-		{
-			if (pErrNum != NULL)
-				*pErrNum = ciErrNum;
-			free(platforms);
-			return NULL;
-		}
-
-		for (i = 0; i < numPlatforms; ++i)
-		{
-			char pbuf[128];
-			ciErrNum = clGetPlatformInfo(platforms[i],
-										 CL_PLATFORM_VENDOR,
-										 sizeof(pbuf),
-										 pbuf,
-										 NULL);
-			if (ciErrNum != CL_SUCCESS)
-			{
-				if (pErrNum != NULL) *pErrNum = ciErrNum;
-				return NULL;
-			}
-
-			if (preferredPlatformIndex >= 0 && i == preferredPlatformIndex)
-			{
-				cl_platform_id tmpPlatform = platforms[0];
-				platforms[0] = platforms[i];
-				platforms[i] = tmpPlatform;
-				break;
-			}
-			else
-			{
-				if (!strcmp(pbuf, spPlatformVendor))
-				{
-					cl_platform_id tmpPlatform = platforms[0];
-					platforms[0] = platforms[i];
-					platforms[i] = tmpPlatform;
-				}
-			}
-		}
-
-		for (i = 0; i < numPlatforms; ++i)
-		{
-			cl_platform_id platform = platforms[i];
-			assert(platform);
-
-			retContext = b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLContext, pGLDC, preferredDeviceIndex, preferredPlatformIndex);
-
-			if (retContext)
-			{
-				//				printf("OpenCL platform details:\n");
-				b3OpenCLPlatformInfo platformInfo;
-
-				b3OpenCLUtils::getPlatformInfo(platform, &platformInfo);
-
-				if (retPlatformId)
-					*retPlatformId = platform;
-
-				break;
-			}
-		}
-
-		free(platforms);
-	}
-	return retContext;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Gets the id of the nth device from the context
-//!
-//! @return the id or -1 when out of range
-//! @param cxMainContext         OpenCL context
-//! @param device_idx            index of the device of interest
-//////////////////////////////////////////////////////////////////////////////
-cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int deviceIndex)
-{
-	assert(cxMainContext);
-
-	size_t szParmDataBytes;
-	cl_device_id* cdDevices;
-	cl_device_id device;
-
-	// get the list of devices associated with context
-	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
-
-	if (szParmDataBytes / sizeof(cl_device_id) < (unsigned int)deviceIndex)
-	{
-		return (cl_device_id)-1;
-	}
-
-	cdDevices = (cl_device_id*)malloc(szParmDataBytes);
-
-	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
-
-	device = cdDevices[deviceIndex];
-	free(cdDevices);
-
-	return device;
-}
-
-int b3OpenCLUtils_getNumDevices(cl_context cxMainContext)
-{
-	size_t szParamDataBytes;
-	int device_count;
-	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes);
-	device_count = (int)szParamDataBytes / sizeof(cl_device_id);
-	return device_count;
-}
-
-void b3OpenCLUtils::getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info)
-{
-	// CL_DEVICE_NAME
-	clGetDeviceInfo(device, CL_DEVICE_NAME, B3_MAX_STRING_LENGTH, &info->m_deviceName, NULL);
-
-	// CL_DEVICE_VENDOR
-	clGetDeviceInfo(device, CL_DEVICE_VENDOR, B3_MAX_STRING_LENGTH, &info->m_deviceVendor, NULL);
-
-	// CL_DRIVER_VERSION
-	clGetDeviceInfo(device, CL_DRIVER_VERSION, B3_MAX_STRING_LENGTH, &info->m_driverVersion, NULL);
-
-	// CL_DEVICE_INFO
-	clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info->m_deviceType, NULL);
-
-	// CL_DEVICE_MAX_COMPUTE_UNITS
-	clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info->m_computeUnits), &info->m_computeUnits, NULL);
-
-	// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
-	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info->m_workitemDims), &info->m_workitemDims, NULL);
-
-	// CL_DEVICE_MAX_WORK_ITEM_SIZES
-	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info->m_workItemSize), &info->m_workItemSize, NULL);
-
-	// CL_DEVICE_MAX_WORK_GROUP_SIZE
-	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info->m_workgroupSize), &info->m_workgroupSize, NULL);
-
-	// CL_DEVICE_MAX_CLOCK_FREQUENCY
-	clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info->m_clockFrequency), &info->m_clockFrequency, NULL);
-
-	// CL_DEVICE_ADDRESS_BITS
-	clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info->m_addressBits), &info->m_addressBits, NULL);
-
-	// CL_DEVICE_MAX_MEM_ALLOC_SIZE
-	clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info->m_maxMemAllocSize), &info->m_maxMemAllocSize, NULL);
-
-	// CL_DEVICE_GLOBAL_MEM_SIZE
-	clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info->m_globalMemSize), &info->m_globalMemSize, NULL);
-
-	// CL_DEVICE_ERROR_CORRECTION_SUPPORT
-	clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info->m_errorCorrectionSupport), &info->m_errorCorrectionSupport, NULL);
-
-	// CL_DEVICE_LOCAL_MEM_TYPE
-	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info->m_localMemType), &info->m_localMemType, NULL);
-
-	// CL_DEVICE_LOCAL_MEM_SIZE
-	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info->m_localMemSize), &info->m_localMemSize, NULL);
-
-	// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
-	clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info->m_constantBufferSize), &info->m_constantBufferSize, NULL);
-
-	// CL_DEVICE_QUEUE_PROPERTIES
-	clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info->m_queueProperties), &info->m_queueProperties, NULL);
-
-	// CL_DEVICE_IMAGE_SUPPORT
-	clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info->m_imageSupport), &info->m_imageSupport, NULL);
-
-	// CL_DEVICE_MAX_READ_IMAGE_ARGS
-	clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info->m_maxReadImageArgs), &info->m_maxReadImageArgs, NULL);
-
-	// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
-	clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info->m_maxWriteImageArgs), &info->m_maxWriteImageArgs, NULL);
-
-	// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
-	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info->m_image2dMaxWidth, NULL);
-	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info->m_image2dMaxHeight, NULL);
-	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info->m_image3dMaxWidth, NULL);
-	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info->m_image3dMaxHeight, NULL);
-	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info->m_image3dMaxDepth, NULL);
-
-	// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
-	clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, B3_MAX_STRING_LENGTH, &info->m_deviceExtensions, NULL);
-
-	// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
-	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info->m_vecWidthChar, NULL);
-	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info->m_vecWidthShort, NULL);
-	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info->m_vecWidthInt, NULL);
-	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info->m_vecWidthLong, NULL);
-	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info->m_vecWidthFloat, NULL);
-	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info->m_vecWidthDouble, NULL);
-}
-
-void b3OpenCLUtils_printDeviceInfo(cl_device_id device)
-{
-	b3OpenCLDeviceInfo info;
-	b3OpenCLUtils::getDeviceInfo(device, &info);
-	b3Printf("Device Info:\n");
-	b3Printf("  CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName);
-	b3Printf("  CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor);
-	b3Printf("  CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion);
-
-	if (info.m_deviceType & CL_DEVICE_TYPE_CPU)
-		b3Printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
-	if (info.m_deviceType & CL_DEVICE_TYPE_GPU)
-		b3Printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
-	if (info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR)
-		b3Printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
-	if (info.m_deviceType & CL_DEVICE_TYPE_DEFAULT)
-		b3Printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
-
-	b3Printf("  CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits);
-	b3Printf("  CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims);
-	b3Printf("  CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]);
-	b3Printf("  CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize);
-	b3Printf("  CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency);
-	b3Printf("  CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits);
-	b3Printf("  CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize / (1024 * 1024)));
-	b3Printf("  CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize / (1024 * 1024)));
-	b3Printf("  CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport == CL_TRUE ? "yes" : "no");
-	b3Printf("  CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global");
-	b3Printf("  CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024));
-	b3Printf("  CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024));
-	if (info.m_queueProperties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
-		b3Printf("  CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
-	if (info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE)
-		b3Printf("  CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
-
-	b3Printf("  CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport);
-
-	b3Printf("  CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs);
-	b3Printf("  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs);
-	b3Printf("\n  CL_DEVICE_IMAGE <dim>");
-	b3Printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth);
-	b3Printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight);
-	b3Printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth);
-	b3Printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight);
-	b3Printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth);
-	if (*info.m_deviceExtensions != 0)
-	{
-		b3Printf("\n  CL_DEVICE_EXTENSIONS:%s\n", info.m_deviceExtensions);
-	}
-	else
-	{
-		b3Printf("  CL_DEVICE_EXTENSIONS: None\n");
-	}
-	b3Printf("  CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
-	b3Printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
-			 info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong, info.m_vecWidthFloat, info.m_vecWidthDouble);
-}
-
-static const char* strip2(const char* name, const char* pattern)
-{
-	size_t const patlen = strlen(pattern);
-	size_t patcnt = 0;
-	const char* oriptr;
-	const char* patloc;
-	// find how many times the pattern occurs in the original string
-	for (oriptr = name; (patloc = strstr(oriptr, pattern)); oriptr = patloc + patlen)
-	{
-		patcnt++;
-	}
-	return oriptr;
-}
-
-cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSourceOrg, cl_int* pErrNum, const char* additionalMacrosArg, const char* clFileNameForCaching, bool disableBinaryCaching)
-{
-	const char* additionalMacros = additionalMacrosArg ? additionalMacrosArg : "";
-
-	if (disableBinaryCaching)
-	{
-		//kernelSourceOrg = 0;
-	}
-
-	cl_program m_cpProgram = 0;
-	cl_int status;
-
-	char binaryFileName[B3_MAX_STRING_LENGTH];
-
-	char deviceName[256];
-	char driverVersion[256];
-	const char* strippedName;
-	int fileUpToDate = 0;
-#ifdef _WIN32
-	int binaryFileValid = 0;
-#endif
-	if (!disableBinaryCaching && clFileNameForCaching)
-	{
-		clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL);
-		clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
-
-		strippedName = strip2(clFileNameForCaching, "\\");
-		strippedName = strip2(strippedName, "/");
-
-#ifdef _MSC_VER
-		sprintf_s(binaryFileName, B3_MAX_STRING_LENGTH, "%s/%s.%s.%s.bin", sCachedBinaryPath, strippedName, deviceName, driverVersion);
-#else
-		sprintf(binaryFileName, "%s/%s.%s.%s.bin", sCachedBinaryPath, strippedName, deviceName, driverVersion);
-#endif
-	}
-	if (clFileNameForCaching && !(disableBinaryCaching || gDebugSkipLoadingBinary || gDebugForceLoadingFromSource))
-	{
-#ifdef _WIN32
-		char* bla = 0;
-
-		//printf("searching for %s\n", binaryFileName);
-
-		FILETIME modtimeBinary;
-		CreateDirectoryA(sCachedBinaryPath, 0);
-		{
-			HANDLE binaryFileHandle = CreateFileA(binaryFileName, GENERIC_READ, 0, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
-			if (binaryFileHandle == INVALID_HANDLE_VALUE)
-			{
-				DWORD errorCode;
-				errorCode = GetLastError();
-				switch (errorCode)
-				{
-					case ERROR_FILE_NOT_FOUND:
-					{
-						b3Warning("\nCached file not found %s\n", binaryFileName);
-						break;
-					}
-					case ERROR_PATH_NOT_FOUND:
-					{
-						b3Warning("\nCached file path not found %s\n", binaryFileName);
-						break;
-					}
-					default:
-					{
-						b3Warning("\nFailed reading cached file with errorCode = %d\n", errorCode);
-					}
-				}
-			}
-			else
-			{
-				if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary) == 0)
-				{
-					DWORD errorCode;
-					errorCode = GetLastError();
-					b3Warning("\nGetFileTime errorCode = %d\n", errorCode);
-				}
-				else
-				{
-					binaryFileValid = 1;
-				}
-				CloseHandle(binaryFileHandle);
-			}
-
-			if (binaryFileValid)
-			{
-				HANDLE srcFileHandle = CreateFileA(clFileNameForCaching, GENERIC_READ, 0, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
-
-				if (srcFileHandle == INVALID_HANDLE_VALUE)
-				{
-					const char* prefix[] = {"./", "../", "../../", "../../../", "../../../../"};
-					for (int i = 0; (srcFileHandle == INVALID_HANDLE_VALUE) && i < 5; i++)
-					{
-						char relativeFileName[1024];
-						sprintf(relativeFileName, "%s%s", prefix[i], clFileNameForCaching);
-						srcFileHandle = CreateFileA(relativeFileName, GENERIC_READ, 0, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
-					}
-				}
-
-				if (srcFileHandle != INVALID_HANDLE_VALUE)
-				{
-					FILETIME modtimeSrc;
-					if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc) == 0)
-					{
-						DWORD errorCode;
-						errorCode = GetLastError();
-						b3Warning("\nGetFileTime errorCode = %d\n", errorCode);
-					}
-					if ((modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime) || ((modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime) && (modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
-					{
-						fileUpToDate = 1;
-					}
-					else
-					{
-						b3Warning("\nCached binary file out-of-date (%s)\n", binaryFileName);
-					}
-					CloseHandle(srcFileHandle);
-				}
-				else
-				{
-#ifdef _DEBUG
-					DWORD errorCode;
-					errorCode = GetLastError();
-					switch (errorCode)
-					{
-						case ERROR_FILE_NOT_FOUND:
-						{
-							b3Warning("\nSrc file not found %s\n", clFileNameForCaching);
-							break;
-						}
-						case ERROR_PATH_NOT_FOUND:
-						{
-							b3Warning("\nSrc path not found %s\n", clFileNameForCaching);
-							break;
-						}
-						default:
-						{
-							b3Warning("\nnSrc file reading errorCode = %d\n", errorCode);
-						}
-					}
-
-					//we should make sure the src file exists so we can verify the timestamp with binary
-					//					assert(0);
-					b3Warning("Warning: cannot find OpenCL kernel %s to verify timestamp of binary cached kernel %s\n", clFileNameForCaching, binaryFileName);
-					fileUpToDate = true;
-#else
-					//if we cannot find the source, assume it is OK in release builds
-					fileUpToDate = true;
-#endif
-				}
-			}
-		}
-
-#else
-		fileUpToDate = true;
-		if (mkdir(sCachedBinaryPath, 0777) == -1)
-		{
-		}
-		else
-		{
-			b3Printf("Succesfully created cache directory: %s\n", sCachedBinaryPath);
-		}
-#endif  //_WIN32
-	}
-
-	if (fileUpToDate)
-	{
-#ifdef _MSC_VER
-		FILE* file;
-		if (fopen_s(&file, binaryFileName, "rb") != 0)
-			file = 0;
-#else
-		FILE* file = fopen(binaryFileName, "rb");
-#endif
-
-		if (file)
-		{
-			size_t binarySize = 0;
-			char* binary = 0;
-
-			fseek(file, 0L, SEEK_END);
-			binarySize = ftell(file);
-			rewind(file);
-			binary = (char*)malloc(sizeof(char) * binarySize);
-			int bytesRead;
-			bytesRead = fread(binary, sizeof(char), binarySize, file);
-			fclose(file);
-
-			m_cpProgram = clCreateProgramWithBinary(clContext, 1, &device, &binarySize, (const unsigned char**)&binary, 0, &status);
-			b3Assert(status == CL_SUCCESS);
-			status = clBuildProgram(m_cpProgram, 1, &device, additionalMacros, 0, 0);
-			b3Assert(status == CL_SUCCESS);
-
-			if (status != CL_SUCCESS)
-			{
-				char* build_log;
-				size_t ret_val_size;
-				clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-				build_log = (char*)malloc(sizeof(char) * (ret_val_size + 1));
-				clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-				build_log[ret_val_size] = '\0';
-				b3Error("%s\n", build_log);
-				free(build_log);
-				b3Assert(0);
-				m_cpProgram = 0;
-
-				b3Warning("clBuildProgram reported failure on cached binary: %s\n", binaryFileName);
-			}
-			else
-			{
-				b3Printf("clBuildProgram successfully compiled cached binary: %s\n", binaryFileName);
-			}
-			free(binary);
-		}
-		else
-		{
-			b3Warning("Cannot open cached binary: %s\n", binaryFileName);
-		}
-	}
-
-	if (!m_cpProgram)
-	{
-		cl_int localErrNum;
-		char* compileFlags;
-		int flagsize;
-
-		const char* kernelSource = kernelSourceOrg;
-
-		if (!kernelSourceOrg || gDebugForceLoadingFromSource)
-		{
-			if (clFileNameForCaching)
-			{
-				FILE* file = fopen(clFileNameForCaching, "rb");
-				//in many cases the relative path is a few levels up the directory hierarchy, so try it
-				if (!file)
-				{
-					const char* prefix[] = {"../", "../../", "../../../", "../../../../"};
-					for (int i = 0; !file && i < 3; i++)
-					{
-						char relativeFileName[1024];
-						sprintf(relativeFileName, "%s%s", prefix[i], clFileNameForCaching);
-						file = fopen(relativeFileName, "rb");
-					}
-				}
-
-				if (file)
-				{
-					char* kernelSrc = 0;
-					fseek(file, 0L, SEEK_END);
-					int kernelSize = ftell(file);
-					rewind(file);
-					kernelSrc = (char*)malloc(kernelSize + 1);
-					int readBytes;
-					readBytes = fread((void*)kernelSrc, 1, kernelSize, file);
-					kernelSrc[kernelSize] = 0;
-					fclose(file);
-					kernelSource = kernelSrc;
-				}
-			}
-		}
-
-		size_t program_length = kernelSource ? strlen(kernelSource) : 0;
-#ifdef MAC  //or __APPLE__?
-		char* flags = "-cl-mad-enable -DMAC ";
-#else
-		const char* flags = "";
-#endif
-
-		m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum);
-		if (localErrNum != CL_SUCCESS)
-		{
-			if (pErrNum)
-				*pErrNum = localErrNum;
-			return 0;
-		}
-
-		// Build the program with 'mad' Optimization option
-
-		flagsize = sizeof(char) * (strlen(additionalMacros) + strlen(flags) + 5);
-		compileFlags = (char*)malloc(flagsize);
-#ifdef _MSC_VER
-		sprintf_s(compileFlags, flagsize, "%s %s", flags, additionalMacros);
-#else
-		sprintf(compileFlags, "%s %s", flags, additionalMacros);
-#endif
-		localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL);
-		if (localErrNum != CL_SUCCESS)
-		{
-			char* build_log;
-			size_t ret_val_size;
-			clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-			build_log = (char*)malloc(sizeof(char) * (ret_val_size + 1));
-			clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-
-			// to be carefully, terminate with \0
-			// there's no information in the reference whether the string is 0 terminated or not
-			build_log[ret_val_size] = '\0';
-
-			b3Error("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
-			free(build_log);
-			if (pErrNum)
-				*pErrNum = localErrNum;
-			return 0;
-		}
-
-		if (!disableBinaryCaching && clFileNameForCaching)
-		{  //	write to binary
-
-			cl_uint numAssociatedDevices;
-			status = clGetProgramInfo(m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0);
-			b3Assert(status == CL_SUCCESS);
-			if (numAssociatedDevices == 1)
-			{
-				size_t binarySize;
-				char* binary;
-
-				status = clGetProgramInfo(m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0);
-				b3Assert(status == CL_SUCCESS);
-
-				binary = (char*)malloc(sizeof(char) * binarySize);
-
-				status = clGetProgramInfo(m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0);
-				b3Assert(status == CL_SUCCESS);
-
-				{
-					FILE* file = 0;
-#ifdef _MSC_VER
-					if (fopen_s(&file, binaryFileName, "wb") != 0)
-						file = 0;
-#else
-					file = fopen(binaryFileName, "wb");
-#endif
-					if (file)
-					{
-						fwrite(binary, sizeof(char), binarySize, file);
-						fclose(file);
-					}
-					else
-					{
-						b3Warning("cannot write file %s\n", binaryFileName);
-					}
-				}
-
-				free(binary);
-			}
-		}
-
-		free(compileFlags);
-	}
-	return m_cpProgram;
-}
-
-cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros)
-{
-	cl_kernel kernel;
-	cl_int localErrNum;
-
-	cl_program m_cpProgram = prog;
-
-	b3Printf("compiling kernel %s ", kernelName);
-
-	if (!m_cpProgram)
-	{
-		m_cpProgram = b3OpenCLUtils_compileCLProgramFromString(clContext, device, kernelSource, pErrNum, additionalMacros, 0, false);
-	}
-
-	// Create the kernel
-	kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum);
-	if (localErrNum != CL_SUCCESS)
-	{
-		b3Error("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName);
-		assert(0);
-		if (pErrNum)
-			*pErrNum = localErrNum;
-		return 0;
-	}
-
-	if (!prog && m_cpProgram)
-	{
-		clReleaseProgram(m_cpProgram);
-	}
-	b3Printf("ready. \n");
-
-	if (pErrNum)
-		*pErrNum = CL_SUCCESS;
-	return kernel;
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h
@ -1,190 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
-Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc. 
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-//original author: Roman Ponomarev
-//cleanup by Erwin Coumans
-
-#ifndef B3_OPENCL_UTILS_H
-#define B3_OPENCL_UTILS_H
-
-#include "b3OpenCLInclude.h"
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-	///C API for OpenCL utilities: convenience functions, see below for C++ API
-
-	/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
-	/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
-	cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* platformId);
-
-	int b3OpenCLUtils_getNumDevices(cl_context cxMainContext);
-
-	cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int nr);
-
-	void b3OpenCLUtils_printDeviceInfo(cl_device_id device);
-
-	cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros);
-
-	//optional
-	cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum, const char* additionalMacros, const char* srcFileNameForCaching, bool disableBinaryCaching);
-
-	//the following optional APIs provide access using specific platform information
-	int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum);
-
-	///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
-	cl_platform_id b3OpenCLUtils_getPlatform(int nr, cl_int* pErrNum);
-
-	void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform);
-
-	const char* b3OpenCLUtils_getSdkVendorName();
-
-	///set the path (directory/folder) where the compiled OpenCL kernel are stored
-	void b3OpenCLUtils_setCachePath(const char* path);
-
-	cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex);
-
-#ifdef __cplusplus
-}
-
-#define B3_MAX_STRING_LENGTH 1024
-
-typedef struct
-{
-	char m_deviceName[B3_MAX_STRING_LENGTH];
-	char m_deviceVendor[B3_MAX_STRING_LENGTH];
-	char m_driverVersion[B3_MAX_STRING_LENGTH];
-	char m_deviceExtensions[B3_MAX_STRING_LENGTH];
-
-	cl_device_type m_deviceType;
-	cl_uint m_computeUnits;
-	size_t m_workitemDims;
-	size_t m_workItemSize[3];
-	size_t m_image2dMaxWidth;
-	size_t m_image2dMaxHeight;
-	size_t m_image3dMaxWidth;
-	size_t m_image3dMaxHeight;
-	size_t m_image3dMaxDepth;
-	size_t m_workgroupSize;
-	cl_uint m_clockFrequency;
-	cl_ulong m_constantBufferSize;
-	cl_ulong m_localMemSize;
-	cl_ulong m_globalMemSize;
-	cl_bool m_errorCorrectionSupport;
-	cl_device_local_mem_type m_localMemType;
-	cl_uint m_maxReadImageArgs;
-	cl_uint m_maxWriteImageArgs;
-
-	cl_uint m_addressBits;
-	cl_ulong m_maxMemAllocSize;
-	cl_command_queue_properties m_queueProperties;
-	cl_bool m_imageSupport;
-	cl_uint m_vecWidthChar;
-	cl_uint m_vecWidthShort;
-	cl_uint m_vecWidthInt;
-	cl_uint m_vecWidthLong;
-	cl_uint m_vecWidthFloat;
-	cl_uint m_vecWidthDouble;
-
-} b3OpenCLDeviceInfo;
-
-struct b3OpenCLPlatformInfo
-{
-	char m_platformVendor[B3_MAX_STRING_LENGTH];
-	char m_platformName[B3_MAX_STRING_LENGTH];
-	char m_platformVersion[B3_MAX_STRING_LENGTH];
-
-	b3OpenCLPlatformInfo()
-	{
-		m_platformVendor[0] = 0;
-		m_platformName[0] = 0;
-		m_platformVersion[0] = 0;
-	}
-};
-
-///C++ API for OpenCL utilities: convenience functions
-struct b3OpenCLUtils
-{
-	/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
-	/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
-	static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex = -1, cl_platform_id* platformId = 0)
-	{
-		return b3OpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx, pGLDC, preferredDeviceIndex, preferredPlatformIndex, platformId);
-	}
-
-	static inline int getNumDevices(cl_context cxMainContext)
-	{
-		return b3OpenCLUtils_getNumDevices(cxMainContext);
-	}
-	static inline cl_device_id getDevice(cl_context cxMainContext, int nr)
-	{
-		return b3OpenCLUtils_getDevice(cxMainContext, nr);
-	}
-
-	static void getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info);
-
-	static inline void printDeviceInfo(cl_device_id device)
-	{
-		b3OpenCLUtils_printDeviceInfo(device);
-	}
-
-	static inline cl_kernel compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum = 0, cl_program prog = 0, const char* additionalMacros = "")
-	{
-		return b3OpenCLUtils_compileCLKernelFromString(clContext, device, kernelSource, kernelName, pErrNum, prog, additionalMacros);
-	}
-
-	//optional
-	static inline cl_program compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum = 0, const char* additionalMacros = "", const char* srcFileNameForCaching = 0, bool disableBinaryCaching = false)
-	{
-		return b3OpenCLUtils_compileCLProgramFromString(clContext, device, kernelSource, pErrNum, additionalMacros, srcFileNameForCaching, disableBinaryCaching);
-	}
-
-	//the following optional APIs provide access using specific platform information
-	static inline int getNumPlatforms(cl_int* pErrNum = 0)
-	{
-		return b3OpenCLUtils_getNumPlatforms(pErrNum);
-	}
-	///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
-	static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum = 0)
-	{
-		return b3OpenCLUtils_getPlatform(nr, pErrNum);
-	}
-
-	static void getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo);
-
-	static inline void printPlatformInfo(cl_platform_id platform)
-	{
-		b3OpenCLUtils_printPlatformInfo(platform);
-	}
-
-	static inline const char* getSdkVendorName()
-	{
-		return b3OpenCLUtils_getSdkVendorName();
-	}
-	static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex = -1)
-	{
-		return b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx, pGLDC, preferredDeviceIndex, preferredPlatformIndex);
-	}
-	static void setCachePath(const char* path)
-	{
-		b3OpenCLUtils_setCachePath(path);
-	}
-};
-
-#endif  //__cplusplus
-
-#endif  // B3_OPENCL_UTILS_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h
@ -1,17 +0,0 @@
-#ifndef B3_BVH_INFO_H
-#define B3_BVH_INFO_H
-
-#include "Bullet3Common/b3Vector3.h"
-
-struct b3BvhInfo
-{
-	b3Vector3 m_aabbMin;
-	b3Vector3 m_aabbMax;
-	b3Vector3 m_quantization;
-	int m_numNodes;
-	int m_numSubTrees;
-	int m_nodeOffset;
-	int m_subTreeOffset;
-};
-
-#endif  //B3_BVH_INFO_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp
@ -1,253 +0,0 @@
-
-#if 0
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#include "b3ContactCache.h"
-#include "Bullet3Common/b3Transform.h"
-
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
-
-b3Scalar					gContactBreakingThreshold = b3Scalar(0.02);
-
-///gContactCalcArea3Points will approximate the convex hull area using 3 points
-///when setting it to false, it will use 4 points to compute the area: it is more accurate but slower
-bool						gContactCalcArea3Points = true;
-
-
-
-
-static inline b3Scalar calcArea4Points(const b3Vector3 &p0,const b3Vector3 &p1,const b3Vector3 &p2,const b3Vector3 &p3)
-{
-	// It calculates possible 3 area constructed from random 4 points and returns the biggest one.
-
-	b3Vector3 a[3],b[3];
-	a[0] = p0 - p1;
-	a[1] = p0 - p2;
-	a[2] = p0 - p3;
-	b[0] = p2 - p3;
-	b[1] = p1 - p3;
-	b[2] = p1 - p2;
-
-	//todo: Following 3 cross production can be easily optimized by SIMD.
-	b3Vector3 tmp0 = a[0].cross(b[0]);
-	b3Vector3 tmp1 = a[1].cross(b[1]);
-	b3Vector3 tmp2 = a[2].cross(b[2]);
-
-	return b3Max(b3Max(tmp0.length2(),tmp1.length2()),tmp2.length2());
-}
-#if 0
-
-//using localPointA for all points
-int b3ContactCache::sortCachedPoints(const b3Vector3& pt) 
-{
-		//calculate 4 possible cases areas, and take biggest area
-		//also need to keep 'deepest'
-		
-		int maxPenetrationIndex = -1;
-#define KEEP_DEEPEST_POINT 1
-#ifdef KEEP_DEEPEST_POINT
-		b3Scalar maxPenetration = pt.getDistance();
-		for (int i=0;i<4;i++)
-		{
-			if (m_pointCache[i].getDistance() < maxPenetration)
-			{
-				maxPenetrationIndex = i;
-				maxPenetration = m_pointCache[i].getDistance();
-			}
-		}
-#endif  //KEEP_DEEPEST_POINT
-		
-		b3Scalar res0(b3Scalar(0.)),res1(b3Scalar(0.)),res2(b3Scalar(0.)),res3(b3Scalar(0.));
-
-	if (gContactCalcArea3Points)
-	{
-		if (maxPenetrationIndex != 0)
-		{
-			b3Vector3 a0 = pt.m_localPointA-m_pointCache[1].m_localPointA;
-			b3Vector3 b0 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
-			b3Vector3 cross = a0.cross(b0);
-			res0 = cross.length2();
-		}
-		if (maxPenetrationIndex != 1)
-		{
-			b3Vector3 a1 = pt.m_localPointA-m_pointCache[0].m_localPointA;
-			b3Vector3 b1 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
-			b3Vector3 cross = a1.cross(b1);
-			res1 = cross.length2();
-		}
-
-		if (maxPenetrationIndex != 2)
-		{
-			b3Vector3 a2 = pt.m_localPointA-m_pointCache[0].m_localPointA;
-			b3Vector3 b2 = m_pointCache[3].m_localPointA-m_pointCache[1].m_localPointA;
-			b3Vector3 cross = a2.cross(b2);
-			res2 = cross.length2();
-		}
-
-		if (maxPenetrationIndex != 3)
-		{
-			b3Vector3 a3 = pt.m_localPointA-m_pointCache[0].m_localPointA;
-			b3Vector3 b3 = m_pointCache[2].m_localPointA-m_pointCache[1].m_localPointA;
-			b3Vector3 cross = a3.cross(b3);
-			res3 = cross.length2();
-		}
-	} 
-	else
-	{
-		if(maxPenetrationIndex != 0) {
-			res0 = calcArea4Points(pt.m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[2].m_localPointA,m_pointCache[3].m_localPointA);
-		}
-
-		if(maxPenetrationIndex != 1) {
-			res1 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[2].m_localPointA,m_pointCache[3].m_localPointA);
-		}
-
-		if(maxPenetrationIndex != 2) {
-			res2 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[3].m_localPointA);
-		}
-
-		if(maxPenetrationIndex != 3) {
-			res3 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[2].m_localPointA);
-		}
-	}
-	b3Vector4 maxvec(res0,res1,res2,res3);
-	int biggestarea = maxvec.closestAxis4();
-	return biggestarea;
-	
-}
-
-
-int b3ContactCache::getCacheEntry(const b3Vector3& newPoint) const
-{
-	b3Scalar shortestDist =  getContactBreakingThreshold() * getContactBreakingThreshold();
-	int size = getNumContacts();
-	int nearestPoint = -1;
-	for( int i = 0; i < size; i++ )
-	{
-		const b3Vector3 &mp = m_pointCache[i];
-
-		b3Vector3 diffA =  mp.m_localPointA- newPoint.m_localPointA;
-		const b3Scalar distToManiPoint = diffA.dot(diffA);
-		if( distToManiPoint < shortestDist )
-		{
-			shortestDist = distToManiPoint;
-			nearestPoint = i;
-		}
-	}
-	return nearestPoint;
-}
-
-int b3ContactCache::addManifoldPoint(const b3Vector3& newPoint)
-{
-	b3Assert(validContactDistance(newPoint));
-	
-	int insertIndex = getNumContacts();
-	if (insertIndex == MANIFOLD_CACHE_SIZE)
-	{
-#if MANIFOLD_CACHE_SIZE >= 4
-		//sort cache so best points come first, based on area
-		insertIndex = sortCachedPoints(newPoint);
-#else
-		insertIndex = 0;
-#endif
-		clearUserCache(m_pointCache[insertIndex]);
-		
-	} else
-	{
-		m_cachedPoints++;
-
-		
-	}
-	if (insertIndex<0)
-		insertIndex=0;
-
-	//b3Assert(m_pointCache[insertIndex].m_userPersistentData==0);
-	m_pointCache[insertIndex] = newPoint;
-	return insertIndex;
-}
-
-#endif
-
-bool b3ContactCache::validContactDistance(const b3Vector3& pt)
-{
-	return pt.w <= gContactBreakingThreshold;
-}
-
-void b3ContactCache::removeContactPoint(struct b3Contact4Data& newContactCache,int i)
-{
-	int numContacts = b3Contact4Data_getNumPoints(&newContactCache);
-	if (i!=(numContacts-1))
-	{
-		b3Swap(newContactCache.m_localPosA[i],newContactCache.m_localPosA[numContacts-1]);
-		b3Swap(newContactCache.m_localPosB[i],newContactCache.m_localPosB[numContacts-1]);
-		b3Swap(newContactCache.m_worldPosB[i],newContactCache.m_worldPosB[numContacts-1]);
-	}
-	b3Contact4Data_setNumPoints(&newContactCache,numContacts-1);
-
-}
-
-
-void b3ContactCache::refreshContactPoints(const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& contacts)
-{
-
-	int numContacts = b3Contact4Data_getNumPoints(&contacts);
-	
-
-	int i;
-	/// first refresh worldspace positions and distance
-	for (i=numContacts-1;i>=0;i--)
-	{
-		b3Vector3 worldPosA = trA( contacts.m_localPosA[i]);
-		b3Vector3 worldPosB = trB( contacts.m_localPosB[i]);
-		contacts.m_worldPosB[i] = worldPosB;
-		float distance = (worldPosA -  worldPosB).dot(contacts.m_worldNormalOnB);
-		contacts.m_worldPosB[i].w = distance;
-	}
-
-	/// then 
-	b3Scalar distance2d;
-	b3Vector3 projectedDifference,projectedPoint;
-	for (i=numContacts-1;i>=0;i--)
-	{
-		b3Vector3 worldPosA = trA( contacts.m_localPosA[i]);
-		b3Vector3 worldPosB = trB( contacts.m_localPosB[i]);
-		b3Vector3&pt = contacts.m_worldPosB[i];
-		//contact becomes invalid when signed distance exceeds margin (projected on contactnormal direction)
-		if (!validContactDistance(pt))
-		{
-			removeContactPoint(contacts,i);
-		} else
-		{
-			//contact also becomes invalid when relative movement orthogonal to normal exceeds margin
-			projectedPoint = worldPosA - contacts.m_worldNormalOnB * contacts.m_worldPosB[i].w;
-			projectedDifference = contacts.m_worldPosB[i] - projectedPoint;
-			distance2d = projectedDifference.dot(projectedDifference);
-			if (distance2d  > gContactBreakingThreshold*gContactBreakingThreshold )
-			{
-				removeContactPoint(contacts,i);
-			} else
-			{
-				////contact point processed callback
-				//if (gContactProcessedCallback)
-				//	(*gContactProcessedCallback)(manifoldPoint,(void*)m_body0,(void*)m_body1);
-			}
-		}
-	}
-	
-
-}
-
-#endif
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h
@ -1,62 +0,0 @@
-
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef B3_CONTACT_CACHE_H
-#define B3_CONTACT_CACHE_H
-
-#include "Bullet3Common/b3Vector3.h"
-#include "Bullet3Common/b3Transform.h"
-#include "Bullet3Common/b3AlignedAllocator.h"
-
-///maximum contact breaking and merging threshold
-extern b3Scalar gContactBreakingThreshold;
-
-#define MANIFOLD_CACHE_SIZE 4
-
-///b3ContactCache is a contact point cache, it stays persistent as long as objects are overlapping in the broadphase.
-///Those contact points are created by the collision narrow phase.
-///The cache can be empty, or hold 1,2,3 or 4 points. Some collision algorithms (GJK) might only add one point at a time.
-///updates/refreshes old contact points, and throw them away if necessary (distance becomes too large)
-///reduces the cache to 4 points, when more then 4 points are added, using following rules:
-///the contact point with deepest penetration is always kept, and it tries to maximuze the area covered by the points
-///note that some pairs of objects might have more then one contact manifold.
-B3_ATTRIBUTE_ALIGNED16(class)
-b3ContactCache
-{
-	/// sort cached points so most isolated points come first
-	int sortCachedPoints(const b3Vector3& pt);
-
-public:
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-
-	int addManifoldPoint(const b3Vector3& newPoint);
-
-	/*void replaceContactPoint(const b3Vector3& newPoint,int insertIndex)
-	{
-		b3Assert(validContactDistance(newPoint));
-		m_pointCache[insertIndex] = newPoint;
-	}
-	*/
-
-	static bool validContactDistance(const b3Vector3& pt);
-
-	/// calculated new worldspace coordinates and depth, and reject points that exceed the collision margin
-	static void refreshContactPoints(const b3Transform& trA, const b3Transform& trB, struct b3Contact4Data& newContactCache);
-
-	static void removeContactPoint(struct b3Contact4Data & newContactCache, int i);
-};
-
-#endif  //B3_CONTACT_CACHE_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
@ -1,106 +0,0 @@
-
-#ifndef _CONVEX_HULL_CONTACT_H
-#define _CONVEX_HULL_CONTACT_H
-
-#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
-#include "Bullet3Common/b3AlignedObjectArray.h"
-
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
-#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
-#include "Bullet3Common/shared/b3Int2.h"
-#include "Bullet3Common/shared/b3Int4.h"
-#include "b3OptimizedBvh.h"
-#include "b3BvhInfo.h"
-#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
-
-//#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h"
-
-struct GpuSatCollision
-{
-	cl_context m_context;
-	cl_device_id m_device;
-	cl_command_queue m_queue;
-	cl_kernel m_findSeparatingAxisKernel;
-	cl_kernel m_mprPenetrationKernel;
-	cl_kernel m_findSeparatingAxisUnitSphereKernel;
-
-	cl_kernel m_findSeparatingAxisVertexFaceKernel;
-	cl_kernel m_findSeparatingAxisEdgeEdgeKernel;
-
-	cl_kernel m_findConcaveSeparatingAxisKernel;
-	cl_kernel m_findConcaveSeparatingAxisVertexFaceKernel;
-	cl_kernel m_findConcaveSeparatingAxisEdgeEdgeKernel;
-
-	cl_kernel m_findCompoundPairsKernel;
-	cl_kernel m_processCompoundPairsKernel;
-
-	cl_kernel m_clipHullHullKernel;
-	cl_kernel m_clipCompoundsHullHullKernel;
-
-	cl_kernel m_clipFacesAndFindContacts;
-	cl_kernel m_findClippingFacesKernel;
-
-	cl_kernel m_clipHullHullConcaveConvexKernel;
-	//	cl_kernel				m_extractManifoldAndAddContactKernel;
-	cl_kernel m_newContactReductionKernel;
-
-	cl_kernel m_bvhTraversalKernel;
-	cl_kernel m_primitiveContactsKernel;
-	cl_kernel m_findConcaveSphereContactsKernel;
-
-	cl_kernel m_processCompoundPairsPrimitivesKernel;
-
-	b3OpenCLArray<b3Vector3> m_unitSphereDirections;
-
-	b3OpenCLArray<int> m_totalContactsOut;
-
-	b3OpenCLArray<b3Vector3> m_sepNormals;
-	b3OpenCLArray<float> m_dmins;
-
-	b3OpenCLArray<int> m_hasSeparatingNormals;
-	b3OpenCLArray<b3Vector3> m_concaveSepNormals;
-	b3OpenCLArray<int> m_concaveHasSeparatingNormals;
-	b3OpenCLArray<int> m_numConcavePairsOut;
-	b3OpenCLArray<b3CompoundOverlappingPair> m_gpuCompoundPairs;
-	b3OpenCLArray<b3Vector3> m_gpuCompoundSepNormals;
-	b3OpenCLArray<int> m_gpuHasCompoundSepNormals;
-	b3OpenCLArray<int> m_numCompoundPairsOut;
-
-	GpuSatCollision(cl_context ctx, cl_device_id device, cl_command_queue q);
-	virtual ~GpuSatCollision();
-
-	void computeConvexConvexContactsGPUSAT(b3OpenCLArray<b3Int4>* pairs, int nPairs,
-										   const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
-										   b3OpenCLArray<b3Contact4>* contactOut, int& nContacts,
-										   const b3OpenCLArray<b3Contact4>* oldContacts,
-										   int maxContactCapacity,
-										   int compoundPairCapacity,
-										   const b3OpenCLArray<b3ConvexPolyhedronData>& hostConvexData,
-										   const b3OpenCLArray<b3Vector3>& vertices,
-										   const b3OpenCLArray<b3Vector3>& uniqueEdges,
-										   const b3OpenCLArray<b3GpuFace>& faces,
-										   const b3OpenCLArray<int>& indices,
-										   const b3OpenCLArray<b3Collidable>& gpuCollidables,
-										   const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes,
-
-										   const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace,
-										   const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace,
-
-										   b3OpenCLArray<b3Vector3>& worldVertsB1GPU,
-										   b3OpenCLArray<b3Int4>& clippingFacesOutGPU,
-										   b3OpenCLArray<b3Vector3>& worldNormalsAGPU,
-										   b3OpenCLArray<b3Vector3>& worldVertsA1GPU,
-										   b3OpenCLArray<b3Vector3>& worldVertsB2GPU,
-										   b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData,
-										   b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU,
-										   b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU,
-										   b3OpenCLArray<b3BvhInfo>* bvhInfo,
-										   int numObjects,
-										   int maxTriConvexPairCapacity,
-										   b3OpenCLArray<b3Int4>& triangleConvexPairs,
-										   int& numTriConvexPairsOut);
-};
-
-#endif  //_CONVEX_HULL_CONTACT_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h
@ -1,7 +0,0 @@
-#ifndef CONVEX_POLYHEDRON_CL
-#define CONVEX_POLYHEDRON_CL
-
-#include "Bullet3Common/b3Transform.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
-
-#endif  //CONVEX_POLYHEDRON_CL
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h
@ -1,79 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2008 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the
-use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it
-freely,
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not
-claim that you wrote the original software. If you use this software in a
-product, an acknowledgment in the product documentation would be appreciated
-but is not required.
-2. Altered source versions must be plainly marked as such, and must not be
-misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-/*
-GJK-EPA collision solver by Nathanael Presson, 2008
-*/
-#ifndef B3_GJK_EPA2_H
-#define B3_GJK_EPA2_H
-
-#include "Bullet3Common/b3AlignedObjectArray.h"
-#include "Bullet3Common/b3Transform.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
-
-///btGjkEpaSolver contributed under zlib by Nathanael Presson
-struct b3GjkEpaSolver2
-{
-	struct sResults
-	{
-		enum eStatus
-		{
-			Separated,   /* Shapes doesnt penetrate												*/
-			Penetrating, /* Shapes are penetrating												*/
-			GJK_Failed,  /* GJK phase fail, no big issue, shapes are probably just 'touching'	*/
-			EPA_Failed   /* EPA phase fail, bigger problem, need to save parameters, and debug	*/
-		} status;
-		b3Vector3 witnesses[2];
-		b3Vector3 normal;
-		b3Scalar distance;
-	};
-
-	static int StackSizeRequirement();
-
-	static bool Distance(const b3Transform& transA, const b3Transform& transB,
-						 const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB,
-						 const b3AlignedObjectArray<b3Vector3>& verticesA,
-						 const b3AlignedObjectArray<b3Vector3>& verticesB,
-						 const b3Vector3& guess,
-						 sResults& results);
-
-	static bool Penetration(const b3Transform& transA, const b3Transform& transB,
-							const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB,
-							const b3AlignedObjectArray<b3Vector3>& verticesA,
-							const b3AlignedObjectArray<b3Vector3>& verticesB,
-							const b3Vector3& guess,
-							sResults& results,
-							bool usemargins = true);
-#if 0
-static b3Scalar	SignedDistance(	const b3Vector3& position,
-								b3Scalar margin,
-								const btConvexShape* shape,
-								const btTransform& wtrs,
-								sResults& results);
-							
-static bool		SignedDistance(	const btConvexShape* shape0,const btTransform& wtrs0,
-								const btConvexShape* shape1,const btTransform& wtrs1,
-								const b3Vector3& guess,
-								sResults& results);
-#endif
-};
-
-#endif  //B3_GJK_EPA2_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
@ -1,358 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#include "b3OptimizedBvh.h"
-#include "b3StridingMeshInterface.h"
-#include "Bullet3Geometry/b3AabbUtil.h"
-
-b3OptimizedBvh::b3OptimizedBvh()
-{
-}
-
-b3OptimizedBvh::~b3OptimizedBvh()
-{
-}
-
-void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax)
-{
-	m_useQuantization = useQuantizedAabbCompression;
-
-	// NodeArray	triangleNodes;
-
-	struct NodeTriangleCallback : public b3InternalTriangleIndexCallback
-	{
-		NodeArray& m_triangleNodes;
-
-		NodeTriangleCallback& operator=(NodeTriangleCallback& other)
-		{
-			m_triangleNodes.copyFromArray(other.m_triangleNodes);
-			return *this;
-		}
-
-		NodeTriangleCallback(NodeArray& triangleNodes)
-			: m_triangleNodes(triangleNodes)
-		{
-		}
-
-		virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
-		{
-			b3OptimizedBvhNode node;
-			b3Vector3 aabbMin, aabbMax;
-			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
-			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
-			aabbMin.setMin(triangle[0]);
-			aabbMax.setMax(triangle[0]);
-			aabbMin.setMin(triangle[1]);
-			aabbMax.setMax(triangle[1]);
-			aabbMin.setMin(triangle[2]);
-			aabbMax.setMax(triangle[2]);
-
-			//with quantization?
-			node.m_aabbMinOrg = aabbMin;
-			node.m_aabbMaxOrg = aabbMax;
-
-			node.m_escapeIndex = -1;
-
-			//for child nodes
-			node.m_subPart = partId;
-			node.m_triangleIndex = triangleIndex;
-			m_triangleNodes.push_back(node);
-		}
-	};
-	struct QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback
-	{
-		QuantizedNodeArray& m_triangleNodes;
-		const b3QuantizedBvh* m_optimizedTree;  // for quantization
-
-		QuantizedNodeTriangleCallback& operator=(QuantizedNodeTriangleCallback& other)
-		{
-			m_triangleNodes.copyFromArray(other.m_triangleNodes);
-			m_optimizedTree = other.m_optimizedTree;
-			return *this;
-		}
-
-		QuantizedNodeTriangleCallback(QuantizedNodeArray& triangleNodes, const b3QuantizedBvh* tree)
-			: m_triangleNodes(triangleNodes), m_optimizedTree(tree)
-		{
-		}
-
-		virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
-		{
-			// The partId and triangle index must fit in the same (positive) integer
-			b3Assert(partId < (1 << MAX_NUM_PARTS_IN_BITS));
-			b3Assert(triangleIndex < (1 << (31 - MAX_NUM_PARTS_IN_BITS)));
-			//negative indices are reserved for escapeIndex
-			b3Assert(triangleIndex >= 0);
-
-			b3QuantizedBvhNode node;
-			b3Vector3 aabbMin, aabbMax;
-			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
-			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
-			aabbMin.setMin(triangle[0]);
-			aabbMax.setMax(triangle[0]);
-			aabbMin.setMin(triangle[1]);
-			aabbMax.setMax(triangle[1]);
-			aabbMin.setMin(triangle[2]);
-			aabbMax.setMax(triangle[2]);
-
-			//PCK: add these checks for zero dimensions of aabb
-			const b3Scalar MIN_AABB_DIMENSION = b3Scalar(0.002);
-			const b3Scalar MIN_AABB_HALF_DIMENSION = b3Scalar(0.001);
-			if (aabbMax.getX() - aabbMin.getX() < MIN_AABB_DIMENSION)
-			{
-				aabbMax.setX(aabbMax.getX() + MIN_AABB_HALF_DIMENSION);
-				aabbMin.setX(aabbMin.getX() - MIN_AABB_HALF_DIMENSION);
-			}
-			if (aabbMax.getY() - aabbMin.getY() < MIN_AABB_DIMENSION)
-			{
-				aabbMax.setY(aabbMax.getY() + MIN_AABB_HALF_DIMENSION);
-				aabbMin.setY(aabbMin.getY() - MIN_AABB_HALF_DIMENSION);
-			}
-			if (aabbMax.getZ() - aabbMin.getZ() < MIN_AABB_DIMENSION)
-			{
-				aabbMax.setZ(aabbMax.getZ() + MIN_AABB_HALF_DIMENSION);
-				aabbMin.setZ(aabbMin.getZ() - MIN_AABB_HALF_DIMENSION);
-			}
-
-			m_optimizedTree->quantize(&node.m_quantizedAabbMin[0], aabbMin, 0);
-			m_optimizedTree->quantize(&node.m_quantizedAabbMax[0], aabbMax, 1);
-
-			node.m_escapeIndexOrTriangleIndex = (partId << (31 - MAX_NUM_PARTS_IN_BITS)) | triangleIndex;
-
-			m_triangleNodes.push_back(node);
-		}
-	};
-
-	int numLeafNodes = 0;
-
-	if (m_useQuantization)
-	{
-		//initialize quantization values
-		setQuantizationValues(bvhAabbMin, bvhAabbMax);
-
-		QuantizedNodeTriangleCallback callback(m_quantizedLeafNodes, this);
-
-		triangles->InternalProcessAllTriangles(&callback, m_bvhAabbMin, m_bvhAabbMax);
-
-		//now we have an array of leafnodes in m_leafNodes
-		numLeafNodes = m_quantizedLeafNodes.size();
-
-		m_quantizedContiguousNodes.resize(2 * numLeafNodes);
-	}
-	else
-	{
-		NodeTriangleCallback callback(m_leafNodes);
-
-		b3Vector3 aabbMin = b3MakeVector3(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
-		b3Vector3 aabbMax = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
-
-		triangles->InternalProcessAllTriangles(&callback, aabbMin, aabbMax);
-
-		//now we have an array of leafnodes in m_leafNodes
-		numLeafNodes = m_leafNodes.size();
-
-		m_contiguousNodes.resize(2 * numLeafNodes);
-	}
-
-	m_curNodeIndex = 0;
-
-	buildTree(0, numLeafNodes);
-
-	///if the entire tree is small then subtree size, we need to create a header info for the tree
-	if (m_useQuantization && !m_SubtreeHeaders.size())
-	{
-		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
-		subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]);
-		subtree.m_rootNodeIndex = 0;
-		subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex();
-	}
-
-	//PCK: update the copy of the size
-	m_subtreeHeaderCount = m_SubtreeHeaders.size();
-
-	//PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary
-	m_quantizedLeafNodes.clear();
-	m_leafNodes.clear();
-}
-
-void b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface, const b3Vector3& aabbMin, const b3Vector3& aabbMax)
-{
-	if (m_useQuantization)
-	{
-		setQuantizationValues(aabbMin, aabbMax);
-
-		updateBvhNodes(meshInterface, 0, m_curNodeIndex, 0);
-
-		///now update all subtree headers
-
-		int i;
-		for (i = 0; i < m_SubtreeHeaders.size(); i++)
-		{
-			b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
-			subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
-		}
-	}
-	else
-	{
-	}
-}
-
-void b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface, const b3Vector3& aabbMin, const b3Vector3& aabbMax)
-{
-	//incrementally initialize quantization values
-	b3Assert(m_useQuantization);
-
-	b3Assert(aabbMin.getX() > m_bvhAabbMin.getX());
-	b3Assert(aabbMin.getY() > m_bvhAabbMin.getY());
-	b3Assert(aabbMin.getZ() > m_bvhAabbMin.getZ());
-
-	b3Assert(aabbMax.getX() < m_bvhAabbMax.getX());
-	b3Assert(aabbMax.getY() < m_bvhAabbMax.getY());
-	b3Assert(aabbMax.getZ() < m_bvhAabbMax.getZ());
-
-	///we should update all quantization values, using updateBvhNodes(meshInterface);
-	///but we only update chunks that overlap the given aabb
-
-	unsigned short quantizedQueryAabbMin[3];
-	unsigned short quantizedQueryAabbMax[3];
-
-	quantize(&quantizedQueryAabbMin[0], aabbMin, 0);
-	quantize(&quantizedQueryAabbMax[0], aabbMax, 1);
-
-	int i;
-	for (i = 0; i < this->m_SubtreeHeaders.size(); i++)
-	{
-		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
-
-		//PCK: unsigned instead of bool
-		unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin, quantizedQueryAabbMax, subtree.m_quantizedAabbMin, subtree.m_quantizedAabbMax);
-		if (overlap != 0)
-		{
-			updateBvhNodes(meshInterface, subtree.m_rootNodeIndex, subtree.m_rootNodeIndex + subtree.m_subtreeSize, i);
-
-			subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
-		}
-	}
-}
-
-void b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface, int firstNode, int endNode, int index)
-{
-	(void)index;
-
-	b3Assert(m_useQuantization);
-
-	int curNodeSubPart = -1;
-
-	//get access info to trianglemesh data
-	const unsigned char* vertexbase = 0;
-	int numverts = 0;
-	PHY_ScalarType type = PHY_INTEGER;
-	int stride = 0;
-	const unsigned char* indexbase = 0;
-	int indexstride = 0;
-	int numfaces = 0;
-	PHY_ScalarType indicestype = PHY_INTEGER;
-
-	b3Vector3 triangleVerts[3];
-	b3Vector3 aabbMin, aabbMax;
-	const b3Vector3& meshScaling = meshInterface->getScaling();
-
-	int i;
-	for (i = endNode - 1; i >= firstNode; i--)
-	{
-		b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i];
-		if (curNode.isLeafNode())
-		{
-			//recalc aabb from triangle data
-			int nodeSubPart = curNode.getPartId();
-			int nodeTriangleIndex = curNode.getTriangleIndex();
-			if (nodeSubPart != curNodeSubPart)
-			{
-				if (curNodeSubPart >= 0)
-					meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
-				meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numfaces, indicestype, nodeSubPart);
-
-				curNodeSubPart = nodeSubPart;
-				b3Assert(indicestype == PHY_INTEGER || indicestype == PHY_SHORT);
-			}
-			//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,
-
-			unsigned int* gfxbase = (unsigned int*)(indexbase + nodeTriangleIndex * indexstride);
-
-			for (int j = 2; j >= 0; j--)
-			{
-				int graphicsindex = indicestype == PHY_SHORT ? ((unsigned short*)gfxbase)[j] : gfxbase[j];
-				if (type == PHY_FLOAT)
-				{
-					float* graphicsbase = (float*)(vertexbase + graphicsindex * stride);
-					triangleVerts[j] = b3MakeVector3(
-						graphicsbase[0] * meshScaling.getX(),
-						graphicsbase[1] * meshScaling.getY(),
-						graphicsbase[2] * meshScaling.getZ());
-				}
-				else
-				{
-					double* graphicsbase = (double*)(vertexbase + graphicsindex * stride);
-					triangleVerts[j] = b3MakeVector3(b3Scalar(graphicsbase[0] * meshScaling.getX()), b3Scalar(graphicsbase[1] * meshScaling.getY()), b3Scalar(graphicsbase[2] * meshScaling.getZ()));
-				}
-			}
-
-			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
-			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
-			aabbMin.setMin(triangleVerts[0]);
-			aabbMax.setMax(triangleVerts[0]);
-			aabbMin.setMin(triangleVerts[1]);
-			aabbMax.setMax(triangleVerts[1]);
-			aabbMin.setMin(triangleVerts[2]);
-			aabbMax.setMax(triangleVerts[2]);
-
-			quantize(&curNode.m_quantizedAabbMin[0], aabbMin, 0);
-			quantize(&curNode.m_quantizedAabbMax[0], aabbMax, 1);
-		}
-		else
-		{
-			//combine aabb from both children
-
-			b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i + 1];
-
-			b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i + 2] : &m_quantizedContiguousNodes[i + 1 + leftChildNode->getEscapeIndex()];
-
-			{
-				for (int i = 0; i < 3; i++)
-				{
-					curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i];
-					if (curNode.m_quantizedAabbMin[i] > rightChildNode->m_quantizedAabbMin[i])
-						curNode.m_quantizedAabbMin[i] = rightChildNode->m_quantizedAabbMin[i];
-
-					curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i];
-					if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i])
-						curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i];
-				}
-			}
-		}
-	}
-
-	if (curNodeSubPart >= 0)
-		meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
-}
-
-///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
-b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
-{
-	b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer, i_dataBufferSize, i_swapEndian);
-
-	//we don't add additional data so just do a static upcast
-	return static_cast<b3OptimizedBvh*>(bvh);
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h
@ -1,56 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-///Contains contributions from Disney Studio's
-
-#ifndef B3_OPTIMIZED_BVH_H
-#define B3_OPTIMIZED_BVH_H
-
-#include "b3QuantizedBvh.h"
-
-class b3StridingMeshInterface;
-
-///The b3OptimizedBvh extends the b3QuantizedBvh to create AABB tree for triangle meshes, through the b3StridingMeshInterface.
-B3_ATTRIBUTE_ALIGNED16(class)
-b3OptimizedBvh : public b3QuantizedBvh
-{
-public:
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-
-protected:
-public:
-	b3OptimizedBvh();
-
-	virtual ~b3OptimizedBvh();
-
-	void build(b3StridingMeshInterface * triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax);
-
-	void refit(b3StridingMeshInterface * triangles, const b3Vector3& aabbMin, const b3Vector3& aabbMax);
-
-	void refitPartial(b3StridingMeshInterface * triangles, const b3Vector3& aabbMin, const b3Vector3& aabbMax);
-
-	void updateBvhNodes(b3StridingMeshInterface * meshInterface, int firstNode, int endNode, int index);
-
-	/// Data buffer MUST be 16 byte aligned
-	virtual bool serializeInPlace(void* o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const
-	{
-		return b3QuantizedBvh::serialize(o_alignedDataBuffer, i_dataBufferSize, i_swapEndian);
-	}
-
-	///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
-	static b3OptimizedBvh* deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
-};
-
-#endif  //B3_OPTIMIZED_BVH_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h
@ -1,511 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef B3_QUANTIZED_BVH_H
-#define B3_QUANTIZED_BVH_H
-
-class b3Serializer;
-
-//#define DEBUG_CHECK_DEQUANTIZATION 1
-#ifdef DEBUG_CHECK_DEQUANTIZATION
-#ifdef __SPU__
-#define printf spu_printf
-#endif  //__SPU__
-
-#include <stdio.h>
-#include <stdlib.h>
-#endif  //DEBUG_CHECK_DEQUANTIZATION
-
-#include "Bullet3Common/b3Vector3.h"
-#include "Bullet3Common/b3AlignedAllocator.h"
-
-#ifdef B3_USE_DOUBLE_PRECISION
-#define b3QuantizedBvhData b3QuantizedBvhDoubleData
-#define b3OptimizedBvhNodeData b3OptimizedBvhNodeDoubleData
-#define b3QuantizedBvhDataName "b3QuantizedBvhDoubleData"
-#else
-#define b3QuantizedBvhData b3QuantizedBvhFloatData
-#define b3OptimizedBvhNodeData b3OptimizedBvhNodeFloatData
-#define b3QuantizedBvhDataName "b3QuantizedBvhFloatData"
-#endif
-
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"
-
-//http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclang/html/vclrf__m128.asp
-
-//Note: currently we have 16 bytes per quantized node
-#define MAX_SUBTREE_SIZE_IN_BYTES 2048
-
-// 10 gives the potential for 1024 parts, with at most 2^21 (2097152) (minus one
-// actually) triangles each (since the sign bit is reserved
-#define MAX_NUM_PARTS_IN_BITS 10
-
-///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
-///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
-B3_ATTRIBUTE_ALIGNED16(struct)
-b3QuantizedBvhNode : public b3QuantizedBvhNodeData
-{
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-
-	bool isLeafNode() const
-	{
-		//skipindex is negative (internal node), triangleindex >=0 (leafnode)
-		return (m_escapeIndexOrTriangleIndex >= 0);
-	}
-	int getEscapeIndex() const
-	{
-		b3Assert(!isLeafNode());
-		return -m_escapeIndexOrTriangleIndex;
-	}
-	int getTriangleIndex() const
-	{
-		b3Assert(isLeafNode());
-		unsigned int x = 0;
-		unsigned int y = (~(x & 0)) << (31 - MAX_NUM_PARTS_IN_BITS);
-		// Get only the lower bits where the triangle index is stored
-		return (m_escapeIndexOrTriangleIndex & ~(y));
-	}
-	int getPartId() const
-	{
-		b3Assert(isLeafNode());
-		// Get only the highest bits where the part index is stored
-		return (m_escapeIndexOrTriangleIndex >> (31 - MAX_NUM_PARTS_IN_BITS));
-	}
-};
-
-/// b3OptimizedBvhNode contains both internal and leaf node information.
-/// Total node size is 44 bytes / node. You can use the compressed version of 16 bytes.
-B3_ATTRIBUTE_ALIGNED16(struct)
-b3OptimizedBvhNode
-{
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-
-	//32 bytes
-	b3Vector3 m_aabbMinOrg;
-	b3Vector3 m_aabbMaxOrg;
-
-	//4
-	int m_escapeIndex;
-
-	//8
-	//for child nodes
-	int m_subPart;
-	int m_triangleIndex;
-
-	//pad the size to 64 bytes
-	char m_padding[20];
-};
-
-///b3BvhSubtreeInfo provides info to gather a subtree of limited size
-B3_ATTRIBUTE_ALIGNED16(class)
-b3BvhSubtreeInfo : public b3BvhSubtreeInfoData
-{
-public:
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-
-	b3BvhSubtreeInfo()
-	{
-		//memset(&m_padding[0], 0, sizeof(m_padding));
-	}
-
-	void setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode)
-	{
-		m_quantizedAabbMin[0] = quantizedNode.m_quantizedAabbMin[0];
-		m_quantizedAabbMin[1] = quantizedNode.m_quantizedAabbMin[1];
-		m_quantizedAabbMin[2] = quantizedNode.m_quantizedAabbMin[2];
-		m_quantizedAabbMax[0] = quantizedNode.m_quantizedAabbMax[0];
-		m_quantizedAabbMax[1] = quantizedNode.m_quantizedAabbMax[1];
-		m_quantizedAabbMax[2] = quantizedNode.m_quantizedAabbMax[2];
-	}
-};
-
-class b3NodeOverlapCallback
-{
-public:
-	virtual ~b3NodeOverlapCallback(){};
-
-	virtual void processNode(int subPart, int triangleIndex) = 0;
-};
-
-#include "Bullet3Common/b3AlignedAllocator.h"
-#include "Bullet3Common/b3AlignedObjectArray.h"
-
-///for code readability:
-typedef b3AlignedObjectArray<b3OptimizedBvhNode> NodeArray;
-typedef b3AlignedObjectArray<b3QuantizedBvhNode> QuantizedNodeArray;
-typedef b3AlignedObjectArray<b3BvhSubtreeInfo> BvhSubtreeInfoArray;
-
-///The b3QuantizedBvh class stores an AABB tree that can be quickly traversed on CPU and Cell SPU.
-///It is used by the b3BvhTriangleMeshShape as midphase
-///It is recommended to use quantization for better performance and lower memory requirements.
-B3_ATTRIBUTE_ALIGNED16(class)
-b3QuantizedBvh
-{
-public:
-	enum b3TraversalMode
-	{
-		TRAVERSAL_STACKLESS = 0,
-		TRAVERSAL_STACKLESS_CACHE_FRIENDLY,
-		TRAVERSAL_RECURSIVE
-	};
-
-	b3Vector3 m_bvhAabbMin;
-	b3Vector3 m_bvhAabbMax;
-	b3Vector3 m_bvhQuantization;
-
-protected:
-	int m_bulletVersion;  //for serialization versioning. It could also be used to detect endianess.
-
-	int m_curNodeIndex;
-	//quantization data
-	bool m_useQuantization;
-
-	NodeArray m_leafNodes;
-	NodeArray m_contiguousNodes;
-	QuantizedNodeArray m_quantizedLeafNodes;
-	QuantizedNodeArray m_quantizedContiguousNodes;
-
-	b3TraversalMode m_traversalMode;
-	BvhSubtreeInfoArray m_SubtreeHeaders;
-
-	//This is only used for serialization so we don't have to add serialization directly to b3AlignedObjectArray
-	mutable int m_subtreeHeaderCount;
-
-	///two versions, one for quantized and normal nodes. This allows code-reuse while maintaining readability (no template/macro!)
-	///this might be refactored into a virtual, it is usually not calculated at run-time
-	void setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin)
-	{
-		if (m_useQuantization)
-		{
-			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0], aabbMin, 0);
-		}
-		else
-		{
-			m_contiguousNodes[nodeIndex].m_aabbMinOrg = aabbMin;
-		}
-	}
-	void setInternalNodeAabbMax(int nodeIndex, const b3Vector3& aabbMax)
-	{
-		if (m_useQuantization)
-		{
-			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0], aabbMax, 1);
-		}
-		else
-		{
-			m_contiguousNodes[nodeIndex].m_aabbMaxOrg = aabbMax;
-		}
-	}
-
-	b3Vector3 getAabbMin(int nodeIndex) const
-	{
-		if (m_useQuantization)
-		{
-			return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMin[0]);
-		}
-		//non-quantized
-		return m_leafNodes[nodeIndex].m_aabbMinOrg;
-	}
-	b3Vector3 getAabbMax(int nodeIndex) const
-	{
-		if (m_useQuantization)
-		{
-			return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMax[0]);
-		}
-		//non-quantized
-		return m_leafNodes[nodeIndex].m_aabbMaxOrg;
-	}
-
-	void setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex)
-	{
-		if (m_useQuantization)
-		{
-			m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = -escapeIndex;
-		}
-		else
-		{
-			m_contiguousNodes[nodeIndex].m_escapeIndex = escapeIndex;
-		}
-	}
-
-	void mergeInternalNodeAabb(int nodeIndex, const b3Vector3& newAabbMin, const b3Vector3& newAabbMax)
-	{
-		if (m_useQuantization)
-		{
-			unsigned short int quantizedAabbMin[3];
-			unsigned short int quantizedAabbMax[3];
-			quantize(quantizedAabbMin, newAabbMin, 0);
-			quantize(quantizedAabbMax, newAabbMax, 1);
-			for (int i = 0; i < 3; i++)
-			{
-				if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] > quantizedAabbMin[i])
-					m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] = quantizedAabbMin[i];
-
-				if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] < quantizedAabbMax[i])
-					m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] = quantizedAabbMax[i];
-			}
-		}
-		else
-		{
-			//non-quantized
-			m_contiguousNodes[nodeIndex].m_aabbMinOrg.setMin(newAabbMin);
-			m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax);
-		}
-	}
-
-	void swapLeafNodes(int firstIndex, int secondIndex);
-
-	void assignInternalNodeFromLeafNode(int internalNode, int leafNodeIndex);
-
-protected:
-	void buildTree(int startIndex, int endIndex);
-
-	int calcSplittingAxis(int startIndex, int endIndex);
-
-	int sortAndCalcSplittingIndex(int startIndex, int endIndex, int splitAxis);
-
-	void walkStacklessTree(b3NodeOverlapCallback * nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
-
-	void walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const;
-	void walkStacklessQuantizedTree(b3NodeOverlapCallback * nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax, int startNodeIndex, int endNodeIndex) const;
-	void walkStacklessTreeAgainstRay(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const;
-
-	///tree traversal designed for small-memory processors like PS3 SPU
-	void walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback * nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const;
-
-	///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
-	void walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode, b3NodeOverlapCallback* nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const;
-
-	///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
-	void walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA, const b3QuantizedBvhNode* treeNodeB, b3NodeOverlapCallback* nodeCallback) const;
-
-	void updateSubtreeHeaders(int leftChildNodexIndex, int rightChildNodexIndex);
-
-public:
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-
-	b3QuantizedBvh();
-
-	virtual ~b3QuantizedBvh();
-
-	///***************************************** expert/internal use only *************************
-	void setQuantizationValues(const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax, b3Scalar quantizationMargin = b3Scalar(1.0));
-	QuantizedNodeArray& getLeafNodeArray() { return m_quantizedLeafNodes; }
-	///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized
-	void buildInternal();
-	///***************************************** expert/internal use only *************************
-
-	void reportAabbOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
-	void reportRayOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const;
-	void reportBoxCastOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
-
-	B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point, int isMax) const
-	{
-		b3Assert(m_useQuantization);
-
-		b3Assert(point.getX() <= m_bvhAabbMax.getX());
-		b3Assert(point.getY() <= m_bvhAabbMax.getY());
-		b3Assert(point.getZ() <= m_bvhAabbMax.getZ());
-
-		b3Assert(point.getX() >= m_bvhAabbMin.getX());
-		b3Assert(point.getY() >= m_bvhAabbMin.getY());
-		b3Assert(point.getZ() >= m_bvhAabbMin.getZ());
-
-		b3Vector3 v = (point - m_bvhAabbMin) * m_bvhQuantization;
-		///Make sure rounding is done in a way that unQuantize(quantizeWithClamp(...)) is conservative
-		///end-points always set the first bit, so that they are sorted properly (so that neighbouring AABBs overlap properly)
-		///@todo: double-check this
-		if (isMax)
-		{
-			out[0] = (unsigned short)(((unsigned short)(v.getX() + b3Scalar(1.)) | 1));
-			out[1] = (unsigned short)(((unsigned short)(v.getY() + b3Scalar(1.)) | 1));
-			out[2] = (unsigned short)(((unsigned short)(v.getZ() + b3Scalar(1.)) | 1));
-		}
-		else
-		{
-			out[0] = (unsigned short)(((unsigned short)(v.getX()) & 0xfffe));
-			out[1] = (unsigned short)(((unsigned short)(v.getY()) & 0xfffe));
-			out[2] = (unsigned short)(((unsigned short)(v.getZ()) & 0xfffe));
-		}
-
-#ifdef DEBUG_CHECK_DEQUANTIZATION
-		b3Vector3 newPoint = unQuantize(out);
-		if (isMax)
-		{
-			if (newPoint.getX() < point.getX())
-			{
-				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n", newPoint.getX() - point.getX(), newPoint.getX(), point.getX());
-			}
-			if (newPoint.getY() < point.getY())
-			{
-				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n", newPoint.getY() - point.getY(), newPoint.getY(), point.getY());
-			}
-			if (newPoint.getZ() < point.getZ())
-			{
-				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n", newPoint.getZ() - point.getZ(), newPoint.getZ(), point.getZ());
-			}
-		}
-		else
-		{
-			if (newPoint.getX() > point.getX())
-			{
-				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n", newPoint.getX() - point.getX(), newPoint.getX(), point.getX());
-			}
-			if (newPoint.getY() > point.getY())
-			{
-				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n", newPoint.getY() - point.getY(), newPoint.getY(), point.getY());
-			}
-			if (newPoint.getZ() > point.getZ())
-			{
-				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n", newPoint.getZ() - point.getZ(), newPoint.getZ(), point.getZ());
-			}
-		}
-#endif  //DEBUG_CHECK_DEQUANTIZATION
-	}
-
-	B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2, int isMax) const
-	{
-		b3Assert(m_useQuantization);
-
-		b3Vector3 clampedPoint(point2);
-		clampedPoint.setMax(m_bvhAabbMin);
-		clampedPoint.setMin(m_bvhAabbMax);
-
-		quantize(out, clampedPoint, isMax);
-	}
-
-	B3_FORCE_INLINE b3Vector3 unQuantize(const unsigned short* vecIn) const
-	{
-		b3Vector3 vecOut;
-		vecOut.setValue(
-			(b3Scalar)(vecIn[0]) / (m_bvhQuantization.getX()),
-			(b3Scalar)(vecIn[1]) / (m_bvhQuantization.getY()),
-			(b3Scalar)(vecIn[2]) / (m_bvhQuantization.getZ()));
-		vecOut += m_bvhAabbMin;
-		return vecOut;
-	}
-
-	///setTraversalMode let's you choose between stackless, recursive or stackless cache friendly tree traversal. Note this is only implemented for quantized trees.
-	void setTraversalMode(b3TraversalMode traversalMode)
-	{
-		m_traversalMode = traversalMode;
-	}
-
-	B3_FORCE_INLINE QuantizedNodeArray& getQuantizedNodeArray()
-	{
-		return m_quantizedContiguousNodes;
-	}
-
-	B3_FORCE_INLINE BvhSubtreeInfoArray& getSubtreeInfoArray()
-	{
-		return m_SubtreeHeaders;
-	}
-
-	////////////////////////////////////////////////////////////////////
-
-	/////Calculate space needed to store BVH for serialization
-	unsigned calculateSerializeBufferSize() const;
-
-	/// Data buffer MUST be 16 byte aligned
-	virtual bool serialize(void* o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const;
-
-	///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
-	static b3QuantizedBvh* deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
-
-	static unsigned int getAlignmentSerializationPadding();
-	//////////////////////////////////////////////////////////////////////
-
-	virtual int calculateSerializeBufferSizeNew() const;
-
-	///fills the dataBuffer and returns the struct name (and 0 on failure)
-	virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const;
-
-	virtual void deSerializeFloat(struct b3QuantizedBvhFloatData & quantizedBvhFloatData);
-
-	virtual void deSerializeDouble(struct b3QuantizedBvhDoubleData & quantizedBvhDoubleData);
-
-	////////////////////////////////////////////////////////////////////
-
-	B3_FORCE_INLINE bool isQuantized()
-	{
-		return m_useQuantization;
-	}
-
-private:
-	// Special "copy" constructor that allows for in-place deserialization
-	// Prevents b3Vector3's default constructor from being called, but doesn't inialize much else
-	// ownsMemory should most likely be false if deserializing, and if you are not, don't call this (it also changes the function signature, which we need)
-	b3QuantizedBvh(b3QuantizedBvh & other, bool ownsMemory);
-};
-
-struct b3OptimizedBvhNodeFloatData
-{
-	b3Vector3FloatData m_aabbMinOrg;
-	b3Vector3FloatData m_aabbMaxOrg;
-	int m_escapeIndex;
-	int m_subPart;
-	int m_triangleIndex;
-	char m_pad[4];
-};
-
-struct b3OptimizedBvhNodeDoubleData
-{
-	b3Vector3DoubleData m_aabbMinOrg;
-	b3Vector3DoubleData m_aabbMaxOrg;
-	int m_escapeIndex;
-	int m_subPart;
-	int m_triangleIndex;
-	char m_pad[4];
-};
-
-struct b3QuantizedBvhFloatData
-{
-	b3Vector3FloatData m_bvhAabbMin;
-	b3Vector3FloatData m_bvhAabbMax;
-	b3Vector3FloatData m_bvhQuantization;
-	int m_curNodeIndex;
-	int m_useQuantization;
-	int m_numContiguousLeafNodes;
-	int m_numQuantizedContiguousNodes;
-	b3OptimizedBvhNodeFloatData* m_contiguousNodesPtr;
-	b3QuantizedBvhNodeData* m_quantizedContiguousNodesPtr;
-	b3BvhSubtreeInfoData* m_subTreeInfoPtr;
-	int m_traversalMode;
-	int m_numSubtreeHeaders;
-};
-
-struct b3QuantizedBvhDoubleData
-{
-	b3Vector3DoubleData m_bvhAabbMin;
-	b3Vector3DoubleData m_bvhAabbMax;
-	b3Vector3DoubleData m_bvhQuantization;
-	int m_curNodeIndex;
-	int m_useQuantization;
-	int m_numContiguousLeafNodes;
-	int m_numQuantizedContiguousNodes;
-	b3OptimizedBvhNodeDoubleData* m_contiguousNodesPtr;
-	b3QuantizedBvhNodeData* m_quantizedContiguousNodesPtr;
-
-	int m_traversalMode;
-	int m_numSubtreeHeaders;
-	b3BvhSubtreeInfoData* m_subTreeInfoPtr;
-};
-
-B3_FORCE_INLINE int b3QuantizedBvh::calculateSerializeBufferSizeNew() const
-{
-	return sizeof(b3QuantizedBvhData);
-}
-
-#endif  //B3_QUANTIZED_BVH_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp
@ -1,207 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#include "b3StridingMeshInterface.h"
-
-b3StridingMeshInterface::~b3StridingMeshInterface()
-{
-}
-
-void b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
-{
-	(void)aabbMin;
-	(void)aabbMax;
-	int numtotalphysicsverts = 0;
-	int part, graphicssubparts = getNumSubParts();
-	const unsigned char* vertexbase;
-	const unsigned char* indexbase;
-	int indexstride;
-	PHY_ScalarType type;
-	PHY_ScalarType gfxindextype;
-	int stride, numverts, numtriangles;
-	int gfxindex;
-	b3Vector3 triangle[3];
-
-	b3Vector3 meshScaling = getScaling();
-
-	///if the number of parts is big, the performance might drop due to the innerloop switch on indextype
-	for (part = 0; part < graphicssubparts; part++)
-	{
-		getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numtriangles, gfxindextype, part);
-		numtotalphysicsverts += numtriangles * 3;  //upper bound
-
-		///unlike that developers want to pass in double-precision meshes in single-precision Bullet build
-		///so disable this feature by default
-		///see patch http://code.google.com/p/bullet/issues/detail?id=213
-
-		switch (type)
-		{
-			case PHY_FLOAT:
-			{
-				float* graphicsbase;
-
-				switch (gfxindextype)
-				{
-					case PHY_INTEGER:
-					{
-						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
-						{
-							unsigned int* tri_indices = (unsigned int*)(indexbase + gfxindex * indexstride);
-							graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
-							triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
-							triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
-							triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
-							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
-						}
-						break;
-					}
-					case PHY_SHORT:
-					{
-						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
-						{
-							unsigned short int* tri_indices = (unsigned short int*)(indexbase + gfxindex * indexstride);
-							graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
-							triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
-							triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
-							triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
-							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
-						}
-						break;
-					}
-					case PHY_UCHAR:
-					{
-						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
-						{
-							unsigned char* tri_indices = (unsigned char*)(indexbase + gfxindex * indexstride);
-							graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
-							triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
-							triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
-							triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
-							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
-						}
-						break;
-					}
-					default:
-						b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
-				}
-				break;
-			}
-
-			case PHY_DOUBLE:
-			{
-				double* graphicsbase;
-
-				switch (gfxindextype)
-				{
-					case PHY_INTEGER:
-					{
-						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
-						{
-							unsigned int* tri_indices = (unsigned int*)(indexbase + gfxindex * indexstride);
-							graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
-							triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
-							triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
-							triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
-							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
-						}
-						break;
-					}
-					case PHY_SHORT:
-					{
-						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
-						{
-							unsigned short int* tri_indices = (unsigned short int*)(indexbase + gfxindex * indexstride);
-							graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
-							triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
-							triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
-							triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
-							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
-						}
-						break;
-					}
-					case PHY_UCHAR:
-					{
-						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
-						{
-							unsigned char* tri_indices = (unsigned char*)(indexbase + gfxindex * indexstride);
-							graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
-							triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
-							triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
-							triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
-							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
-						}
-						break;
-					}
-					default:
-						b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
-				}
-				break;
-			}
-			default:
-				b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE));
-		}
-
-		unLockReadOnlyVertexBase(part);
-	}
-}
-
-void b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin, b3Vector3& aabbMax)
-{
-	struct AabbCalculationCallback : public b3InternalTriangleIndexCallback
-	{
-		b3Vector3 m_aabbMin;
-		b3Vector3 m_aabbMax;
-
-		AabbCalculationCallback()
-		{
-			m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
-			m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
-		}
-
-		virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
-		{
-			(void)partId;
-			(void)triangleIndex;
-
-			m_aabbMin.setMin(triangle[0]);
-			m_aabbMax.setMax(triangle[0]);
-			m_aabbMin.setMin(triangle[1]);
-			m_aabbMax.setMax(triangle[1]);
-			m_aabbMin.setMin(triangle[2]);
-			m_aabbMax.setMax(triangle[2]);
-		}
-	};
-
-	//first calculate the total aabb for all triangles
-	AabbCalculationCallback aabbCallback;
-	aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
-	aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
-	InternalProcessAllTriangles(&aabbCallback, aabbMin, aabbMax);
-
-	aabbMin = aabbCallback.m_aabbMin;
-	aabbMax = aabbCallback.m_aabbMax;
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h
@ -1,158 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef B3_STRIDING_MESHINTERFACE_H
-#define B3_STRIDING_MESHINTERFACE_H
-
-#include "Bullet3Common/b3Vector3.h"
-#include "b3TriangleCallback.h"
-//#include "b3ConcaveShape.h"
-
-enum PHY_ScalarType
-{
-	PHY_FLOAT,
-	PHY_DOUBLE,
-	PHY_INTEGER,
-	PHY_SHORT,
-	PHY_FIXEDPOINT88,
-	PHY_UCHAR
-};
-
-///	The b3StridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with b3BvhTriangleMeshShape and some other collision shapes.
-/// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips.
-/// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory.
-B3_ATTRIBUTE_ALIGNED16(class)
-b3StridingMeshInterface
-{
-protected:
-	b3Vector3 m_scaling;
-
-public:
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-
-	b3StridingMeshInterface() : m_scaling(b3MakeVector3(b3Scalar(1.), b3Scalar(1.), b3Scalar(1.)))
-	{
-	}
-
-	virtual ~b3StridingMeshInterface();
-
-	virtual void InternalProcessAllTriangles(b3InternalTriangleIndexCallback * callback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
-
-	///brute force method to calculate aabb
-	void calculateAabbBruteForce(b3Vector3 & aabbMin, b3Vector3 & aabbMax);
-
-	/// get read and write access to a subpart of a triangle mesh
-	/// this subpart has a continuous array of vertices and indices
-	/// in this way the mesh can be handled as chunks of memory with striding
-	/// very similar to OpenGL vertexarray support
-	/// make a call to unLockVertexBase when the read and write access is finished
-	virtual void getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& stride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) = 0;
-
-	virtual void getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& stride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) const = 0;
-
-	/// unLockVertexBase finishes the access to a subpart of the triangle mesh
-	/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
-	virtual void unLockVertexBase(int subpart) = 0;
-
-	virtual void unLockReadOnlyVertexBase(int subpart) const = 0;
-
-	/// getNumSubParts returns the number of seperate subparts
-	/// each subpart has a continuous array of vertices and indices
-	virtual int getNumSubParts() const = 0;
-
-	virtual void preallocateVertices(int numverts) = 0;
-	virtual void preallocateIndices(int numindices) = 0;
-
-	virtual bool hasPremadeAabb() const { return false; }
-	virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
-	{
-		(void)aabbMin;
-		(void)aabbMax;
-	}
-	virtual void getPremadeAabb(b3Vector3 * aabbMin, b3Vector3 * aabbMax) const
-	{
-		(void)aabbMin;
-		(void)aabbMax;
-	}
-
-	const b3Vector3& getScaling() const
-	{
-		return m_scaling;
-	}
-	void setScaling(const b3Vector3& scaling)
-	{
-		m_scaling = scaling;
-	}
-
-	virtual int calculateSerializeBufferSize() const;
-
-	///fills the dataBuffer and returns the struct name (and 0 on failure)
-	//virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
-};
-
-struct b3IntIndexData
-{
-	int m_value;
-};
-
-struct b3ShortIntIndexData
-{
-	short m_value;
-	char m_pad[2];
-};
-
-struct b3ShortIntIndexTripletData
-{
-	short m_values[3];
-	char m_pad[2];
-};
-
-struct b3CharIndexTripletData
-{
-	unsigned char m_values[3];
-	char m_pad;
-};
-
-///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
-struct b3MeshPartData
-{
-	b3Vector3FloatData* m_vertices3f;
-	b3Vector3DoubleData* m_vertices3d;
-
-	b3IntIndexData* m_indices32;
-	b3ShortIntIndexTripletData* m_3indices16;
-	b3CharIndexTripletData* m_3indices8;
-
-	b3ShortIntIndexData* m_indices16;  //backwards compatibility
-
-	int m_numTriangles;  //length of m_indices = m_numTriangles
-	int m_numVertices;
-};
-
-///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
-struct b3StridingMeshInterfaceData
-{
-	b3MeshPartData* m_meshPartsPtr;
-	b3Vector3FloatData m_scaling;
-	int m_numMeshParts;
-	char m_padding[4];
-};
-
-B3_FORCE_INLINE int b3StridingMeshInterface::calculateSerializeBufferSize() const
-{
-	return sizeof(b3StridingMeshInterfaceData);
-}
-
-#endif  //B3_STRIDING_MESHINTERFACE_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h
@ -1,34 +0,0 @@
-
-#ifndef B3_SUPPORT_MAPPINGS_H
-#define B3_SUPPORT_MAPPINGS_H
-
-#include "Bullet3Common/b3Transform.h"
-#include "Bullet3Common/b3AlignedObjectArray.h"
-#include "b3VectorFloat4.h"
-
-struct b3GjkPairDetector;
-
-inline b3Vector3 localGetSupportVertexWithMargin(const float4& supportVec, const struct b3ConvexPolyhedronData* hull,
-												 const b3AlignedObjectArray<b3Vector3>& verticesA, b3Scalar margin)
-{
-	b3Vector3 supVec = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
-	b3Scalar maxDot = b3Scalar(-B3_LARGE_FLOAT);
-
-	// Here we take advantage of dot(a, b*c) = dot(a*b, c).  Note: This is true mathematically, but not numerically.
-	if (0 < hull->m_numVertices)
-	{
-		const b3Vector3 scaled = supportVec;
-		int index = (int)scaled.maxDot(&verticesA[hull->m_vertexOffset], hull->m_numVertices, maxDot);
-		return verticesA[hull->m_vertexOffset + index];
-	}
-
-	return supVec;
-}
-
-inline b3Vector3 localGetSupportVertexWithoutMargin(const float4& supportVec, const struct b3ConvexPolyhedronData* hull,
-													const b3AlignedObjectArray<b3Vector3>& verticesA)
-{
-	return localGetSupportVertexWithMargin(supportVec, hull, verticesA, 0.f);
-}
-
-#endif  //B3_SUPPORT_MAPPINGS_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp
@ -1,24 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#include "b3TriangleCallback.h"
-
-b3TriangleCallback::~b3TriangleCallback()
-{
-}
-
-b3InternalTriangleIndexCallback::~b3InternalTriangleIndexCallback()
-{
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h
@ -1,37 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef B3_TRIANGLE_CALLBACK_H
-#define B3_TRIANGLE_CALLBACK_H
-
-#include "Bullet3Common/b3Vector3.h"
-
-///The b3TriangleCallback provides a callback for each overlapping triangle when calling processAllTriangles.
-///This callback is called by processAllTriangles for all b3ConcaveShape derived class, such as  b3BvhTriangleMeshShape, b3StaticPlaneShape and b3HeightfieldTerrainShape.
-class b3TriangleCallback
-{
-public:
-	virtual ~b3TriangleCallback();
-	virtual void processTriangle(b3Vector3* triangle, int partId, int triangleIndex) = 0;
-};
-
-class b3InternalTriangleIndexCallback
-{
-public:
-	virtual ~b3InternalTriangleIndexCallback();
-	virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex) = 0;
-};
-
-#endif  //B3_TRIANGLE_CALLBACK_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
@ -1,90 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#include "b3TriangleIndexVertexArray.h"
-
-b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles, int* triangleIndexBase, int triangleIndexStride, int numVertices, b3Scalar* vertexBase, int vertexStride)
-	: m_hasAabb(0)
-{
-	b3IndexedMesh mesh;
-
-	mesh.m_numTriangles = numTriangles;
-	mesh.m_triangleIndexBase = (const unsigned char*)triangleIndexBase;
-	mesh.m_triangleIndexStride = triangleIndexStride;
-	mesh.m_numVertices = numVertices;
-	mesh.m_vertexBase = (const unsigned char*)vertexBase;
-	mesh.m_vertexStride = vertexStride;
-
-	addIndexedMesh(mesh);
-}
-
-b3TriangleIndexVertexArray::~b3TriangleIndexVertexArray()
-{
-}
-
-void b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart)
-{
-	b3Assert(subpart < getNumSubParts());
-
-	b3IndexedMesh& mesh = m_indexedMeshes[subpart];
-
-	numverts = mesh.m_numVertices;
-	(*vertexbase) = (unsigned char*)mesh.m_vertexBase;
-
-	type = mesh.m_vertexType;
-
-	vertexStride = mesh.m_vertexStride;
-
-	numfaces = mesh.m_numTriangles;
-
-	(*indexbase) = (unsigned char*)mesh.m_triangleIndexBase;
-	indexstride = mesh.m_triangleIndexStride;
-	indicestype = mesh.m_indexType;
-}
-
-void b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart) const
-{
-	const b3IndexedMesh& mesh = m_indexedMeshes[subpart];
-
-	numverts = mesh.m_numVertices;
-	(*vertexbase) = (const unsigned char*)mesh.m_vertexBase;
-
-	type = mesh.m_vertexType;
-
-	vertexStride = mesh.m_vertexStride;
-
-	numfaces = mesh.m_numTriangles;
-	(*indexbase) = (const unsigned char*)mesh.m_triangleIndexBase;
-	indexstride = mesh.m_triangleIndexStride;
-	indicestype = mesh.m_indexType;
-}
-
-bool b3TriangleIndexVertexArray::hasPremadeAabb() const
-{
-	return (m_hasAabb == 1);
-}
-
-void b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
-{
-	m_aabbMin = aabbMin;
-	m_aabbMax = aabbMax;
-	m_hasAabb = 1;  // this is intentionally an int see notes in header
-}
-
-void b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax) const
-{
-	*aabbMin = m_aabbMin;
-	*aabbMax = m_aabbMax;
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h
@ -1,128 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
-#define B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
-
-#include "b3StridingMeshInterface.h"
-#include "Bullet3Common/b3AlignedObjectArray.h"
-#include "Bullet3Common/b3Scalar.h"
-
-///The b3IndexedMesh indexes a single vertex and index array. Multiple b3IndexedMesh objects can be passed into a b3TriangleIndexVertexArray using addIndexedMesh.
-///Instead of the number of indices, we pass the number of triangles.
-B3_ATTRIBUTE_ALIGNED16(struct)
-b3IndexedMesh
-{
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-
-	int m_numTriangles;
-	const unsigned char* m_triangleIndexBase;
-	// Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed)
-	int m_triangleIndexStride;
-	int m_numVertices;
-	const unsigned char* m_vertexBase;
-	// Size of a vertex, in bytes
-	int m_vertexStride;
-
-	// The index type is set when adding an indexed mesh to the
-	// b3TriangleIndexVertexArray, do not set it manually
-	PHY_ScalarType m_indexType;
-
-	// The vertex type has a default type similar to Bullet's precision mode (float or double)
-	// but can be set manually if you for example run Bullet with double precision but have
-	// mesh data in single precision..
-	PHY_ScalarType m_vertexType;
-
-	b3IndexedMesh()
-		: m_indexType(PHY_INTEGER),
-#ifdef B3_USE_DOUBLE_PRECISION
-		  m_vertexType(PHY_DOUBLE)
-#else   // B3_USE_DOUBLE_PRECISION
-		  m_vertexType(PHY_FLOAT)
-#endif  // B3_USE_DOUBLE_PRECISION
-	{
-	}
-};
-
-typedef b3AlignedObjectArray<b3IndexedMesh> IndexedMeshArray;
-
-///The b3TriangleIndexVertexArray allows to access multiple triangle meshes, by indexing into existing triangle/index arrays.
-///Additional meshes can be added using addIndexedMesh
-///No duplcate is made of the vertex/index data, it only indexes into external vertex/index arrays.
-///So keep those arrays around during the lifetime of this b3TriangleIndexVertexArray.
-B3_ATTRIBUTE_ALIGNED16(class)
-b3TriangleIndexVertexArray : public b3StridingMeshInterface
-{
-protected:
-	IndexedMeshArray m_indexedMeshes;
-	int m_pad[2];
-	mutable int m_hasAabb;  // using int instead of bool to maintain alignment
-	mutable b3Vector3 m_aabbMin;
-	mutable b3Vector3 m_aabbMax;
-
-public:
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-
-	b3TriangleIndexVertexArray() : m_hasAabb(0)
-	{
-	}
-
-	virtual ~b3TriangleIndexVertexArray();
-
-	//just to be backwards compatible
-	b3TriangleIndexVertexArray(int numTriangles, int* triangleIndexBase, int triangleIndexStride, int numVertices, b3Scalar* vertexBase, int vertexStride);
-
-	void addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER)
-	{
-		m_indexedMeshes.push_back(mesh);
-		m_indexedMeshes[m_indexedMeshes.size() - 1].m_indexType = indexType;
-	}
-
-	virtual void getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0);
-
-	virtual void getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) const;
-
-	/// unLockVertexBase finishes the access to a subpart of the triangle mesh
-	/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
-	virtual void unLockVertexBase(int subpart) { (void)subpart; }
-
-	virtual void unLockReadOnlyVertexBase(int subpart) const { (void)subpart; }
-
-	/// getNumSubParts returns the number of seperate subparts
-	/// each subpart has a continuous array of vertices and indices
-	virtual int getNumSubParts() const
-	{
-		return (int)m_indexedMeshes.size();
-	}
-
-	IndexedMeshArray& getIndexedMeshArray()
-	{
-		return m_indexedMeshes;
-	}
-
-	const IndexedMeshArray& getIndexedMeshArray() const
-	{
-		return m_indexedMeshes;
-	}
-
-	virtual void preallocateVertices(int numverts) { (void)numverts; }
-	virtual void preallocateIndices(int numindices) { (void)numindices; }
-
-	virtual bool hasPremadeAabb() const;
-	virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
-	virtual void getPremadeAabb(b3Vector3 * aabbMin, b3Vector3 * aabbMax) const;
-};
-
-#endif  //B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h
@ -1,10 +0,0 @@
-#ifndef B3_VECTOR_FLOAT4_H
-#define B3_VECTOR_FLOAT4_H
-
-#include "Bullet3Common/b3Transform.h"
-
-//#define cross3(a,b) (a.cross(b))
-#define float4 b3Vector3
-//#define make_float4(x,y,z,w) b3Vector4(x,y,z,w)
-
-#endif  //B3_VECTOR_FLOAT4_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp
@ -1,574 +0,0 @@
-
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-	
-	Elsevier CDROM license agreements grants nonexclusive license to use the software
-	for any purpose, commercial or non-commercial as long as the following credit is included
-	identifying the original source of the software:
-
-	Parts of the source are "from the book Real-Time Collision Detection by
-	Christer Ericson, published by Morgan Kaufmann Publishers,
-	(c) 2005 Elsevier Inc."
-		
-*/
-
-#include "b3VoronoiSimplexSolver.h"
-
-#define VERTA 0
-#define VERTB 1
-#define VERTC 2
-#define VERTD 3
-
-#define B3_CATCH_DEGENERATE_TETRAHEDRON 1
-void b3VoronoiSimplexSolver::removeVertex(int index)
-{
-	b3Assert(m_numVertices > 0);
-	m_numVertices--;
-	m_simplexVectorW[index] = m_simplexVectorW[m_numVertices];
-	m_simplexPointsP[index] = m_simplexPointsP[m_numVertices];
-	m_simplexPointsQ[index] = m_simplexPointsQ[m_numVertices];
-}
-
-void b3VoronoiSimplexSolver::reduceVertices(const b3UsageBitfield& usedVerts)
-{
-	if ((numVertices() >= 4) && (!usedVerts.usedVertexD))
-		removeVertex(3);
-
-	if ((numVertices() >= 3) && (!usedVerts.usedVertexC))
-		removeVertex(2);
-
-	if ((numVertices() >= 2) && (!usedVerts.usedVertexB))
-		removeVertex(1);
-
-	if ((numVertices() >= 1) && (!usedVerts.usedVertexA))
-		removeVertex(0);
-}
-
-//clear the simplex, remove all the vertices
-void b3VoronoiSimplexSolver::reset()
-{
-	m_cachedValidClosest = false;
-	m_numVertices = 0;
-	m_needsUpdate = true;
-	m_lastW = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
-	m_cachedBC.reset();
-}
-
-//add a vertex
-void b3VoronoiSimplexSolver::addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q)
-{
-	m_lastW = w;
-	m_needsUpdate = true;
-
-	m_simplexVectorW[m_numVertices] = w;
-	m_simplexPointsP[m_numVertices] = p;
-	m_simplexPointsQ[m_numVertices] = q;
-
-	m_numVertices++;
-}
-
-bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
-{
-	if (m_needsUpdate)
-	{
-		m_cachedBC.reset();
-
-		m_needsUpdate = false;
-
-		switch (numVertices())
-		{
-			case 0:
-				m_cachedValidClosest = false;
-				break;
-			case 1:
-			{
-				m_cachedP1 = m_simplexPointsP[0];
-				m_cachedP2 = m_simplexPointsQ[0];
-				m_cachedV = m_cachedP1 - m_cachedP2;  //== m_simplexVectorW[0]
-				m_cachedBC.reset();
-				m_cachedBC.setBarycentricCoordinates(b3Scalar(1.), b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
-				m_cachedValidClosest = m_cachedBC.isValid();
-				break;
-			};
-			case 2:
-			{
-				//closest point origin from line segment
-				const b3Vector3& from = m_simplexVectorW[0];
-				const b3Vector3& to = m_simplexVectorW[1];
-				b3Vector3 nearest;
-
-				b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
-				b3Vector3 diff = p - from;
-				b3Vector3 v = to - from;
-				b3Scalar t = v.dot(diff);
-
-				if (t > 0)
-				{
-					b3Scalar dotVV = v.dot(v);
-					if (t < dotVV)
-					{
-						t /= dotVV;
-						diff -= t * v;
-						m_cachedBC.m_usedVertices.usedVertexA = true;
-						m_cachedBC.m_usedVertices.usedVertexB = true;
-					}
-					else
-					{
-						t = 1;
-						diff -= v;
-						//reduce to 1 point
-						m_cachedBC.m_usedVertices.usedVertexB = true;
-					}
-				}
-				else
-				{
-					t = 0;
-					//reduce to 1 point
-					m_cachedBC.m_usedVertices.usedVertexA = true;
-				}
-				m_cachedBC.setBarycentricCoordinates(1 - t, t);
-				nearest = from + t * v;
-
-				m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]);
-				m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]);
-				m_cachedV = m_cachedP1 - m_cachedP2;
-
-				reduceVertices(m_cachedBC.m_usedVertices);
-
-				m_cachedValidClosest = m_cachedBC.isValid();
-				break;
-			}
-			case 3:
-			{
-				//closest point origin from triangle
-				b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
-
-				const b3Vector3& a = m_simplexVectorW[0];
-				const b3Vector3& b = m_simplexVectorW[1];
-				const b3Vector3& c = m_simplexVectorW[2];
-
-				closestPtPointTriangle(p, a, b, c, m_cachedBC);
-				m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
-							 m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
-							 m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2];
-
-				m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
-							 m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
-							 m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2];
-
-				m_cachedV = m_cachedP1 - m_cachedP2;
-
-				reduceVertices(m_cachedBC.m_usedVertices);
-				m_cachedValidClosest = m_cachedBC.isValid();
-
-				break;
-			}
-			case 4:
-			{
-				b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
-
-				const b3Vector3& a = m_simplexVectorW[0];
-				const b3Vector3& b = m_simplexVectorW[1];
-				const b3Vector3& c = m_simplexVectorW[2];
-				const b3Vector3& d = m_simplexVectorW[3];
-
-				bool hasSeperation = closestPtPointTetrahedron(p, a, b, c, d, m_cachedBC);
-
-				if (hasSeperation)
-				{
-					m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
-								 m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
-								 m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] +
-								 m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3];
-
-					m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
-								 m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
-								 m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] +
-								 m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3];
-
-					m_cachedV = m_cachedP1 - m_cachedP2;
-					reduceVertices(m_cachedBC.m_usedVertices);
-				}
-				else
-				{
-					//					printf("sub distance got penetration\n");
-
-					if (m_cachedBC.m_degenerate)
-					{
-						m_cachedValidClosest = false;
-					}
-					else
-					{
-						m_cachedValidClosest = true;
-						//degenerate case == false, penetration = true + zero
-						m_cachedV.setValue(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
-					}
-					break;
-				}
-
-				m_cachedValidClosest = m_cachedBC.isValid();
-
-				//closest point origin from tetrahedron
-				break;
-			}
-			default:
-			{
-				m_cachedValidClosest = false;
-			}
-		};
-	}
-
-	return m_cachedValidClosest;
-}
-
-//return/calculate the closest vertex
-bool b3VoronoiSimplexSolver::closest(b3Vector3& v)
-{
-	bool succes = updateClosestVectorAndPoints();
-	v = m_cachedV;
-	return succes;
-}
-
-b3Scalar b3VoronoiSimplexSolver::maxVertex()
-{
-	int i, numverts = numVertices();
-	b3Scalar maxV = b3Scalar(0.);
-	for (i = 0; i < numverts; i++)
-	{
-		b3Scalar curLen2 = m_simplexVectorW[i].length2();
-		if (maxV < curLen2)
-			maxV = curLen2;
-	}
-	return maxV;
-}
-
-//return the current simplex
-int b3VoronoiSimplexSolver::getSimplex(b3Vector3* pBuf, b3Vector3* qBuf, b3Vector3* yBuf) const
-{
-	int i;
-	for (i = 0; i < numVertices(); i++)
-	{
-		yBuf[i] = m_simplexVectorW[i];
-		pBuf[i] = m_simplexPointsP[i];
-		qBuf[i] = m_simplexPointsQ[i];
-	}
-	return numVertices();
-}
-
-bool b3VoronoiSimplexSolver::inSimplex(const b3Vector3& w)
-{
-	bool found = false;
-	int i, numverts = numVertices();
-	//b3Scalar maxV = b3Scalar(0.);
-
-	//w is in the current (reduced) simplex
-	for (i = 0; i < numverts; i++)
-	{
-#ifdef BT_USE_EQUAL_VERTEX_THRESHOLD
-		if (m_simplexVectorW[i].distance2(w) <= m_equalVertexThreshold)
-#else
-		if (m_simplexVectorW[i] == w)
-#endif
-			found = true;
-	}
-
-	//check in case lastW is already removed
-	if (w == m_lastW)
-		return true;
-
-	return found;
-}
-
-void b3VoronoiSimplexSolver::backup_closest(b3Vector3& v)
-{
-	v = m_cachedV;
-}
-
-bool b3VoronoiSimplexSolver::emptySimplex() const
-{
-	return (numVertices() == 0);
-}
-
-void b3VoronoiSimplexSolver::compute_points(b3Vector3& p1, b3Vector3& p2)
-{
-	updateClosestVectorAndPoints();
-	p1 = m_cachedP1;
-	p2 = m_cachedP2;
-}
-
-bool b3VoronoiSimplexSolver::closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, b3SubSimplexClosestResult& result)
-{
-	result.m_usedVertices.reset();
-
-	// Check if P in vertex region outside A
-	b3Vector3 ab = b - a;
-	b3Vector3 ac = c - a;
-	b3Vector3 ap = p - a;
-	b3Scalar d1 = ab.dot(ap);
-	b3Scalar d2 = ac.dot(ap);
-	if (d1 <= b3Scalar(0.0) && d2 <= b3Scalar(0.0))
-	{
-		result.m_closestPointOnSimplex = a;
-		result.m_usedVertices.usedVertexA = true;
-		result.setBarycentricCoordinates(1, 0, 0);
-		return true;  // a; // barycentric coordinates (1,0,0)
-	}
-
-	// Check if P in vertex region outside B
-	b3Vector3 bp = p - b;
-	b3Scalar d3 = ab.dot(bp);
-	b3Scalar d4 = ac.dot(bp);
-	if (d3 >= b3Scalar(0.0) && d4 <= d3)
-	{
-		result.m_closestPointOnSimplex = b;
-		result.m_usedVertices.usedVertexB = true;
-		result.setBarycentricCoordinates(0, 1, 0);
-
-		return true;  // b; // barycentric coordinates (0,1,0)
-	}
-	// Check if P in edge region of AB, if so return projection of P onto AB
-	b3Scalar vc = d1 * d4 - d3 * d2;
-	if (vc <= b3Scalar(0.0) && d1 >= b3Scalar(0.0) && d3 <= b3Scalar(0.0))
-	{
-		b3Scalar v = d1 / (d1 - d3);
-		result.m_closestPointOnSimplex = a + v * ab;
-		result.m_usedVertices.usedVertexA = true;
-		result.m_usedVertices.usedVertexB = true;
-		result.setBarycentricCoordinates(1 - v, v, 0);
-		return true;
-		//return a + v * ab; // barycentric coordinates (1-v,v,0)
-	}
-
-	// Check if P in vertex region outside C
-	b3Vector3 cp = p - c;
-	b3Scalar d5 = ab.dot(cp);
-	b3Scalar d6 = ac.dot(cp);
-	if (d6 >= b3Scalar(0.0) && d5 <= d6)
-	{
-		result.m_closestPointOnSimplex = c;
-		result.m_usedVertices.usedVertexC = true;
-		result.setBarycentricCoordinates(0, 0, 1);
-		return true;  //c; // barycentric coordinates (0,0,1)
-	}
-
-	// Check if P in edge region of AC, if so return projection of P onto AC
-	b3Scalar vb = d5 * d2 - d1 * d6;
-	if (vb <= b3Scalar(0.0) && d2 >= b3Scalar(0.0) && d6 <= b3Scalar(0.0))
-	{
-		b3Scalar w = d2 / (d2 - d6);
-		result.m_closestPointOnSimplex = a + w * ac;
-		result.m_usedVertices.usedVertexA = true;
-		result.m_usedVertices.usedVertexC = true;
-		result.setBarycentricCoordinates(1 - w, 0, w);
-		return true;
-		//return a + w * ac; // barycentric coordinates (1-w,0,w)
-	}
-
-	// Check if P in edge region of BC, if so return projection of P onto BC
-	b3Scalar va = d3 * d6 - d5 * d4;
-	if (va <= b3Scalar(0.0) && (d4 - d3) >= b3Scalar(0.0) && (d5 - d6) >= b3Scalar(0.0))
-	{
-		b3Scalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6));
-
-		result.m_closestPointOnSimplex = b + w * (c - b);
-		result.m_usedVertices.usedVertexB = true;
-		result.m_usedVertices.usedVertexC = true;
-		result.setBarycentricCoordinates(0, 1 - w, w);
-		return true;
-		// return b + w * (c - b); // barycentric coordinates (0,1-w,w)
-	}
-
-	// P inside face region. Compute Q through its barycentric coordinates (u,v,w)
-	b3Scalar denom = b3Scalar(1.0) / (va + vb + vc);
-	b3Scalar v = vb * denom;
-	b3Scalar w = vc * denom;
-
-	result.m_closestPointOnSimplex = a + ab * v + ac * w;
-	result.m_usedVertices.usedVertexA = true;
-	result.m_usedVertices.usedVertexB = true;
-	result.m_usedVertices.usedVertexC = true;
-	result.setBarycentricCoordinates(1 - v - w, v, w);
-
-	return true;
-	//	return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = b3Scalar(1.0) - v - w
-}
-
-/// Test if point p and d lie on opposite sides of plane through abc
-int b3VoronoiSimplexSolver::pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d)
-{
-	b3Vector3 normal = (b - a).cross(c - a);
-
-	b3Scalar signp = (p - a).dot(normal);  // [AP AB AC]
-	b3Scalar signd = (d - a).dot(normal);  // [AD AB AC]
-
-#ifdef B3_CATCH_DEGENERATE_TETRAHEDRON
-#ifdef BT_USE_DOUBLE_PRECISION
-	if (signd * signd < (b3Scalar(1e-8) * b3Scalar(1e-8)))
-	{
-		return -1;
-	}
-#else
-	if (signd * signd < (b3Scalar(1e-4) * b3Scalar(1e-4)))
-	{
-		//		printf("affine dependent/degenerate\n");//
-		return -1;
-	}
-#endif
-
-#endif
-	// Points on opposite sides if expression signs are opposite
-	return signp * signd < b3Scalar(0.);
-}
-
-bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult)
-{
-	b3SubSimplexClosestResult tempResult;
-
-	// Start out assuming point inside all halfspaces, so closest to itself
-	finalResult.m_closestPointOnSimplex = p;
-	finalResult.m_usedVertices.reset();
-	finalResult.m_usedVertices.usedVertexA = true;
-	finalResult.m_usedVertices.usedVertexB = true;
-	finalResult.m_usedVertices.usedVertexC = true;
-	finalResult.m_usedVertices.usedVertexD = true;
-
-	int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d);
-	int pointOutsideACD = pointOutsideOfPlane(p, a, c, d, b);
-	int pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c);
-	int pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a);
-
-	if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0)
-	{
-		finalResult.m_degenerate = true;
-		return false;
-	}
-
-	if (!pointOutsideABC && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC)
-	{
-		return false;
-	}
-
-	b3Scalar bestSqDist = FLT_MAX;
-	// If point outside face abc then compute closest point on abc
-	if (pointOutsideABC)
-	{
-		closestPtPointTriangle(p, a, b, c, tempResult);
-		b3Vector3 q = tempResult.m_closestPointOnSimplex;
-
-		b3Scalar sqDist = (q - p).dot(q - p);
-		// Update best closest point if (squared) distance is less than current best
-		if (sqDist < bestSqDist)
-		{
-			bestSqDist = sqDist;
-			finalResult.m_closestPointOnSimplex = q;
-			//convert result bitmask!
-			finalResult.m_usedVertices.reset();
-			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
-			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexB;
-			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
-			finalResult.setBarycentricCoordinates(
-				tempResult.m_barycentricCoords[VERTA],
-				tempResult.m_barycentricCoords[VERTB],
-				tempResult.m_barycentricCoords[VERTC],
-				0);
-		}
-	}
-
-	// Repeat test for face acd
-	if (pointOutsideACD)
-	{
-		closestPtPointTriangle(p, a, c, d, tempResult);
-		b3Vector3 q = tempResult.m_closestPointOnSimplex;
-		//convert result bitmask!
-
-		b3Scalar sqDist = (q - p).dot(q - p);
-		if (sqDist < bestSqDist)
-		{
-			bestSqDist = sqDist;
-			finalResult.m_closestPointOnSimplex = q;
-			finalResult.m_usedVertices.reset();
-			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
-
-			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexB;
-			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexC;
-			finalResult.setBarycentricCoordinates(
-				tempResult.m_barycentricCoords[VERTA],
-				0,
-				tempResult.m_barycentricCoords[VERTB],
-				tempResult.m_barycentricCoords[VERTC]);
-		}
-	}
-	// Repeat test for face adb
-
-	if (pointOutsideADB)
-	{
-		closestPtPointTriangle(p, a, d, b, tempResult);
-		b3Vector3 q = tempResult.m_closestPointOnSimplex;
-		//convert result bitmask!
-
-		b3Scalar sqDist = (q - p).dot(q - p);
-		if (sqDist < bestSqDist)
-		{
-			bestSqDist = sqDist;
-			finalResult.m_closestPointOnSimplex = q;
-			finalResult.m_usedVertices.reset();
-			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
-			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexC;
-
-			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
-			finalResult.setBarycentricCoordinates(
-				tempResult.m_barycentricCoords[VERTA],
-				tempResult.m_barycentricCoords[VERTC],
-				0,
-				tempResult.m_barycentricCoords[VERTB]);
-		}
-	}
-	// Repeat test for face bdc
-
-	if (pointOutsideBDC)
-	{
-		closestPtPointTriangle(p, b, d, c, tempResult);
-		b3Vector3 q = tempResult.m_closestPointOnSimplex;
-		//convert result bitmask!
-		b3Scalar sqDist = (q - p).dot(q - p);
-		if (sqDist < bestSqDist)
-		{
-			bestSqDist = sqDist;
-			finalResult.m_closestPointOnSimplex = q;
-			finalResult.m_usedVertices.reset();
-			//
-			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexA;
-			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
-			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
-
-			finalResult.setBarycentricCoordinates(
-				0,
-				tempResult.m_barycentricCoords[VERTA],
-				tempResult.m_barycentricCoords[VERTC],
-				tempResult.m_barycentricCoords[VERTB]);
-		}
-	}
-
-	//help! we ended up full !
-
-	if (finalResult.m_usedVertices.usedVertexA &&
-		finalResult.m_usedVertices.usedVertexB &&
-		finalResult.m_usedVertices.usedVertexC &&
-		finalResult.m_usedVertices.usedVertexD)
-	{
-		return true;
-	}
-
-	return true;
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h
@ -1,164 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#ifndef B3_VORONOI_SIMPLEX_SOLVER_H
-#define B3_VORONOI_SIMPLEX_SOLVER_H
-
-#include "Bullet3Common/b3Vector3.h"
-
-#define VORONOI_SIMPLEX_MAX_VERTS 5
-
-///disable next define, or use defaultCollisionConfiguration->getSimplexSolver()->setEqualVertexThreshold(0.f) to disable/configure
-//#define BT_USE_EQUAL_VERTEX_THRESHOLD
-#define VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD 0.0001f
-
-struct b3UsageBitfield
-{
-	b3UsageBitfield()
-	{
-		reset();
-	}
-
-	void reset()
-	{
-		usedVertexA = false;
-		usedVertexB = false;
-		usedVertexC = false;
-		usedVertexD = false;
-	}
-	unsigned short usedVertexA : 1;
-	unsigned short usedVertexB : 1;
-	unsigned short usedVertexC : 1;
-	unsigned short usedVertexD : 1;
-	unsigned short unused1 : 1;
-	unsigned short unused2 : 1;
-	unsigned short unused3 : 1;
-	unsigned short unused4 : 1;
-};
-
-struct b3SubSimplexClosestResult
-{
-	b3Vector3 m_closestPointOnSimplex;
-	//MASK for m_usedVertices
-	//stores the simplex vertex-usage, using the MASK,
-	// if m_usedVertices & MASK then the related vertex is used
-	b3UsageBitfield m_usedVertices;
-	b3Scalar m_barycentricCoords[4];
-	bool m_degenerate;
-
-	void reset()
-	{
-		m_degenerate = false;
-		setBarycentricCoordinates();
-		m_usedVertices.reset();
-	}
-	bool isValid()
-	{
-		bool valid = (m_barycentricCoords[0] >= b3Scalar(0.)) &&
-					 (m_barycentricCoords[1] >= b3Scalar(0.)) &&
-					 (m_barycentricCoords[2] >= b3Scalar(0.)) &&
-					 (m_barycentricCoords[3] >= b3Scalar(0.));
-
-		return valid;
-	}
-	void setBarycentricCoordinates(b3Scalar a = b3Scalar(0.), b3Scalar b = b3Scalar(0.), b3Scalar c = b3Scalar(0.), b3Scalar d = b3Scalar(0.))
-	{
-		m_barycentricCoords[0] = a;
-		m_barycentricCoords[1] = b;
-		m_barycentricCoords[2] = c;
-		m_barycentricCoords[3] = d;
-	}
-};
-
-/// b3VoronoiSimplexSolver is an implementation of the closest point distance algorithm from a 1-4 points simplex to the origin.
-/// Can be used with GJK, as an alternative to Johnson distance algorithm.
-
-B3_ATTRIBUTE_ALIGNED16(class)
-b3VoronoiSimplexSolver
-{
-public:
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-
-	int m_numVertices;
-
-	b3Vector3 m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS];
-	b3Vector3 m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS];
-	b3Vector3 m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS];
-
-	b3Vector3 m_cachedP1;
-	b3Vector3 m_cachedP2;
-	b3Vector3 m_cachedV;
-	b3Vector3 m_lastW;
-
-	b3Scalar m_equalVertexThreshold;
-	bool m_cachedValidClosest;
-
-	b3SubSimplexClosestResult m_cachedBC;
-
-	bool m_needsUpdate;
-
-	void removeVertex(int index);
-	void reduceVertices(const b3UsageBitfield& usedVerts);
-	bool updateClosestVectorAndPoints();
-
-	bool closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult);
-	int pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d);
-	bool closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, b3SubSimplexClosestResult& result);
-
-public:
-	b3VoronoiSimplexSolver()
-		: m_equalVertexThreshold(VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD)
-	{
-	}
-	void reset();
-
-	void addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q);
-
-	void setEqualVertexThreshold(b3Scalar threshold)
-	{
-		m_equalVertexThreshold = threshold;
-	}
-
-	b3Scalar getEqualVertexThreshold() const
-	{
-		return m_equalVertexThreshold;
-	}
-
-	bool closest(b3Vector3 & v);
-
-	b3Scalar maxVertex();
-
-	bool fullSimplex() const
-	{
-		return (m_numVertices == 4);
-	}
-
-	int getSimplex(b3Vector3 * pBuf, b3Vector3 * qBuf, b3Vector3 * yBuf) const;
-
-	bool inSimplex(const b3Vector3& w);
-
-	void backup_closest(b3Vector3 & v);
-
-	bool emptySimplex() const;
-
-	void compute_points(b3Vector3 & p1, b3Vector3 & p2);
-
-	int numVertices() const
-	{
-		return m_numVertices;
-	}
-};
-
-#endif  //B3_VORONOI_SIMPLEX_SOLVER_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl
@ -1,283 +0,0 @@
-//keep this enum in sync with the CPU version (in btCollidable.h)
-//written by Erwin Coumans
-
-#define SHAPE_CONVEX_HULL 3
-#define SHAPE_CONCAVE_TRIMESH 5
-#define TRIANGLE_NUM_CONVEX_FACES 5
-#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
-#define SHAPE_SPHERE 7
-
-typedef unsigned int u32;
-
-#define MAX_NUM_PARTS_IN_BITS 10
-
-///btQuantizedBvhNode is a compressed aabb node, 16 bytes.
-///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
-typedef struct
-{
-	//12 bytes
-	unsigned short int	m_quantizedAabbMin[3];
-	unsigned short int	m_quantizedAabbMax[3];
-	//4 bytes
-	int	m_escapeIndexOrTriangleIndex;
-} btQuantizedBvhNode;
-
-typedef struct
-{
-	float4		m_aabbMin;
-	float4		m_aabbMax;
-	float4		m_quantization;
-	int			m_numNodes;
-	int			m_numSubTrees;
-	int			m_nodeOffset;
-	int			m_subTreeOffset;
-
-} b3BvhInfo;
-
-int	getTriangleIndex(const btQuantizedBvhNode* rootNode)
-{
-	unsigned int x=0;
-	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
-	// Get only the lower bits where the triangle index is stored
-	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
-}
-
-int isLeaf(const btQuantizedBvhNode* rootNode)
-{
-	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
-	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
-}
-	
-int getEscapeIndex(const btQuantizedBvhNode* rootNode)
-{
-	return -rootNode->m_escapeIndexOrTriangleIndex;
-}
-
-typedef struct
-{
-	//12 bytes
-	unsigned short int	m_quantizedAabbMin[3];
-	unsigned short int	m_quantizedAabbMax[3];
-	//4 bytes, points to the root of the subtree
-	int			m_rootNodeIndex;
-	//4 bytes
-	int			m_subtreeSize;
-	int			m_padding[3];
-} btBvhSubtreeInfo;
-
-///keep this in sync with btCollidable.h
-typedef struct
-{
-	int m_numChildShapes;
-	int blaat2;
-	int m_shapeType;
-	int m_shapeIndex;
-	
-} btCollidableGpu;
-
-typedef struct
-{
-	float4	m_childPosition;
-	float4	m_childOrientation;
-	int m_shapeIndex;
-	int m_unused0;
-	int m_unused1;
-	int m_unused2;
-} btGpuChildShape;
-
-
-typedef struct
-{
-	float4 m_pos;
-	float4 m_quat;
-	float4 m_linVel;
-	float4 m_angVel;
-
-	u32 m_collidableIdx;
-	float m_invMass;
-	float m_restituitionCoeff;
-	float m_frictionCoeff;
-} BodyData;
-
-typedef struct 
-{
-	union
-	{
-		float4	m_min;
-		float   m_minElems[4];
-		int			m_minIndices[4];
-	};
-	union
-	{
-		float4	m_max;
-		float   m_maxElems[4];
-		int			m_maxIndices[4];
-	};
-} btAabbCL;
-
-
-int testQuantizedAabbAgainstQuantizedAabb(
-								const unsigned short int* aabbMin1,
-								const unsigned short int* aabbMax1,
-								const unsigned short int* aabbMin2,
-								const unsigned short int* aabbMax2)
-{
-	//int overlap = 1;
-	if (aabbMin1[0] > aabbMax2[0])
-		return 0;
-	if (aabbMax1[0] < aabbMin2[0])
-		return 0;
-	if (aabbMin1[1] > aabbMax2[1])
-		return 0;
-	if (aabbMax1[1] < aabbMin2[1])
-		return 0;
-	if (aabbMin1[2] > aabbMax2[2])
-		return 0;
-	if (aabbMax1[2] < aabbMin2[2])
-		return 0;
-	return 1;
-	//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;
-	//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;
-	//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;
-	//return overlap;
-}
-
-
-void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)
-{
-	float4 clampedPoint = max(point2,bvhAabbMin);
-	clampedPoint = min (clampedPoint, bvhAabbMax);
-
-	float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;
-	if (isMax)
-	{
-		out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));
-		out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));
-		out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));
-	} else
-	{
-		out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));
-		out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));
-		out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));
-	}
-
-}
-
-
-// work-in-progress
-__kernel void   bvhTraversalKernel( __global const int4* pairs, 
-									__global const BodyData* rigidBodies, 
-									__global const btCollidableGpu* collidables,
-									__global btAabbCL* aabbs,
-									__global int4* concavePairsOut,
-									__global volatile int* numConcavePairsOut,
-									__global const btBvhSubtreeInfo* subtreeHeadersRoot,
-									__global const btQuantizedBvhNode* quantizedNodesRoot,
-									__global const b3BvhInfo* bvhInfos,
-									int numPairs,
-									int maxNumConcavePairsCapacity)
-{
-	int id = get_global_id(0);
-	if (id>=numPairs)
-		return;
-	
-	int bodyIndexA = pairs[id].x;
-	int bodyIndexB = pairs[id].y;
-	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
-	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
-	
-	//once the broadphase avoids static-static pairs, we can remove this test
-	if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
-	{
-		return;
-	}
-		
-	if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)
-		return;
-
-	int shapeTypeB = collidables[collidableIndexB].m_shapeType;
-		
-	if (shapeTypeB!=SHAPE_CONVEX_HULL &&
-		shapeTypeB!=SHAPE_SPHERE	&&
-		shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS
-		)
-		return;
-
-	b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];
-
-	float4 bvhAabbMin = bvhInfo.m_aabbMin;
-	float4 bvhAabbMax = bvhInfo.m_aabbMax;
-	float4 bvhQuantization = bvhInfo.m_quantization;
-	int numSubtreeHeaders = bvhInfo.m_numSubTrees;
-	__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];
-	__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];
-	
-
-	unsigned short int quantizedQueryAabbMin[3];
-	unsigned short int quantizedQueryAabbMax[3];
-	quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);
-	quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);
-	
-	for (int i=0;i<numSubtreeHeaders;i++)
-	{
-		btBvhSubtreeInfo subtree = subtreeHeaders[i];
-				
-		int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
-		if (overlap != 0)
-		{
-			int startNodeIndex = subtree.m_rootNodeIndex;
-			int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;
-			int curIndex = startNodeIndex;
-			int escapeIndex;
-			int isLeafNode;
-			int aabbOverlap;
-			while (curIndex < endNodeIndex)
-			{
-				btQuantizedBvhNode rootNode = quantizedNodes[curIndex];
-				aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);
-				isLeafNode = isLeaf(&rootNode);
-				if (aabbOverlap)
-				{
-					if (isLeafNode)
-					{
-						int triangleIndex = getTriangleIndex(&rootNode);
-						if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)
-						{
-								int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
-								int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);
-								for (int b=0;b<numChildrenB;b++)
-								{
-									if ((pairIdx+b)<maxNumConcavePairsCapacity)
-									{
-										int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
-										int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);
-										concavePairsOut[pairIdx+b] = newPair;
-									}
-								}
-						} else
-						{
-							int pairIdx = atomic_inc(numConcavePairsOut);
-							if (pairIdx<maxNumConcavePairsCapacity)
-							{
-								int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);
-								concavePairsOut[pairIdx] = newPair;
-							}
-						}
-					} 
-					curIndex++;
-				} else
-				{
-					if (isLeafNode)
-					{
-						curIndex++;
-					} else
-					{
-						escapeIndex = getEscapeIndex(&rootNode);
-						curIndex += escapeIndex;
-					}
-				}
-			}
-		}
-	}
-
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
@ -1,257 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* bvhTraversalKernelCL =
-	"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
-	"//written by Erwin Coumans\n"
-	"#define SHAPE_CONVEX_HULL 3\n"
-	"#define SHAPE_CONCAVE_TRIMESH 5\n"
-	"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
-	"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
-	"#define SHAPE_SPHERE 7\n"
-	"typedef unsigned int u32;\n"
-	"#define MAX_NUM_PARTS_IN_BITS 10\n"
-	"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
-	"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
-	"typedef struct\n"
-	"{\n"
-	"	//12 bytes\n"
-	"	unsigned short int	m_quantizedAabbMin[3];\n"
-	"	unsigned short int	m_quantizedAabbMax[3];\n"
-	"	//4 bytes\n"
-	"	int	m_escapeIndexOrTriangleIndex;\n"
-	"} btQuantizedBvhNode;\n"
-	"typedef struct\n"
-	"{\n"
-	"	float4		m_aabbMin;\n"
-	"	float4		m_aabbMax;\n"
-	"	float4		m_quantization;\n"
-	"	int			m_numNodes;\n"
-	"	int			m_numSubTrees;\n"
-	"	int			m_nodeOffset;\n"
-	"	int			m_subTreeOffset;\n"
-	"} b3BvhInfo;\n"
-	"int	getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
-	"{\n"
-	"	unsigned int x=0;\n"
-	"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
-	"	// Get only the lower bits where the triangle index is stored\n"
-	"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
-	"}\n"
-	"int isLeaf(const btQuantizedBvhNode* rootNode)\n"
-	"{\n"
-	"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
-	"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
-	"}\n"
-	"	\n"
-	"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n"
-	"{\n"
-	"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
-	"}\n"
-	"typedef struct\n"
-	"{\n"
-	"	//12 bytes\n"
-	"	unsigned short int	m_quantizedAabbMin[3];\n"
-	"	unsigned short int	m_quantizedAabbMax[3];\n"
-	"	//4 bytes, points to the root of the subtree\n"
-	"	int			m_rootNodeIndex;\n"
-	"	//4 bytes\n"
-	"	int			m_subtreeSize;\n"
-	"	int			m_padding[3];\n"
-	"} btBvhSubtreeInfo;\n"
-	"///keep this in sync with btCollidable.h\n"
-	"typedef struct\n"
-	"{\n"
-	"	int m_numChildShapes;\n"
-	"	int blaat2;\n"
-	"	int m_shapeType;\n"
-	"	int m_shapeIndex;\n"
-	"	\n"
-	"} btCollidableGpu;\n"
-	"typedef struct\n"
-	"{\n"
-	"	float4	m_childPosition;\n"
-	"	float4	m_childOrientation;\n"
-	"	int m_shapeIndex;\n"
-	"	int m_unused0;\n"
-	"	int m_unused1;\n"
-	"	int m_unused2;\n"
-	"} btGpuChildShape;\n"
-	"typedef struct\n"
-	"{\n"
-	"	float4 m_pos;\n"
-	"	float4 m_quat;\n"
-	"	float4 m_linVel;\n"
-	"	float4 m_angVel;\n"
-	"	u32 m_collidableIdx;\n"
-	"	float m_invMass;\n"
-	"	float m_restituitionCoeff;\n"
-	"	float m_frictionCoeff;\n"
-	"} BodyData;\n"
-	"typedef struct \n"
-	"{\n"
-	"	union\n"
-	"	{\n"
-	"		float4	m_min;\n"
-	"		float   m_minElems[4];\n"
-	"		int			m_minIndices[4];\n"
-	"	};\n"
-	"	union\n"
-	"	{\n"
-	"		float4	m_max;\n"
-	"		float   m_maxElems[4];\n"
-	"		int			m_maxIndices[4];\n"
-	"	};\n"
-	"} btAabbCL;\n"
-	"int testQuantizedAabbAgainstQuantizedAabb(\n"
-	"								const unsigned short int* aabbMin1,\n"
-	"								const unsigned short int* aabbMax1,\n"
-	"								const unsigned short int* aabbMin2,\n"
-	"								const unsigned short int* aabbMax2)\n"
-	"{\n"
-	"	//int overlap = 1;\n"
-	"	if (aabbMin1[0] > aabbMax2[0])\n"
-	"		return 0;\n"
-	"	if (aabbMax1[0] < aabbMin2[0])\n"
-	"		return 0;\n"
-	"	if (aabbMin1[1] > aabbMax2[1])\n"
-	"		return 0;\n"
-	"	if (aabbMax1[1] < aabbMin2[1])\n"
-	"		return 0;\n"
-	"	if (aabbMin1[2] > aabbMax2[2])\n"
-	"		return 0;\n"
-	"	if (aabbMax1[2] < aabbMin2[2])\n"
-	"		return 0;\n"
-	"	return 1;\n"
-	"	//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n"
-	"	//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n"
-	"	//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n"
-	"	//return overlap;\n"
-	"}\n"
-	"void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n"
-	"{\n"
-	"	float4 clampedPoint = max(point2,bvhAabbMin);\n"
-	"	clampedPoint = min (clampedPoint, bvhAabbMax);\n"
-	"	float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n"
-	"	if (isMax)\n"
-	"	{\n"
-	"		out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n"
-	"		out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n"
-	"		out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n"
-	"	} else\n"
-	"	{\n"
-	"		out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n"
-	"		out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n"
-	"		out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n"
-	"	}\n"
-	"}\n"
-	"// work-in-progress\n"
-	"__kernel void   bvhTraversalKernel( __global const int4* pairs, \n"
-	"									__global const BodyData* rigidBodies, \n"
-	"									__global const btCollidableGpu* collidables,\n"
-	"									__global btAabbCL* aabbs,\n"
-	"									__global int4* concavePairsOut,\n"
-	"									__global volatile int* numConcavePairsOut,\n"
-	"									__global const btBvhSubtreeInfo* subtreeHeadersRoot,\n"
-	"									__global const btQuantizedBvhNode* quantizedNodesRoot,\n"
-	"									__global const b3BvhInfo* bvhInfos,\n"
-	"									int numPairs,\n"
-	"									int maxNumConcavePairsCapacity)\n"
-	"{\n"
-	"	int id = get_global_id(0);\n"
-	"	if (id>=numPairs)\n"
-	"		return;\n"
-	"	\n"
-	"	int bodyIndexA = pairs[id].x;\n"
-	"	int bodyIndexB = pairs[id].y;\n"
-	"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
-	"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
-	"	\n"
-	"	//once the broadphase avoids static-static pairs, we can remove this test\n"
-	"	if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
-	"	{\n"
-	"		return;\n"
-	"	}\n"
-	"		\n"
-	"	if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n"
-	"		return;\n"
-	"	int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
-	"		\n"
-	"	if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n"
-	"		shapeTypeB!=SHAPE_SPHERE	&&\n"
-	"		shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
-	"		)\n"
-	"		return;\n"
-	"	b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n"
-	"	float4 bvhAabbMin = bvhInfo.m_aabbMin;\n"
-	"	float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
-	"	float4 bvhQuantization = bvhInfo.m_quantization;\n"
-	"	int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
-	"	__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
-	"	__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
-	"	\n"
-	"	unsigned short int quantizedQueryAabbMin[3];\n"
-	"	unsigned short int quantizedQueryAabbMax[3];\n"
-	"	quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
-	"	quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
-	"	\n"
-	"	for (int i=0;i<numSubtreeHeaders;i++)\n"
-	"	{\n"
-	"		btBvhSubtreeInfo subtree = subtreeHeaders[i];\n"
-	"				\n"
-	"		int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
-	"		if (overlap != 0)\n"
-	"		{\n"
-	"			int startNodeIndex = subtree.m_rootNodeIndex;\n"
-	"			int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n"
-	"			int curIndex = startNodeIndex;\n"
-	"			int escapeIndex;\n"
-	"			int isLeafNode;\n"
-	"			int aabbOverlap;\n"
-	"			while (curIndex < endNodeIndex)\n"
-	"			{\n"
-	"				btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
-	"				aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
-	"				isLeafNode = isLeaf(&rootNode);\n"
-	"				if (aabbOverlap)\n"
-	"				{\n"
-	"					if (isLeafNode)\n"
-	"					{\n"
-	"						int triangleIndex = getTriangleIndex(&rootNode);\n"
-	"						if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
-	"						{\n"
-	"								int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
-	"								int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n"
-	"								for (int b=0;b<numChildrenB;b++)\n"
-	"								{\n"
-	"									if ((pairIdx+b)<maxNumConcavePairsCapacity)\n"
-	"									{\n"
-	"										int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
-	"										int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n"
-	"										concavePairsOut[pairIdx+b] = newPair;\n"
-	"									}\n"
-	"								}\n"
-	"						} else\n"
-	"						{\n"
-	"							int pairIdx = atomic_inc(numConcavePairsOut);\n"
-	"							if (pairIdx<maxNumConcavePairsCapacity)\n"
-	"							{\n"
-	"								int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n"
-	"								concavePairsOut[pairIdx] = newPair;\n"
-	"							}\n"
-	"						}\n"
-	"					} \n"
-	"					curIndex++;\n"
-	"				} else\n"
-	"				{\n"
-	"					if (isLeafNode)\n"
-	"					{\n"
-	"						curIndex++;\n"
-	"					} else\n"
-	"					{\n"
-	"						escapeIndex = getEscapeIndex(&rootNode);\n"
-	"						curIndex += escapeIndex;\n"
-	"					}\n"
-	"				}\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"}\n";
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl
@ -1,311 +0,0 @@
-
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
-
-#define AppendInc(x, out) out = atomic_inc(x)
-#define GET_NPOINTS(x) (x).m_worldNormalOnB.w
-#ifdef cl_ext_atomic_counters_32
-	#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
-#else
-	#define counter32_t volatile __global int*
-#endif
-
-
-__kernel void   mprPenetrationKernel( __global int4* pairs,
-																					__global const b3RigidBodyData_t* rigidBodies, 
-																					__global const b3Collidable_t* collidables,
-																					__global const b3ConvexPolyhedronData_t* convexShapes, 
-																					__global const float4* vertices,
-																					__global float4* separatingNormals,
-																					__global int* hasSeparatingAxis,
-																					__global struct b3Contact4Data* restrict globalContactsOut,
-																					counter32_t nGlobalContactsOut,
-																					int contactCapacity,
-																					int numPairs)
-{
-	int i = get_global_id(0);
-	int pairIndex = i;
-	if (i<numPairs)
-	{
-		int bodyIndexA = pairs[i].x;
-		int bodyIndexB = pairs[i].y;
-
-		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
-		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
-	
-		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
-		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
-		
-		
-		//once the broadphase avoids static-static pairs, we can remove this test
-		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
-		{
-			return;
-		}
-		
-
-		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
-		{
-			return;
-		}
-
-		float depthOut;
-		b3Float4 dirOut;
-		b3Float4 posOut;
-
-
-		int res = b3MprPenetration(pairIndex, bodyIndexA, bodyIndexB,rigidBodies,convexShapes,collidables,vertices,separatingNormals,hasSeparatingAxis,&depthOut, &dirOut, &posOut);
-		
-		
-		
-		
-
-		if (res==0)
-		{
-			//add a contact
-
-			int dstIdx;
-			AppendInc( nGlobalContactsOut, dstIdx );
-			if (dstIdx<contactCapacity)
-			{
-				pairs[pairIndex].z = dstIdx;
-				__global struct b3Contact4Data* c = globalContactsOut + dstIdx;
-				c->m_worldNormalOnB = -dirOut;//normal;
-				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
-				c->m_batchIdx = pairIndex;
-				int bodyA = pairs[pairIndex].x;
-				int bodyB = pairs[pairIndex].y;
-				c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;
-				c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;
-				c->m_childIndexA = -1;
-				c->m_childIndexB = -1;
-				//for (int i=0;i<nContacts;i++)
-				posOut.w = -depthOut;
-				c->m_worldPosB[0] = posOut;//localPoints[contactIdx[i]];
-				GET_NPOINTS(*c) = 1;//nContacts;
-			}
-		}
-
-	}
-}
-
-typedef float4 Quaternion;
-#define make_float4 (float4)
-
-__inline
-float dot3F4(float4 a, float4 b)
-{
-	float4 a1 = make_float4(a.xyz,0.f);
-	float4 b1 = make_float4(b.xyz,0.f);
-	return dot(a1, b1);
-}
-
-
-
-
-__inline
-float4 cross3(float4 a, float4 b)
-{
-	return cross(a,b);
-}
-__inline
-Quaternion qtMul(Quaternion a, Quaternion b)
-{
-	Quaternion ans;
-	ans = cross3( a, b );
-	ans += a.w*b+b.w*a;
-//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
-	ans.w = a.w*b.w - dot3F4(a, b);
-	return ans;
-}
-
-__inline
-Quaternion qtInvert(Quaternion q)
-{
-	return (Quaternion)(-q.xyz, q.w);
-}
-
-__inline
-float4 qtRotate(Quaternion q, float4 vec)
-{
-	Quaternion qInv = qtInvert( q );
-	float4 vcpy = vec;
-	vcpy.w = 0.f;
-	float4 out = qtMul(qtMul(q,vcpy),qInv);
-	return out;
-}
-
-__inline
-float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
-{
-	return qtRotate( *orientation, *p ) + (*translation);
-}
-
-
-__inline
-float4 qtInvRotate(const Quaternion q, float4 vec)
-{
-	return qtRotate( qtInvert( q ), vec );
-}
-
-
-inline void project(__global const b3ConvexPolyhedronData_t* hull,  const float4 pos, const float4 orn, 
-const float4* dir, __global const float4* vertices, float* min, float* max)
-{
-	min[0] = FLT_MAX;
-	max[0] = -FLT_MAX;
-	int numVerts = hull->m_numVertices;
-
-	const float4 localDir = qtInvRotate(orn,*dir);
-	float offset = dot(pos,*dir);
-	for(int i=0;i<numVerts;i++)
-	{
-		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
-		if(dp < min[0])	
-			min[0] = dp;
-		if(dp > max[0])	
-			max[0] = dp;
-	}
-	if(min[0]>max[0])
-	{
-		float tmp = min[0];
-		min[0] = max[0];
-		max[0] = tmp;
-	}
-	min[0] += offset;
-	max[0] += offset;
-}
-
-
-bool findSeparatingAxisUnitSphere(	__global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, 
-	const float4 posA1,
-	const float4 ornA,
-	const float4 posB1,
-	const float4 ornB,
-	const float4 DeltaC2,
-	__global const float4* vertices,
-	__global const float4* unitSphereDirections,
-	int numUnitSphereDirections,
-	float4* sep,
-	float* dmin)
-{
-	
-	float4 posA = posA1;
-	posA.w = 0.f;
-	float4 posB = posB1;
-	posB.w = 0.f;
-
-	int curPlaneTests=0;
-
-	int curEdgeEdge = 0;
-	// Test unit sphere directions
-	for (int i=0;i<numUnitSphereDirections;i++)
-	{
-
-		float4 crossje;
-		crossje = unitSphereDirections[i];	
-
-		if (dot3F4(DeltaC2,crossje)>0)
-			crossje *= -1.f;
-		{
-			float dist;
-			bool result = true;
-			float Min0,Max0;
-			float Min1,Max1;
-			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
-			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
-		
-			if(Max0<Min1 || Max1<Min0)
-				return false;
-		
-			float d0 = Max0 - Min1;
-			float d1 = Max1 - Min0;
-			dist = d0<d1 ? d0:d1;
-			result = true;
-	
-			if(dist<*dmin)
-			{
-				*dmin = dist;
-				*sep = crossje;
-			}
-		}
-	}
-
-	
-	if((dot3F4(-DeltaC2,*sep))>0.0f)
-	{
-		*sep = -(*sep);
-	}
-	return true;
-}
-
-
-
-__kernel void   findSeparatingAxisUnitSphereKernel( __global const int4* pairs, 
-																					__global const b3RigidBodyData_t* rigidBodies, 
-																					__global const b3Collidable_t* collidables,
-																					__global const b3ConvexPolyhedronData_t* convexShapes, 
-																					__global const float4* vertices,
-																					__global const float4* unitSphereDirections,
-																					__global  float4* separatingNormals,
-																					__global  int* hasSeparatingAxis,
-																					__global  float* dmins,
-																					int numUnitSphereDirections,
-																					int numPairs
-																					)
-{
-
-	int i = get_global_id(0);
-	
-	if (i<numPairs)
-	{
-
-		if (hasSeparatingAxis[i])
-		{
-	
-			int bodyIndexA = pairs[i].x;
-			int bodyIndexB = pairs[i].y;
-	
-			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
-			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
-		
-			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
-			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
-			
-			
-			int numFacesA = convexShapes[shapeIndexA].m_numFaces;
-	
-			float dmin = dmins[i];
-	
-			float4 posA = rigidBodies[bodyIndexA].m_pos;
-			posA.w = 0.f;
-			float4 posB = rigidBodies[bodyIndexB].m_pos;
-			posB.w = 0.f;
-			float4 c0local = convexShapes[shapeIndexA].m_localCenter;
-			float4 ornA = rigidBodies[bodyIndexA].m_quat;
-			float4 c0 = transform(&c0local, &posA, &ornA);
-			float4 c1local = convexShapes[shapeIndexB].m_localCenter;
-			float4 ornB =rigidBodies[bodyIndexB].m_quat;
-			float4 c1 = transform(&c1local,&posB,&ornB);
-			const float4 DeltaC2 = c0 - c1;
-			float4 sepNormal = separatingNormals[i];
-			
-			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;
-			if (numEdgeEdgeDirections>numUnitSphereDirections)
-			{
-				bool sepEE = findSeparatingAxisUnitSphere(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
-																										posB,ornB,
-																										DeltaC2,
-																										vertices,unitSphereDirections,numUnitSphereDirections,&sepNormal,&dmin);
-				if (!sepEE)
-				{
-					hasSeparatingAxis[i] = 0;
-				} else
-				{
-					hasSeparatingAxis[i] = 1;
-					separatingNormals[i] = sepNormal;
-				}
-			}
-		}		//if (hasSeparatingAxis[i])
-	}//(i<numPairs)
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h
--- a/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp
@ -1,203 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Takahiro Harada
-//Host-code rewritten by Erwin Coumans
-
-#define BOUNDSEARCH_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl"
-#define KERNEL0 "SearchSortDataLowerKernel"
-#define KERNEL1 "SearchSortDataUpperKernel"
-#define KERNEL2 "SubtractKernel"
-
-#include "b3BoundSearchCL.h"
-#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
-#include "b3LauncherCL.h"
-#include "kernels/BoundSearchKernelsCL.h"
-
-b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
-	: m_context(ctx),
-	  m_device(device),
-	  m_queue(queue)
-{
-	const char* additionalMacros = "";
-	//const char* srcFileNameForCaching="";
-
-	cl_int pErrNum;
-	const char* kernelSource = boundSearchKernelsCL;
-
-	cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, BOUNDSEARCH_PATH);
-	b3Assert(boundSearchProg);
-
-	m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg, additionalMacros);
-	b3Assert(m_lowerSortDataKernel);
-
-	m_upperSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg, additionalMacros);
-	b3Assert(m_upperSortDataKernel);
-
-	m_subtractKernel = 0;
-
-	if (maxSize)
-	{
-		m_subtractKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg, additionalMacros);
-		b3Assert(m_subtractKernel);
-	}
-
-	//m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST );
-
-	m_lower = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
-	m_upper = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
-
-	m_filler = new b3FillCL(ctx, device, queue);
-}
-
-b3BoundSearchCL::~b3BoundSearchCL()
-{
-	delete m_lower;
-	delete m_upper;
-	delete m_filler;
-
-	clReleaseKernel(m_lowerSortDataKernel);
-	clReleaseKernel(m_upperSortDataKernel);
-	clReleaseKernel(m_subtractKernel);
-}
-
-void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option)
-{
-	b3Int4 constBuffer;
-	constBuffer.x = nSrc;
-	constBuffer.y = nDst;
-
-	if (option == BOUND_LOWER)
-	{
-		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_lowerSortDataKernel, "m_lowerSortDataKernel");
-		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(nSrc);
-		launcher.setConst(nDst);
-
-		launcher.launch1D(nSrc, 64);
-	}
-	else if (option == BOUND_UPPER)
-	{
-		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
-
-		b3LauncherCL launcher(m_queue, m_upperSortDataKernel, "m_upperSortDataKernel");
-		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(nSrc);
-		launcher.setConst(nDst);
-
-		launcher.launch1D(nSrc, 64);
-	}
-	else if (option == COUNT)
-	{
-		b3Assert(m_lower);
-		b3Assert(m_upper);
-		b3Assert(m_lower->capacity() <= (int)nDst);
-		b3Assert(m_upper->capacity() <= (int)nDst);
-
-		int zero = 0;
-		m_filler->execute(*m_lower, zero, nDst);
-		m_filler->execute(*m_upper, zero, nDst);
-
-		execute(src, nSrc, *m_lower, nDst, BOUND_LOWER);
-		execute(src, nSrc, *m_upper, nDst, BOUND_UPPER);
-
-		{
-			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_upper->getBufferCL(), true), b3BufferInfoCL(m_lower->getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
-
-			b3LauncherCL launcher(m_queue, m_subtractKernel, "m_subtractKernel");
-			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(nSrc);
-			launcher.setConst(nDst);
-
-			launcher.launch1D(nDst, 64);
-		}
-	}
-	else
-	{
-		b3Assert(0);
-	}
-}
-
-void b3BoundSearchCL::executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc,
-								  b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option)
-{
-	for (int i = 0; i < nSrc - 1; i++)
-		b3Assert(src[i].m_key <= src[i + 1].m_key);
-
-	b3SortData minData, zeroData, maxData;
-	minData.m_key = -1;
-	minData.m_value = -1;
-	zeroData.m_key = 0;
-	zeroData.m_value = 0;
-	maxData.m_key = nDst;
-	maxData.m_value = nDst;
-
-	if (option == BOUND_LOWER)
-	{
-		for (int i = 0; i < nSrc; i++)
-		{
-			b3SortData& iData = (i == 0) ? minData : src[i - 1];
-			b3SortData& jData = (i == nSrc) ? maxData : src[i];
-
-			if (iData.m_key != jData.m_key)
-			{
-				int k = jData.m_key;
-				{
-					dst[k] = i;
-				}
-			}
-		}
-	}
-	else if (option == BOUND_UPPER)
-	{
-		for (int i = 1; i < nSrc + 1; i++)
-		{
-			b3SortData& iData = src[i - 1];
-			b3SortData& jData = (i == nSrc) ? maxData : src[i];
-
-			if (iData.m_key != jData.m_key)
-			{
-				int k = iData.m_key;
-				{
-					dst[k] = i;
-				}
-			}
-		}
-	}
-	else if (option == COUNT)
-	{
-		b3AlignedObjectArray<unsigned int> lower;
-		lower.resize(nDst);
-		b3AlignedObjectArray<unsigned int> upper;
-		upper.resize(nDst);
-
-		for (int i = 0; i < nDst; i++)
-		{
-			lower[i] = upper[i] = 0;
-		}
-
-		executeHost(src, nSrc, lower, nDst, BOUND_LOWER);
-		executeHost(src, nSrc, upper, nDst, BOUND_UPPER);
-
-		for (int i = 0; i < nDst; i++)
-		{
-			dst[i] = upper[i] - lower[i];
-		}
-	}
-	else
-	{
-		b3Assert(0);
-	}
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h
@ -1,64 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Takahiro Harada
-
-#ifndef B3_BOUNDSEARCH_H
-#define B3_BOUNDSEARCH_H
-
-#pragma once
-
-/*#include <Adl/Adl.h>
-#include <AdlPrimitives/Math/Math.h>
-#include <AdlPrimitives/Sort/SortData.h>
-#include <AdlPrimitives/Fill/Fill.h>
-*/
-
-#include "b3OpenCLArray.h"
-#include "b3FillCL.h"
-#include "b3RadixSort32CL.h"  //for b3SortData (perhaps move it?)
-class b3BoundSearchCL
-{
-public:
-	enum Option
-	{
-		BOUND_LOWER,
-		BOUND_UPPER,
-		COUNT,
-	};
-
-	cl_context m_context;
-	cl_device_id m_device;
-	cl_command_queue m_queue;
-
-	cl_kernel m_lowerSortDataKernel;
-	cl_kernel m_upperSortDataKernel;
-	cl_kernel m_subtractKernel;
-
-	b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
-	b3OpenCLArray<unsigned int>* m_lower;
-	b3OpenCLArray<unsigned int>* m_upper;
-
-	b3FillCL* m_filler;
-
-	b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
-
-	virtual ~b3BoundSearchCL();
-
-	//	src has to be src[i].m_key <= src[i+1].m_key
-	void execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
-
-	void executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
-};
-
-#endif  //B3_BOUNDSEARCH_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h
@ -1,18 +0,0 @@
-
-#ifndef B3_BUFFER_INFO_CL_H
-#define B3_BUFFER_INFO_CL_H
-
-#include "b3OpenCLArray.h"
-
-struct b3BufferInfoCL
-{
-	//b3BufferInfoCL(){}
-
-	//	template<typename T>
-	b3BufferInfoCL(cl_mem buff, bool isReadOnly = false) : m_clBuffer(buff), m_isReadOnly(isReadOnly) {}
-
-	cl_mem m_clBuffer;
-	bool m_isReadOnly;
-};
-
-#endif  //B3_BUFFER_INFO_CL_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp
@ -1,119 +0,0 @@
-#include "b3FillCL.h"
-#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
-#include "b3BufferInfoCL.h"
-#include "b3LauncherCL.h"
-
-#define FILL_CL_PROGRAM_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl"
-
-#include "kernels/FillKernelsCL.h"
-
-b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
-	: m_commandQueue(queue)
-{
-	const char* kernelSource = fillKernelsCL;
-	cl_int pErrNum;
-	const char* additionalMacros = "";
-
-	cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, FILL_CL_PROGRAM_PATH);
-	b3Assert(fillProg);
-
-	m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg, additionalMacros);
-	b3Assert(m_fillIntKernel);
-
-	m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg, additionalMacros);
-	b3Assert(m_fillIntKernel);
-
-	m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg, additionalMacros);
-	b3Assert(m_fillFloatKernel);
-
-	m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg, additionalMacros);
-	b3Assert(m_fillKernelInt2);
-}
-
-b3FillCL::~b3FillCL()
-{
-	clReleaseKernel(m_fillKernelInt2);
-	clReleaseKernel(m_fillIntKernel);
-	clReleaseKernel(m_fillUnsignedIntKernel);
-	clReleaseKernel(m_fillFloatKernel);
-}
-
-void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset)
-{
-	b3Assert(n > 0);
-
-	{
-		b3LauncherCL launcher(m_commandQueue, m_fillFloatKernel, "m_fillFloatKernel");
-		launcher.setBuffer(src.getBufferCL());
-		launcher.setConst(n);
-		launcher.setConst(value);
-		launcher.setConst(offset);
-
-		launcher.launch1D(n);
-	}
-}
-
-void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset)
-{
-	b3Assert(n > 0);
-
-	{
-		b3LauncherCL launcher(m_commandQueue, m_fillIntKernel, "m_fillIntKernel");
-		launcher.setBuffer(src.getBufferCL());
-		launcher.setConst(n);
-		launcher.setConst(value);
-		launcher.setConst(offset);
-		launcher.launch1D(n);
-	}
-}
-
-void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
-{
-	b3Assert(n > 0);
-
-	{
-		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};
-
-		b3LauncherCL launcher(m_commandQueue, m_fillUnsignedIntKernel, "m_fillUnsignedIntKernel");
-		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(n);
-		launcher.setConst(value);
-		launcher.setConst(offset);
-
-		launcher.launch1D(n);
-	}
-}
-
-void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset)
-{
-	for (int i = 0; i < n; i++)
-	{
-		src[i + offset] = value;
-	}
-}
-
-void b3FillCL::executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset)
-{
-	for (int i = 0; i < n; i++)
-	{
-		src[i + offset] = value;
-	}
-}
-
-void b3FillCL::execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset)
-{
-	b3Assert(n > 0);
-
-	{
-		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};
-
-		b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2, "m_fillKernelInt2");
-		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(n);
-		launcher.setConst(value);
-		launcher.setConst(offset);
-
-		//( constBuffer );
-		launcher.launch1D(n);
-	}
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h
@ -1,52 +0,0 @@
-#ifndef B3_FILL_CL_H
-#define B3_FILL_CL_H
-
-#include "b3OpenCLArray.h"
-#include "Bullet3Common/b3Scalar.h"
-
-#include "Bullet3Common/shared/b3Int2.h"
-#include "Bullet3Common/shared/b3Int4.h"
-
-class b3FillCL
-{
-	cl_command_queue m_commandQueue;
-
-	cl_kernel m_fillKernelInt2;
-	cl_kernel m_fillIntKernel;
-	cl_kernel m_fillUnsignedIntKernel;
-	cl_kernel m_fillFloatKernel;
-
-public:
-	struct b3ConstData
-	{
-		union {
-			b3Int4 m_data;
-			b3UnsignedInt4 m_UnsignedData;
-		};
-		int m_offset;
-		int m_n;
-		int m_padding[2];
-	};
-
-protected:
-public:
-	b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
-
-	virtual ~b3FillCL();
-
-	void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
-
-	void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);
-
-	void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);
-
-	void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);
-
-	void executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset);
-
-	void executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset);
-
-	//	void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0);
-};
-
-#endif  //B3_FILL_CL_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp
@ -1,296 +0,0 @@
-#include "b3LauncherCL.h"
-
-bool gDebugLauncherCL = false;
-
-b3LauncherCL::b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name)
-	: m_commandQueue(queue),
-	  m_kernel(kernel),
-	  m_idx(0),
-	  m_enableSerialization(false),
-	  m_name(name)
-{
-	if (gDebugLauncherCL)
-	{
-		static int counter = 0;
-		printf("[%d] Prepare to launch OpenCL kernel %s\n", counter++, name);
-	}
-
-	m_serializationSizeInBytes = sizeof(int);
-}
-
-b3LauncherCL::~b3LauncherCL()
-{
-	for (int i = 0; i < m_arrays.size(); i++)
-	{
-		delete (m_arrays[i]);
-	}
-
-	m_arrays.clear();
-	if (gDebugLauncherCL)
-	{
-		static int counter = 0;
-		printf("[%d] Finished launching OpenCL kernel %s\n", counter++, m_name);
-	}
-}
-
-void b3LauncherCL::setBuffer(cl_mem clBuffer)
-{
-	if (m_enableSerialization)
-	{
-		b3KernelArgData kernelArg;
-		kernelArg.m_argIndex = m_idx;
-		kernelArg.m_isBuffer = 1;
-		kernelArg.m_clBuffer = clBuffer;
-
-		cl_mem_info param_name = CL_MEM_SIZE;
-		size_t param_value;
-		size_t sizeInBytes = sizeof(size_t);
-		size_t actualSizeInBytes;
-		cl_int err;
-		err = clGetMemObjectInfo(kernelArg.m_clBuffer,
-								 param_name,
-								 sizeInBytes,
-								 &param_value,
-								 &actualSizeInBytes);
-
-		b3Assert(err == CL_SUCCESS);
-		kernelArg.m_argSizeInBytes = param_value;
-
-		m_kernelArguments.push_back(kernelArg);
-		m_serializationSizeInBytes += sizeof(b3KernelArgData);
-		m_serializationSizeInBytes += param_value;
-	}
-	cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
-	b3Assert(status == CL_SUCCESS);
-}
-
-void b3LauncherCL::setBuffers(b3BufferInfoCL* buffInfo, int n)
-{
-	for (int i = 0; i < n; i++)
-	{
-		if (m_enableSerialization)
-		{
-			b3KernelArgData kernelArg;
-			kernelArg.m_argIndex = m_idx;
-			kernelArg.m_isBuffer = 1;
-			kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
-
-			cl_mem_info param_name = CL_MEM_SIZE;
-			size_t param_value;
-			size_t sizeInBytes = sizeof(size_t);
-			size_t actualSizeInBytes;
-			cl_int err;
-			err = clGetMemObjectInfo(kernelArg.m_clBuffer,
-									 param_name,
-									 sizeInBytes,
-									 &param_value,
-									 &actualSizeInBytes);
-
-			b3Assert(err == CL_SUCCESS);
-			kernelArg.m_argSizeInBytes = param_value;
-
-			m_kernelArguments.push_back(kernelArg);
-			m_serializationSizeInBytes += sizeof(b3KernelArgData);
-			m_serializationSizeInBytes += param_value;
-		}
-		cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
-		b3Assert(status == CL_SUCCESS);
-	}
-}
-
-struct b3KernelArgDataUnaligned
-{
-	int m_isBuffer;
-	int m_argIndex;
-	int m_argSizeInBytes;
-	int m_unusedPadding;
-	union {
-		cl_mem m_clBuffer;
-		unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
-	};
-};
-#include <string.h>
-
-int b3LauncherCL::deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
-{
-	int index = 0;
-
-	int numArguments = *(int*)&buf[index];
-	index += sizeof(int);
-
-	for (int i = 0; i < numArguments; i++)
-	{
-		b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index];
-
-		index += sizeof(b3KernelArgData);
-		if (arg->m_isBuffer)
-		{
-			b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx, m_commandQueue, arg->m_argSizeInBytes);
-			clData->resize(arg->m_argSizeInBytes);
-
-			clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
-
-			arg->m_clBuffer = clData->getBufferCL();
-
-			m_arrays.push_back(clData);
-
-			cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
-			b3Assert(status == CL_SUCCESS);
-			index += arg->m_argSizeInBytes;
-		}
-		else
-		{
-			cl_int status = clSetKernelArg(m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
-			b3Assert(status == CL_SUCCESS);
-		}
-		b3KernelArgData b;
-		memcpy(&b, arg, sizeof(b3KernelArgDataUnaligned));
-		m_kernelArguments.push_back(b);
-	}
-	m_serializationSizeInBytes = index;
-	return index;
-}
-
-int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
-{
-	int index = 0;
-
-	int numArguments = *(int*)&goldBuffer[index];
-	index += sizeof(int);
-
-	if (numArguments != m_kernelArguments.size())
-	{
-		printf("failed validation: expected %d arguments, found %d\n", numArguments, m_kernelArguments.size());
-		return -1;
-	}
-
-	for (int ii = 0; ii < numArguments; ii++)
-	{
-		b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];
-
-		if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
-		{
-			printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n", ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
-			return -2;
-		}
-
-		{
-			int expected = argGold->m_isBuffer;
-			int found = m_kernelArguments[ii].m_isBuffer;
-
-			if (expected != found)
-			{
-				printf("failed validation: argument %d isBuffer expected: %d, found %d\n", ii, expected, found);
-				return -3;
-			}
-		}
-		index += sizeof(b3KernelArgData);
-
-		if (argGold->m_isBuffer)
-		{
-			unsigned char* memBuf = (unsigned char*)malloc(m_kernelArguments[ii].m_argSizeInBytes);
-			unsigned char* goldBuf = &goldBuffer[index];
-			for (int j = 0; j < m_kernelArguments[j].m_argSizeInBytes; j++)
-			{
-				memBuf[j] = 0xaa;
-			}
-
-			cl_int status = 0;
-			status = clEnqueueReadBuffer(m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
-										 memBuf, 0, 0, 0);
-			b3Assert(status == CL_SUCCESS);
-			clFinish(m_commandQueue);
-
-			for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
-			{
-				int expected = goldBuf[b];
-				int found = memBuf[b];
-				if (expected != found)
-				{
-					printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
-						   ii, b, expected, found);
-					return -4;
-				}
-			}
-
-			index += argGold->m_argSizeInBytes;
-		}
-		else
-		{
-			//compare content
-			for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
-			{
-				int expected = argGold->m_argData[b];
-				int found = m_kernelArguments[ii].m_argData[b];
-				if (expected != found)
-				{
-					printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
-						   ii, b, expected, found);
-					return -5;
-				}
-			}
-		}
-	}
-	return index;
-}
-
-int b3LauncherCL::serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
-{
-	//initialize to known values
-	for (int i = 0; i < destBufferCapacity; i++)
-		destBuffer[i] = 0xec;
-
-	assert(destBufferCapacity >= m_serializationSizeInBytes);
-
-	//todo: use the b3Serializer for this to allow for 32/64bit, endianness etc
-	int numArguments = m_kernelArguments.size();
-	int curBufferSize = 0;
-	int* dest = (int*)&destBuffer[curBufferSize];
-	*dest = numArguments;
-	curBufferSize += sizeof(int);
-
-	for (int i = 0; i < this->m_kernelArguments.size(); i++)
-	{
-		b3KernelArgData* arg = (b3KernelArgData*)&destBuffer[curBufferSize];
-		*arg = m_kernelArguments[i];
-		curBufferSize += sizeof(b3KernelArgData);
-		if (arg->m_isBuffer == 1)
-		{
-			//copy the OpenCL buffer content
-			cl_int status = 0;
-			status = clEnqueueReadBuffer(m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
-										 &destBuffer[curBufferSize], 0, 0, 0);
-			b3Assert(status == CL_SUCCESS);
-			clFinish(m_commandQueue);
-			curBufferSize += arg->m_argSizeInBytes;
-		}
-	}
-	return curBufferSize;
-}
-
-void b3LauncherCL::serializeToFile(const char* fileName, int numWorkItems)
-{
-	int num = numWorkItems;
-	int buffSize = getSerializationBufferSize();
-	unsigned char* buf = new unsigned char[buffSize + sizeof(int)];
-	for (int i = 0; i < buffSize + 1; i++)
-	{
-		unsigned char* ptr = (unsigned char*)&buf[i];
-		*ptr = 0xff;
-	}
-	//	int actualWrite = serializeArguments(buf,buffSize);
-
-	//	unsigned char* cptr = (unsigned char*)&buf[buffSize];
-	//            printf("buf[buffSize] = %d\n",*cptr);
-
-	assert(buf[buffSize] == 0xff);  //check for buffer overrun
-	int* ptr = (int*)&buf[buffSize];
-
-	*ptr = num;
-
-	FILE* f = fopen(fileName, "wb");
-	fwrite(buf, buffSize + sizeof(int), 1, f);
-	fclose(f);
-
-	delete[] buf;
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h
@ -1,128 +0,0 @@
-
-#ifndef B3_LAUNCHER_CL_H
-#define B3_LAUNCHER_CL_H
-
-#include "b3BufferInfoCL.h"
-#include "Bullet3Common/b3MinMax.h"
-#include "b3OpenCLArray.h"
-#include <stdio.h>
-
-#define B3_DEBUG_SERIALIZE_CL
-
-#ifdef _WIN32
-#pragma warning(disable : 4996)
-#endif
-#define B3_CL_MAX_ARG_SIZE 16
-B3_ATTRIBUTE_ALIGNED16(struct)
-b3KernelArgData
-{
-	int m_isBuffer;
-	int m_argIndex;
-	int m_argSizeInBytes;
-	int m_unusedPadding;
-	union {
-		cl_mem m_clBuffer;
-		unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
-	};
-};
-
-class b3LauncherCL
-{
-	cl_command_queue m_commandQueue;
-	cl_kernel m_kernel;
-	int m_idx;
-
-	b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
-	int m_serializationSizeInBytes;
-	bool m_enableSerialization;
-
-	const char* m_name;
-
-public:
-	b3AlignedObjectArray<b3OpenCLArray<unsigned char>*> m_arrays;
-
-	b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name);
-
-	virtual ~b3LauncherCL();
-
-	void setBuffer(cl_mem clBuffer);
-
-	void setBuffers(b3BufferInfoCL* buffInfo, int n);
-
-	int getSerializationBufferSize() const
-	{
-		return m_serializationSizeInBytes;
-	}
-
-	int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx);
-
-	inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx);
-
-	int serializeArguments(unsigned char* destBuffer, int destBufferCapacity);
-
-	int getNumArguments() const
-	{
-		return m_kernelArguments.size();
-	}
-
-	b3KernelArgData getArgument(int index)
-	{
-		return m_kernelArguments[index];
-	}
-
-	void serializeToFile(const char* fileName, int numWorkItems);
-
-	template <typename T>
-	inline void setConst(const T& consts)
-	{
-		int sz = sizeof(T);
-		b3Assert(sz <= B3_CL_MAX_ARG_SIZE);
-
-		if (m_enableSerialization)
-		{
-			b3KernelArgData kernelArg;
-			kernelArg.m_argIndex = m_idx;
-			kernelArg.m_isBuffer = 0;
-			T* destArg = (T*)kernelArg.m_argData;
-			*destArg = consts;
-			kernelArg.m_argSizeInBytes = sizeof(T);
-			m_kernelArguments.push_back(kernelArg);
-			m_serializationSizeInBytes += sizeof(b3KernelArgData);
-		}
-
-		cl_int status = clSetKernelArg(m_kernel, m_idx++, sz, &consts);
-		b3Assert(status == CL_SUCCESS);
-	}
-
-	inline void launch1D(int numThreads, int localSize = 64)
-	{
-		launch2D(numThreads, 1, localSize, 1);
-	}
-
-	inline void launch2D(int numThreadsX, int numThreadsY, int localSizeX, int localSizeY)
-	{
-		size_t gRange[3] = {1, 1, 1};
-		size_t lRange[3] = {1, 1, 1};
-		lRange[0] = localSizeX;
-		lRange[1] = localSizeY;
-		gRange[0] = b3Max((size_t)1, (numThreadsX / lRange[0]) + (!(numThreadsX % lRange[0]) ? 0 : 1));
-		gRange[0] *= lRange[0];
-		gRange[1] = b3Max((size_t)1, (numThreadsY / lRange[1]) + (!(numThreadsY % lRange[1]) ? 0 : 1));
-		gRange[1] *= lRange[1];
-
-		cl_int status = clEnqueueNDRangeKernel(m_commandQueue,
-											   m_kernel, 2, NULL, gRange, lRange, 0, 0, 0);
-		if (status != CL_SUCCESS)
-		{
-			printf("Error: OpenCL status = %d\n", status);
-		}
-		b3Assert(status == CL_SUCCESS);
-	}
-
-	void enableSerialization(bool serialize)
-	{
-		m_enableSerialization = serialize;
-	}
-};
-
-#endif  //B3_LAUNCHER_CL_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h
@ -1,300 +0,0 @@
-#ifndef B3_OPENCL_ARRAY_H
-#define B3_OPENCL_ARRAY_H
-
-#include "Bullet3Common/b3AlignedObjectArray.h"
-#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
-
-template <typename T>
-class b3OpenCLArray
-{
-	size_t m_size;
-	size_t m_capacity;
-	cl_mem m_clBuffer;
-
-	cl_context m_clContext;
-	cl_command_queue m_commandQueue;
-
-	bool m_ownsMemory;
-
-	bool m_allowGrowingCapacity;
-
-	void deallocate()
-	{
-		if (m_clBuffer && m_ownsMemory)
-		{
-			clReleaseMemObject(m_clBuffer);
-		}
-		m_clBuffer = 0;
-		m_capacity = 0;
-	}
-
-	b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src);
-
-	B3_FORCE_INLINE size_t allocSize(size_t size)
-	{
-		return (size ? size * 2 : 1);
-	}
-
-public:
-	b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity = 0, bool allowGrowingCapacity = true)
-		: m_size(0), m_capacity(0), m_clBuffer(0), m_clContext(ctx), m_commandQueue(queue), m_ownsMemory(true), m_allowGrowingCapacity(true)
-	{
-		if (initialCapacity)
-		{
-			reserve(initialCapacity);
-		}
-		m_allowGrowingCapacity = allowGrowingCapacity;
-	}
-
-	///this is an error-prone method with no error checking, be careful!
-	void setFromOpenCLBuffer(cl_mem buffer, size_t sizeInElements)
-	{
-		deallocate();
-		m_ownsMemory = false;
-		m_allowGrowingCapacity = false;
-		m_clBuffer = buffer;
-		m_size = sizeInElements;
-		m_capacity = sizeInElements;
-	}
-
-	// we could enable this assignment, but need to make sure to avoid accidental deep copies
-	//	b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
-	//	{
-	//		copyFromArray(src);
-	//		return *this;
-	//	}
-
-	cl_mem getBufferCL() const
-	{
-		return m_clBuffer;
-	}
-
-	virtual ~b3OpenCLArray()
-	{
-		deallocate();
-		m_size = 0;
-		m_capacity = 0;
-	}
-
-	B3_FORCE_INLINE bool push_back(const T& _Val, bool waitForCompletion = true)
-	{
-		bool result = true;
-		size_t sz = size();
-		if (sz == capacity())
-		{
-			result = reserve(allocSize(size()));
-		}
-		copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
-		m_size++;
-		return result;
-	}
-
-	B3_FORCE_INLINE T forcedAt(size_t n) const
-	{
-		b3Assert(n >= 0);
-		b3Assert(n < capacity());
-		T elem;
-		copyToHostPointer(&elem, 1, n, true);
-		return elem;
-	}
-
-	B3_FORCE_INLINE T at(size_t n) const
-	{
-		b3Assert(n >= 0);
-		b3Assert(n < size());
-		T elem;
-		copyToHostPointer(&elem, 1, n, true);
-		return elem;
-	}
-
-	B3_FORCE_INLINE bool resize(size_t newsize, bool copyOldContents = true)
-	{
-		bool result = true;
-		size_t curSize = size();
-
-		if (newsize < curSize)
-		{
-			//leave the OpenCL memory for now
-		}
-		else
-		{
-			if (newsize > size())
-			{
-				result = reserve(newsize, copyOldContents);
-			}
-
-			//leave new data uninitialized (init in debug mode?)
-			//for (size_t i=curSize;i<newsize;i++) ...
-		}
-
-		if (result)
-		{
-			m_size = newsize;
-		}
-		else
-		{
-			m_size = 0;
-		}
-		return result;
-	}
-
-	B3_FORCE_INLINE size_t size() const
-	{
-		return m_size;
-	}
-
-	B3_FORCE_INLINE size_t capacity() const
-	{
-		return m_capacity;
-	}
-
-	B3_FORCE_INLINE bool reserve(size_t _Count, bool copyOldContents = true)
-	{
-		bool result = true;
-		// determine new minimum length of allocated storage
-		if (capacity() < _Count)
-		{  // not enough room, reallocate
-
-			if (m_allowGrowingCapacity)
-			{
-				cl_int ciErrNum;
-				//create a new OpenCL buffer
-				size_t memSizeInBytes = sizeof(T) * _Count;
-				cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
-				if (ciErrNum != CL_SUCCESS)
-				{
-					b3Error("OpenCL out-of-memory\n");
-					_Count = 0;
-					result = false;
-				}
-//#define B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
-#ifdef B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
-				unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
-				for (size_t i = 0; i < memSizeInBytes; i++)
-					src[i] = 0xbb;
-				ciErrNum = clEnqueueWriteBuffer(m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0, 0, 0);
-				b3Assert(ciErrNum == CL_SUCCESS);
-				clFinish(m_commandQueue);
-				free(src);
-#endif  //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
-
-				if (result)
-				{
-					if (copyOldContents)
-						copyToCL(buf, size());
-				}
-
-				//deallocate the old buffer
-				deallocate();
-
-				m_clBuffer = buf;
-
-				m_capacity = _Count;
-			}
-			else
-			{
-				//fail: assert and
-				b3Assert(0);
-				deallocate();
-				result = false;
-			}
-		}
-		return result;
-	}
-
-	void copyToCL(cl_mem destination, size_t numElements, size_t firstElem = 0, size_t dstOffsetInElems = 0) const
-	{
-		if (numElements <= 0)
-			return;
-
-		b3Assert(m_clBuffer);
-		b3Assert(destination);
-
-		//likely some error, destination is same as source
-		b3Assert(m_clBuffer != destination);
-
-		b3Assert((firstElem + numElements) <= m_size);
-
-		cl_int status = 0;
-
-		b3Assert(numElements > 0);
-		b3Assert(numElements <= m_size);
-
-		size_t srcOffsetBytes = sizeof(T) * firstElem;
-		size_t dstOffsetInBytes = sizeof(T) * dstOffsetInElems;
-
-		status = clEnqueueCopyBuffer(m_commandQueue, m_clBuffer, destination,
-									 srcOffsetBytes, dstOffsetInBytes, sizeof(T) * numElements, 0, 0, 0);
-
-		b3Assert(status == CL_SUCCESS);
-	}
-
-	void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion = true)
-	{
-		size_t newSize = srcArray.size();
-
-		bool copyOldContents = false;
-		resize(newSize, copyOldContents);
-		if (newSize)
-			copyFromHostPointer(&srcArray[0], newSize, 0, waitForCompletion);
-	}
-
-	void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem = 0, bool waitForCompletion = true)
-	{
-		b3Assert(numElems + destFirstElem <= capacity());
-
-		if (numElems + destFirstElem)
-		{
-			cl_int status = 0;
-			size_t sizeInBytes = sizeof(T) * numElems;
-			status = clEnqueueWriteBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * destFirstElem, sizeInBytes,
-										  src, 0, 0, 0);
-			b3Assert(status == CL_SUCCESS);
-			if (waitForCompletion)
-				clFinish(m_commandQueue);
-		}
-		else
-		{
-			b3Error("copyFromHostPointer invalid range\n");
-		}
-	}
-
-	void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion = true) const
-	{
-		destArray.resize(this->size());
-		if (size())
-			copyToHostPointer(&destArray[0], size(), 0, waitForCompletion);
-	}
-
-	void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem = 0, bool waitForCompletion = true) const
-	{
-		b3Assert(numElem + srcFirstElem <= capacity());
-
-		if (numElem + srcFirstElem <= capacity())
-		{
-			cl_int status = 0;
-			status = clEnqueueReadBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * srcFirstElem, sizeof(T) * numElem,
-										 destPtr, 0, 0, 0);
-			b3Assert(status == CL_SUCCESS);
-
-			if (waitForCompletion)
-				clFinish(m_commandQueue);
-		}
-		else
-		{
-			b3Error("copyToHostPointer invalid range\n");
-		}
-	}
-
-	void copyFromOpenCLArray(const b3OpenCLArray& src)
-	{
-		size_t newSize = src.size();
-		resize(newSize);
-		if (size())
-		{
-			src.copyToCL(m_clBuffer, size());
-		}
-	}
-};
-
-#endif  //B3_OPENCL_ARRAY_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp
@ -1,120 +0,0 @@
-#include "b3PrefixScanCL.h"
-#include "b3FillCL.h"
-#define B3_PREFIXSCAN_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl"
-
-#include "b3LauncherCL.h"
-#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
-#include "kernels/PrefixScanKernelsCL.h"
-
-b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
-	: m_commandQueue(queue)
-{
-	const char* scanKernelSource = prefixScanKernelsCL;
-	cl_int pErrNum;
-	char* additionalMacros = 0;
-
-	m_workBuffer = new b3OpenCLArray<unsigned int>(ctx, queue, size);
-	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_PROG_PATH);
-	b3Assert(scanProg);
-
-	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
-	b3Assert(m_localScanKernel);
-	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
-	b3Assert(m_blockSumKernel);
-	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
-	b3Assert(m_propagationKernel);
-}
-
-b3PrefixScanCL::~b3PrefixScanCL()
-{
-	delete m_workBuffer;
-	clReleaseKernel(m_localScanKernel);
-	clReleaseKernel(m_blockSumKernel);
-	clReleaseKernel(m_propagationKernel);
-}
-
-template <class T>
-T b3NextPowerOf2(T n)
-{
-	n -= 1;
-	for (int i = 0; i < sizeof(T) * 8; i++)
-		n = n | (n >> i);
-	return n + 1;
-}
-
-void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
-{
-	//	b3Assert( data->m_option == EXCLUSIVE );
-	const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));
-
-	dst.resize(src.size());
-	m_workBuffer->resize(src.size());
-
-	b3Int4 constBuffer;
-	constBuffer.x = n;
-	constBuffer.y = numBlocks;
-	constBuffer.z = (int)b3NextPowerOf2(numBlocks);
-
-	b3OpenCLArray<unsigned int>* srcNative = &src;
-	b3OpenCLArray<unsigned int>* dstNative = &dst;
-
-	{
-		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
-
-		b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
-		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(constBuffer);
-		launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
-	}
-
-	{
-		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};
-
-		b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
-		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(constBuffer);
-		launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
-	}
-
-	if (numBlocks > 1)
-	{
-		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
-		b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
-		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(constBuffer);
-		launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
-	}
-
-	if (sum)
-	{
-		clFinish(m_commandQueue);
-		dstNative->copyToHostPointer(sum, 1, n - 1, true);
-	}
-}
-
-void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
-{
-	unsigned int s = 0;
-	//if( data->m_option == EXCLUSIVE )
-	{
-		for (int i = 0; i < n; i++)
-		{
-			dst[i] = s;
-			s += src[i];
-		}
-	}
-	/*else
-	{
-		for(int i=0; i<n; i++)
-		{
-			s += hSrc[i];
-			hDst[i] = s;
-		}
-	}
-	*/
-
-	if (sum)
-	{
-		*sum = dst[n - 1];
-	}
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h
@ -1,35 +0,0 @@
-
-#ifndef B3_PREFIX_SCAN_CL_H
-#define B3_PREFIX_SCAN_CL_H
-
-#include "b3OpenCLArray.h"
-#include "b3BufferInfoCL.h"
-#include "Bullet3Common/b3AlignedObjectArray.h"
-
-class b3PrefixScanCL
-{
-	enum
-	{
-		BLOCK_SIZE = 128
-	};
-
-	//	Option m_option;
-
-	cl_command_queue m_commandQueue;
-
-	cl_kernel m_localScanKernel;
-	cl_kernel m_blockSumKernel;
-	cl_kernel m_propagationKernel;
-
-	b3OpenCLArray<unsigned int>* m_workBuffer;
-
-public:
-	b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);
-
-	virtual ~b3PrefixScanCL();
-
-	void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
-	void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum = 0);
-};
-
-#endif  //B3_PREFIX_SCAN_CL_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp
@ -1,120 +0,0 @@
-#include "b3PrefixScanFloat4CL.h"
-#include "b3FillCL.h"
-#define B3_PREFIXSCAN_FLOAT4_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl"
-
-#include "b3LauncherCL.h"
-#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
-#include "kernels/PrefixScanKernelsFloat4CL.h"
-
-b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
-	: m_commandQueue(queue)
-{
-	const char* scanKernelSource = prefixScanKernelsFloat4CL;
-	cl_int pErrNum;
-	char* additionalMacros = 0;
-
-	m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx, queue, size);
-	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
-	b3Assert(scanProg);
-
-	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
-	b3Assert(m_localScanKernel);
-	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
-	b3Assert(m_blockSumKernel);
-	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
-	b3Assert(m_propagationKernel);
-}
-
-b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL()
-{
-	delete m_workBuffer;
-	clReleaseKernel(m_localScanKernel);
-	clReleaseKernel(m_blockSumKernel);
-	clReleaseKernel(m_propagationKernel);
-}
-
-template <class T>
-T b3NextPowerOf2(T n)
-{
-	n -= 1;
-	for (int i = 0; i < sizeof(T) * 8; i++)
-		n = n | (n >> i);
-	return n + 1;
-}
-
-void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum)
-{
-	//	b3Assert( data->m_option == EXCLUSIVE );
-	const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));
-
-	dst.resize(src.size());
-	m_workBuffer->resize(src.size());
-
-	b3Int4 constBuffer;
-	constBuffer.x = n;
-	constBuffer.y = numBlocks;
-	constBuffer.z = (int)b3NextPowerOf2(numBlocks);
-
-	b3OpenCLArray<b3Vector3>* srcNative = &src;
-	b3OpenCLArray<b3Vector3>* dstNative = &dst;
-
-	{
-		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
-
-		b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
-		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(constBuffer);
-		launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
-	}
-
-	{
-		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};
-
-		b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
-		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(constBuffer);
-		launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
-	}
-
-	if (numBlocks > 1)
-	{
-		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
-		b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
-		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-		launcher.setConst(constBuffer);
-		launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
-	}
-
-	if (sum)
-	{
-		clFinish(m_commandQueue);
-		dstNative->copyToHostPointer(sum, 1, n - 1, true);
-	}
-}
-
-void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum)
-{
-	b3Vector3 s = b3MakeVector3(0, 0, 0);
-	//if( data->m_option == EXCLUSIVE )
-	{
-		for (int i = 0; i < n; i++)
-		{
-			dst[i] = s;
-			s += src[i];
-		}
-	}
-	/*else
-	{
-		for(int i=0; i<n; i++)
-		{
-			s += hSrc[i];
-			hDst[i] = s;
-		}
-	}
-	*/
-
-	if (sum)
-	{
-		*sum = dst[n - 1];
-	}
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h
@ -1,36 +0,0 @@
-
-#ifndef B3_PREFIX_SCAN_CL_H
-#define B3_PREFIX_SCAN_CL_H
-
-#include "b3OpenCLArray.h"
-#include "b3BufferInfoCL.h"
-#include "Bullet3Common/b3AlignedObjectArray.h"
-#include "Bullet3Common/b3Vector3.h"
-
-class b3PrefixScanFloat4CL
-{
-	enum
-	{
-		BLOCK_SIZE = 128
-	};
-
-	//	Option m_option;
-
-	cl_command_queue m_commandQueue;
-
-	cl_kernel m_localScanKernel;
-	cl_kernel m_blockSumKernel;
-	cl_kernel m_propagationKernel;
-
-	b3OpenCLArray<b3Vector3>* m_workBuffer;
-
-public:
-	b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);
-
-	virtual ~b3PrefixScanFloat4CL();
-
-	void execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum = 0);
-	void executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum);
-};
-
-#endif  //B3_PREFIX_SCAN_CL_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
@ -1,646 +0,0 @@
-
-#include "b3RadixSort32CL.h"
-#include "b3LauncherCL.h"
-#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
-#include "b3PrefixScanCL.h"
-#include "b3FillCL.h"
-
-#define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
-
-#include "kernels/RadixSort32KernelsCL.h"
-
-b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
-	: m_commandQueue(queue)
-{
-	b3OpenCLDeviceInfo info;
-	b3OpenCLUtils::getDeviceInfo(device, &info);
-	m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
-
-	m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
-	m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
-	m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
-	m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
-	m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
-	m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
-
-	if (initialCapacity > 0)
-	{
-		m_workBuffer1->resize(initialCapacity);
-		m_workBuffer3->resize(initialCapacity);
-		m_workBuffer3a->resize(initialCapacity);
-		m_workBuffer4->resize(initialCapacity);
-		m_workBuffer4a->resize(initialCapacity);
-	}
-
-	m_scan = new b3PrefixScanCL(ctx, device, queue);
-	m_fill = new b3FillCL(ctx, device, queue);
-
-	const char* additionalMacros = "";
-
-	cl_int pErrNum;
-	const char* kernelSource = radixSort32KernelsCL;
-
-	cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
-	b3Assert(sortProg);
-
-	m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
-	b3Assert(m_streamCountSortDataKernel);
-
-	m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
-	b3Assert(m_streamCountKernel);
-
-	if (m_deviceCPU)
-	{
-		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
-		b3Assert(m_sortAndScatterSortDataKernel);
-		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
-		b3Assert(m_sortAndScatterKernel);
-	}
-	else
-	{
-		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
-		b3Assert(m_sortAndScatterSortDataKernel);
-		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
-		b3Assert(m_sortAndScatterKernel);
-	}
-
-	m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
-	b3Assert(m_prefixScanKernel);
-}
-
-b3RadixSort32CL::~b3RadixSort32CL()
-{
-	delete m_scan;
-	delete m_fill;
-	delete m_workBuffer1;
-	delete m_workBuffer2;
-	delete m_workBuffer3;
-	delete m_workBuffer3a;
-	delete m_workBuffer4;
-	delete m_workBuffer4a;
-
-	clReleaseKernel(m_streamCountSortDataKernel);
-	clReleaseKernel(m_streamCountKernel);
-	clReleaseKernel(m_sortAndScatterSortDataKernel);
-	clReleaseKernel(m_sortAndScatterKernel);
-	clReleaseKernel(m_prefixScanKernel);
-}
-
-void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
-{
-	int n = inout.size();
-	const int BITS_PER_PASS = 8;
-	const int NUM_TABLES = (1 << BITS_PER_PASS);
-
-	int tables[NUM_TABLES];
-	int counter[NUM_TABLES];
-
-	b3SortData* src = &inout[0];
-	b3AlignedObjectArray<b3SortData> workbuffer;
-	workbuffer.resize(inout.size());
-	b3SortData* dst = &workbuffer[0];
-
-	int count = 0;
-	for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
-	{
-		for (int i = 0; i < NUM_TABLES; i++)
-		{
-			tables[i] = 0;
-		}
-
-		for (int i = 0; i < n; i++)
-		{
-			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
-			tables[tableIdx]++;
-		}
-//#define TEST
-#ifdef TEST
-		printf("histogram size=%d\n", NUM_TABLES);
-		for (int i = 0; i < NUM_TABLES; i++)
-		{
-			if (tables[i] != 0)
-			{
-				printf("tables[%d]=%d]\n", i, tables[i]);
-			}
-		}
-#endif  //TEST \
-	//	prefix scan
-		int sum = 0;
-		for (int i = 0; i < NUM_TABLES; i++)
-		{
-			int iData = tables[i];
-			tables[i] = sum;
-			sum += iData;
-			counter[i] = 0;
-		}
-
-		//	distribute
-		for (int i = 0; i < n; i++)
-		{
-			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
-
-			dst[tables[tableIdx] + counter[tableIdx]] = src[i];
-			counter[tableIdx]++;
-		}
-
-		b3Swap(src, dst);
-		count++;
-	}
-
-	if (count & 1)
-	{
-		b3Assert(0);  //need to copy
-	}
-}
-
-void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
-{
-	b3AlignedObjectArray<b3SortData> inout;
-	keyValuesInOut.copyToHost(inout);
-
-	executeHost(inout, sortBits);
-
-	keyValuesInOut.copyFromHost(inout);
-}
-
-void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
-							  b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
-{
-}
-
-//#define DEBUG_RADIXSORT
-//#define DEBUG_RADIXSORT2
-
-void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
-{
-	int originalSize = keyValuesInOut.size();
-	int workingSize = originalSize;
-
-	int dataAlignment = DATA_ALIGNMENT;
-
-#ifdef DEBUG_RADIXSORT2
-	b3AlignedObjectArray<b3SortData> test2;
-	keyValuesInOut.copyToHost(test2);
-	printf("numElem = %d\n", test2.size());
-	for (int i = 0; i < test2.size(); i++)
-	{
-		printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
-		printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
-	}
-#endif  //DEBUG_RADIXSORT2
-
-	b3OpenCLArray<b3SortData>* src = 0;
-
-	if (workingSize % dataAlignment)
-	{
-		workingSize += dataAlignment - (workingSize % dataAlignment);
-		m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
-		m_workBuffer4->resize(workingSize);
-		b3SortData fillValue;
-		fillValue.m_key = 0xffffffff;
-		fillValue.m_value = 0xffffffff;
-
-#define USE_BTFILL
-#ifdef USE_BTFILL
-		m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
-#else
-		//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
-
-		for (int i = originalSize; i < workingSize; i++)
-		{
-			m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
-		}
-#endif  //USE_BTFILL
-
-		src = m_workBuffer4;
-	}
-	else
-	{
-		src = &keyValuesInOut;
-		m_workBuffer4->resize(0);
-	}
-
-	b3Assert(workingSize % DATA_ALIGNMENT == 0);
-	int minCap = NUM_BUCKET * NUM_WGS;
-
-	int n = workingSize;
-
-	m_workBuffer1->resize(minCap);
-	m_workBuffer3->resize(workingSize);
-
-	//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
-	b3Assert(BITS_PER_PASS == 4);
-	b3Assert(WG_SIZE == 64);
-	b3Assert((sortBits & 0x3) == 0);
-
-	b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
-
-	b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
-	b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
-
-	int nWGs = NUM_WGS;
-	b3ConstData cdata;
-
-	{
-		int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE;  //set at 256
-		int nBlocks = (n + blockSize - 1) / (blockSize);
-		cdata.m_n = n;
-		cdata.m_nWGs = NUM_WGS;
-		cdata.m_startBit = 0;
-		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
-		if (nBlocks < NUM_WGS)
-		{
-			cdata.m_nBlocksPerWG = 1;
-			nWGs = nBlocks;
-		}
-	}
-
-	int count = 0;
-	for (int ib = 0; ib < sortBits; ib += 4)
-	{
-#ifdef DEBUG_RADIXSORT2
-		keyValuesInOut.copyToHost(test2);
-		printf("numElem = %d\n", test2.size());
-		for (int i = 0; i < test2.size(); i++)
-		{
-			if (test2[i].m_key != test2[i].m_value)
-			{
-				printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
-				printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
-			}
-		}
-#endif  //DEBUG_RADIXSORT2
-
-		cdata.m_startBit = ib;
-
-		if (src->size())
-		{
-			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
-			b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
-
-			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(cdata);
-
-			int num = NUM_WGS * WG_SIZE;
-			launcher.launch1D(num, WG_SIZE);
-		}
-
-#ifdef DEBUG_RADIXSORT
-		b3AlignedObjectArray<unsigned int> testHist;
-		srcHisto->copyToHost(testHist);
-		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
-		for (int i = 0; i < testHist.size(); i++)
-		{
-			if (testHist[i] != 0)
-				printf("testHist[%d]=%d\n", i, testHist[i]);
-		}
-#endif  //DEBUG_RADIXSORT
-
-//fast prefix scan is not working properly on Mac OSX yet
-#ifdef __APPLE__
-		bool fastScan = false;
-#else
-		bool fastScan = !m_deviceCPU;  //only use fast scan on GPU
-#endif
-
-		if (fastScan)
-		{  //	prefix scan group histogram
-			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
-			b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
-			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(cdata);
-			launcher.launch1D(128, 128);
-			destHisto = srcHisto;
-		}
-		else
-		{
-			//unsigned int sum; //for debugging
-			m_scan->execute(*srcHisto, *destHisto, 1920, 0);  //,&sum);
-		}
-
-#ifdef DEBUG_RADIXSORT
-		destHisto->copyToHost(testHist);
-		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
-		for (int i = 0; i < testHist.size(); i++)
-		{
-			if (testHist[i] != 0)
-				printf("testHist[%d]=%d\n", i, testHist[i]);
-		}
-
-		for (int i = 0; i < testHist.size(); i += NUM_WGS)
-		{
-			printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
-		}
-
-#endif  //DEBUG_RADIXSORT
-
-#define USE_GPU
-#ifdef USE_GPU
-
-		if (src->size())
-		{  //	local sort and distribute
-			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
-			b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
-			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(cdata);
-			launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
-		}
-#else
-		{
-#define NUM_TABLES 16
-//#define SEQUENTIAL
-#ifdef SEQUENTIAL
-			int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-			int tables[NUM_TABLES];
-			int startBit = ib;
-
-			destHisto->copyToHost(testHist);
-			b3AlignedObjectArray<b3SortData> srcHost;
-			b3AlignedObjectArray<b3SortData> dstHost;
-			dstHost.resize(src->size());
-
-			src->copyToHost(srcHost);
-
-			for (int i = 0; i < NUM_TABLES; i++)
-			{
-				tables[i] = testHist[i * NUM_WGS];
-			}
-
-			//	distribute
-			for (int i = 0; i < n; i++)
-			{
-				int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
-
-				dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
-				counter2[tableIdx]++;
-			}
-
-#else
-
-			int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-			int tables[NUM_TABLES];
-			b3AlignedObjectArray<b3SortData> dstHostOK;
-			dstHostOK.resize(src->size());
-
-			destHisto->copyToHost(testHist);
-			b3AlignedObjectArray<b3SortData> srcHost;
-			src->copyToHost(srcHost);
-
-			int blockSize = 256;
-			int nBlocksPerWG = cdata.m_nBlocksPerWG;
-			int startBit = ib;
-
-			{
-				for (int i = 0; i < NUM_TABLES; i++)
-				{
-					tables[i] = testHist[i * NUM_WGS];
-				}
-
-				//	distribute
-				for (int i = 0; i < n; i++)
-				{
-					int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
-
-					dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
-					counter2[tableIdx]++;
-				}
-			}
-
-			b3AlignedObjectArray<b3SortData> dstHost;
-			dstHost.resize(src->size());
-
-			int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-			for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
-			{
-				int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-				int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
-
-				for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
-				{
-					for (int lIdx = 0; lIdx < 64; lIdx++)
-					{
-						int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
-
-						//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
-						//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
-						//	AMD: AtomInc performs better while NV prefers ++
-						for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
-						{
-							if (addr + j < n)
-							{
-								//  printf ("addr+j=%d\n", addr+j);
-
-								int i = addr + j;
-
-								int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
-
-								int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
-
-								b3SortData ok = dstHostOK[destIndex];
-
-								if (ok.m_key != srcHost[i].m_key)
-								{
-									printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
-									printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
-								}
-								if (ok.m_value != srcHost[i].m_value)
-								{
-									printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
-									printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
-								}
-
-								dstHost[destIndex] = srcHost[i];
-								counter[tableIdx]++;
-							}
-						}
-					}
-				}
-			}
-
-#endif  //SEQUENTIAL
-
-			dst->copyFromHost(dstHost);
-		}
-#endif  //USE_GPU
-
-#ifdef DEBUG_RADIXSORT
-		destHisto->copyToHost(testHist);
-		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
-		for (int i = 0; i < testHist.size(); i++)
-		{
-			if (testHist[i] != 0)
-				printf("testHist[%d]=%d\n", i, testHist[i]);
-		}
-#endif  //DEBUG_RADIXSORT
-		b3Swap(src, dst);
-		b3Swap(srcHisto, destHisto);
-
-#ifdef DEBUG_RADIXSORT2
-		keyValuesInOut.copyToHost(test2);
-		printf("numElem = %d\n", test2.size());
-		for (int i = 0; i < test2.size(); i++)
-		{
-			if (test2[i].m_key != test2[i].m_value)
-			{
-				printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
-				printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
-			}
-		}
-#endif  //DEBUG_RADIXSORT2
-
-		count++;
-	}
-
-	if (count & 1)
-	{
-		b3Assert(0);  //need to copy from workbuffer to keyValuesInOut
-	}
-
-	if (m_workBuffer4->size())
-	{
-		m_workBuffer4->resize(originalSize);
-		keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
-	}
-
-#ifdef DEBUG_RADIXSORT
-	keyValuesInOut.copyToHost(test2);
-
-	printf("numElem = %d\n", test2.size());
-	for (int i = 0; i < test2.size(); i++)
-	{
-		printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
-		printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
-	}
-#endif
-}
-
-void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
-{
-	int originalSize = keysInOut.size();
-	int workingSize = originalSize;
-
-	int dataAlignment = DATA_ALIGNMENT;
-
-	b3OpenCLArray<unsigned int>* src = 0;
-
-	if (workingSize % dataAlignment)
-	{
-		workingSize += dataAlignment - (workingSize % dataAlignment);
-		m_workBuffer4a->copyFromOpenCLArray(keysInOut);
-		m_workBuffer4a->resize(workingSize);
-		unsigned int fillValue = 0xffffffff;
-
-		m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
-
-		src = m_workBuffer4a;
-	}
-	else
-	{
-		src = &keysInOut;
-		m_workBuffer4a->resize(0);
-	}
-
-	b3Assert(workingSize % DATA_ALIGNMENT == 0);
-	int minCap = NUM_BUCKET * NUM_WGS;
-
-	int n = workingSize;
-
-	m_workBuffer1->resize(minCap);
-	m_workBuffer3->resize(workingSize);
-	m_workBuffer3a->resize(workingSize);
-
-	//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
-	b3Assert(BITS_PER_PASS == 4);
-	b3Assert(WG_SIZE == 64);
-	b3Assert((sortBits & 0x3) == 0);
-
-	b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
-
-	b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
-	b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
-
-	int nWGs = NUM_WGS;
-	b3ConstData cdata;
-
-	{
-		int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE;  //set at 256
-		int nBlocks = (n + blockSize - 1) / (blockSize);
-		cdata.m_n = n;
-		cdata.m_nWGs = NUM_WGS;
-		cdata.m_startBit = 0;
-		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
-		if (nBlocks < NUM_WGS)
-		{
-			cdata.m_nBlocksPerWG = 1;
-			nWGs = nBlocks;
-		}
-	}
-
-	int count = 0;
-	for (int ib = 0; ib < sortBits; ib += 4)
-	{
-		cdata.m_startBit = ib;
-
-		if (src->size())
-		{
-			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
-			b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
-
-			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(cdata);
-
-			int num = NUM_WGS * WG_SIZE;
-			launcher.launch1D(num, WG_SIZE);
-		}
-
-//fast prefix scan is not working properly on Mac OSX yet
-#ifdef __APPLE__
-		bool fastScan = false;
-#else
-		bool fastScan = !m_deviceCPU;
-#endif
-
-		if (fastScan)
-		{  //	prefix scan group histogram
-			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
-			b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
-			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(cdata);
-			launcher.launch1D(128, 128);
-			destHisto = srcHisto;
-		}
-		else
-		{
-			//unsigned int sum; //for debugging
-			m_scan->execute(*srcHisto, *destHisto, 1920, 0);  //,&sum);
-		}
-
-		if (src->size())
-		{  //	local sort and distribute
-			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
-			b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
-			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(cdata);
-			launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
-		}
-
-		b3Swap(src, dst);
-		b3Swap(srcHisto, destHisto);
-
-		count++;
-	}
-
-	if (count & 1)
-	{
-		b3Assert(0);  //need to copy from workbuffer to keyValuesInOut
-	}
-
-	if (m_workBuffer4a->size())
-	{
-		m_workBuffer4a->resize(originalSize);
-		keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
-	}
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
@ -1,84 +0,0 @@
-
-#ifndef B3_RADIXSORT32_H
-#define B3_RADIXSORT32_H
-
-#include "b3OpenCLArray.h"
-
-struct b3SortData
-{
-	union {
-		unsigned int m_key;
-		unsigned int x;
-	};
-
-	union {
-		unsigned int m_value;
-		unsigned int y;
-	};
-};
-#include "b3BufferInfoCL.h"
-
-class b3RadixSort32CL
-{
-	b3OpenCLArray<unsigned int>* m_workBuffer1;
-	b3OpenCLArray<unsigned int>* m_workBuffer2;
-
-	b3OpenCLArray<b3SortData>* m_workBuffer3;
-	b3OpenCLArray<b3SortData>* m_workBuffer4;
-
-	b3OpenCLArray<unsigned int>* m_workBuffer3a;
-	b3OpenCLArray<unsigned int>* m_workBuffer4a;
-
-	cl_command_queue m_commandQueue;
-
-	cl_kernel m_streamCountSortDataKernel;
-	cl_kernel m_streamCountKernel;
-
-	cl_kernel m_prefixScanKernel;
-	cl_kernel m_sortAndScatterSortDataKernel;
-	cl_kernel m_sortAndScatterKernel;
-
-	bool m_deviceCPU;
-
-	class b3PrefixScanCL* m_scan;
-	class b3FillCL* m_fill;
-
-public:
-	struct b3ConstData
-	{
-		int m_n;
-		int m_nWGs;
-		int m_startBit;
-		int m_nBlocksPerWG;
-	};
-	enum
-	{
-		DATA_ALIGNMENT = 256,
-		WG_SIZE = 64,
-		BLOCK_SIZE = 256,
-		ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE),
-		BITS_PER_PASS = 4,
-		NUM_BUCKET = (1 << BITS_PER_PASS),
-		//	if you change this, change nPerWI in kernel as well
-		NUM_WGS = 20 * 6,  //	cypress
-						   //			NUM_WGS = 24*6,	//	cayman
-						   //			NUM_WGS = 32*4,	//	nv
-	};
-
-private:
-public:
-	b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0);
-
-	virtual ~b3RadixSort32CL();
-
-	void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
-				 b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
-
-	///keys only
-	void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32);
-
-	void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
-	void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
-	void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
-};
-#endif  //B3_RADIXSORT32_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl
@ -1,106 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Takahiro Harada
-
-
-typedef unsigned int u32;
-#define GET_GROUP_IDX get_group_id(0)
-#define GET_LOCAL_IDX get_local_id(0)
-#define GET_GLOBAL_IDX get_global_id(0)
-#define GET_GROUP_SIZE get_local_size(0)
-#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
-
-typedef struct
-{
-	u32 m_key; 
-	u32 m_value;
-}SortData;
-
-
-
-typedef struct
-{
-	u32 m_nSrc;
-	u32 m_nDst;
-	u32 m_padding[2];
-} ConstBuffer;
-
-
-
-__attribute__((reqd_work_group_size(64,1,1)))
-__kernel
-void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, 
-					unsigned int nSrc, unsigned int nDst)
-{
-	int gIdx = GET_GLOBAL_IDX;
-
-	if( gIdx < nSrc )
-	{
-		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);
-		SortData end; end.m_key = nDst; end.m_value = nDst;
-
-		SortData iData = (gIdx==0)? first: src[gIdx-1];
-		SortData jData = (gIdx==nSrc)? end: src[gIdx];
-
-		if( iData.m_key != jData.m_key )
-		{
-//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
-			u32 k = jData.m_key;
-			{
-				dst[k] = gIdx;
-			}
-		}
-	}
-}
-
-
-__attribute__((reqd_work_group_size(64,1,1)))
-__kernel
-void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, 
-					unsigned int nSrc, unsigned int nDst)
-{
-	int gIdx = GET_GLOBAL_IDX+1;
-
-	if( gIdx < nSrc+1 )
-	{
-		SortData first; first.m_key = 0; first.m_value = 0;
-		SortData end; end.m_key = nDst; end.m_value = nDst;
-
-		SortData iData = src[gIdx-1];
-		SortData jData = (gIdx==nSrc)? end: src[gIdx];
-
-		if( iData.m_key != jData.m_key )
-		{
-			u32 k = iData.m_key;
-			{
-				dst[k] = gIdx;
-			}
-		}
-	}
-}
-
-__attribute__((reqd_work_group_size(64,1,1)))
-__kernel
-void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, 
-					unsigned int nSrc, unsigned int nDst)
-{
-	int gIdx = GET_GLOBAL_IDX;
-	
-
-	if( gIdx < nDst )
-	{
-		C[gIdx] = A[gIdx] - B[gIdx];
-	}
-}
-
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
@ -1,86 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* boundSearchKernelsCL =
-	"/*\n"
-	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-	"This software is provided 'as-is', without any express or implied warranty.\n"
-	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-	"Permission is granted to anyone to use this software for any purpose, \n"
-	"including commercial applications, and to alter it and redistribute it freely, \n"
-	"subject to the following restrictions:\n"
-	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-	"3. This notice may not be removed or altered from any source distribution.\n"
-	"*/\n"
-	"//Originally written by Takahiro Harada\n"
-	"typedef unsigned int u32;\n"
-	"#define GET_GROUP_IDX get_group_id(0)\n"
-	"#define GET_LOCAL_IDX get_local_id(0)\n"
-	"#define GET_GLOBAL_IDX get_global_id(0)\n"
-	"#define GET_GROUP_SIZE get_local_size(0)\n"
-	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-	"typedef struct\n"
-	"{\n"
-	"	u32 m_key; \n"
-	"	u32 m_value;\n"
-	"}SortData;\n"
-	"typedef struct\n"
-	"{\n"
-	"	u32 m_nSrc;\n"
-	"	u32 m_nDst;\n"
-	"	u32 m_padding[2];\n"
-	"} ConstBuffer;\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"__kernel\n"
-	"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
-	"					unsigned int nSrc, unsigned int nDst)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"	if( gIdx < nSrc )\n"
-	"	{\n"
-	"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
-	"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
-	"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
-	"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
-	"		if( iData.m_key != jData.m_key )\n"
-	"		{\n"
-	"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
-	"			u32 k = jData.m_key;\n"
-	"			{\n"
-	"				dst[k] = gIdx;\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"__kernel\n"
-	"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
-	"					unsigned int nSrc, unsigned int nDst)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX+1;\n"
-	"	if( gIdx < nSrc+1 )\n"
-	"	{\n"
-	"		SortData first; first.m_key = 0; first.m_value = 0;\n"
-	"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
-	"		SortData iData = src[gIdx-1];\n"
-	"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
-	"		if( iData.m_key != jData.m_key )\n"
-	"		{\n"
-	"			u32 k = iData.m_key;\n"
-	"			{\n"
-	"				dst[k] = gIdx;\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"__kernel\n"
-	"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
-	"					unsigned int nSrc, unsigned int nDst)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"	\n"
-	"	if( gIdx < nDst )\n"
-	"	{\n"
-	"		C[gIdx] = A[gIdx] - B[gIdx];\n"
-	"	}\n"
-	"}\n";
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl
@ -1,128 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Takahiro Harada
-
-#pragma OPENCL EXTENSION cl_amd_printf : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
-typedef unsigned int u32;
-#define GET_GROUP_IDX get_group_id(0)
-#define GET_LOCAL_IDX get_local_id(0)
-#define GET_GLOBAL_IDX get_global_id(0)
-#define GET_GROUP_SIZE get_local_size(0)
-#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
-#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
-#define AtomInc(x) atom_inc(&(x))
-#define AtomInc1(x, out) out = atom_inc(&(x))
-
-#define make_uint4 (uint4)
-#define make_uint2 (uint2)
-#define make_int2 (int2)
-
-typedef struct
-{
-	int m_n;
-	int m_padding[3];
-} ConstBuffer;
-
-
-
-__kernel
-__attribute__((reqd_work_group_size(64,1,1)))
-void Copy1F4Kernel(__global float4* dst, __global float4* src, 
-					ConstBuffer cb)
-{
-	int gIdx = GET_GLOBAL_IDX;
-
-	if( gIdx < cb.m_n )
-	{
-		float4 a0 = src[gIdx];
-
-		dst[ gIdx ] = a0;
-	}
-}
-
-__kernel
-__attribute__((reqd_work_group_size(64,1,1)))
-void Copy2F4Kernel(__global float4* dst, __global float4* src, 
-					ConstBuffer cb)
-{
-	int gIdx = GET_GLOBAL_IDX;
-
-	if( 2*gIdx <= cb.m_n )
-	{
-		float4 a0 = src[gIdx*2+0];
-		float4 a1 = src[gIdx*2+1];
-
-		dst[ gIdx*2+0 ] = a0;
-		dst[ gIdx*2+1 ] = a1;
-	}
-}
-
-__kernel
-__attribute__((reqd_work_group_size(64,1,1)))
-void Copy4F4Kernel(__global float4* dst, __global float4* src, 
-					ConstBuffer cb)
-{
-	int gIdx = GET_GLOBAL_IDX;
-
-	if( 4*gIdx <= cb.m_n )
-	{
-		int idx0 = gIdx*4+0;
-		int idx1 = gIdx*4+1;
-		int idx2 = gIdx*4+2;
-		int idx3 = gIdx*4+3;
-
-		float4 a0 = src[idx0];
-		float4 a1 = src[idx1];
-		float4 a2 = src[idx2];
-		float4 a3 = src[idx3];
-
-		dst[ idx0 ] = a0;
-		dst[ idx1 ] = a1;
-		dst[ idx2 ] = a2;
-		dst[ idx3 ] = a3;
-	}
-}
-
-__kernel
-__attribute__((reqd_work_group_size(64,1,1)))
-void CopyF1Kernel(__global float* dstF1, __global float* srcF1, 
-					ConstBuffer cb)
-{
-	int gIdx = GET_GLOBAL_IDX;
-
-	if( gIdx < cb.m_n )
-	{
-		float a0 = srcF1[gIdx];
-
-		dstF1[ gIdx ] = a0;
-	}
-}
-
-__kernel
-__attribute__((reqd_work_group_size(64,1,1)))
-void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, 
-					ConstBuffer cb)
-{
-	int gIdx = GET_GLOBAL_IDX;
-
-	if( gIdx < cb.m_n )
-	{
-		float2 a0 = srcF2[gIdx];
-
-		dstF2[ gIdx ] = a0;
-	}
-}
-
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h
@ -1,131 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* copyKernelsCL =
-	"/*\n"
-	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-	"\n"
-	"This software is provided 'as-is', without any express or implied warranty.\n"
-	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-	"Permission is granted to anyone to use this software for any purpose, \n"
-	"including commercial applications, and to alter it and redistribute it freely, \n"
-	"subject to the following restrictions:\n"
-	"\n"
-	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-	"3. This notice may not be removed or altered from any source distribution.\n"
-	"*/\n"
-	"//Originally written by Takahiro Harada\n"
-	"\n"
-	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-	"\n"
-	"typedef unsigned int u32;\n"
-	"#define GET_GROUP_IDX get_group_id(0)\n"
-	"#define GET_LOCAL_IDX get_local_id(0)\n"
-	"#define GET_GLOBAL_IDX get_global_id(0)\n"
-	"#define GET_GROUP_SIZE get_local_size(0)\n"
-	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-	"#define AtomInc(x) atom_inc(&(x))\n"
-	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-	"\n"
-	"#define make_uint4 (uint4)\n"
-	"#define make_uint2 (uint2)\n"
-	"#define make_int2 (int2)\n"
-	"\n"
-	"typedef struct\n"
-	"{\n"
-	"	int m_n;\n"
-	"	int m_padding[3];\n"
-	"} ConstBuffer;\n"
-	"\n"
-	"\n"
-	"\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
-	"					ConstBuffer cb)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"\n"
-	"	if( gIdx < cb.m_n )\n"
-	"	{\n"
-	"		float4 a0 = src[gIdx];\n"
-	"\n"
-	"		dst[ gIdx ] = a0;\n"
-	"	}\n"
-	"}\n"
-	"\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
-	"					ConstBuffer cb)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"\n"
-	"	if( 2*gIdx <= cb.m_n )\n"
-	"	{\n"
-	"		float4 a0 = src[gIdx*2+0];\n"
-	"		float4 a1 = src[gIdx*2+1];\n"
-	"\n"
-	"		dst[ gIdx*2+0 ] = a0;\n"
-	"		dst[ gIdx*2+1 ] = a1;\n"
-	"	}\n"
-	"}\n"
-	"\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
-	"					ConstBuffer cb)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"\n"
-	"	if( 4*gIdx <= cb.m_n )\n"
-	"	{\n"
-	"		int idx0 = gIdx*4+0;\n"
-	"		int idx1 = gIdx*4+1;\n"
-	"		int idx2 = gIdx*4+2;\n"
-	"		int idx3 = gIdx*4+3;\n"
-	"\n"
-	"		float4 a0 = src[idx0];\n"
-	"		float4 a1 = src[idx1];\n"
-	"		float4 a2 = src[idx2];\n"
-	"		float4 a3 = src[idx3];\n"
-	"\n"
-	"		dst[ idx0 ] = a0;\n"
-	"		dst[ idx1 ] = a1;\n"
-	"		dst[ idx2 ] = a2;\n"
-	"		dst[ idx3 ] = a3;\n"
-	"	}\n"
-	"}\n"
-	"\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
-	"					ConstBuffer cb)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"\n"
-	"	if( gIdx < cb.m_n )\n"
-	"	{\n"
-	"		float a0 = srcF1[gIdx];\n"
-	"\n"
-	"		dstF1[ gIdx ] = a0;\n"
-	"	}\n"
-	"}\n"
-	"\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
-	"					ConstBuffer cb)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"\n"
-	"	if( gIdx < cb.m_n )\n"
-	"	{\n"
-	"		float2 a0 = srcF2[gIdx];\n"
-	"\n"
-	"		dstF2[ gIdx ] = a0;\n"
-	"	}\n"
-	"}\n"
-	"\n"
-	"\n";
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl
@ -1,107 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Takahiro Harada
-
-
-#pragma OPENCL EXTENSION cl_amd_printf : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
-typedef unsigned int u32;
-#define GET_GROUP_IDX get_group_id(0)
-#define GET_LOCAL_IDX get_local_id(0)
-#define GET_GLOBAL_IDX get_global_id(0)
-#define GET_GROUP_SIZE get_local_size(0)
-#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
-#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
-#define AtomInc(x) atom_inc(&(x))
-#define AtomInc1(x, out) out = atom_inc(&(x))
-
-#define make_uint4 (uint4)
-#define make_uint2 (uint2)
-#define make_int2 (int2)
-
-typedef struct
-{
-	union
-	{
-		int4 m_data;
-		uint4 m_unsignedData;
-		float	m_floatData;
-	};
-	int m_offset;
-	int m_n;
-	int m_padding[2];
-} ConstBuffer;
-
-
-__kernel
-__attribute__((reqd_work_group_size(64,1,1)))
-void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)
-{
-	int gIdx = GET_GLOBAL_IDX;
-
-	if( gIdx < num_elements )
-	{
-		dstInt[ offset+gIdx ] = value;
-	}
-}
-
-__kernel
-__attribute__((reqd_work_group_size(64,1,1)))
-void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)
-{
-	int gIdx = GET_GLOBAL_IDX;
-
-	if( gIdx < num_elements )
-	{
-		dstFloat[ offset+gIdx ] = value;
-	}
-}
-
-__kernel
-__attribute__((reqd_work_group_size(64,1,1)))
-void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)
-{
-	int gIdx = GET_GLOBAL_IDX;
-
-	if( gIdx < num )
-	{
-		dstInt[ offset+gIdx ] = value;
-	}
-}
-
-__kernel
-__attribute__((reqd_work_group_size(64,1,1)))
-void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)
-{
-	int gIdx = GET_GLOBAL_IDX;
-
-	if( gIdx < num )
-	{
-		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );
-	}
-}
-
-__kernel
-__attribute__((reqd_work_group_size(64,1,1)))
-void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)
-{
-	int gIdx = GET_GLOBAL_IDX;
-
-	if( gIdx < num )
-	{
-		dstInt4[ offset+gIdx ] = value;
-	}
-}
-
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
@ -1,90 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* fillKernelsCL =
-	"/*\n"
-	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-	"This software is provided 'as-is', without any express or implied warranty.\n"
-	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-	"Permission is granted to anyone to use this software for any purpose, \n"
-	"including commercial applications, and to alter it and redistribute it freely, \n"
-	"subject to the following restrictions:\n"
-	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-	"3. This notice may not be removed or altered from any source distribution.\n"
-	"*/\n"
-	"//Originally written by Takahiro Harada\n"
-	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-	"typedef unsigned int u32;\n"
-	"#define GET_GROUP_IDX get_group_id(0)\n"
-	"#define GET_LOCAL_IDX get_local_id(0)\n"
-	"#define GET_GLOBAL_IDX get_global_id(0)\n"
-	"#define GET_GROUP_SIZE get_local_size(0)\n"
-	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-	"#define AtomInc(x) atom_inc(&(x))\n"
-	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-	"#define make_uint4 (uint4)\n"
-	"#define make_uint2 (uint2)\n"
-	"#define make_int2 (int2)\n"
-	"typedef struct\n"
-	"{\n"
-	"	union\n"
-	"	{\n"
-	"		int4 m_data;\n"
-	"		uint4 m_unsignedData;\n"
-	"		float	m_floatData;\n"
-	"	};\n"
-	"	int m_offset;\n"
-	"	int m_n;\n"
-	"	int m_padding[2];\n"
-	"} ConstBuffer;\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"	if( gIdx < num_elements )\n"
-	"	{\n"
-	"		dstInt[ offset+gIdx ] = value;\n"
-	"	}\n"
-	"}\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"	if( gIdx < num_elements )\n"
-	"	{\n"
-	"		dstFloat[ offset+gIdx ] = value;\n"
-	"	}\n"
-	"}\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"	if( gIdx < num )\n"
-	"	{\n"
-	"		dstInt[ offset+gIdx ] = value;\n"
-	"	}\n"
-	"}\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"	if( gIdx < num )\n"
-	"	{\n"
-	"		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
-	"	}\n"
-	"}\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)\n"
-	"{\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"	if( gIdx < num )\n"
-	"	{\n"
-	"		dstInt4[ offset+gIdx ] = value;\n"
-	"	}\n"
-	"}\n";
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl
@ -1,154 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Takahiro Harada
-
-
-typedef unsigned int u32;
-#define GET_GROUP_IDX get_group_id(0)
-#define GET_LOCAL_IDX get_local_id(0)
-#define GET_GLOBAL_IDX get_global_id(0)
-#define GET_GROUP_SIZE get_local_size(0)
-#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
-
-// takahiro end
-#define WG_SIZE 128 
-#define m_numElems x
-#define m_numBlocks y
-#define m_numScanBlocks z
-
-/*typedef struct
-{
-	uint m_numElems;
-	uint m_numBlocks;
-	uint m_numScanBlocks;
-	uint m_padding[1];
-} ConstBuffer;
-*/
-
-float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)
-{
-	float4 blocksum;
-    int offset = 1;
-    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
-    {
-        GROUP_LDS_BARRIER;
-        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
-        {
-            int ai = offset*(2*iIdx+1)-1;
-            int bi = offset*(2*iIdx+2)-1;
-            data[bi] += data[ai];
-        }
-	}
-
-    GROUP_LDS_BARRIER;
-
-    if( lIdx == 0 )
-	{
-		blocksum = data[ n-1 ];
-    data[ n-1 ] = 0;
-	}
-
-	GROUP_LDS_BARRIER;
-
-	offset >>= 1;
-    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
-    {
-        GROUP_LDS_BARRIER;
-        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
-        {
-            int ai = offset*(2*iIdx+1)-1;
-            int bi = offset*(2*iIdx+2)-1;
-            float4 temp = data[ai];
-            data[ai] = data[bi];
-            data[bi] += temp;
-        }
-	}
-	GROUP_LDS_BARRIER;
-
-	return blocksum;
-}
-
-__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-__kernel
-void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer,	uint4 cb)
-{
-	__local float4 ldsData[WG_SIZE*2];
-
-	int gIdx = GET_GLOBAL_IDX;
-	int lIdx = GET_LOCAL_IDX;
-
-	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
-	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
-
-	float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
-
-	if( lIdx == 0 ) 
-		sumBuffer[GET_GROUP_IDX] = sum;
-
-	if( (2*gIdx) < cb.m_numElems )
-    {
-        dst[2*gIdx]     = ldsData[2*lIdx];
-	}
-	if( (2*gIdx + 1) < cb.m_numElems )
-	{
-        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
-    }
-}
-
-__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-__kernel
-void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)
-{
-	const u32 blockSize = WG_SIZE*2;
-
-	int myIdx = GET_GROUP_IDX+1;
-	int lIdx = GET_LOCAL_IDX;
-
-	float4 iBlockSum = blockSum[myIdx];
-
-	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
-	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
-	{
-		dst[i] += iBlockSum;
-	}
-}
-
-__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-__kernel
-void TopLevelScanKernel(__global float4* dst, uint4 cb)
-{
-	__local float4 ldsData[2048];
-	int gIdx = GET_GLOBAL_IDX;
-	int lIdx = GET_LOCAL_IDX;
-	int lSize = GET_GROUP_SIZE;
-
-	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
-	{
-		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
-	}
-
-	GROUP_LDS_BARRIER;
-
-	float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
-
-	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
-	{
-		dst[i] = ldsData[i];
-	}
-
-	if( gIdx == 0 )
-	{
-		dst[cb.m_numBlocks] = sum;
-	}
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl
@ -1,154 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Takahiro Harada
-
-
-typedef unsigned int u32;
-#define GET_GROUP_IDX get_group_id(0)
-#define GET_LOCAL_IDX get_local_id(0)
-#define GET_GLOBAL_IDX get_global_id(0)
-#define GET_GROUP_SIZE get_local_size(0)
-#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
-
-// takahiro end
-#define WG_SIZE 128 
-#define m_numElems x
-#define m_numBlocks y
-#define m_numScanBlocks z
-
-/*typedef struct
-{
-	uint m_numElems;
-	uint m_numBlocks;
-	uint m_numScanBlocks;
-	uint m_padding[1];
-} ConstBuffer;
-*/
-
-u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)
-{
-	u32 blocksum;
-    int offset = 1;
-    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
-    {
-        GROUP_LDS_BARRIER;
-        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
-        {
-            int ai = offset*(2*iIdx+1)-1;
-            int bi = offset*(2*iIdx+2)-1;
-            data[bi] += data[ai];
-        }
-	}
-
-    GROUP_LDS_BARRIER;
-
-    if( lIdx == 0 )
-	{
-		blocksum = data[ n-1 ];
-        data[ n-1 ] = 0;
-	}
-
-	GROUP_LDS_BARRIER;
-
-	offset >>= 1;
-    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
-    {
-        GROUP_LDS_BARRIER;
-        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
-        {
-            int ai = offset*(2*iIdx+1)-1;
-            int bi = offset*(2*iIdx+2)-1;
-            u32 temp = data[ai];
-            data[ai] = data[bi];
-            data[bi] += temp;
-        }
-	}
-	GROUP_LDS_BARRIER;
-
-	return blocksum;
-}
-
-__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-__kernel
-void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,
-		uint4 cb)
-{
-	__local u32 ldsData[WG_SIZE*2];
-
-	int gIdx = GET_GLOBAL_IDX;
-	int lIdx = GET_LOCAL_IDX;
-
-	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
-	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
-
-	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
-
-	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
-
-	if( (2*gIdx) < cb.m_numElems )
-    {
-        dst[2*gIdx]     = ldsData[2*lIdx];
-	}
-	if( (2*gIdx + 1) < cb.m_numElems )
-	{
-        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
-    }
-}
-
-__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-__kernel
-void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)
-{
-	const u32 blockSize = WG_SIZE*2;
-
-	int myIdx = GET_GROUP_IDX+1;
-	int lIdx = GET_LOCAL_IDX;
-
-	u32 iBlockSum = blockSum[myIdx];
-
-	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
-	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
-	{
-		dst[i] += iBlockSum;
-	}
-}
-
-__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-__kernel
-void TopLevelScanKernel(__global u32* dst, uint4 cb)
-{
-	__local u32 ldsData[2048];
-	int gIdx = GET_GLOBAL_IDX;
-	int lIdx = GET_LOCAL_IDX;
-	int lSize = GET_GROUP_SIZE;
-
-	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
-	{
-		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
-	}
-
-	GROUP_LDS_BARRIER;
-
-	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
-
-	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
-	{
-		dst[i] = ldsData[i];
-	}
-
-	if( gIdx == 0 )
-	{
-		dst[cb.m_numBlocks] = sum;
-	}
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
@ -1,128 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* prefixScanKernelsCL =
-	"/*\n"
-	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-	"This software is provided 'as-is', without any express or implied warranty.\n"
-	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-	"Permission is granted to anyone to use this software for any purpose, \n"
-	"including commercial applications, and to alter it and redistribute it freely, \n"
-	"subject to the following restrictions:\n"
-	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-	"3. This notice may not be removed or altered from any source distribution.\n"
-	"*/\n"
-	"//Originally written by Takahiro Harada\n"
-	"typedef unsigned int u32;\n"
-	"#define GET_GROUP_IDX get_group_id(0)\n"
-	"#define GET_LOCAL_IDX get_local_id(0)\n"
-	"#define GET_GLOBAL_IDX get_global_id(0)\n"
-	"#define GET_GROUP_SIZE get_local_size(0)\n"
-	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-	"// takahiro end\n"
-	"#define WG_SIZE 128 \n"
-	"#define m_numElems x\n"
-	"#define m_numBlocks y\n"
-	"#define m_numScanBlocks z\n"
-	"/*typedef struct\n"
-	"{\n"
-	"	uint m_numElems;\n"
-	"	uint m_numBlocks;\n"
-	"	uint m_numScanBlocks;\n"
-	"	uint m_padding[1];\n"
-	"} ConstBuffer;\n"
-	"*/\n"
-	"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
-	"{\n"
-	"	u32 blocksum;\n"
-	"    int offset = 1;\n"
-	"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
-	"    {\n"
-	"        GROUP_LDS_BARRIER;\n"
-	"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
-	"        {\n"
-	"            int ai = offset*(2*iIdx+1)-1;\n"
-	"            int bi = offset*(2*iIdx+2)-1;\n"
-	"            data[bi] += data[ai];\n"
-	"        }\n"
-	"	}\n"
-	"    GROUP_LDS_BARRIER;\n"
-	"    if( lIdx == 0 )\n"
-	"	{\n"
-	"		blocksum = data[ n-1 ];\n"
-	"        data[ n-1 ] = 0;\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	offset >>= 1;\n"
-	"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
-	"    {\n"
-	"        GROUP_LDS_BARRIER;\n"
-	"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
-	"        {\n"
-	"            int ai = offset*(2*iIdx+1)-1;\n"
-	"            int bi = offset*(2*iIdx+2)-1;\n"
-	"            u32 temp = data[ai];\n"
-	"            data[ai] = data[bi];\n"
-	"            data[bi] += temp;\n"
-	"        }\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	return blocksum;\n"
-	"}\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"__kernel\n"
-	"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
-	"		uint4 cb)\n"
-	"{\n"
-	"	__local u32 ldsData[WG_SIZE*2];\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"	int lIdx = GET_LOCAL_IDX;\n"
-	"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
-	"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
-	"	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-	"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
-	"	if( (2*gIdx) < cb.m_numElems )\n"
-	"    {\n"
-	"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
-	"	}\n"
-	"	if( (2*gIdx + 1) < cb.m_numElems )\n"
-	"	{\n"
-	"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
-	"    }\n"
-	"}\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"__kernel\n"
-	"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
-	"{\n"
-	"	const u32 blockSize = WG_SIZE*2;\n"
-	"	int myIdx = GET_GROUP_IDX+1;\n"
-	"	int lIdx = GET_LOCAL_IDX;\n"
-	"	u32 iBlockSum = blockSum[myIdx];\n"
-	"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
-	"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
-	"	{\n"
-	"		dst[i] += iBlockSum;\n"
-	"	}\n"
-	"}\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"__kernel\n"
-	"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
-	"{\n"
-	"	__local u32 ldsData[2048];\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"	int lIdx = GET_LOCAL_IDX;\n"
-	"	int lSize = GET_GROUP_SIZE;\n"
-	"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
-	"	{\n"
-	"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-	"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
-	"	{\n"
-	"		dst[i] = ldsData[i];\n"
-	"	}\n"
-	"	if( gIdx == 0 )\n"
-	"	{\n"
-	"		dst[cb.m_numBlocks] = sum;\n"
-	"	}\n"
-	"}\n";
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
@ -1,128 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* prefixScanKernelsFloat4CL =
-	"/*\n"
-	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-	"This software is provided 'as-is', without any express or implied warranty.\n"
-	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-	"Permission is granted to anyone to use this software for any purpose, \n"
-	"including commercial applications, and to alter it and redistribute it freely, \n"
-	"subject to the following restrictions:\n"
-	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-	"3. This notice may not be removed or altered from any source distribution.\n"
-	"*/\n"
-	"//Originally written by Takahiro Harada\n"
-	"typedef unsigned int u32;\n"
-	"#define GET_GROUP_IDX get_group_id(0)\n"
-	"#define GET_LOCAL_IDX get_local_id(0)\n"
-	"#define GET_GLOBAL_IDX get_global_id(0)\n"
-	"#define GET_GROUP_SIZE get_local_size(0)\n"
-	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-	"// takahiro end\n"
-	"#define WG_SIZE 128 \n"
-	"#define m_numElems x\n"
-	"#define m_numBlocks y\n"
-	"#define m_numScanBlocks z\n"
-	"/*typedef struct\n"
-	"{\n"
-	"	uint m_numElems;\n"
-	"	uint m_numBlocks;\n"
-	"	uint m_numScanBlocks;\n"
-	"	uint m_padding[1];\n"
-	"} ConstBuffer;\n"
-	"*/\n"
-	"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
-	"{\n"
-	"	float4 blocksum;\n"
-	"    int offset = 1;\n"
-	"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
-	"    {\n"
-	"        GROUP_LDS_BARRIER;\n"
-	"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
-	"        {\n"
-	"            int ai = offset*(2*iIdx+1)-1;\n"
-	"            int bi = offset*(2*iIdx+2)-1;\n"
-	"            data[bi] += data[ai];\n"
-	"        }\n"
-	"	}\n"
-	"    GROUP_LDS_BARRIER;\n"
-	"    if( lIdx == 0 )\n"
-	"	{\n"
-	"		blocksum = data[ n-1 ];\n"
-	"    data[ n-1 ] = 0;\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	offset >>= 1;\n"
-	"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
-	"    {\n"
-	"        GROUP_LDS_BARRIER;\n"
-	"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
-	"        {\n"
-	"            int ai = offset*(2*iIdx+1)-1;\n"
-	"            int bi = offset*(2*iIdx+2)-1;\n"
-	"            float4 temp = data[ai];\n"
-	"            data[ai] = data[bi];\n"
-	"            data[bi] += temp;\n"
-	"        }\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	return blocksum;\n"
-	"}\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"__kernel\n"
-	"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer,	uint4 cb)\n"
-	"{\n"
-	"	__local float4 ldsData[WG_SIZE*2];\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"	int lIdx = GET_LOCAL_IDX;\n"
-	"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
-	"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
-	"	float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-	"	if( lIdx == 0 ) \n"
-	"		sumBuffer[GET_GROUP_IDX] = sum;\n"
-	"	if( (2*gIdx) < cb.m_numElems )\n"
-	"    {\n"
-	"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
-	"	}\n"
-	"	if( (2*gIdx + 1) < cb.m_numElems )\n"
-	"	{\n"
-	"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
-	"    }\n"
-	"}\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"__kernel\n"
-	"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
-	"{\n"
-	"	const u32 blockSize = WG_SIZE*2;\n"
-	"	int myIdx = GET_GROUP_IDX+1;\n"
-	"	int lIdx = GET_LOCAL_IDX;\n"
-	"	float4 iBlockSum = blockSum[myIdx];\n"
-	"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
-	"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
-	"	{\n"
-	"		dst[i] += iBlockSum;\n"
-	"	}\n"
-	"}\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"__kernel\n"
-	"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
-	"{\n"
-	"	__local float4 ldsData[2048];\n"
-	"	int gIdx = GET_GLOBAL_IDX;\n"
-	"	int lIdx = GET_LOCAL_IDX;\n"
-	"	int lSize = GET_GROUP_SIZE;\n"
-	"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
-	"	{\n"
-	"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-	"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
-	"	{\n"
-	"		dst[i] = ldsData[i];\n"
-	"	}\n"
-	"	if( gIdx == 0 )\n"
-	"	{\n"
-	"		dst[cb.m_numBlocks] = sum;\n"
-	"	}\n"
-	"}\n";
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl
--- a/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
@ -1,909 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* radixSort32KernelsCL =
-	"/*\n"
-	"Bullet Continuous Collision Detection and Physics Library\n"
-	"Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org\n"
-	"This software is provided 'as-is', without any express or implied warranty.\n"
-	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-	"Permission is granted to anyone to use this software for any purpose, \n"
-	"including commercial applications, and to alter it and redistribute it freely, \n"
-	"subject to the following restrictions:\n"
-	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-	"3. This notice may not be removed or altered from any source distribution.\n"
-	"*/\n"
-	"//Author Takahiro Harada\n"
-	"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-	"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
-	"typedef unsigned int u32;\n"
-	"#define GET_GROUP_IDX get_group_id(0)\n"
-	"#define GET_LOCAL_IDX get_local_id(0)\n"
-	"#define GET_GLOBAL_IDX get_global_id(0)\n"
-	"#define GET_GROUP_SIZE get_local_size(0)\n"
-	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-	"#define AtomInc(x) atom_inc(&(x))\n"
-	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-	"#define AtomAdd(x, value) atom_add(&(x), value)\n"
-	"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-	"#define make_uint4 (uint4)\n"
-	"#define make_uint2 (uint2)\n"
-	"#define make_int2 (int2)\n"
-	"#define WG_SIZE 64\n"
-	"#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n"
-	"#define BITS_PER_PASS 4\n"
-	"#define NUM_BUCKET (1<<BITS_PER_PASS)\n"
-	"typedef uchar u8;\n"
-	"//	this isn't optimization for VLIW. But just reducing writes. \n"
-	"#define USE_2LEVEL_REDUCE 1\n"
-	"//#define CHECK_BOUNDARY 1\n"
-	"//#define NV_GPU 1\n"
-	"//	Cypress\n"
-	"#define nPerWI 16\n"
-	"//	Cayman\n"
-	"//#define nPerWI 20\n"
-	"#define m_n x\n"
-	"#define m_nWGs y\n"
-	"#define m_startBit z\n"
-	"#define m_nBlocksPerWG w\n"
-	"/*\n"
-	"typedef struct\n"
-	"{\n"
-	"	int m_n;\n"
-	"	int m_nWGs;\n"
-	"	int m_startBit;\n"
-	"	int m_nBlocksPerWG;\n"
-	"} ConstBuffer;\n"
-	"*/\n"
-	"typedef struct\n"
-	"{\n"
-	"	unsigned int m_key;\n"
-	"	unsigned int m_value;\n"
-	"} SortDataCL;\n"
-	"uint prefixScanVectorEx( uint4* data )\n"
-	"{\n"
-	"	u32 sum = 0;\n"
-	"	u32 tmp = data[0].x;\n"
-	"	data[0].x = sum;\n"
-	"	sum += tmp;\n"
-	"	tmp = data[0].y;\n"
-	"	data[0].y = sum;\n"
-	"	sum += tmp;\n"
-	"	tmp = data[0].z;\n"
-	"	data[0].z = sum;\n"
-	"	sum += tmp;\n"
-	"	tmp = data[0].w;\n"
-	"	data[0].w = sum;\n"
-	"	sum += tmp;\n"
-	"	return sum;\n"
-	"}\n"
-	"u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n"
-	"{\n"
-	"	{	//	Set data\n"
-	"		sorterSharedMemory[lIdx] = 0;\n"
-	"		sorterSharedMemory[lIdx+wgSize] = pData;\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	{	//	Prefix sum\n"
-	"		int idx = 2*lIdx + (wgSize+1);\n"
-	"#if defined(USE_2LEVEL_REDUCE)\n"
-	"		if( lIdx < 64 )\n"
-	"		{\n"
-	"			u32 u0, u1, u2;\n"
-	"			u0 = sorterSharedMemory[idx-3];\n"
-	"			u1 = sorterSharedMemory[idx-2];\n"
-	"			u2 = sorterSharedMemory[idx-1];\n"
-	"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
-	"			GROUP_MEM_FENCE;\n"
-	"			u0 = sorterSharedMemory[idx-12];\n"
-	"			u1 = sorterSharedMemory[idx-8];\n"
-	"			u2 = sorterSharedMemory[idx-4];\n"
-	"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
-	"			GROUP_MEM_FENCE;\n"
-	"			u0 = sorterSharedMemory[idx-48];\n"
-	"			u1 = sorterSharedMemory[idx-32];\n"
-	"			u2 = sorterSharedMemory[idx-16];\n"
-	"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
-	"			GROUP_MEM_FENCE;\n"
-	"			if( wgSize > 64 )\n"
-	"			{\n"
-	"				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"			}\n"
-	"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
-	"			GROUP_MEM_FENCE;\n"
-	"		}\n"
-	"#else\n"
-	"		if( lIdx < 64 )\n"
-	"		{\n"
-	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
-	"			GROUP_MEM_FENCE;\n"
-	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];			\n"
-	"			GROUP_MEM_FENCE;\n"
-	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
-	"			GROUP_MEM_FENCE;\n"
-	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
-	"			GROUP_MEM_FENCE;\n"
-	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
-	"			GROUP_MEM_FENCE;\n"
-	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n"
-	"			GROUP_MEM_FENCE;\n"
-	"			if( wgSize > 64 )\n"
-	"			{\n"
-	"				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"			}\n"
-	"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
-	"			GROUP_MEM_FENCE;\n"
-	"		}\n"
-	"#endif\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	*totalSum = sorterSharedMemory[wgSize*2-1];\n"
-	"	u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n"
-	"	return addValue;\n"
-	"}\n"
-	"//__attribute__((reqd_work_group_size(128,1,1)))\n"
-	"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
-	"{\n"
-	"	u32 s4 = prefixScanVectorEx( &pData );\n"
-	"	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n"
-	"	return pData + make_uint4( rank, rank, rank, rank );\n"
-	"}\n"
-	"//__attribute__((reqd_work_group_size(64,1,1)))\n"
-	"uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
-	"{\n"
-	"	u32 s4 = prefixScanVectorEx( &pData );\n"
-	"	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n"
-	"	return pData + make_uint4( rank, rank, rank, rank );\n"
-	"}\n"
-	"u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n"
-	"u32 bit8Scan(u32 v)\n"
-	"{\n"
-	"	return (v<<8) + (v<<16) + (v<<24);\n"
-	"}\n"
-	"//===\n"
-	"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n"
-	"{\n"
-	"	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
-	"	u32 gIdx = GET_GLOBAL_IDX;\n"
-	"	u32 lIdx = GET_LOCAL_IDX;\n"
-	"	u32 wgIdx = GET_GROUP_IDX;\n"
-	"	u32 wgSize = GET_GROUP_SIZE;\n"
-	"	const int startBit = cb.m_startBit;\n"
-	"	const int n = cb.m_n;\n"
-	"	const int nWGs = cb.m_nWGs;\n"
-	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-	"	for(int i=0; i<NUM_BUCKET; i++)\n"
-	"	{\n"
-	"		MY_HISTOGRAM(i) = 0;\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-	"	u32 localKey;\n"
-	"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-	"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-	"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
-	"	{\n"
-	"		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
-	"		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
-	"		//	AMD: AtomInc performs better while NV prefers ++\n"
-	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-	"		{\n"
-	"#if defined(CHECK_BOUNDARY)\n"
-	"			if( addr+i < n )\n"
-	"#endif\n"
-	"			{\n"
-	"				localKey = (gSrc[addr+i]>>startBit) & 0xf;\n"
-	"#if defined(NV_GPU)\n"
-	"				MY_HISTOGRAM( localKey )++;\n"
-	"#else\n"
-	"				AtomInc( MY_HISTOGRAM( localKey ) );\n"
-	"#endif\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	\n"
-	"	if( lIdx < NUM_BUCKET )\n"
-	"	{\n"
-	"		u32 sum = 0;\n"
-	"		for(int i=0; i<GET_GROUP_SIZE; i++)\n"
-	"		{\n"
-	"			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
-	"		}\n"
-	"		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
-	"	}\n"
-	"}\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4  cb )\n"
-	"{\n"
-	"	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
-	"	u32 gIdx = GET_GLOBAL_IDX;\n"
-	"	u32 lIdx = GET_LOCAL_IDX;\n"
-	"	u32 wgIdx = GET_GROUP_IDX;\n"
-	"	u32 wgSize = GET_GROUP_SIZE;\n"
-	"	const int startBit = cb.m_startBit;\n"
-	"	const int n = cb.m_n;\n"
-	"	const int nWGs = cb.m_nWGs;\n"
-	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-	"	for(int i=0; i<NUM_BUCKET; i++)\n"
-	"	{\n"
-	"		MY_HISTOGRAM(i) = 0;\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-	"	u32 localKey;\n"
-	"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-	"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-	"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
-	"	{\n"
-	"		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
-	"		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
-	"		//	AMD: AtomInc performs better while NV prefers ++\n"
-	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-	"		{\n"
-	"#if defined(CHECK_BOUNDARY)\n"
-	"			if( addr+i < n )\n"
-	"#endif\n"
-	"			{\n"
-	"				localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;\n"
-	"#if defined(NV_GPU)\n"
-	"				MY_HISTOGRAM( localKey )++;\n"
-	"#else\n"
-	"				AtomInc( MY_HISTOGRAM( localKey ) );\n"
-	"#endif\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	\n"
-	"	if( lIdx < NUM_BUCKET )\n"
-	"	{\n"
-	"		u32 sum = 0;\n"
-	"		for(int i=0; i<GET_GROUP_SIZE; i++)\n"
-	"		{\n"
-	"			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
-	"		}\n"
-	"		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
-	"	}\n"
-	"}\n"
-	"#define nPerLane (nPerWI/4)\n"
-	"//	NUM_BUCKET*nWGs < 128*nPerWI\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(128,1,1)))\n"
-	"void PrefixScanKernel( __global u32* wHistogram1, int4  cb )\n"
-	"{\n"
-	"	__local u32 ldsTopScanData[128*2];\n"
-	"	u32 lIdx = GET_LOCAL_IDX;\n"
-	"	u32 wgIdx = GET_GROUP_IDX;\n"
-	"	const int nWGs = cb.m_nWGs;\n"
-	"	u32 data[nPerWI];\n"
-	"	for(int i=0; i<nPerWI; i++)\n"
-	"	{\n"
-	"		data[i] = 0;\n"
-	"		if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n"
-	"			data[i] = wHistogram1[nPerWI*lIdx+i];\n"
-	"	}\n"
-	"	uint4 myData = make_uint4(0,0,0,0);\n"
-	"	for(int i=0; i<nPerLane; i++)\n"
-	"	{\n"
-	"		myData.x += data[nPerLane*0+i];\n"
-	"		myData.y += data[nPerLane*1+i];\n"
-	"		myData.z += data[nPerLane*2+i];\n"
-	"		myData.w += data[nPerLane*3+i];\n"
-	"	}\n"
-	"	uint totalSum;\n"
-	"	uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n"
-	"//	for(int j=0; j<4; j++) //	somehow it introduces a lot of branches\n"
-	"	{	int j = 0;\n"
-	"		u32 sum = 0;\n"
-	"		for(int i=0; i<nPerLane; i++)\n"
-	"		{\n"
-	"			u32 tmp = data[nPerLane*j+i];\n"
-	"			data[nPerLane*j+i] = sum;\n"
-	"			sum += tmp;\n"
-	"		}\n"
-	"	}\n"
-	"	{	int j = 1;\n"
-	"		u32 sum = 0;\n"
-	"		for(int i=0; i<nPerLane; i++)\n"
-	"		{\n"
-	"			u32 tmp = data[nPerLane*j+i];\n"
-	"			data[nPerLane*j+i] = sum;\n"
-	"			sum += tmp;\n"
-	"		}\n"
-	"	}\n"
-	"	{	int j = 2;\n"
-	"		u32 sum = 0;\n"
-	"		for(int i=0; i<nPerLane; i++)\n"
-	"		{\n"
-	"			u32 tmp = data[nPerLane*j+i];\n"
-	"			data[nPerLane*j+i] = sum;\n"
-	"			sum += tmp;\n"
-	"		}\n"
-	"	}\n"
-	"	{	int j = 3;\n"
-	"		u32 sum = 0;\n"
-	"		for(int i=0; i<nPerLane; i++)\n"
-	"		{\n"
-	"			u32 tmp = data[nPerLane*j+i];\n"
-	"			data[nPerLane*j+i] = sum;\n"
-	"			sum += tmp;\n"
-	"		}\n"
-	"	}\n"
-	"	for(int i=0; i<nPerLane; i++)\n"
-	"	{\n"
-	"		data[nPerLane*0+i] += scanned.x;\n"
-	"		data[nPerLane*1+i] += scanned.y;\n"
-	"		data[nPerLane*2+i] += scanned.z;\n"
-	"		data[nPerLane*3+i] += scanned.w;\n"
-	"	}\n"
-	"	for(int i=0; i<nPerWI; i++)\n"
-	"	{\n"
-	"		int index = nPerWI*lIdx+i;\n"
-	"		if (index < NUM_BUCKET*nWGs)\n"
-	"			wHistogram1[nPerWI*lIdx+i] = data[i];\n"
-	"	}\n"
-	"}\n"
-	"//	4 scan, 4 exchange\n"
-	"void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
-	"{\n"
-	"	for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)\n"
-	"	{\n"
-	"		u32 mask = (1<<bitIdx);\n"
-	"		uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n"
-	"		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
-	"		u32 total;\n"
-	"		prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n"
-	"		{\n"
-	"			uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
-	"			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
-	"			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"			ldsSortData[dstAddr.x] = sortData[0];\n"
-	"			ldsSortData[dstAddr.y] = sortData[1];\n"
-	"			ldsSortData[dstAddr.z] = sortData[2];\n"
-	"			ldsSortData[dstAddr.w] = sortData[3];\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"			sortData[0] = ldsSortData[localAddr.x];\n"
-	"			sortData[1] = ldsSortData[localAddr.y];\n"
-	"			sortData[2] = ldsSortData[localAddr.z];\n"
-	"			sortData[3] = ldsSortData[localAddr.w];\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"//	2 scan, 2 exchange\n"
-	"void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
-	"{\n"
-	"	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
-	"	{\n"
-	"		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
-	"			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
-	"			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
-	"			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
-	"		u32 key4;\n"
-	"		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
-	"		{\n"
-	"			sKeyPacked[0] |= 1<<(8*b.x);\n"
-	"			sKeyPacked[1] |= 1<<(8*b.y);\n"
-	"			sKeyPacked[2] |= 1<<(8*b.z);\n"
-	"			sKeyPacked[3] |= 1<<(8*b.w);\n"
-	"			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
-	"		}\n"
-	"		u32 rankPacked;\n"
-	"		u32 sumPacked;\n"
-	"		{\n"
-	"			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
-	"		}\n"
-	"		GROUP_LDS_BARRIER;\n"
-	"		u32 newOffset[4] = { 0,0,0,0 };\n"
-	"		{\n"
-	"			u32 sumScanned = bit8Scan( sumPacked );\n"
-	"			u32 scannedKeys[4];\n"
-	"			scannedKeys[0] = 1<<(8*b.x);\n"
-	"			scannedKeys[1] = 1<<(8*b.y);\n"
-	"			scannedKeys[2] = 1<<(8*b.z);\n"
-	"			scannedKeys[3] = 1<<(8*b.w);\n"
-	"			{	//	4 scans at once\n"
-	"				u32 sum4 = 0;\n"
-	"				for(int ie=0; ie<4; ie++)\n"
-	"				{\n"
-	"					u32 tmp = scannedKeys[ie];\n"
-	"					scannedKeys[ie] = sum4;\n"
-	"					sum4 += tmp;\n"
-	"				}\n"
-	"			}\n"
-	"			{\n"
-	"				u32 sumPlusRank = sumScanned + rankPacked;\n"
-	"				{	u32 ie = b.x;\n"
-	"					scannedKeys[0] += sumPlusRank;\n"
-	"					newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
-	"				}\n"
-	"				{	u32 ie = b.y;\n"
-	"					scannedKeys[1] += sumPlusRank;\n"
-	"					newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
-	"				}\n"
-	"				{	u32 ie = b.z;\n"
-	"					scannedKeys[2] += sumPlusRank;\n"
-	"					newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
-	"				}\n"
-	"				{	u32 ie = b.w;\n"
-	"					scannedKeys[3] += sumPlusRank;\n"
-	"					newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
-	"				}\n"
-	"			}\n"
-	"		}\n"
-	"		GROUP_LDS_BARRIER;\n"
-	"		{\n"
-	"			ldsSortData[newOffset[0]] = sortData[0];\n"
-	"			ldsSortData[newOffset[1]] = sortData[1];\n"
-	"			ldsSortData[newOffset[2]] = sortData[2];\n"
-	"			ldsSortData[newOffset[3]] = sortData[3];\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"			u32 dstAddr = 4*lIdx;\n"
-	"			sortData[0] = ldsSortData[dstAddr+0];\n"
-	"			sortData[1] = ldsSortData[dstAddr+1];\n"
-	"			sortData[2] = ldsSortData[dstAddr+2];\n"
-	"			sortData[3] = ldsSortData[dstAddr+3];\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
-	"{\n"
-	"	__local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
-	"	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
-	"	__local u32 localHistogram[NUM_BUCKET*2];\n"
-	"	u32 gIdx = GET_GLOBAL_IDX;\n"
-	"	u32 lIdx = GET_LOCAL_IDX;\n"
-	"	u32 wgIdx = GET_GROUP_IDX;\n"
-	"	u32 wgSize = GET_GROUP_SIZE;\n"
-	"	const int n = cb.m_n;\n"
-	"	const int nWGs = cb.m_nWGs;\n"
-	"	const int startBit = cb.m_startBit;\n"
-	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-	"	if( lIdx < (NUM_BUCKET) )\n"
-	"	{\n"
-	"		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-	"	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
-	"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-	"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
-	"	{\n"
-	"		u32 myHistogram = 0;\n"
-	"		u32 sortData[ELEMENTS_PER_WORK_ITEM];\n"
-	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-	"#if defined(CHECK_BOUNDARY)\n"
-	"			sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;\n"
-	"#else\n"
-	"			sortData[i] = gSrc[ addr+i ];\n"
-	"#endif\n"
-	"		sort4Bits(sortData, startBit, lIdx, ldsSortData);\n"
-	"		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
-	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-	"			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
-	"		{	//	create histogram\n"
-	"			u32 setIdx = lIdx/16;\n"
-	"			if( lIdx < NUM_BUCKET )\n"
-	"			{\n"
-	"				localHistogram[lIdx] = 0;\n"
-	"			}\n"
-	"			ldsSortData[lIdx] = 0;\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-	"#if defined(CHECK_BOUNDARY)\n"
-	"				if( addr+i < n )\n"
-	"#endif\n"
-	"#if defined(NV_GPU)\n"
-	"				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
-	"#else\n"
-	"				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
-	"#endif\n"
-	"			\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"			\n"
-	"			uint hIdx = NUM_BUCKET+lIdx;\n"
-	"			if( lIdx < NUM_BUCKET )\n"
-	"			{\n"
-	"				u32 sum = 0;\n"
-	"				for(int i=0; i<WG_SIZE/16; i++)\n"
-	"				{\n"
-	"					sum += SET_HISTOGRAM( i, lIdx );\n"
-	"				}\n"
-	"				myHistogram = sum;\n"
-	"				localHistogram[hIdx] = sum;\n"
-	"			}\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"#if defined(USE_2LEVEL_REDUCE)\n"
-	"			if( lIdx < NUM_BUCKET )\n"
-	"			{\n"
-	"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				u32 u0, u1, u2;\n"
-	"				u0 = localHistogram[hIdx-3];\n"
-	"				u1 = localHistogram[hIdx-2];\n"
-	"				u2 = localHistogram[hIdx-1];\n"
-	"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				u0 = localHistogram[hIdx-12];\n"
-	"				u1 = localHistogram[hIdx-8];\n"
-	"				u2 = localHistogram[hIdx-4];\n"
-	"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
-	"				GROUP_MEM_FENCE;\n"
-	"			}\n"
-	"#else\n"
-	"			if( lIdx < NUM_BUCKET )\n"
-	"			{\n"
-	"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"			}\n"
-	"#endif\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"		}\n"
-	"		{\n"
-	"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
-	"			{\n"
-	"				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
-	"				int binIdx = keys[ie];\n"
-	"				int groupOffset = localHistogramToCarry[binIdx];\n"
-	"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
-	"#if defined(CHECK_BOUNDARY)\n"
-	"				if( addr+ie < n )\n"
-	"#endif\n"
-	"				gDst[ groupOffset + myIdx ] = sortData[ie];\n"
-	"			}\n"
-	"		}\n"
-	"		GROUP_LDS_BARRIER;\n"
-	"		if( lIdx < NUM_BUCKET )\n"
-	"		{\n"
-	"			localHistogramToCarry[lIdx] += myHistogram;\n"
-	"		}\n"
-	"		GROUP_LDS_BARRIER;\n"
-	"	}\n"
-	"}\n"
-	"//	2 scan, 2 exchange\n"
-	"void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n"
-	"{\n"
-	"	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
-	"	{\n"
-	"		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
-	"			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
-	"			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
-	"			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
-	"		u32 key4;\n"
-	"		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
-	"		{\n"
-	"			sKeyPacked[0] |= 1<<(8*b.x);\n"
-	"			sKeyPacked[1] |= 1<<(8*b.y);\n"
-	"			sKeyPacked[2] |= 1<<(8*b.z);\n"
-	"			sKeyPacked[3] |= 1<<(8*b.w);\n"
-	"			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
-	"		}\n"
-	"		u32 rankPacked;\n"
-	"		u32 sumPacked;\n"
-	"		{\n"
-	"			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
-	"		}\n"
-	"		GROUP_LDS_BARRIER;\n"
-	"		u32 newOffset[4] = { 0,0,0,0 };\n"
-	"		{\n"
-	"			u32 sumScanned = bit8Scan( sumPacked );\n"
-	"			u32 scannedKeys[4];\n"
-	"			scannedKeys[0] = 1<<(8*b.x);\n"
-	"			scannedKeys[1] = 1<<(8*b.y);\n"
-	"			scannedKeys[2] = 1<<(8*b.z);\n"
-	"			scannedKeys[3] = 1<<(8*b.w);\n"
-	"			{	//	4 scans at once\n"
-	"				u32 sum4 = 0;\n"
-	"				for(int ie=0; ie<4; ie++)\n"
-	"				{\n"
-	"					u32 tmp = scannedKeys[ie];\n"
-	"					scannedKeys[ie] = sum4;\n"
-	"					sum4 += tmp;\n"
-	"				}\n"
-	"			}\n"
-	"			{\n"
-	"				u32 sumPlusRank = sumScanned + rankPacked;\n"
-	"				{	u32 ie = b.x;\n"
-	"					scannedKeys[0] += sumPlusRank;\n"
-	"					newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
-	"				}\n"
-	"				{	u32 ie = b.y;\n"
-	"					scannedKeys[1] += sumPlusRank;\n"
-	"					newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
-	"				}\n"
-	"				{	u32 ie = b.z;\n"
-	"					scannedKeys[2] += sumPlusRank;\n"
-	"					newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
-	"				}\n"
-	"				{	u32 ie = b.w;\n"
-	"					scannedKeys[3] += sumPlusRank;\n"
-	"					newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
-	"				}\n"
-	"			}\n"
-	"		}\n"
-	"		GROUP_LDS_BARRIER;\n"
-	"		{\n"
-	"			ldsSortData[newOffset[0]] = sortData[0];\n"
-	"			ldsSortData[newOffset[1]] = sortData[1];\n"
-	"			ldsSortData[newOffset[2]] = sortData[2];\n"
-	"			ldsSortData[newOffset[3]] = sortData[3];\n"
-	"			ldsSortVal[newOffset[0]] = sortVal[0];\n"
-	"			ldsSortVal[newOffset[1]] = sortVal[1];\n"
-	"			ldsSortVal[newOffset[2]] = sortVal[2];\n"
-	"			ldsSortVal[newOffset[3]] = sortVal[3];\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"			u32 dstAddr = 4*lIdx;\n"
-	"			sortData[0] = ldsSortData[dstAddr+0];\n"
-	"			sortData[1] = ldsSortData[dstAddr+1];\n"
-	"			sortData[2] = ldsSortData[dstAddr+2];\n"
-	"			sortData[3] = ldsSortData[dstAddr+3];\n"
-	"			sortVal[0] = ldsSortVal[dstAddr+0];\n"
-	"			sortVal[1] = ldsSortVal[dstAddr+1];\n"
-	"			sortVal[2] = ldsSortVal[dstAddr+2];\n"
-	"			sortVal[3] = ldsSortVal[dstAddr+3];\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"		}\n"
-	"	}\n"
-	"}\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
-	"{\n"
-	"	__local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
-	"	__local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
-	"	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
-	"	__local u32 localHistogram[NUM_BUCKET*2];\n"
-	"	u32 gIdx = GET_GLOBAL_IDX;\n"
-	"	u32 lIdx = GET_LOCAL_IDX;\n"
-	"	u32 wgIdx = GET_GROUP_IDX;\n"
-	"	u32 wgSize = GET_GROUP_SIZE;\n"
-	"	const int n = cb.m_n;\n"
-	"	const int nWGs = cb.m_nWGs;\n"
-	"	const int startBit = cb.m_startBit;\n"
-	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-	"	if( lIdx < (NUM_BUCKET) )\n"
-	"	{\n"
-	"		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
-	"	}\n"
-	"	GROUP_LDS_BARRIER;\n"
-	"    \n"
-	"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-	"	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
-	"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-	"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
-	"	{\n"
-	"		u32 myHistogram = 0;\n"
-	"		int sortData[ELEMENTS_PER_WORK_ITEM];\n"
-	"		int sortVal[ELEMENTS_PER_WORK_ITEM];\n"
-	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-	"#if defined(CHECK_BOUNDARY)\n"
-	"		{\n"
-	"			sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;\n"
-	"			sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;\n"
-	"		}\n"
-	"#else\n"
-	"		{\n"
-	"			sortData[i] = gSrc[ addr+i ].m_key;\n"
-	"			sortVal[i] = gSrc[ addr+i ].m_value;\n"
-	"		}\n"
-	"#endif\n"
-	"		sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n"
-	"		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
-	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-	"			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
-	"		{	//	create histogram\n"
-	"			u32 setIdx = lIdx/16;\n"
-	"			if( lIdx < NUM_BUCKET )\n"
-	"			{\n"
-	"				localHistogram[lIdx] = 0;\n"
-	"			}\n"
-	"			ldsSortData[lIdx] = 0;\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-	"#if defined(CHECK_BOUNDARY)\n"
-	"				if( addr+i < n )\n"
-	"#endif\n"
-	"#if defined(NV_GPU)\n"
-	"				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
-	"#else\n"
-	"				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
-	"#endif\n"
-	"			\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"			\n"
-	"			uint hIdx = NUM_BUCKET+lIdx;\n"
-	"			if( lIdx < NUM_BUCKET )\n"
-	"			{\n"
-	"				u32 sum = 0;\n"
-	"				for(int i=0; i<WG_SIZE/16; i++)\n"
-	"				{\n"
-	"					sum += SET_HISTOGRAM( i, lIdx );\n"
-	"				}\n"
-	"				myHistogram = sum;\n"
-	"				localHistogram[hIdx] = sum;\n"
-	"			}\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"#if defined(USE_2LEVEL_REDUCE)\n"
-	"			if( lIdx < NUM_BUCKET )\n"
-	"			{\n"
-	"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				u32 u0, u1, u2;\n"
-	"				u0 = localHistogram[hIdx-3];\n"
-	"				u1 = localHistogram[hIdx-2];\n"
-	"				u2 = localHistogram[hIdx-1];\n"
-	"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				u0 = localHistogram[hIdx-12];\n"
-	"				u1 = localHistogram[hIdx-8];\n"
-	"				u2 = localHistogram[hIdx-4];\n"
-	"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
-	"				GROUP_MEM_FENCE;\n"
-	"			}\n"
-	"#else\n"
-	"			if( lIdx < NUM_BUCKET )\n"
-	"			{\n"
-	"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
-	"				GROUP_MEM_FENCE;\n"
-	"			}\n"
-	"#endif\n"
-	"			GROUP_LDS_BARRIER;\n"
-	"		}\n"
-	"    	{\n"
-	"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
-	"			{\n"
-	"				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
-	"				int binIdx = keys[ie];\n"
-	"				int groupOffset = localHistogramToCarry[binIdx];\n"
-	"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
-	"#if defined(CHECK_BOUNDARY)\n"
-	"				if( addr+ie < n )\n"
-	"				{\n"
-	"                    if ((groupOffset + myIdx)<n)\n"
-	"                    {\n"
-	"                        if (sortData[ie]==sortVal[ie])\n"
-	"                        {\n"
-	"                            \n"
-	"                            SortDataCL tmp;\n"
-	"                            tmp.m_key = sortData[ie];\n"
-	"                            tmp.m_value = sortVal[ie];\n"
-	"                            if (tmp.m_key == tmp.m_value)\n"
-	"                                gDst[groupOffset + myIdx ] = tmp;\n"
-	"                        }\n"
-	"                        \n"
-	"                    }\n"
-	"				}\n"
-	"#else\n"
-	"                if ((groupOffset + myIdx)<n)\n"
-	"                {\n"
-	"                    gDst[ groupOffset + myIdx ].m_key = sortData[ie];\n"
-	"                    gDst[ groupOffset + myIdx ].m_value = sortVal[ie];\n"
-	"                }\n"
-	"#endif\n"
-	"			}\n"
-	"		}\n"
-	"		GROUP_LDS_BARRIER;\n"
-	"		if( lIdx < NUM_BUCKET )\n"
-	"		{\n"
-	"			localHistogramToCarry[lIdx] += myHistogram;\n"
-	"		}\n"
-	"		GROUP_LDS_BARRIER;\n"
-	"	}\n"
-	"}\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
-	"{\n"
-	"    \n"
-	"	u32 gIdx = GET_GLOBAL_IDX;\n"
-	"	u32 realLocalIdx = GET_LOCAL_IDX;\n"
-	"	u32 wgIdx = GET_GROUP_IDX;\n"
-	"	u32 wgSize = GET_GROUP_SIZE;\n"
-	"	const int startBit = cb.m_startBit;\n"
-	"	const int n = cb.m_n;\n"
-	"	const int nWGs = cb.m_nWGs;\n"
-	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-	"    int counter[NUM_BUCKET];\n"
-	"    \n"
-	"    if (realLocalIdx>0)\n"
-	"        return;\n"
-	"    \n"
-	"    for (int c=0;c<NUM_BUCKET;c++)\n"
-	"        counter[c]=0;\n"
-	"    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-	"	\n"
-	"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-	"   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
-	"  {\n"
-	"     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
-	" 	{\n"
-	"        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-	"        \n"
-	"		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
-	"		{\n"
-	"            int i = addr2+j;\n"
-	"			if( i < n )\n"
-	"			{\n"
-	"                int tableIdx;\n"
-	"				tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
-	"                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
-	"                counter[tableIdx] ++;\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"  }\n"
-	"    \n"
-	"}\n"
-	"__kernel\n"
-	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-	"void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
-	"{\n"
-	"    \n"
-	"	u32 gIdx = GET_GLOBAL_IDX;\n"
-	"	u32 realLocalIdx = GET_LOCAL_IDX;\n"
-	"	u32 wgIdx = GET_GROUP_IDX;\n"
-	"	u32 wgSize = GET_GROUP_SIZE;\n"
-	"	const int startBit = cb.m_startBit;\n"
-	"	const int n = cb.m_n;\n"
-	"	const int nWGs = cb.m_nWGs;\n"
-	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-	"    int counter[NUM_BUCKET];\n"
-	"    \n"
-	"    if (realLocalIdx>0)\n"
-	"        return;\n"
-	"    \n"
-	"    for (int c=0;c<NUM_BUCKET;c++)\n"
-	"        counter[c]=0;\n"
-	"    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-	"	\n"
-	"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-	"   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
-	"  {\n"
-	"     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
-	" 	{\n"
-	"        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-	"        \n"
-	"		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
-	"		{\n"
-	"            int i = addr2+j;\n"
-	"			if( i < n )\n"
-	"			{\n"
-	"                int tableIdx;\n"
-	"				tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
-	"                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
-	"                counter[tableIdx] ++;\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"  }\n"
-	"    \n"
-	"}\n";
--- a/Engine/lib/bullet/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp
@ -1,374 +0,0 @@
-
-#include "b3GpuRaycast.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
-#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h"
-
-#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
-#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
-#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h"
-
-#include "Bullet3OpenCL/Raycast/kernels/rayCastKernels.h"
-
-#define B3_RAYCAST_PATH "src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl"
-
-struct b3GpuRaycastInternalData
-{
-	cl_context m_context;
-	cl_device_id m_device;
-	cl_command_queue m_q;
-	cl_kernel m_raytraceKernel;
-	cl_kernel m_raytracePairsKernel;
-	cl_kernel m_findRayRigidPairIndexRanges;
-
-	b3GpuParallelLinearBvh* m_plbvh;
-	b3RadixSort32CL* m_radixSorter;
-	b3FillCL* m_fill;
-
-	//1 element per ray
-	b3OpenCLArray<b3RayInfo>* m_gpuRays;
-	b3OpenCLArray<b3RayHit>* m_gpuHitResults;
-	b3OpenCLArray<int>* m_firstRayRigidPairIndexPerRay;
-	b3OpenCLArray<int>* m_numRayRigidPairsPerRay;
-
-	//1 element per (ray index, rigid index) pair, where the ray intersects with the rigid's AABB
-	b3OpenCLArray<int>* m_gpuNumRayRigidPairs;
-	b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs;  //x == ray index, y == rigid index
-
-	int m_test;
-};
-
-b3GpuRaycast::b3GpuRaycast(cl_context ctx, cl_device_id device, cl_command_queue q)
-{
-	m_data = new b3GpuRaycastInternalData;
-	m_data->m_context = ctx;
-	m_data->m_device = device;
-	m_data->m_q = q;
-	m_data->m_raytraceKernel = 0;
-	m_data->m_raytracePairsKernel = 0;
-	m_data->m_findRayRigidPairIndexRanges = 0;
-
-	m_data->m_plbvh = new b3GpuParallelLinearBvh(ctx, device, q);
-	m_data->m_radixSorter = new b3RadixSort32CL(ctx, device, q);
-	m_data->m_fill = new b3FillCL(ctx, device, q);
-
-	m_data->m_gpuRays = new b3OpenCLArray<b3RayInfo>(ctx, q);
-	m_data->m_gpuHitResults = new b3OpenCLArray<b3RayHit>(ctx, q);
-	m_data->m_firstRayRigidPairIndexPerRay = new b3OpenCLArray<int>(ctx, q);
-	m_data->m_numRayRigidPairsPerRay = new b3OpenCLArray<int>(ctx, q);
-	m_data->m_gpuNumRayRigidPairs = new b3OpenCLArray<int>(ctx, q);
-	m_data->m_gpuRayRigidPairs = new b3OpenCLArray<b3Int2>(ctx, q);
-
-	{
-		cl_int errNum = 0;
-		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, &errNum, "", B3_RAYCAST_PATH);
-		b3Assert(errNum == CL_SUCCESS);
-		m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "rayCastKernel", &errNum, prog);
-		b3Assert(errNum == CL_SUCCESS);
-		m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "rayCastPairsKernel", &errNum, prog);
-		b3Assert(errNum == CL_SUCCESS);
-		m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "findRayRigidPairIndexRanges", &errNum, prog);
-		b3Assert(errNum == CL_SUCCESS);
-		clReleaseProgram(prog);
-	}
-}
-
-b3GpuRaycast::~b3GpuRaycast()
-{
-	clReleaseKernel(m_data->m_raytraceKernel);
-	clReleaseKernel(m_data->m_raytracePairsKernel);
-	clReleaseKernel(m_data->m_findRayRigidPairIndexRanges);
-
-	delete m_data->m_plbvh;
-	delete m_data->m_radixSorter;
-	delete m_data->m_fill;
-
-	delete m_data->m_gpuRays;
-	delete m_data->m_gpuHitResults;
-	delete m_data->m_firstRayRigidPairIndexPerRay;
-	delete m_data->m_numRayRigidPairsPerRay;
-	delete m_data->m_gpuNumRayRigidPairs;
-	delete m_data->m_gpuRayRigidPairs;
-
-	delete m_data;
-}
-
-bool sphere_intersect(const b3Vector3& spherePos, b3Scalar radius, const b3Vector3& rayFrom, const b3Vector3& rayTo, float& hitFraction)
-{
-	b3Vector3 rs = rayFrom - spherePos;
-	b3Vector3 rayDir = rayTo - rayFrom;
-
-	float A = b3Dot(rayDir, rayDir);
-	float B = b3Dot(rs, rayDir);
-	float C = b3Dot(rs, rs) - (radius * radius);
-
-	float D = B * B - A * C;
-
-	if (D > 0.0)
-	{
-		float t = (-B - sqrt(D)) / A;
-
-		if ((t >= 0.0f) && (t < hitFraction))
-		{
-			hitFraction = t;
-			return true;
-		}
-	}
-	return false;
-}
-
-bool rayConvex(const b3Vector3& rayFromLocal, const b3Vector3& rayToLocal, const b3ConvexPolyhedronData& poly,
-			   const b3AlignedObjectArray<b3GpuFace>& faces, float& hitFraction, b3Vector3& hitNormal)
-{
-	float exitFraction = hitFraction;
-	float enterFraction = -0.1f;
-	b3Vector3 curHitNormal = b3MakeVector3(0, 0, 0);
-	for (int i = 0; i < poly.m_numFaces; i++)
-	{
-		const b3GpuFace& face = faces[poly.m_faceOffset + i];
-		float fromPlaneDist = b3Dot(rayFromLocal, face.m_plane) + face.m_plane.w;
-		float toPlaneDist = b3Dot(rayToLocal, face.m_plane) + face.m_plane.w;
-		if (fromPlaneDist < 0.f)
-		{
-			if (toPlaneDist >= 0.f)
-			{
-				float fraction = fromPlaneDist / (fromPlaneDist - toPlaneDist);
-				if (exitFraction > fraction)
-				{
-					exitFraction = fraction;
-				}
-			}
-		}
-		else
-		{
-			if (toPlaneDist < 0.f)
-			{
-				float fraction = fromPlaneDist / (fromPlaneDist - toPlaneDist);
-				if (enterFraction <= fraction)
-				{
-					enterFraction = fraction;
-					curHitNormal = face.m_plane;
-					curHitNormal.w = 0.f;
-				}
-			}
-			else
-			{
-				return false;
-			}
-		}
-		if (exitFraction <= enterFraction)
-			return false;
-	}
-
-	if (enterFraction < 0.f)
-		return false;
-
-	hitFraction = enterFraction;
-	hitNormal = curHitNormal;
-	return true;
-}
-
-void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
-								int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData)
-{
-	//	return castRays(rays,hitResults,numBodies,bodies,numCollidables,collidables);
-
-	B3_PROFILE("castRaysHost");
-	for (int r = 0; r < rays.size(); r++)
-	{
-		b3Vector3 rayFrom = rays[r].m_from;
-		b3Vector3 rayTo = rays[r].m_to;
-		float hitFraction = hitResults[r].m_hitFraction;
-
-		int hitBodyIndex = -1;
-		b3Vector3 hitNormal;
-
-		for (int b = 0; b < numBodies; b++)
-		{
-			const b3Vector3& pos = bodies[b].m_pos;
-			//const b3Quaternion& orn = bodies[b].m_quat;
-
-			switch (collidables[bodies[b].m_collidableIdx].m_shapeType)
-			{
-				case SHAPE_SPHERE:
-				{
-					b3Scalar radius = collidables[bodies[b].m_collidableIdx].m_radius;
-					if (sphere_intersect(pos, radius, rayFrom, rayTo, hitFraction))
-					{
-						hitBodyIndex = b;
-						b3Vector3 hitPoint;
-						hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to, hitFraction);
-						hitNormal = (hitPoint - bodies[b].m_pos).normalize();
-					}
-				}
-				case SHAPE_CONVEX_HULL:
-				{
-					b3Transform convexWorldTransform;
-					convexWorldTransform.setIdentity();
-					convexWorldTransform.setOrigin(bodies[b].m_pos);
-					convexWorldTransform.setRotation(bodies[b].m_quat);
-					b3Transform convexWorld2Local = convexWorldTransform.inverse();
-
-					b3Vector3 rayFromLocal = convexWorld2Local(rayFrom);
-					b3Vector3 rayToLocal = convexWorld2Local(rayTo);
-
-					int shapeIndex = collidables[bodies[b].m_collidableIdx].m_shapeIndex;
-					const b3ConvexPolyhedronData& poly = narrowphaseData->m_convexPolyhedra[shapeIndex];
-					if (rayConvex(rayFromLocal, rayToLocal, poly, narrowphaseData->m_convexFaces, hitFraction, hitNormal))
-					{
-						hitBodyIndex = b;
-					}
-
-					break;
-				}
-				default:
-				{
-					static bool once = true;
-					if (once)
-					{
-						once = false;
-						b3Warning("Raytest: unsupported shape type\n");
-					}
-				}
-			}
-		}
-		if (hitBodyIndex >= 0)
-		{
-			hitResults[r].m_hitFraction = hitFraction;
-			hitResults[r].m_hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to, hitFraction);
-			hitResults[r].m_hitNormal = hitNormal;
-			hitResults[r].m_hitBody = hitBodyIndex;
-		}
-	}
-}
-///todo: add some acceleration structure (AABBs, tree etc)
-void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
-							int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
-							const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase)
-{
-	//castRaysHost(rays,hitResults,numBodies,bodies,numCollidables,collidables,narrowphaseData);
-
-	B3_PROFILE("castRaysGPU");
-
-	{
-		B3_PROFILE("raycast copyFromHost");
-		m_data->m_gpuRays->copyFromHost(rays);
-		m_data->m_gpuHitResults->copyFromHost(hitResults);
-	}
-
-	int numRays = hitResults.size();
-	{
-		m_data->m_firstRayRigidPairIndexPerRay->resize(numRays);
-		m_data->m_numRayRigidPairsPerRay->resize(numRays);
-
-		m_data->m_gpuNumRayRigidPairs->resize(1);
-		m_data->m_gpuRayRigidPairs->resize(numRays * 16);
-	}
-
-	//run kernel
-	const bool USE_BRUTE_FORCE_RAYCAST = false;
-	if (USE_BRUTE_FORCE_RAYCAST)
-	{
-		B3_PROFILE("raycast launch1D");
-
-		b3LauncherCL launcher(m_data->m_q, m_data->m_raytraceKernel, "m_raytraceKernel");
-		int numRays = rays.size();
-		launcher.setConst(numRays);
-
-		launcher.setBuffer(m_data->m_gpuRays->getBufferCL());
-		launcher.setBuffer(m_data->m_gpuHitResults->getBufferCL());
-
-		launcher.setConst(numBodies);
-		launcher.setBuffer(narrowphaseData->m_bodyBufferGPU->getBufferCL());
-		launcher.setBuffer(narrowphaseData->m_collidablesGPU->getBufferCL());
-		launcher.setBuffer(narrowphaseData->m_convexFacesGPU->getBufferCL());
-		launcher.setBuffer(narrowphaseData->m_convexPolyhedraGPU->getBufferCL());
-
-		launcher.launch1D(numRays);
-		clFinish(m_data->m_q);
-	}
-	else
-	{
-		m_data->m_plbvh->build(broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU());
-
-		m_data->m_plbvh->testRaysAgainstBvhAabbs(*m_data->m_gpuRays, *m_data->m_gpuNumRayRigidPairs, *m_data->m_gpuRayRigidPairs);
-
-		int numRayRigidPairs = -1;
-		m_data->m_gpuNumRayRigidPairs->copyToHostPointer(&numRayRigidPairs, 1);
-		if (numRayRigidPairs > m_data->m_gpuRayRigidPairs->size())
-		{
-			numRayRigidPairs = m_data->m_gpuRayRigidPairs->size();
-			m_data->m_gpuNumRayRigidPairs->copyFromHostPointer(&numRayRigidPairs, 1);
-		}
-
-		m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs);  //Radix sort needs b3OpenCLArray::size() to be correct
-
-		//Sort ray-rigid pairs by ray index
-		{
-			B3_PROFILE("sort ray-rigid pairs");
-			m_data->m_radixSorter->execute(*reinterpret_cast<b3OpenCLArray<b3SortData>*>(m_data->m_gpuRayRigidPairs));
-		}
-
-		//detect start,count of each ray pair
-		{
-			B3_PROFILE("detect ray-rigid pair index ranges");
-
-			{
-				B3_PROFILE("reset ray-rigid pair index ranges");
-
-				m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays);  //atomic_min used to find first index
-				m_data->m_fill->execute(*m_data->m_numRayRigidPairsPerRay, 0, numRays);
-				clFinish(m_data->m_q);
-			}
-
-			b3BufferInfoCL bufferInfo[] =
-				{
-					b3BufferInfoCL(m_data->m_gpuRayRigidPairs->getBufferCL()),
-
-					b3BufferInfoCL(m_data->m_firstRayRigidPairIndexPerRay->getBufferCL()),
-					b3BufferInfoCL(m_data->m_numRayRigidPairsPerRay->getBufferCL())};
-
-			b3LauncherCL launcher(m_data->m_q, m_data->m_findRayRigidPairIndexRanges, "m_findRayRigidPairIndexRanges");
-			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(numRayRigidPairs);
-
-			launcher.launch1D(numRayRigidPairs);
-			clFinish(m_data->m_q);
-		}
-
-		{
-			B3_PROFILE("ray-rigid intersection");
-
-			b3BufferInfoCL bufferInfo[] =
-				{
-					b3BufferInfoCL(m_data->m_gpuRays->getBufferCL()),
-					b3BufferInfoCL(m_data->m_gpuHitResults->getBufferCL()),
-					b3BufferInfoCL(m_data->m_firstRayRigidPairIndexPerRay->getBufferCL()),
-					b3BufferInfoCL(m_data->m_numRayRigidPairsPerRay->getBufferCL()),
-
-					b3BufferInfoCL(narrowphaseData->m_bodyBufferGPU->getBufferCL()),
-					b3BufferInfoCL(narrowphaseData->m_collidablesGPU->getBufferCL()),
-					b3BufferInfoCL(narrowphaseData->m_convexFacesGPU->getBufferCL()),
-					b3BufferInfoCL(narrowphaseData->m_convexPolyhedraGPU->getBufferCL()),
-
-					b3BufferInfoCL(m_data->m_gpuRayRigidPairs->getBufferCL())};
-
-			b3LauncherCL launcher(m_data->m_q, m_data->m_raytracePairsKernel, "m_raytracePairsKernel");
-			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
-			launcher.setConst(numRays);
-
-			launcher.launch1D(numRays);
-			clFinish(m_data->m_q);
-		}
-	}
-
-	//copy results
-	{
-		B3_PROFILE("raycast copyToHost");
-		m_data->m_gpuHitResults->copyToHost(hitResults);
-	}
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h
@ -1,28 +0,0 @@
-#ifndef B3_GPU_RAYCAST_H
-#define B3_GPU_RAYCAST_H
-
-#include "Bullet3Common/b3Vector3.h"
-#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
-
-#include "Bullet3Common/b3AlignedObjectArray.h"
-#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
-
-class b3GpuRaycast
-{
-protected:
-	struct b3GpuRaycastInternalData* m_data;
-
-public:
-	b3GpuRaycast(cl_context ctx, cl_device_id device, cl_command_queue q);
-	virtual ~b3GpuRaycast();
-
-	void castRaysHost(const b3AlignedObjectArray<b3RayInfo>& raysIn, b3AlignedObjectArray<b3RayHit>& hitResults,
-					  int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
-					  const struct b3GpuNarrowPhaseInternalData* narrowphaseData);
-
-	void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
-				  int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
-				  const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase);
-};
-
-#endif  //B3_GPU_RAYCAST_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl
@ -1,439 +0,0 @@
-
-#define SHAPE_CONVEX_HULL 3
-#define SHAPE_PLANE 4
-#define SHAPE_CONCAVE_TRIMESH 5
-#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
-#define SHAPE_SPHERE 7
-
-
-typedef struct
-{
-	float4 m_from;
-	float4 m_to;
-} b3RayInfo;
-
-typedef struct
-{
-	float m_hitFraction;
-	int	m_hitResult0;
-	int	m_hitResult1;
-	int	m_hitResult2;
-	float4	m_hitPoint;
-	float4	m_hitNormal;
-} b3RayHit;
-
-typedef struct
-{
-	float4 m_pos;
-	float4 m_quat;
-	float4 m_linVel;
-	float4 m_angVel;
-
-	unsigned int m_collidableIdx;
-	float m_invMass;
-	float m_restituitionCoeff;
-	float m_frictionCoeff;
-} Body;
-
-typedef struct Collidable
-{
-	union {
-		int m_numChildShapes;
-		int m_bvhIndex;
-	};
-	float m_radius;
-	int m_shapeType;
-	int m_shapeIndex;
-} Collidable;
-
-
-typedef struct  
-{
-	float4		m_localCenter;
-	float4		m_extents;
-	float4		mC;
-	float4		mE;
-
-	float			m_radius;
-	int	m_faceOffset;
-	int m_numFaces;
-	int	m_numVertices;
-
-	int m_vertexOffset;
-	int	m_uniqueEdgesOffset;
-	int	m_numUniqueEdges;
-	int m_unused;
-
-} ConvexPolyhedronCL;
-
-typedef struct
-{
-	float4 m_plane;
-	int m_indexOffset;
-	int m_numIndices;
-} b3GpuFace;
-
-
-
-///////////////////////////////////////
-//	Quaternion
-///////////////////////////////////////
-
-typedef float4 Quaternion;
-
-__inline
-	Quaternion qtMul(Quaternion a, Quaternion b);
-
-__inline
-	Quaternion qtNormalize(Quaternion in);
-
-
-__inline
-	Quaternion qtInvert(Quaternion q);
-
-
-__inline
-	float dot3F4(float4 a, float4 b)
-{
-	float4 a1 = (float4)(a.xyz,0.f);
-	float4 b1 = (float4)(b.xyz,0.f);
-	return dot(a1, b1);
-}
-
-
-__inline
-	Quaternion qtMul(Quaternion a, Quaternion b)
-{
-	Quaternion ans;
-	ans = cross( a, b );
-	ans += a.w*b+b.w*a;
-	//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
-	ans.w = a.w*b.w - dot3F4(a, b);
-	return ans;
-}
-
-__inline
-	Quaternion qtNormalize(Quaternion in)
-{
-	return fast_normalize(in);
-	//	in /= length( in );
-	//	return in;
-}
-__inline
-	float4 qtRotate(Quaternion q, float4 vec)
-{
-	Quaternion qInv = qtInvert( q );
-	float4 vcpy = vec;
-	vcpy.w = 0.f;
-	float4 out = qtMul(q,vcpy);
-	out = qtMul(out,qInv);
-	return out;
-}
-
-__inline
-	Quaternion qtInvert(Quaternion q)
-{
-	return (Quaternion)(-q.xyz, q.w);
-}
-
-__inline
-	float4 qtInvRotate(const Quaternion q, float4 vec)
-{
-	return qtRotate( qtInvert( q ), vec );
-}
-
-
-
-void	trInverse(float4 translationIn, Quaternion orientationIn,
-	float4* translationOut, Quaternion* orientationOut)
-{
-	*orientationOut = qtInvert(orientationIn);
-	*translationOut = qtRotate(*orientationOut, -translationIn);
-}
-
-
-
-
-
-bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,
-	__global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)
-{
-	rayFromLocal.w = 0.f;
-	rayToLocal.w = 0.f;
-	bool result = true;
-
-	float exitFraction = hitFraction[0];
-	float enterFraction = -0.3f;
-	float4 curHitNormal = (float4)(0,0,0,0);
-	for (int i=0;i<numFaces && result;i++)
-	{
-		b3GpuFace face = faces[faceOffset+i];
-		float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;
-		float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;
-		if (fromPlaneDist<0.f)
-		{
-			if (toPlaneDist >= 0.f)
-			{
-				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
-				if (exitFraction>fraction)
-				{
-					exitFraction = fraction;
-				}
-			} 			
-		} else
-		{
-			if (toPlaneDist<0.f)
-			{
-				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
-				if (enterFraction <= fraction)
-				{
-					enterFraction = fraction;
-					curHitNormal = face.m_plane;
-					curHitNormal.w = 0.f;
-				}
-			} else
-			{
-				result = false;
-			}
-		}
-		if (exitFraction <= enterFraction)
-			result = false;
-	}
-
-	if (enterFraction < 0.f)
-	{
-		result = false;
-	}
-
-	if (result)
-	{	
-		hitFraction[0] = enterFraction;
-		hitNormal[0] = curHitNormal;
-	}
-	return result;
-}
-
-
-
-
-
-
-bool sphere_intersect(float4 spherePos,  float radius, float4 rayFrom, float4 rayTo, float* hitFraction)
-{
-	float4 rs = rayFrom - spherePos;
-	rs.w = 0.f;
-	float4 rayDir = rayTo-rayFrom;
-	rayDir.w = 0.f;
-	float A = dot(rayDir,rayDir);
-	float B = dot(rs, rayDir);
-	float C = dot(rs, rs) - (radius * radius);
-
-	float D = B * B - A*C;
-
-	if (D > 0.0f)
-	{
-		float t = (-B - sqrt(D))/A;
-
-		if ( (t >= 0.0f) && (t < (*hitFraction)) )
-		{
-			*hitFraction = t;
-			return true;
-		}
-	}
-	return false;
-}
-
-float4 setInterpolate3(float4 from, float4 to, float t)
-{
-	float s = 1.0f - t;
-	float4 result;
-	result = s * from + t * to;
-	result.w = 0.f;	
-	return result;	
-}
-
-__kernel void rayCastKernel(  
-	int numRays, 
-	const __global b3RayInfo* rays, 
-	__global b3RayHit* hitResults, 
-	const int numBodies, 
-	__global Body* bodies,
-	__global Collidable* collidables,
-	__global const b3GpuFace* faces,
-	__global const ConvexPolyhedronCL* convexShapes	)
-{
-
-	int i = get_global_id(0);
-	if (i>=numRays)
-		return;
-
-	hitResults[i].m_hitFraction = 1.f;
-
-	float4 rayFrom = rays[i].m_from;
-	float4 rayTo = rays[i].m_to;
-	float hitFraction = 1.f;
-	float4 hitPoint;
-	float4 hitNormal;
-	int hitBodyIndex= -1;
-
-	int cachedCollidableIndex = -1;
-	Collidable cachedCollidable;
-
-	for (int b=0;b<numBodies;b++)
-	{
-		if (hitResults[i].m_hitResult2==b)
-			continue;
-		Body body = bodies[b];
-		float4 pos = body.m_pos;
-		float4 orn = body.m_quat;
-		if (cachedCollidableIndex != body.m_collidableIdx)
-		{
-			cachedCollidableIndex = body.m_collidableIdx;
-			cachedCollidable = collidables[cachedCollidableIndex];
-		}
-		if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)
-		{
-
-			float4 invPos = (float4)(0,0,0,0);
-			float4 invOrn = (float4)(0,0,0,0);
-			float4 rayFromLocal = (float4)(0,0,0,0);
-			float4 rayToLocal = (float4)(0,0,0,0);
-			invOrn = qtInvert(orn);
-			invPos = qtRotate(invOrn, -pos);
-			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;
-			rayToLocal = qtRotate( invOrn, rayTo) + invPos;
-			rayFromLocal.w = 0.f;
-			rayToLocal.w = 0.f;
-			int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;
-			int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;
-			if (numFaces)
-			{
-				if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))
-				{
-					hitBodyIndex = b;
-					
-				}
-			}
-		}
-		if (cachedCollidable.m_shapeType == SHAPE_SPHERE)
-		{
-			float radius = cachedCollidable.m_radius;
-		
-			if (sphere_intersect(pos,  radius, rayFrom, rayTo, &hitFraction))
-			{
-				hitBodyIndex = b;
-				hitNormal = (float4) (hitPoint-bodies[b].m_pos);
-			}
-		}
-	}
-
-	if (hitBodyIndex>=0)
-	{
-		hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);
-		hitResults[i].m_hitFraction = hitFraction;
-		hitResults[i].m_hitPoint = hitPoint;
-		hitResults[i].m_hitNormal = normalize(hitNormal);
-		hitResults[i].m_hitResult0 = hitBodyIndex;
-	}
-
-}
-
-
-__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, 
-											__global int* out_firstRayRigidPairIndexPerRay,
-											__global int* out_numRayRigidPairsPerRay,
-											int numRayRigidPairs)
-{
-	int rayRigidPairIndex = get_global_id(0);
-	if (rayRigidPairIndex >= numRayRigidPairs) return;
-	
-	int rayIndex = rayRigidPairs[rayRigidPairIndex].x;
-	
-	atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);
-	atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);
-}
-
-__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, 
-								__global b3RayHit* hitResults, 
-								__global int* firstRayRigidPairIndexPerRay,
-								__global int* numRayRigidPairsPerRay,
-									
-								__global Body* bodies,
-								__global Collidable* collidables,
-								__global const b3GpuFace* faces,
-								__global const ConvexPolyhedronCL* convexShapes,
-								
-								__global int2* rayRigidPairs,
-								int numRays)
-{
-	int i = get_global_id(0);
-	if (i >= numRays) return;
-	
-	float4 rayFrom = rays[i].m_from;
-	float4 rayTo = rays[i].m_to;
-		
-	hitResults[i].m_hitFraction = 1.f;
-		
-	float hitFraction = 1.f;
-	float4 hitPoint;
-	float4 hitNormal;
-	int hitBodyIndex = -1;
-		
-	//
-	for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)
-	{
-		int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];
-		int b = rayRigidPairs[rayRigidPairIndex].y;
-		
-		if (hitResults[i].m_hitResult2 == b) continue;
-		
-		Body body = bodies[b];
-		Collidable rigidCollidable = collidables[body.m_collidableIdx];
-		
-		float4 pos = body.m_pos;
-		float4 orn = body.m_quat;
-		
-		if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)
-		{
-			float4 invPos = (float4)(0,0,0,0);
-			float4 invOrn = (float4)(0,0,0,0);
-			float4 rayFromLocal = (float4)(0,0,0,0);
-			float4 rayToLocal = (float4)(0,0,0,0);
-			invOrn = qtInvert(orn);
-			invPos = qtRotate(invOrn, -pos);
-			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;
-			rayToLocal = qtRotate( invOrn, rayTo) + invPos;
-			rayFromLocal.w = 0.f;
-			rayToLocal.w = 0.f;
-			int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;
-			int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;
-			
-			if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))
-			{
-				hitBodyIndex = b;
-				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
-			}
-		}
-		
-		if (rigidCollidable.m_shapeType == SHAPE_SPHERE)
-		{
-			float radius = rigidCollidable.m_radius;
-		
-			if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))
-			{
-				hitBodyIndex = b;
-				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
-				hitNormal = (float4) (hitPoint - bodies[b].m_pos);
-			}
-		}
-	}
-	
-	if (hitBodyIndex >= 0)
-	{
-		hitResults[i].m_hitFraction = hitFraction;
-		hitResults[i].m_hitPoint = hitPoint;
-		hitResults[i].m_hitNormal = normalize(hitNormal);
-		hitResults[i].m_hitResult0 = hitBodyIndex;
-	}
-	
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h
@ -1,380 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* rayCastKernelCL =
-	"#define SHAPE_CONVEX_HULL 3\n"
-	"#define SHAPE_PLANE 4\n"
-	"#define SHAPE_CONCAVE_TRIMESH 5\n"
-	"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
-	"#define SHAPE_SPHERE 7\n"
-	"typedef struct\n"
-	"{\n"
-	"	float4 m_from;\n"
-	"	float4 m_to;\n"
-	"} b3RayInfo;\n"
-	"typedef struct\n"
-	"{\n"
-	"	float m_hitFraction;\n"
-	"	int	m_hitResult0;\n"
-	"	int	m_hitResult1;\n"
-	"	int	m_hitResult2;\n"
-	"	float4	m_hitPoint;\n"
-	"	float4	m_hitNormal;\n"
-	"} b3RayHit;\n"
-	"typedef struct\n"
-	"{\n"
-	"	float4 m_pos;\n"
-	"	float4 m_quat;\n"
-	"	float4 m_linVel;\n"
-	"	float4 m_angVel;\n"
-	"	unsigned int m_collidableIdx;\n"
-	"	float m_invMass;\n"
-	"	float m_restituitionCoeff;\n"
-	"	float m_frictionCoeff;\n"
-	"} Body;\n"
-	"typedef struct Collidable\n"
-	"{\n"
-	"	union {\n"
-	"		int m_numChildShapes;\n"
-	"		int m_bvhIndex;\n"
-	"	};\n"
-	"	float m_radius;\n"
-	"	int m_shapeType;\n"
-	"	int m_shapeIndex;\n"
-	"} Collidable;\n"
-	"typedef struct  \n"
-	"{\n"
-	"	float4		m_localCenter;\n"
-	"	float4		m_extents;\n"
-	"	float4		mC;\n"
-	"	float4		mE;\n"
-	"	float			m_radius;\n"
-	"	int	m_faceOffset;\n"
-	"	int m_numFaces;\n"
-	"	int	m_numVertices;\n"
-	"	int m_vertexOffset;\n"
-	"	int	m_uniqueEdgesOffset;\n"
-	"	int	m_numUniqueEdges;\n"
-	"	int m_unused;\n"
-	"} ConvexPolyhedronCL;\n"
-	"typedef struct\n"
-	"{\n"
-	"	float4 m_plane;\n"
-	"	int m_indexOffset;\n"
-	"	int m_numIndices;\n"
-	"} b3GpuFace;\n"
-	"///////////////////////////////////////\n"
-	"//	Quaternion\n"
-	"///////////////////////////////////////\n"
-	"typedef float4 Quaternion;\n"
-	"__inline\n"
-	"	Quaternion qtMul(Quaternion a, Quaternion b);\n"
-	"__inline\n"
-	"	Quaternion qtNormalize(Quaternion in);\n"
-	"__inline\n"
-	"	Quaternion qtInvert(Quaternion q);\n"
-	"__inline\n"
-	"	float dot3F4(float4 a, float4 b)\n"
-	"{\n"
-	"	float4 a1 = (float4)(a.xyz,0.f);\n"
-	"	float4 b1 = (float4)(b.xyz,0.f);\n"
-	"	return dot(a1, b1);\n"
-	"}\n"
-	"__inline\n"
-	"	Quaternion qtMul(Quaternion a, Quaternion b)\n"
-	"{\n"
-	"	Quaternion ans;\n"
-	"	ans = cross( a, b );\n"
-	"	ans += a.w*b+b.w*a;\n"
-	"	//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
-	"	ans.w = a.w*b.w - dot3F4(a, b);\n"
-	"	return ans;\n"
-	"}\n"
-	"__inline\n"
-	"	Quaternion qtNormalize(Quaternion in)\n"
-	"{\n"
-	"	return fast_normalize(in);\n"
-	"	//	in /= length( in );\n"
-	"	//	return in;\n"
-	"}\n"
-	"__inline\n"
-	"	float4 qtRotate(Quaternion q, float4 vec)\n"
-	"{\n"
-	"	Quaternion qInv = qtInvert( q );\n"
-	"	float4 vcpy = vec;\n"
-	"	vcpy.w = 0.f;\n"
-	"	float4 out = qtMul(q,vcpy);\n"
-	"	out = qtMul(out,qInv);\n"
-	"	return out;\n"
-	"}\n"
-	"__inline\n"
-	"	Quaternion qtInvert(Quaternion q)\n"
-	"{\n"
-	"	return (Quaternion)(-q.xyz, q.w);\n"
-	"}\n"
-	"__inline\n"
-	"	float4 qtInvRotate(const Quaternion q, float4 vec)\n"
-	"{\n"
-	"	return qtRotate( qtInvert( q ), vec );\n"
-	"}\n"
-	"void	trInverse(float4 translationIn, Quaternion orientationIn,\n"
-	"	float4* translationOut, Quaternion* orientationOut)\n"
-	"{\n"
-	"	*orientationOut = qtInvert(orientationIn);\n"
-	"	*translationOut = qtRotate(*orientationOut, -translationIn);\n"
-	"}\n"
-	"bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,\n"
-	"	__global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)\n"
-	"{\n"
-	"	rayFromLocal.w = 0.f;\n"
-	"	rayToLocal.w = 0.f;\n"
-	"	bool result = true;\n"
-	"	float exitFraction = hitFraction[0];\n"
-	"	float enterFraction = -0.3f;\n"
-	"	float4 curHitNormal = (float4)(0,0,0,0);\n"
-	"	for (int i=0;i<numFaces && result;i++)\n"
-	"	{\n"
-	"		b3GpuFace face = faces[faceOffset+i];\n"
-	"		float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;\n"
-	"		float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;\n"
-	"		if (fromPlaneDist<0.f)\n"
-	"		{\n"
-	"			if (toPlaneDist >= 0.f)\n"
-	"			{\n"
-	"				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
-	"				if (exitFraction>fraction)\n"
-	"				{\n"
-	"					exitFraction = fraction;\n"
-	"				}\n"
-	"			} 			\n"
-	"		} else\n"
-	"		{\n"
-	"			if (toPlaneDist<0.f)\n"
-	"			{\n"
-	"				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
-	"				if (enterFraction <= fraction)\n"
-	"				{\n"
-	"					enterFraction = fraction;\n"
-	"					curHitNormal = face.m_plane;\n"
-	"					curHitNormal.w = 0.f;\n"
-	"				}\n"
-	"			} else\n"
-	"			{\n"
-	"				result = false;\n"
-	"			}\n"
-	"		}\n"
-	"		if (exitFraction <= enterFraction)\n"
-	"			result = false;\n"
-	"	}\n"
-	"	if (enterFraction < 0.f)\n"
-	"	{\n"
-	"		result = false;\n"
-	"	}\n"
-	"	if (result)\n"
-	"	{	\n"
-	"		hitFraction[0] = enterFraction;\n"
-	"		hitNormal[0] = curHitNormal;\n"
-	"	}\n"
-	"	return result;\n"
-	"}\n"
-	"bool sphere_intersect(float4 spherePos,  float radius, float4 rayFrom, float4 rayTo, float* hitFraction)\n"
-	"{\n"
-	"	float4 rs = rayFrom - spherePos;\n"
-	"	rs.w = 0.f;\n"
-	"	float4 rayDir = rayTo-rayFrom;\n"
-	"	rayDir.w = 0.f;\n"
-	"	float A = dot(rayDir,rayDir);\n"
-	"	float B = dot(rs, rayDir);\n"
-	"	float C = dot(rs, rs) - (radius * radius);\n"
-	"	float D = B * B - A*C;\n"
-	"	if (D > 0.0f)\n"
-	"	{\n"
-	"		float t = (-B - sqrt(D))/A;\n"
-	"		if ( (t >= 0.0f) && (t < (*hitFraction)) )\n"
-	"		{\n"
-	"			*hitFraction = t;\n"
-	"			return true;\n"
-	"		}\n"
-	"	}\n"
-	"	return false;\n"
-	"}\n"
-	"float4 setInterpolate3(float4 from, float4 to, float t)\n"
-	"{\n"
-	"	float s = 1.0f - t;\n"
-	"	float4 result;\n"
-	"	result = s * from + t * to;\n"
-	"	result.w = 0.f;	\n"
-	"	return result;	\n"
-	"}\n"
-	"__kernel void rayCastKernel(  \n"
-	"	int numRays, \n"
-	"	const __global b3RayInfo* rays, \n"
-	"	__global b3RayHit* hitResults, \n"
-	"	const int numBodies, \n"
-	"	__global Body* bodies,\n"
-	"	__global Collidable* collidables,\n"
-	"	__global const b3GpuFace* faces,\n"
-	"	__global const ConvexPolyhedronCL* convexShapes	)\n"
-	"{\n"
-	"	int i = get_global_id(0);\n"
-	"	if (i>=numRays)\n"
-	"		return;\n"
-	"	hitResults[i].m_hitFraction = 1.f;\n"
-	"	float4 rayFrom = rays[i].m_from;\n"
-	"	float4 rayTo = rays[i].m_to;\n"
-	"	float hitFraction = 1.f;\n"
-	"	float4 hitPoint;\n"
-	"	float4 hitNormal;\n"
-	"	int hitBodyIndex= -1;\n"
-	"	int cachedCollidableIndex = -1;\n"
-	"	Collidable cachedCollidable;\n"
-	"	for (int b=0;b<numBodies;b++)\n"
-	"	{\n"
-	"		if (hitResults[i].m_hitResult2==b)\n"
-	"			continue;\n"
-	"		Body body = bodies[b];\n"
-	"		float4 pos = body.m_pos;\n"
-	"		float4 orn = body.m_quat;\n"
-	"		if (cachedCollidableIndex != body.m_collidableIdx)\n"
-	"		{\n"
-	"			cachedCollidableIndex = body.m_collidableIdx;\n"
-	"			cachedCollidable = collidables[cachedCollidableIndex];\n"
-	"		}\n"
-	"		if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
-	"		{\n"
-	"			float4 invPos = (float4)(0,0,0,0);\n"
-	"			float4 invOrn = (float4)(0,0,0,0);\n"
-	"			float4 rayFromLocal = (float4)(0,0,0,0);\n"
-	"			float4 rayToLocal = (float4)(0,0,0,0);\n"
-	"			invOrn = qtInvert(orn);\n"
-	"			invPos = qtRotate(invOrn, -pos);\n"
-	"			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
-	"			rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
-	"			rayFromLocal.w = 0.f;\n"
-	"			rayToLocal.w = 0.f;\n"
-	"			int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;\n"
-	"			int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;\n"
-	"			if (numFaces)\n"
-	"			{\n"
-	"				if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
-	"				{\n"
-	"					hitBodyIndex = b;\n"
-	"					\n"
-	"				}\n"
-	"			}\n"
-	"		}\n"
-	"		if (cachedCollidable.m_shapeType == SHAPE_SPHERE)\n"
-	"		{\n"
-	"			float radius = cachedCollidable.m_radius;\n"
-	"		\n"
-	"			if (sphere_intersect(pos,  radius, rayFrom, rayTo, &hitFraction))\n"
-	"			{\n"
-	"				hitBodyIndex = b;\n"
-	"				hitNormal = (float4) (hitPoint-bodies[b].m_pos);\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"	if (hitBodyIndex>=0)\n"
-	"	{\n"
-	"		hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);\n"
-	"		hitResults[i].m_hitFraction = hitFraction;\n"
-	"		hitResults[i].m_hitPoint = hitPoint;\n"
-	"		hitResults[i].m_hitNormal = normalize(hitNormal);\n"
-	"		hitResults[i].m_hitResult0 = hitBodyIndex;\n"
-	"	}\n"
-	"}\n"
-	"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n"
-	"											__global int* out_firstRayRigidPairIndexPerRay,\n"
-	"											__global int* out_numRayRigidPairsPerRay,\n"
-	"											int numRayRigidPairs)\n"
-	"{\n"
-	"	int rayRigidPairIndex = get_global_id(0);\n"
-	"	if (rayRigidPairIndex >= numRayRigidPairs) return;\n"
-	"	\n"
-	"	int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n"
-	"	\n"
-	"	atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n"
-	"	atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n"
-	"}\n"
-	"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n"
-	"								__global b3RayHit* hitResults, \n"
-	"								__global int* firstRayRigidPairIndexPerRay,\n"
-	"								__global int* numRayRigidPairsPerRay,\n"
-	"									\n"
-	"								__global Body* bodies,\n"
-	"								__global Collidable* collidables,\n"
-	"								__global const b3GpuFace* faces,\n"
-	"								__global const ConvexPolyhedronCL* convexShapes,\n"
-	"								\n"
-	"								__global int2* rayRigidPairs,\n"
-	"								int numRays)\n"
-	"{\n"
-	"	int i = get_global_id(0);\n"
-	"	if (i >= numRays) return;\n"
-	"	\n"
-	"	float4 rayFrom = rays[i].m_from;\n"
-	"	float4 rayTo = rays[i].m_to;\n"
-	"		\n"
-	"	hitResults[i].m_hitFraction = 1.f;\n"
-	"		\n"
-	"	float hitFraction = 1.f;\n"
-	"	float4 hitPoint;\n"
-	"	float4 hitNormal;\n"
-	"	int hitBodyIndex = -1;\n"
-	"		\n"
-	"	//\n"
-	"	for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n"
-	"	{\n"
-	"		int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n"
-	"		int b = rayRigidPairs[rayRigidPairIndex].y;\n"
-	"		\n"
-	"		if (hitResults[i].m_hitResult2 == b) continue;\n"
-	"		\n"
-	"		Body body = bodies[b];\n"
-	"		Collidable rigidCollidable = collidables[body.m_collidableIdx];\n"
-	"		\n"
-	"		float4 pos = body.m_pos;\n"
-	"		float4 orn = body.m_quat;\n"
-	"		\n"
-	"		if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
-	"		{\n"
-	"			float4 invPos = (float4)(0,0,0,0);\n"
-	"			float4 invOrn = (float4)(0,0,0,0);\n"
-	"			float4 rayFromLocal = (float4)(0,0,0,0);\n"
-	"			float4 rayToLocal = (float4)(0,0,0,0);\n"
-	"			invOrn = qtInvert(orn);\n"
-	"			invPos = qtRotate(invOrn, -pos);\n"
-	"			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
-	"			rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
-	"			rayFromLocal.w = 0.f;\n"
-	"			rayToLocal.w = 0.f;\n"
-	"			int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n"
-	"			int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n"
-	"			\n"
-	"			if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
-	"			{\n"
-	"				hitBodyIndex = b;\n"
-	"				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
-	"			}\n"
-	"		}\n"
-	"		\n"
-	"		if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n"
-	"		{\n"
-	"			float radius = rigidCollidable.m_radius;\n"
-	"		\n"
-	"			if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
-	"			{\n"
-	"				hitBodyIndex = b;\n"
-	"				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
-	"				hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n"
-	"			}\n"
-	"		}\n"
-	"	}\n"
-	"	\n"
-	"	if (hitBodyIndex >= 0)\n"
-	"	{\n"
-	"		hitResults[i].m_hitFraction = hitFraction;\n"
-	"		hitResults[i].m_hitPoint = hitPoint;\n"
-	"		hitResults[i].m_hitNormal = normalize(hitNormal);\n"
-	"		hitResults[i].m_hitResult0 = hitBodyIndex;\n"
-	"	}\n"
-	"	\n"
-	"}\n";
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h
@ -1,17 +0,0 @@
-
-#ifndef B3_CONSTRAINT4_h
-#define B3_CONSTRAINT4_h
-#include "Bullet3Common/b3Vector3.h"
-
-#include "Bullet3Dynamics/shared/b3ContactConstraint4.h"
-
-B3_ATTRIBUTE_ALIGNED16(struct)
-b3GpuConstraint4 : public b3ContactConstraint4
-{
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-
-	inline void setFrictionCoeff(float value) { m_linear[3] = value; }
-	inline float getFrictionCoeff() const { return m_linear[3]; }
-};
-
-#endif  //B3_CONSTRAINT4_h
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp
@ -1,134 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Erwin Coumans
-
-#include "b3GpuGenericConstraint.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
-
-#include <new>
-#include "Bullet3Common/b3Transform.h"
-
-void b3GpuGenericConstraint::getInfo1(unsigned int* info, const b3RigidBodyData* bodies)
-{
-	switch (m_constraintType)
-	{
-		case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
-		{
-			*info = 3;
-			break;
-		};
-		default:
-		{
-			b3Assert(0);
-		}
-	};
-}
-
-void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies)
-{
-	b3Transform trA;
-	trA.setIdentity();
-	trA.setOrigin(bodies[constraint->m_rbA].m_pos);
-	trA.setRotation(bodies[constraint->m_rbA].m_quat);
-
-	b3Transform trB;
-	trB.setIdentity();
-	trB.setOrigin(bodies[constraint->m_rbB].m_pos);
-	trB.setRotation(bodies[constraint->m_rbB].m_quat);
-
-	// anchor points in global coordinates with respect to body PORs.
-
-	// set jacobian
-	info->m_J1linearAxis[0] = 1;
-	info->m_J1linearAxis[info->rowskip + 1] = 1;
-	info->m_J1linearAxis[2 * info->rowskip + 2] = 1;
-
-	b3Vector3 a1 = trA.getBasis() * constraint->getPivotInA();
-	//b3Vector3 a1a = b3QuatRotate(trA.getRotation(),constraint->getPivotInA());
-
-	{
-		b3Vector3* angular0 = (b3Vector3*)(info->m_J1angularAxis);
-		b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis + info->rowskip);
-		b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis + 2 * info->rowskip);
-		b3Vector3 a1neg = -a1;
-		a1neg.getSkewSymmetricMatrix(angular0, angular1, angular2);
-	}
-
-	if (info->m_J2linearAxis)
-	{
-		info->m_J2linearAxis[0] = -1;
-		info->m_J2linearAxis[info->rowskip + 1] = -1;
-		info->m_J2linearAxis[2 * info->rowskip + 2] = -1;
-	}
-
-	b3Vector3 a2 = trB.getBasis() * constraint->getPivotInB();
-
-	{
-		//	b3Vector3 a2n = -a2;
-		b3Vector3* angular0 = (b3Vector3*)(info->m_J2angularAxis);
-		b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis + info->rowskip);
-		b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis + 2 * info->rowskip);
-		a2.getSkewSymmetricMatrix(angular0, angular1, angular2);
-	}
-
-	// set right hand side
-	//	b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;
-	b3Scalar currERP = info->erp;
-
-	b3Scalar k = info->fps * currERP;
-	int j;
-	for (j = 0; j < 3; j++)
-	{
-		info->m_constraintError[j * info->rowskip] = k * (a2[j] + trB.getOrigin()[j] - a1[j] - trA.getOrigin()[j]);
-		//printf("info->m_constraintError[%d]=%f\n",j,info->m_constraintError[j]);
-	}
-#if 0
-	if(m_flags & B3_P2P_FLAGS_CFM)
-	{
-		for (j=0; j<3; j++)
-		{
-			info->cfm[j*info->rowskip] = m_cfm;
-		}
-	}
-#endif
-
-#if 0
-	b3Scalar impulseClamp = m_setting.m_impulseClamp;//
-	for (j=0; j<3; j++)
-    {
-		if (m_setting.m_impulseClamp > 0)
-		{
-			info->m_lowerLimit[j*info->rowskip] = -impulseClamp;
-			info->m_upperLimit[j*info->rowskip] = impulseClamp;
-		}
-	}
-	info->m_damping = m_setting.m_damping;
-#endif
-}
-
-void b3GpuGenericConstraint::getInfo2(b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies)
-{
-	switch (m_constraintType)
-	{
-		case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
-		{
-			getInfo2Point2Point(this, info, bodies);
-			break;
-		};
-		default:
-		{
-			b3Assert(0);
-		}
-	};
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h
@ -1,128 +0,0 @@
-/*
-Copyright (c) 2013 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Erwin Coumans
-
-#ifndef B3_GPU_GENERIC_CONSTRAINT_H
-#define B3_GPU_GENERIC_CONSTRAINT_H
-
-#include "Bullet3Common/b3Quaternion.h"
-struct b3RigidBodyData;
-enum B3_CONSTRAINT_FLAGS
-{
-	B3_CONSTRAINT_FLAG_ENABLED = 1,
-};
-
-enum b3GpuGenericConstraintType
-{
-	B3_GPU_POINT2POINT_CONSTRAINT_TYPE = 3,
-	B3_GPU_FIXED_CONSTRAINT_TYPE = 4,
-	//	B3_HINGE_CONSTRAINT_TYPE,
-	//	B3_CONETWIST_CONSTRAINT_TYPE,
-	//	B3_D6_CONSTRAINT_TYPE,
-	//	B3_SLIDER_CONSTRAINT_TYPE,
-	//	B3_CONTACT_CONSTRAINT_TYPE,
-	//	B3_D6_SPRING_CONSTRAINT_TYPE,
-	//	B3_GEAR_CONSTRAINT_TYPE,
-
-	B3_GPU_MAX_CONSTRAINT_TYPE
-};
-
-struct b3GpuConstraintInfo2
-{
-	// integrator parameters: frames per second (1/stepsize), default error
-	// reduction parameter (0..1).
-	b3Scalar fps, erp;
-
-	// for the first and second body, pointers to two (linear and angular)
-	// n*3 jacobian sub matrices, stored by rows. these matrices will have
-	// been initialized to 0 on entry. if the second body is zero then the
-	// J2xx pointers may be 0.
-	b3Scalar *m_J1linearAxis, *m_J1angularAxis, *m_J2linearAxis, *m_J2angularAxis;
-
-	// elements to jump from one row to the next in J's
-	int rowskip;
-
-	// right hand sides of the equation J*v = c + cfm * lambda. cfm is the
-	// "constraint force mixing" vector. c is set to zero on entry, cfm is
-	// set to a constant value (typically very small or zero) value on entry.
-	b3Scalar *m_constraintError, *cfm;
-
-	// lo and hi limits for variables (set to -/+ infinity on entry).
-	b3Scalar *m_lowerLimit, *m_upperLimit;
-
-	// findex vector for variables. see the LCP solver interface for a
-	// description of what this does. this is set to -1 on entry.
-	// note that the returned indexes are relative to the first index of
-	// the constraint.
-	int* findex;
-	// number of solver iterations
-	int m_numIterations;
-
-	//damping of the velocity
-	b3Scalar m_damping;
-};
-
-B3_ATTRIBUTE_ALIGNED16(struct)
-b3GpuGenericConstraint
-{
-	int m_constraintType;
-	int m_rbA;
-	int m_rbB;
-	float m_breakingImpulseThreshold;
-
-	b3Vector3 m_pivotInA;
-	b3Vector3 m_pivotInB;
-	b3Quaternion m_relTargetAB;
-
-	int m_flags;
-	int m_uid;
-	int m_padding[2];
-
-	int getRigidBodyA() const
-	{
-		return m_rbA;
-	}
-	int getRigidBodyB() const
-	{
-		return m_rbB;
-	}
-
-	const b3Vector3& getPivotInA() const
-	{
-		return m_pivotInA;
-	}
-
-	const b3Vector3& getPivotInB() const
-	{
-		return m_pivotInB;
-	}
-
-	int isEnabled() const
-	{
-		return m_flags & B3_CONSTRAINT_FLAG_ENABLED;
-	}
-
-	float getBreakingImpulseThreshold() const
-	{
-		return m_breakingImpulseThreshold;
-	}
-
-	///internal method used by the constraint solver, don't use them directly
-	void getInfo1(unsigned int* info, const b3RigidBodyData* bodies);
-
-	///internal method used by the constraint solver, don't use them directly
-	void getInfo2(b3GpuConstraintInfo2 * info, const b3RigidBodyData* bodies);
-};
-
-#endif  //B3_GPU_GENERIC_CONSTRAINT_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h
@ -1,56 +0,0 @@
-
-#ifndef B3_GPU_JACOBI_CONTACT_SOLVER_H
-#define B3_GPU_JACOBI_CONTACT_SOLVER_H
-#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
-//#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
-
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
-
-//struct b3InertiaData;
-//b3InertiaData
-
-class b3TypedConstraint;
-
-struct b3JacobiSolverInfo
-{
-	int m_fixedBodyIndex;
-
-	float m_deltaTime;
-	float m_positionDrift;
-	float m_positionConstraintCoeff;
-	int m_numIterations;
-
-	b3JacobiSolverInfo()
-		: m_fixedBodyIndex(0),
-		  m_deltaTime(1. / 60.f),
-		  m_positionDrift(0.005f),
-		  m_positionConstraintCoeff(0.99f),
-		  m_numIterations(7)
-	{
-	}
-};
-class b3GpuJacobiContactSolver
-{
-protected:
-	struct b3GpuJacobiSolverInternalData* m_data;
-
-	cl_context m_context;
-	cl_device_id m_device;
-	cl_command_queue m_queue;
-
-public:
-	b3GpuJacobiContactSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity);
-	virtual ~b3GpuJacobiContactSolver();
-
-	void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index);
-	void solveGroupHost(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, struct b3Contact4* manifoldPtr, int numManifolds, const b3JacobiSolverInfo& solverInfo);
-	//void  solveGroupHost(btRigidBodyCL* bodies,b3InertiaData* inertias,int numBodies,btContact4* manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btJacobiSolverInfo& solverInfo);
-
-	//b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
-
-	//void  solveGroup(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo);
-	//void  solveGroupMixed(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo);
-};
-#endif  //B3_GPU_JACOBI_CONTACT_SOLVER_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h
@ -1,101 +0,0 @@
-#ifndef B3_GPU_NARROWPHASE_H
-#define B3_GPU_NARROWPHASE_H
-
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
-#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
-#include "Bullet3Common/b3AlignedObjectArray.h"
-#include "Bullet3Common/b3Vector3.h"
-
-class b3GpuNarrowPhase
-{
-protected:
-	struct b3GpuNarrowPhaseInternalData* m_data;
-	int m_acceleratedCompanionShapeIndex;
-	int m_planeBodyIndex;
-	int m_static0Index;
-
-	cl_context m_context;
-	cl_device_id m_device;
-	cl_command_queue m_queue;
-
-	int registerConvexHullShapeInternal(class b3ConvexUtility* convexPtr, b3Collidable& col);
-	int registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, b3Collidable& col, const float* scaling);
-
-public:
-	b3GpuNarrowPhase(cl_context vtx, cl_device_id dev, cl_command_queue q, const struct b3Config& config);
-
-	virtual ~b3GpuNarrowPhase(void);
-
-	int registerSphereShape(float radius);
-	int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
-
-	int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
-	int registerFace(const b3Vector3& faceNormal, float faceConstant);
-
-	int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling);
-
-	//do they need to be merged?
-
-	int registerConvexHullShape(b3ConvexUtility* utilPtr);
-	int registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
-
-	int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax, bool writeToGpu);
-	void setObjectTransform(const float* position, const float* orientation, int bodyIndex);
-
-	void writeAllBodiesToGpu();
-	void reset();
-	void readbackAllBodiesToCpu();
-	bool getObjectTransformFromCpu(float* position, float* orientation, int bodyIndex) const;
-
-	void setObjectTransformCpu(float* position, float* orientation, int bodyIndex);
-	void setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex);
-
-	virtual void computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects);
-
-	cl_mem getBodiesGpu();
-	const struct b3RigidBodyData* getBodiesCpu() const;
-	//struct b3RigidBodyData* getBodiesCpu();
-
-	int getNumBodiesGpu() const;
-
-	cl_mem getBodyInertiasGpu();
-	int getNumBodyInertiasGpu() const;
-
-	cl_mem getCollidablesGpu();
-	const struct b3Collidable* getCollidablesCpu() const;
-	int getNumCollidablesGpu() const;
-
-	const struct b3SapAabb* getLocalSpaceAabbsCpu() const;
-
-	const struct b3Contact4* getContactsCPU() const;
-
-	cl_mem getContactsGpu();
-	int getNumContactsGpu() const;
-
-	cl_mem getAabbLocalSpaceBufferGpu();
-
-	int getNumRigidBodies() const;
-
-	int allocateCollidable();
-
-	int getStatic0Index() const
-	{
-		return m_static0Index;
-	}
-	b3Collidable& getCollidableCpu(int collidableIndex);
-	const b3Collidable& getCollidableCpu(int collidableIndex) const;
-
-	const b3GpuNarrowPhaseInternalData* getInternalData() const
-	{
-		return m_data;
-	}
-
-	b3GpuNarrowPhaseInternalData* getInternalData()
-	{
-		return m_data;
-	}
-
-	const struct b3SapAabb& getLocalSpaceAabb(int collidableIndex) const;
-};
-
-#endif  //B3_GPU_NARROWPHASE_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h
@ -1,89 +0,0 @@
-
-#ifndef B3_GPU_NARROWPHASE_INTERNAL_DATA_H
-#define B3_GPU_NARROWPHASE_INTERNAL_DATA_H
-
-#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
-#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
-
-#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
-#include "Bullet3Common/b3AlignedObjectArray.h"
-#include "Bullet3Common/b3Vector3.h"
-
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
-#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
-#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
-
-#include "Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h"
-#include "Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h"
-#include "Bullet3Common/shared/b3Int4.h"
-#include "Bullet3Common/shared/b3Int2.h"
-
-class b3ConvexUtility;
-
-struct b3GpuNarrowPhaseInternalData
-{
-	b3AlignedObjectArray<b3ConvexUtility*>* m_convexData;
-
-	b3AlignedObjectArray<b3ConvexPolyhedronData> m_convexPolyhedra;
-	b3AlignedObjectArray<b3Vector3> m_uniqueEdges;
-	b3AlignedObjectArray<b3Vector3> m_convexVertices;
-	b3AlignedObjectArray<int> m_convexIndices;
-
-	b3OpenCLArray<b3ConvexPolyhedronData>* m_convexPolyhedraGPU;
-	b3OpenCLArray<b3Vector3>* m_uniqueEdgesGPU;
-	b3OpenCLArray<b3Vector3>* m_convexVerticesGPU;
-	b3OpenCLArray<int>* m_convexIndicesGPU;
-
-	b3OpenCLArray<b3Vector3>* m_worldVertsB1GPU;
-	b3OpenCLArray<b3Int4>* m_clippingFacesOutGPU;
-	b3OpenCLArray<b3Vector3>* m_worldNormalsAGPU;
-	b3OpenCLArray<b3Vector3>* m_worldVertsA1GPU;
-	b3OpenCLArray<b3Vector3>* m_worldVertsB2GPU;
-
-	b3AlignedObjectArray<b3GpuChildShape> m_cpuChildShapes;
-	b3OpenCLArray<b3GpuChildShape>* m_gpuChildShapes;
-
-	b3AlignedObjectArray<b3GpuFace> m_convexFaces;
-	b3OpenCLArray<b3GpuFace>* m_convexFacesGPU;
-
-	struct GpuSatCollision* m_gpuSatCollision;
-
-	b3OpenCLArray<b3Int4>* m_triangleConvexPairs;
-
-	b3OpenCLArray<b3Contact4>* m_pBufContactBuffersGPU[2];
-	int m_currentContactBuffer;
-	b3AlignedObjectArray<b3Contact4>* m_pBufContactOutCPU;
-
-	b3AlignedObjectArray<b3RigidBodyData>* m_bodyBufferCPU;
-	b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
-
-	b3AlignedObjectArray<b3InertiaData>* m_inertiaBufferCPU;
-	b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
-
-	int m_numAcceleratedShapes;
-	int m_numAcceleratedRigidBodies;
-
-	b3AlignedObjectArray<b3Collidable> m_collidablesCPU;
-	b3OpenCLArray<b3Collidable>* m_collidablesGPU;
-
-	b3OpenCLArray<b3SapAabb>* m_localShapeAABBGPU;
-	b3AlignedObjectArray<b3SapAabb>* m_localShapeAABBCPU;
-
-	b3AlignedObjectArray<class b3OptimizedBvh*> m_bvhData;
-	b3AlignedObjectArray<class b3TriangleIndexVertexArray*> m_meshInterfaces;
-
-	b3AlignedObjectArray<b3QuantizedBvhNode> m_treeNodesCPU;
-	b3AlignedObjectArray<b3BvhSubtreeInfo> m_subTreesCPU;
-
-	b3AlignedObjectArray<b3BvhInfo> m_bvhInfoCPU;
-	b3OpenCLArray<b3BvhInfo>* m_bvhInfoGPU;
-
-	b3OpenCLArray<b3QuantizedBvhNode>* m_treeNodesGPU;
-	b3OpenCLArray<b3BvhSubtreeInfo>* m_subTreesGPU;
-
-	b3Config m_config;
-};
-
-#endif  //B3_GPU_NARROWPHASE_INTERNAL_DATA_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h
@ -1,76 +0,0 @@
-/*
-Copyright (c) 2013 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Erwin Coumans
-
-#ifndef B3_GPU_PGS_CONSTRAINT_SOLVER_H
-#define B3_GPU_PGS_CONSTRAINT_SOLVER_H
-
-struct b3Contact4;
-struct b3ContactPoint;
-
-class b3Dispatcher;
-
-#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
-#include "Bullet3Dynamics/ConstraintSolver/b3ContactSolverInfo.h"
-#include "b3GpuSolverBody.h"
-#include "b3GpuSolverConstraint.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
-struct b3RigidBodyData;
-struct b3InertiaData;
-
-#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
-#include "b3GpuGenericConstraint.h"
-
-class b3GpuPgsConstraintSolver
-{
-protected:
-	int m_staticIdx;
-	struct b3GpuPgsJacobiSolverInternalData* m_gpuData;
-
-protected:
-	b3AlignedObjectArray<b3GpuSolverBody> m_tmpSolverBodyPool;
-	b3GpuConstraintArray m_tmpSolverContactConstraintPool;
-	b3GpuConstraintArray m_tmpSolverNonContactConstraintPool;
-	b3GpuConstraintArray m_tmpSolverContactFrictionConstraintPool;
-	b3GpuConstraintArray m_tmpSolverContactRollingFrictionConstraintPool;
-
-	b3AlignedObjectArray<unsigned int> m_tmpConstraintSizesPool;
-
-	bool m_usePgs;
-	void averageVelocities();
-
-	int m_maxOverrideNumSolverIterations;
-
-	int m_numSplitImpulseRecoveries;
-
-	//	int	getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias);
-	void initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb);
-
-public:
-	b3GpuPgsConstraintSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, bool usePgs);
-	virtual ~b3GpuPgsConstraintSolver();
-
-	virtual b3Scalar solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1, int numConstraints, const b3ContactSolverInfo& infoGlobal);
-	virtual b3Scalar solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
-	b3Scalar solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
-
-	b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
-	void solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias,
-					 int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints);
-
-	int sortConstraintByBatch3(struct b3BatchConstraint* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies);
-	void recomputeBatches();
-};
-
-#endif  //B3_GPU_PGS_CONSTRAINT_SOLVER_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h
@ -1,37 +0,0 @@
-
-#ifndef B3_GPU_BATCHING_PGS_SOLVER_H
-#define B3_GPU_BATCHING_PGS_SOLVER_H
-
-#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
-#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
-#include "b3GpuConstraint4.h"
-
-class b3GpuPgsContactSolver
-{
-protected:
-	int m_debugOutput;
-
-	struct b3GpuBatchingPgsSolverInternalData* m_data;
-
-	void batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx);
-
-	inline int sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies);
-	inline int sortConstraintByBatch2(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies);
-	inline int sortConstraintByBatch3(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies, int* batchSizes);
-
-	void solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
-										  b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);  //const b3OpenCLArray<int>* gpuBatchSizes);
-
-	void solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
-								b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);  //const b3OpenCLArray<int>* gpuBatchSizes);
-
-public:
-	b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity);
-	virtual ~b3GpuPgsContactSolver();
-
-	void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index);
-};
-
-#endif  //B3_GPU_BATCHING_PGS_SOLVER_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
@ -1,677 +0,0 @@
-/*
-Copyright (c) 2013 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Erwin Coumans
-
-#include "b3GpuRigidBodyPipeline.h"
-#include "b3GpuRigidBodyPipelineInternalData.h"
-#include "kernels/integrateKernel.h"
-#include "kernels/updateAabbsKernel.h"
-
-#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
-#include "b3GpuNarrowPhase.h"
-#include "Bullet3Geometry/b3AabbUtil.h"
-#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
-#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
-#include "Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h"
-#include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h"
-
-//#define TEST_OTHER_GPU_SOLVER
-
-#define B3_RIGIDBODY_INTEGRATE_PATH "src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl"
-#define B3_RIGIDBODY_UPDATEAABB_PATH "src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl"
-
-bool useBullet2CpuSolver = true;
-
-//choice of contact solver
-bool gUseJacobi = false;
-bool gUseDbvt = false;
-bool gDumpContactStats = false;
-bool gCalcWorldSpaceAabbOnCpu = false;
-bool gUseCalculateOverlappingPairsHost = false;
-bool gIntegrateOnCpu = false;
-bool gClearPairsOnGpu = true;
-
-#define TEST_OTHER_GPU_SOLVER 1
-#ifdef TEST_OTHER_GPU_SOLVER
-#include "b3GpuJacobiContactSolver.h"
-#endif  //TEST_OTHER_GPU_SOLVER
-
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
-#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
-#include "Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h"
-
-#include "b3GpuPgsContactSolver.h"
-#include "b3Solver.h"
-
-#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
-#include "Bullet3OpenCL/Raycast/b3GpuRaycast.h"
-
-#include "Bullet3Dynamics/shared/b3IntegrateTransforms.h"
-#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h"
-
-b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx, cl_device_id device, cl_command_queue q, class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config)
-{
-	m_data = new b3GpuRigidBodyPipelineInternalData;
-	m_data->m_constraintUid = 0;
-	m_data->m_config = config;
-	m_data->m_context = ctx;
-	m_data->m_device = device;
-	m_data->m_queue = q;
-
-	m_data->m_solver = new b3PgsJacobiSolver(true);                            //new b3PgsJacobiSolver(true);
-	m_data->m_gpuSolver = new b3GpuPgsConstraintSolver(ctx, device, q, true);  //new b3PgsJacobiSolver(true);
-
-	m_data->m_allAabbsGPU = new b3OpenCLArray<b3SapAabb>(ctx, q, config.m_maxConvexBodies);
-	m_data->m_overlappingPairsGPU = new b3OpenCLArray<b3BroadphasePair>(ctx, q, config.m_maxBroadphasePairs);
-
-	m_data->m_gpuConstraints = new b3OpenCLArray<b3GpuGenericConstraint>(ctx, q);
-#ifdef TEST_OTHER_GPU_SOLVER
-	m_data->m_solver3 = new b3GpuJacobiContactSolver(ctx, device, q, config.m_maxBroadphasePairs);
-#endif  //	TEST_OTHER_GPU_SOLVER
-
-	m_data->m_solver2 = new b3GpuPgsContactSolver(ctx, device, q, config.m_maxBroadphasePairs);
-
-	m_data->m_raycaster = new b3GpuRaycast(ctx, device, q);
-
-	m_data->m_broadphaseDbvt = broadphaseDbvt;
-	m_data->m_broadphaseSap = broadphaseSap;
-	m_data->m_narrowphase = narrowphase;
-	m_data->m_gravity.setValue(0.f, -9.8f, 0.f);
-
-	cl_int errNum = 0;
-
-	{
-		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, integrateKernelCL, &errNum, "", B3_RIGIDBODY_INTEGRATE_PATH);
-		b3Assert(errNum == CL_SUCCESS);
-		m_data->m_integrateTransformsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, integrateKernelCL, "integrateTransformsKernel", &errNum, prog);
-		b3Assert(errNum == CL_SUCCESS);
-		clReleaseProgram(prog);
-	}
-	{
-		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, &errNum, "", B3_RIGIDBODY_UPDATEAABB_PATH);
-		b3Assert(errNum == CL_SUCCESS);
-		m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, "initializeGpuAabbsFull", &errNum, prog);
-		b3Assert(errNum == CL_SUCCESS);
-
-		m_data->m_clearOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, "clearOverlappingPairsKernel", &errNum, prog);
-		b3Assert(errNum == CL_SUCCESS);
-
-		clReleaseProgram(prog);
-	}
-}
-
-b3GpuRigidBodyPipeline::~b3GpuRigidBodyPipeline()
-{
-	if (m_data->m_integrateTransformsKernel)
-		clReleaseKernel(m_data->m_integrateTransformsKernel);
-
-	if (m_data->m_updateAabbsKernel)
-		clReleaseKernel(m_data->m_updateAabbsKernel);
-
-	if (m_data->m_clearOverlappingPairsKernel)
-		clReleaseKernel(m_data->m_clearOverlappingPairsKernel);
-	delete m_data->m_raycaster;
-	delete m_data->m_solver;
-	delete m_data->m_allAabbsGPU;
-	delete m_data->m_gpuConstraints;
-	delete m_data->m_overlappingPairsGPU;
-
-#ifdef TEST_OTHER_GPU_SOLVER
-	delete m_data->m_solver3;
-#endif  //TEST_OTHER_GPU_SOLVER
-
-	delete m_data->m_solver2;
-
-	delete m_data;
-}
-
-void b3GpuRigidBodyPipeline::reset()
-{
-	m_data->m_gpuConstraints->resize(0);
-	m_data->m_cpuConstraints.resize(0);
-	m_data->m_allAabbsGPU->resize(0);
-	m_data->m_allAabbsCPU.resize(0);
-}
-
-void b3GpuRigidBodyPipeline::addConstraint(b3TypedConstraint* constraint)
-{
-	m_data->m_joints.push_back(constraint);
-}
-
-void b3GpuRigidBodyPipeline::removeConstraint(b3TypedConstraint* constraint)
-{
-	m_data->m_joints.remove(constraint);
-}
-
-void b3GpuRigidBodyPipeline::removeConstraintByUid(int uid)
-{
-	m_data->m_gpuSolver->recomputeBatches();
-	//slow linear search
-	m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints);
-	//remove
-	for (int i = 0; i < m_data->m_cpuConstraints.size(); i++)
-	{
-		if (m_data->m_cpuConstraints[i].m_uid == uid)
-		{
-			//m_data->m_cpuConstraints.remove(m_data->m_cpuConstraints[i]);
-			m_data->m_cpuConstraints.swap(i, m_data->m_cpuConstraints.size() - 1);
-			m_data->m_cpuConstraints.pop_back();
-
-			break;
-		}
-	}
-
-	if (m_data->m_cpuConstraints.size())
-	{
-		m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints);
-	}
-	else
-	{
-		m_data->m_gpuConstraints->resize(0);
-	}
-}
-int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, float breakingThreshold)
-{
-	m_data->m_gpuSolver->recomputeBatches();
-	b3GpuGenericConstraint c;
-	c.m_uid = m_data->m_constraintUid;
-	m_data->m_constraintUid++;
-	c.m_flags = B3_CONSTRAINT_FLAG_ENABLED;
-	c.m_rbA = bodyA;
-	c.m_rbB = bodyB;
-	c.m_pivotInA.setValue(pivotInA[0], pivotInA[1], pivotInA[2]);
-	c.m_pivotInB.setValue(pivotInB[0], pivotInB[1], pivotInB[2]);
-	c.m_breakingImpulseThreshold = breakingThreshold;
-	c.m_constraintType = B3_GPU_POINT2POINT_CONSTRAINT_TYPE;
-	m_data->m_cpuConstraints.push_back(c);
-	return c.m_uid;
-}
-int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold)
-{
-	m_data->m_gpuSolver->recomputeBatches();
-	b3GpuGenericConstraint c;
-	c.m_uid = m_data->m_constraintUid;
-	m_data->m_constraintUid++;
-	c.m_flags = B3_CONSTRAINT_FLAG_ENABLED;
-	c.m_rbA = bodyA;
-	c.m_rbB = bodyB;
-	c.m_pivotInA.setValue(pivotInA[0], pivotInA[1], pivotInA[2]);
-	c.m_pivotInB.setValue(pivotInB[0], pivotInB[1], pivotInB[2]);
-	c.m_relTargetAB.setValue(relTargetAB[0], relTargetAB[1], relTargetAB[2], relTargetAB[3]);
-	c.m_breakingImpulseThreshold = breakingThreshold;
-	c.m_constraintType = B3_GPU_FIXED_CONSTRAINT_TYPE;
-
-	m_data->m_cpuConstraints.push_back(c);
-	return c.m_uid;
-}
-
-void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
-{
-	//update worldspace AABBs from local AABB/worldtransform
-	{
-		B3_PROFILE("setupGpuAabbs");
-		setupGpuAabbsFull();
-	}
-
-	int numPairs = 0;
-
-	//compute overlapping pairs
-	{
-		if (gUseDbvt)
-		{
-			{
-				B3_PROFILE("setAabb");
-				m_data->m_allAabbsGPU->copyToHost(m_data->m_allAabbsCPU);
-				for (int i = 0; i < m_data->m_allAabbsCPU.size(); i++)
-				{
-					b3Vector3 aabbMin = b3MakeVector3(m_data->m_allAabbsCPU[i].m_min[0], m_data->m_allAabbsCPU[i].m_min[1], m_data->m_allAabbsCPU[i].m_min[2]);
-					b3Vector3 aabbMax = b3MakeVector3(m_data->m_allAabbsCPU[i].m_max[0], m_data->m_allAabbsCPU[i].m_max[1], m_data->m_allAabbsCPU[i].m_max[2]);
-					m_data->m_broadphaseDbvt->setAabb(i, aabbMin, aabbMax, 0);
-				}
-			}
-
-			{
-				B3_PROFILE("calculateOverlappingPairs");
-				m_data->m_broadphaseDbvt->calculateOverlappingPairs();
-			}
-			numPairs = m_data->m_broadphaseDbvt->getOverlappingPairCache()->getNumOverlappingPairs();
-		}
-		else
-		{
-			if (gUseCalculateOverlappingPairsHost)
-			{
-				m_data->m_broadphaseSap->calculateOverlappingPairsHost(m_data->m_config.m_maxBroadphasePairs);
-			}
-			else
-			{
-				m_data->m_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs);
-			}
-			numPairs = m_data->m_broadphaseSap->getNumOverlap();
-		}
-	}
-
-	//compute contact points
-	//	printf("numPairs=%d\n",numPairs);
-
-	int numContacts = 0;
-
-	int numBodies = m_data->m_narrowphase->getNumRigidBodies();
-
-	if (numPairs)
-	{
-		cl_mem pairs = 0;
-		cl_mem aabbsWS = 0;
-		if (gUseDbvt)
-		{
-			B3_PROFILE("m_overlappingPairsGPU->copyFromHost");
-			m_data->m_overlappingPairsGPU->copyFromHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray());
-			pairs = m_data->m_overlappingPairsGPU->getBufferCL();
-			aabbsWS = m_data->m_allAabbsGPU->getBufferCL();
-		}
-		else
-		{
-			pairs = m_data->m_broadphaseSap->getOverlappingPairBuffer();
-			aabbsWS = m_data->m_broadphaseSap->getAabbBufferWS();
-		}
-
-		m_data->m_overlappingPairsGPU->resize(numPairs);
-
-		//mark the contacts for each pair as 'unused'
-		if (numPairs)
-		{
-			b3OpenCLArray<b3BroadphasePair> gpuPairs(this->m_data->m_context, m_data->m_queue);
-			gpuPairs.setFromOpenCLBuffer(pairs, numPairs);
-
-			if (gClearPairsOnGpu)
-			{
-				//b3AlignedObjectArray<b3BroadphasePair> hostPairs;//just for debugging
-				//gpuPairs.copyToHost(hostPairs);
-
-				b3LauncherCL launcher(m_data->m_queue, m_data->m_clearOverlappingPairsKernel, "clearOverlappingPairsKernel");
-				launcher.setBuffer(pairs);
-				launcher.setConst(numPairs);
-				launcher.launch1D(numPairs);
-
-				//gpuPairs.copyToHost(hostPairs);
-			}
-			else
-			{
-				b3AlignedObjectArray<b3BroadphasePair> hostPairs;
-				gpuPairs.copyToHost(hostPairs);
-
-				for (int i = 0; i < hostPairs.size(); i++)
-				{
-					hostPairs[i].z = 0xffffffff;
-				}
-
-				gpuPairs.copyFromHost(hostPairs);
-			}
-		}
-
-		m_data->m_narrowphase->computeContacts(pairs, numPairs, aabbsWS, numBodies);
-		numContacts = m_data->m_narrowphase->getNumContactsGpu();
-
-		if (gUseDbvt)
-		{
-			///store the cached information (contact locations in the 'z' component)
-			B3_PROFILE("m_overlappingPairsGPU->copyToHost");
-			m_data->m_overlappingPairsGPU->copyToHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray());
-		}
-		if (gDumpContactStats && numContacts)
-		{
-			m_data->m_narrowphase->getContactsGpu();
-
-			printf("numContacts = %d\n", numContacts);
-
-			int totalPoints = 0;
-			const b3Contact4* contacts = m_data->m_narrowphase->getContactsCPU();
-
-			for (int i = 0; i < numContacts; i++)
-			{
-				totalPoints += contacts->getNPoints();
-			}
-			printf("totalPoints=%d\n", totalPoints);
-		}
-	}
-
-	//convert contact points to contact constraints
-
-	//solve constraints
-
-	b3OpenCLArray<b3RigidBodyData> gpuBodies(m_data->m_context, m_data->m_queue, 0, true);
-	gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(), m_data->m_narrowphase->getNumRigidBodies());
-	b3OpenCLArray<b3InertiaData> gpuInertias(m_data->m_context, m_data->m_queue, 0, true);
-	gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(), m_data->m_narrowphase->getNumRigidBodies());
-	b3OpenCLArray<b3Contact4> gpuContacts(m_data->m_context, m_data->m_queue, 0, true);
-	gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(), m_data->m_narrowphase->getNumContactsGpu());
-
-	int numJoints = m_data->m_joints.size() ? m_data->m_joints.size() : m_data->m_cpuConstraints.size();
-	if (useBullet2CpuSolver && numJoints)
-	{
-		//	b3AlignedObjectArray<b3Contact4> hostContacts;
-		//gpuContacts.copyToHost(hostContacts);
-		{
-			bool useGpu = m_data->m_joints.size() == 0;
-
-			//			b3Contact4* contacts = numContacts? &hostContacts[0]: 0;
-			//m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,contacts,numJoints, joints);
-			if (useGpu)
-			{
-				m_data->m_gpuSolver->solveJoints(m_data->m_narrowphase->getNumRigidBodies(), &gpuBodies, &gpuInertias, numJoints, m_data->m_gpuConstraints);
-			}
-			else
-			{
-				b3AlignedObjectArray<b3RigidBodyData> hostBodies;
-				gpuBodies.copyToHost(hostBodies);
-				b3AlignedObjectArray<b3InertiaData> hostInertias;
-				gpuInertias.copyToHost(hostInertias);
-
-				b3TypedConstraint** joints = numJoints ? &m_data->m_joints[0] : 0;
-				m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(), &hostBodies[0], &hostInertias[0], 0, 0, numJoints, joints);
-				gpuBodies.copyFromHost(hostBodies);
-			}
-		}
-	}
-
-	if (numContacts)
-	{
-#ifdef TEST_OTHER_GPU_SOLVER
-
-		if (gUseJacobi)
-		{
-			bool useGpu = true;
-			if (useGpu)
-			{
-				bool forceHost = false;
-				if (forceHost)
-				{
-					b3AlignedObjectArray<b3RigidBodyData> hostBodies;
-					b3AlignedObjectArray<b3InertiaData> hostInertias;
-					b3AlignedObjectArray<b3Contact4> hostContacts;
-
-					{
-						B3_PROFILE("copyToHost");
-						gpuBodies.copyToHost(hostBodies);
-						gpuInertias.copyToHost(hostInertias);
-						gpuContacts.copyToHost(hostContacts);
-					}
-
-					{
-						b3JacobiSolverInfo solverInfo;
-						m_data->m_solver3->solveGroupHost(&hostBodies[0], &hostInertias[0], hostBodies.size(), &hostContacts[0], hostContacts.size(), solverInfo);
-					}
-					{
-						B3_PROFILE("copyFromHost");
-						gpuBodies.copyFromHost(hostBodies);
-					}
-				}
-				else
-
-				{
-					int static0Index = m_data->m_narrowphase->getStatic0Index();
-					b3JacobiSolverInfo solverInfo;
-					//m_data->m_solver3->solveContacts(    >solveGroup(&gpuBodies, &gpuInertias, &gpuContacts,solverInfo);
-					//m_data->m_solver3->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]);
-					m_data->m_solver3->solveContacts(numBodies, gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL(), m_data->m_config, static0Index);
-				}
-			}
-			else
-			{
-				b3AlignedObjectArray<b3RigidBodyData> hostBodies;
-				gpuBodies.copyToHost(hostBodies);
-				b3AlignedObjectArray<b3InertiaData> hostInertias;
-				gpuInertias.copyToHost(hostInertias);
-				b3AlignedObjectArray<b3Contact4> hostContacts;
-				gpuContacts.copyToHost(hostContacts);
-				{
-					//m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]);
-				}
-				gpuBodies.copyFromHost(hostBodies);
-			}
-		}
-		else
-#endif  //TEST_OTHER_GPU_SOLVER
-		{
-			int static0Index = m_data->m_narrowphase->getStatic0Index();
-			m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL(), m_data->m_config, static0Index);
-
-			//m_data->m_solver4->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(), gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL());
-
-			/*m_data->m_solver3->solveContactConstraintHost(
-			(b3OpenCLArray<RigidBodyBase::Body>*)&gpuBodies,
-			(b3OpenCLArray<RigidBodyBase::Inertia>*)&gpuInertias,
-			(b3OpenCLArray<Constraint4>*) &gpuContacts,
-			0,numContacts,256);
-			*/
-		}
-	}
-
-	integrate(deltaTime);
-}
-
-void b3GpuRigidBodyPipeline::integrate(float timeStep)
-{
-	//integrate
-	int numBodies = m_data->m_narrowphase->getNumRigidBodies();
-	float angularDamp = 0.99f;
-
-	if (gIntegrateOnCpu)
-	{
-		if (numBodies)
-		{
-			b3GpuNarrowPhaseInternalData* npData = m_data->m_narrowphase->getInternalData();
-			npData->m_bodyBufferGPU->copyToHost(*npData->m_bodyBufferCPU);
-
-			b3RigidBodyData_t* bodies = &npData->m_bodyBufferCPU->at(0);
-
-			for (int nodeID = 0; nodeID < numBodies; nodeID++)
-			{
-				integrateSingleTransform(bodies, nodeID, timeStep, angularDamp, m_data->m_gravity);
-			}
-			npData->m_bodyBufferGPU->copyFromHost(*npData->m_bodyBufferCPU);
-		}
-	}
-	else
-	{
-		b3LauncherCL launcher(m_data->m_queue, m_data->m_integrateTransformsKernel, "m_integrateTransformsKernel");
-		launcher.setBuffer(m_data->m_narrowphase->getBodiesGpu());
-
-		launcher.setConst(numBodies);
-		launcher.setConst(timeStep);
-		launcher.setConst(angularDamp);
-		launcher.setConst(m_data->m_gravity);
-		launcher.launch1D(numBodies);
-	}
-}
-
-void b3GpuRigidBodyPipeline::setupGpuAabbsFull()
-{
-	cl_int ciErrNum = 0;
-
-	int numBodies = m_data->m_narrowphase->getNumRigidBodies();
-	if (!numBodies)
-		return;
-
-	if (gCalcWorldSpaceAabbOnCpu)
-	{
-		if (numBodies)
-		{
-			if (gUseDbvt)
-			{
-				m_data->m_allAabbsCPU.resize(numBodies);
-				m_data->m_narrowphase->readbackAllBodiesToCpu();
-				for (int i = 0; i < numBodies; i++)
-				{
-					b3ComputeWorldAabb(i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(), &m_data->m_allAabbsCPU[0]);
-				}
-				m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
-			}
-			else
-			{
-				m_data->m_broadphaseSap->getAllAabbsCPU().resize(numBodies);
-				m_data->m_narrowphase->readbackAllBodiesToCpu();
-				for (int i = 0; i < numBodies; i++)
-				{
-					b3ComputeWorldAabb(i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(), &m_data->m_broadphaseSap->getAllAabbsCPU()[0]);
-				}
-				m_data->m_broadphaseSap->getAllAabbsGPU().copyFromHost(m_data->m_broadphaseSap->getAllAabbsCPU());
-				//m_data->m_broadphaseSap->writeAabbsToGpu();
-			}
-		}
-	}
-	else
-	{
-		//__kernel void initializeGpuAabbsFull(  const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB)
-		b3LauncherCL launcher(m_data->m_queue, m_data->m_updateAabbsKernel, "m_updateAabbsKernel");
-		launcher.setConst(numBodies);
-		cl_mem bodies = m_data->m_narrowphase->getBodiesGpu();
-		launcher.setBuffer(bodies);
-		cl_mem collidables = m_data->m_narrowphase->getCollidablesGpu();
-		launcher.setBuffer(collidables);
-		cl_mem localAabbs = m_data->m_narrowphase->getAabbLocalSpaceBufferGpu();
-		launcher.setBuffer(localAabbs);
-
-		cl_mem worldAabbs = 0;
-		if (gUseDbvt)
-		{
-			worldAabbs = m_data->m_allAabbsGPU->getBufferCL();
-		}
-		else
-		{
-			worldAabbs = m_data->m_broadphaseSap->getAabbBufferWS();
-		}
-		launcher.setBuffer(worldAabbs);
-		launcher.launch1D(numBodies);
-
-		oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	}
-
-	/*
-	b3AlignedObjectArray<b3SapAabb> aabbs;
-	m_data->m_broadphaseSap->m_allAabbsGPU.copyToHost(aabbs);
-
-	printf("numAabbs = %d\n",  aabbs.size());
-
-	for (int i=0;i<aabbs.size();i++)
-	{
-		printf("aabb[%d].m_min=%f,%f,%f,%d\n",i,aabbs[i].m_minVec[0],aabbs[i].m_minVec[1],aabbs[i].m_minVec[2],aabbs[i].m_minIndices[3]);
-		printf("aabb[%d].m_max=%f,%f,%f,%d\n",i,aabbs[i].m_maxVec[0],aabbs[i].m_maxVec[1],aabbs[i].m_maxVec[2],aabbs[i].m_signedMaxIndices[3]);
-
-	};
-	*/
-}
-
-cl_mem b3GpuRigidBodyPipeline::getBodyBuffer()
-{
-	return m_data->m_narrowphase->getBodiesGpu();
-}
-
-int b3GpuRigidBodyPipeline::getNumBodies() const
-{
-	return m_data->m_narrowphase->getNumRigidBodies();
-}
-
-void b3GpuRigidBodyPipeline::setGravity(const float* grav)
-{
-	m_data->m_gravity.setValue(grav[0], grav[1], grav[2]);
-}
-
-void b3GpuRigidBodyPipeline::copyConstraintsToHost()
-{
-	m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints);
-}
-
-void b3GpuRigidBodyPipeline::writeAllInstancesToGpu()
-{
-	m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
-	m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints);
-}
-
-int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex, bool writeInstanceToGpu)
-{
-	b3Vector3 aabbMin = b3MakeVector3(0, 0, 0), aabbMax = b3MakeVector3(0, 0, 0);
-
-	if (collidableIndex >= 0)
-	{
-		b3SapAabb localAabb = m_data->m_narrowphase->getLocalSpaceAabb(collidableIndex);
-		b3Vector3 localAabbMin = b3MakeVector3(localAabb.m_min[0], localAabb.m_min[1], localAabb.m_min[2]);
-		b3Vector3 localAabbMax = b3MakeVector3(localAabb.m_max[0], localAabb.m_max[1], localAabb.m_max[2]);
-
-		b3Scalar margin = 0.01f;
-		b3Transform t;
-		t.setIdentity();
-		t.setOrigin(b3MakeVector3(position[0], position[1], position[2]));
-		t.setRotation(b3Quaternion(orientation[0], orientation[1], orientation[2], orientation[3]));
-		b3TransformAabb(localAabbMin, localAabbMax, margin, t, aabbMin, aabbMax);
-	}
-	else
-	{
-		b3Error("registerPhysicsInstance using invalid collidableIndex\n");
-		return -1;
-	}
-
-	bool writeToGpu = false;
-	int bodyIndex = m_data->m_narrowphase->getNumRigidBodies();
-	bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex, mass, position, orientation, &aabbMin.getX(), &aabbMax.getX(), writeToGpu);
-
-	if (bodyIndex >= 0)
-	{
-		if (gUseDbvt)
-		{
-			m_data->m_broadphaseDbvt->createProxy(aabbMin, aabbMax, bodyIndex, 0, 1, 1);
-			b3SapAabb aabb;
-			for (int i = 0; i < 3; i++)
-			{
-				aabb.m_min[i] = aabbMin[i];
-				aabb.m_max[i] = aabbMax[i];
-				aabb.m_minIndices[3] = bodyIndex;
-			}
-			m_data->m_allAabbsCPU.push_back(aabb);
-			if (writeInstanceToGpu)
-			{
-				m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
-			}
-		}
-		else
-		{
-			if (mass)
-			{
-				m_data->m_broadphaseSap->createProxy(aabbMin, aabbMax, bodyIndex, 1, 1);  //m_dispatcher);
-			}
-			else
-			{
-				m_data->m_broadphaseSap->createLargeProxy(aabbMin, aabbMax, bodyIndex, 1, 1);  //m_dispatcher);
-			}
-		}
-	}
-
-	/*
-	if (mass>0.f)
-		m_numDynamicPhysicsInstances++;
-
-	m_numPhysicsInstances++;
-	*/
-
-	return bodyIndex;
-}
-
-void b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults)
-{
-	this->m_data->m_raycaster->castRays(rays, hitResults,
-										getNumBodies(), this->m_data->m_narrowphase->getBodiesCpu(),
-										m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(),
-										m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap);
-}
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h
@ -1,70 +0,0 @@
-/*
-Copyright (c) 2013 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Erwin Coumans
-
-#ifndef B3_GPU_RIGIDBODY_PIPELINE_H
-#define B3_GPU_RIGIDBODY_PIPELINE_H
-
-#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
-#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
-
-#include "Bullet3Common/b3AlignedObjectArray.h"
-#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
-
-class b3GpuRigidBodyPipeline
-{
-protected:
-	struct b3GpuRigidBodyPipelineInternalData* m_data;
-
-	int allocateCollidable();
-
-public:
-	b3GpuRigidBodyPipeline(cl_context ctx, cl_device_id device, cl_command_queue q, class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config);
-	virtual ~b3GpuRigidBodyPipeline();
-
-	void stepSimulation(float deltaTime);
-	void integrate(float timeStep);
-	void setupGpuAabbsFull();
-
-	int registerConvexPolyhedron(class b3ConvexUtility* convex);
-
-	//int		registerConvexPolyhedron(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
-	//int		registerSphereShape(float radius);
-	//int		registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
-
-	//int		registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling);
-	//int		registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
-
-	int registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData, bool writeInstanceToGpu);
-	//if you passed "writeInstanceToGpu" false in the registerPhysicsInstance method (for performance) you need to call writeAllInstancesToGpu after all instances are registered
-	void writeAllInstancesToGpu();
-	void copyConstraintsToHost();
-	void setGravity(const float* grav);
-	void reset();
-
-	int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, float breakingThreshold);
-	int createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold);
-	void removeConstraintByUid(int uid);
-
-	void addConstraint(class b3TypedConstraint* constraint);
-	void removeConstraint(b3TypedConstraint* constraint);
-
-	void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults);
-
-	cl_mem getBodyBuffer();
-
-	int getNumBodies() const;
-};
-
-#endif  //B3_GPU_RIGIDBODY_PIPELINE_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h
@ -1,68 +0,0 @@
-/*
-Copyright (c) 2013 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Erwin Coumans
-
-#ifndef B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
-#define B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
-
-#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
-#include "Bullet3Common/b3AlignedObjectArray.h"
-
-#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
-
-#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
-#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
-#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
-
-#include "Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h"
-#include "Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h"
-
-struct b3GpuRigidBodyPipelineInternalData
-{
-	cl_context m_context;
-	cl_device_id m_device;
-	cl_command_queue m_queue;
-
-	cl_kernel m_integrateTransformsKernel;
-	cl_kernel m_updateAabbsKernel;
-	cl_kernel m_clearOverlappingPairsKernel;
-
-	class b3PgsJacobiSolver* m_solver;
-
-	class b3GpuPgsConstraintSolver* m_gpuSolver;
-
-	class b3GpuPgsContactSolver* m_solver2;
-	class b3GpuJacobiContactSolver* m_solver3;
-	class b3GpuRaycast* m_raycaster;
-
-	class b3GpuBroadphaseInterface* m_broadphaseSap;
-
-	struct b3DynamicBvhBroadphase* m_broadphaseDbvt;
-	b3OpenCLArray<b3SapAabb>* m_allAabbsGPU;
-	b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
-	b3OpenCLArray<b3BroadphasePair>* m_overlappingPairsGPU;
-
-	b3OpenCLArray<b3GpuGenericConstraint>* m_gpuConstraints;
-	b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints;
-
-	b3AlignedObjectArray<b3TypedConstraint*> m_joints;
-	int m_constraintUid;
-	class b3GpuNarrowPhase* m_narrowphase;
-	b3Vector3 m_gravity;
-
-	b3Config m_config;
-};
-
-#endif  //B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
--- a/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h
+++ b/Engine/lib/bullet/src/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h
@ -1,210 +0,0 @@
-/*
-Copyright (c) 2013 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Erwin Coumans
-
-#ifndef B3_GPU_SOLVER_BODY_H
-#define B3_GPU_SOLVER_BODY_H
-
-#include "Bullet3Common/b3Vector3.h"
-#include "Bullet3Common/b3Matrix3x3.h"
-
-#include "Bullet3Common/b3AlignedAllocator.h"
-#include "Bullet3Common/b3TransformUtil.h"
-
-///Until we get other contributions, only use SIMD on Windows, when using Visual Studio 2008 or later, and not double precision
-#ifdef B3_USE_SSE
-#define USE_SIMD 1
-#endif  //
-
-///The b3SolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance.
-B3_ATTRIBUTE_ALIGNED16(struct)
-b3GpuSolverBody
-{
-	B3_DECLARE_ALIGNED_ALLOCATOR();
-	//	b3Transform		m_worldTransformUnused;
-	b3Vector3 m_deltaLinearVelocity;
-	b3Vector3 m_deltaAngularVelocity;
-	b3Vector3 m_angularFactor;
-	b3Vector3 m_linearFactor;
-	b3Vector3 m_invMass;
-	b3Vector3 m_pushVelocity;
-	b3Vector3 m_turnVelocity;
-	b3Vector3 m_linearVelocity;
-	b3Vector3 m_angularVelocity;
-
-	union {
-		void* m_originalBody;
-		int m_originalBodyIndex;
-	};
-
-	int padding[3];
-
-	/*
-	void	setWorldTransform(const b3Transform& worldTransform)
-	{
-		m_worldTransform = worldTransform;
-	}
-
-	const b3Transform& getWorldTransform() const
-	{
-		return m_worldTransform;
-	}
-	*/
-	B3_FORCE_INLINE void getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const
-	{
-		if (m_originalBody)
-			velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos);
-		else
-			velocity.setValue(0, 0, 0);
-	}
-
-	B3_FORCE_INLINE void getAngularVelocity(b3Vector3 & angVel) const
-	{
-		if (m_originalBody)
-			angVel = m_angularVelocity + m_deltaAngularVelocity;
-		else
-			angVel.setValue(0, 0, 0);
-	}
-
-	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
-	B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude)
-	{
-		if (m_originalBody)
-		{
-			m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor;
-			m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
-		}
-	}
-
-	B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, b3Scalar impulseMagnitude)
-	{
-		if (m_originalBody)
-		{
-			m_pushVelocity += linearComponent * impulseMagnitude * m_linearFactor;
-			m_turnVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
-		}
-	}
-
-	const b3Vector3& getDeltaLinearVelocity() const
-	{
-		return m_deltaLinearVelocity;
-	}
-
-	const b3Vector3& getDeltaAngularVelocity() const
-	{
-		return m_deltaAngularVelocity;
-	}
-
-	const b3Vector3& getPushVelocity() const
-	{
-		return m_pushVelocity;
-	}
-
-	const b3Vector3& getTurnVelocity() const
-	{
-		return m_turnVelocity;
-	}
-
-	////////////////////////////////////////////////
-	///some internal methods, don't use them
-
-	b3Vector3& internalGetDeltaLinearVelocity()
-	{
-		return m_deltaLinearVelocity;
-	}
-
-	b3Vector3& internalGetDeltaAngularVelocity()
-	{
-		return m_deltaAngularVelocity;
-	}
-
-	const b3Vector3& internalGetAngularFactor() const
-	{
-		return m_angularFactor;
-	}
-
-	const b3Vector3& internalGetInvMass() const
-	{
-		return m_invMass;
-	}
-
-	void internalSetInvMass(const b3Vector3& invMass)
-	{
-		m_invMass = invMass;
-	}
-
-	b3Vector3& internalGetPushVelocity()
-	{
-		return m_pushVelocity;
-	}
-
-	b3Vector3& internalGetTurnVelocity()
-	{
-		return m_turnVelocity;
-	}
-
-	B3_FORCE_INLINE void internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const
-	{
-		velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos);
-	}
-
-	B3_FORCE_INLINE void internalGetAngularVelocity(b3Vector3 & angVel) const
-	{
-		angVel = m_angularVelocity + m_deltaAngularVelocity;
-	}
-
-	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
-	B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude)
-	{
-		//if (m_originalBody)
-		{
-			m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor;
-			m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
-		}
-	}
-
-	void writebackVelocity()
-	{
-		//if (m_originalBody>=0)
-		{
-			m_linearVelocity += m_deltaLinearVelocity;
-			m_angularVelocity += m_deltaAngularVelocity;
-
-			//m_originalBody->setCompanionId(-1);
-		}
-	}
-
-	void writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp)
-	{
-		(void)timeStep;
-		if (m_originalBody)
-		{
-			m_linearVelocity += m_deltaLinearVelocity;
-			m_angularVelocity += m_deltaAngularVelocity;
-
-			//correct the position/orientation based on push/turn recovery
-			b3Transform newTransform;
-			if (m_pushVelocity[0] != 0.f || m_pushVelocity[1] != 0 || m_pushVelocity[2] != 0 || m_turnVelocity[0] != 0.f || m_turnVelocity[1] != 0 || m_turnVelocity[2] != 0)
-			{
-				//	b3Quaternion orn = m_worldTransform.getRotation();
-				//				b3TransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform);
-				//				m_worldTransform = newTransform;
-			}
-			//m_worldTransform.setRotation(orn);
-			//m_originalBody->setCompanionId(-1);
-		}
-	}
-};
-
-#endif  //B3_SOLVER_BODY_H
--- a/Show more
+++ b/Show more