Revert "Updated SDL, Bullet and OpenAL soft libs"

This reverts commit 370161cfb1.
This commit is contained in:
AzaezelX 2019-07-08 09:49:44 -05:00
parent 63be684474
commit bc77ff0833
1102 changed files with 62741 additions and 204988 deletions

View file

@ -1,42 +0,0 @@
#ifndef B3_GPU_BROADPHASE_INTERFACE_H
#define B3_GPU_BROADPHASE_INTERFACE_H
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
#include "Bullet3Common/b3Vector3.h"
#include "b3SapAabb.h"
#include "Bullet3Common/shared/b3Int2.h"
#include "Bullet3Common/shared/b3Int4.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
class b3GpuBroadphaseInterface
{
public:
typedef class b3GpuBroadphaseInterface*(CreateFunc)(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~b3GpuBroadphaseInterface()
{
}
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
virtual void calculateOverlappingPairs(int maxPairs) = 0;
virtual void calculateOverlappingPairsHost(int maxPairs) = 0;
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu() = 0;
virtual cl_mem getAabbBufferWS() = 0;
virtual int getNumOverlap() = 0;
virtual cl_mem getOverlappingPairBuffer() = 0;
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() = 0;
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() = 0;
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0;
};
#endif //B3_GPU_BROADPHASE_INTERFACE_H

View file

@ -1,338 +0,0 @@
#include "b3GpuGridBroadphase.h"
#include "Bullet3Geometry/b3AabbUtil.h"
#include "kernels/gridBroadphaseKernels.h"
#include "kernels/sapKernels.h"
//#include "kernels/gridBroadphase.cl"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
#define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl"
cl_kernel kCalcHashAABB;
cl_kernel kClearCellStart;
cl_kernel kFindCellStart;
cl_kernel kFindOverlappingPairs;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sap2Kernel;
//int maxPairsPerBody = 64;
int maxBodiesPerCell = 256; //??
b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q)
: m_context(ctx),
m_device(device),
m_queue(q),
m_allAabbsGPU1(ctx, q),
m_smallAabbsMappingGPU(ctx, q),
m_largeAabbsMappingGPU(ctx, q),
m_gpuPairs(ctx, q),
m_hashGpu(ctx, q),
m_cellStartGpu(ctx, q),
m_paramsGPU(ctx, q)
{
b3Vector3 gridSize = b3MakeVector3(3, 3, 3);
b3Vector3 invGridSize = b3MakeVector3(1.f / gridSize[0], 1.f / gridSize[1], 1.f / gridSize[2]);
m_paramsCPU.m_gridSize[0] = 128;
m_paramsCPU.m_gridSize[1] = 128;
m_paramsCPU.m_gridSize[2] = 128;
m_paramsCPU.m_gridSize[3] = maxBodiesPerCell;
m_paramsCPU.setMaxBodiesPerCell(maxBodiesPerCell);
m_paramsCPU.m_invCellSize[0] = invGridSize[0];
m_paramsCPU.m_invCellSize[1] = invGridSize[1];
m_paramsCPU.m_invCellSize[2] = invGridSize[2];
m_paramsCPU.m_invCellSize[3] = 0.f;
m_paramsGPU.push_back(m_paramsCPU);
cl_int errNum = 0;
{
const char* sapSrc = sapCL;
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH);
b3Assert(errNum == CL_SUCCESS);
m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg);
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg);
b3Assert(errNum == CL_SUCCESS);
}
{
cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, gridBroadphaseCL, &errNum, "", B3_GRID_BROADPHASE_PATH);
b3Assert(errNum == CL_SUCCESS);
kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kCalcHashAABB", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kClearCellStart", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindCellStart", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindOverlappingPairs", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
}
m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);
}
b3GpuGridBroadphase::~b3GpuGridBroadphase()
{
clReleaseKernel(kCalcHashAABB);
clReleaseKernel(kClearCellStart);
clReleaseKernel(kFindCellStart);
clReleaseKernel(kFindOverlappingPairs);
clReleaseKernel(m_sap2Kernel);
clReleaseKernel(m_copyAabbsKernel);
delete m_sorter;
}
void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr;
m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
m_allAabbsCPU1.push_back(aabb);
}
void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr;
m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
m_allAabbsCPU1.push_back(aabb);
}
void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
{
B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs");
if (0)
{
calculateOverlappingPairsHost(maxPairs);
/*
b3AlignedObjectArray<b3Int4> cpuPairs;
m_gpuPairs.copyToHost(cpuPairs);
printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size());
for (int i=0;i<m_gpuPairs.size();i++)
{
printf("host pair %d = %d,%d\n",i,cpuPairs[i].x,cpuPairs[i].y);
}
*/
return;
}
int numSmallAabbs = m_smallAabbsMappingGPU.size();
b3OpenCLArray<int> pairCount(m_context, m_queue);
pairCount.push_back(0);
m_gpuPairs.resize(maxPairs); //numSmallAabbs*maxPairsPerBody);
{
int numLargeAabbs = m_largeAabbsMappingGPU.size();
if (numLargeAabbs && numSmallAabbs)
{
B3_PROFILE("sap2Kernel");
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL(m_allAabbsGPU1.getBufferCL()),
b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()),
b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()),
b3BufferInfoCL(m_gpuPairs.getBufferCL()),
b3BufferInfoCL(pairCount.getBufferCL())};
b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLargeAabbs);
launcher.setConst(numSmallAabbs);
launcher.setConst(0); //axis is not used
launcher.setConst(maxPairs);
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64);
int numPairs = pairCount.at(0);
if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs = maxPairs;
}
}
}
if (numSmallAabbs)
{
B3_PROFILE("gridKernel");
m_hashGpu.resize(numSmallAabbs);
{
B3_PROFILE("kCalcHashAABB");
b3LauncherCL launch(m_queue, kCalcHashAABB, "kCalcHashAABB");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_allAabbsGPU1.getBufferCL());
launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
launch.setBuffer(m_hashGpu.getBufferCL());
launch.setBuffer(this->m_paramsGPU.getBufferCL());
launch.launch1D(numSmallAabbs);
}
m_sorter->execute(m_hashGpu);
int numCells = this->m_paramsCPU.m_gridSize[0] * this->m_paramsCPU.m_gridSize[1] * this->m_paramsCPU.m_gridSize[2];
m_cellStartGpu.resize(numCells);
//b3AlignedObjectArray<int > cellStartCpu;
{
B3_PROFILE("kClearCellStart");
b3LauncherCL launch(m_queue, kClearCellStart, "kClearCellStart");
launch.setConst(numCells);
launch.setBuffer(m_cellStartGpu.getBufferCL());
launch.launch1D(numCells);
//m_cellStartGpu.copyToHost(cellStartCpu);
//printf("??\n");
}
{
B3_PROFILE("kFindCellStart");
b3LauncherCL launch(m_queue, kFindCellStart, "kFindCellStart");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_hashGpu.getBufferCL());
launch.setBuffer(m_cellStartGpu.getBufferCL());
launch.launch1D(numSmallAabbs);
//m_cellStartGpu.copyToHost(cellStartCpu);
//printf("??\n");
}
{
B3_PROFILE("kFindOverlappingPairs");
b3LauncherCL launch(m_queue, kFindOverlappingPairs, "kFindOverlappingPairs");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_allAabbsGPU1.getBufferCL());
launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
launch.setBuffer(m_hashGpu.getBufferCL());
launch.setBuffer(m_cellStartGpu.getBufferCL());
launch.setBuffer(m_paramsGPU.getBufferCL());
//launch.setBuffer(0);
launch.setBuffer(pairCount.getBufferCL());
launch.setBuffer(m_gpuPairs.getBufferCL());
launch.setConst(maxPairs);
launch.launch1D(numSmallAabbs);
int numPairs = pairCount.at(0);
if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs = maxPairs;
}
m_gpuPairs.resize(numPairs);
if (0)
{
b3AlignedObjectArray<b3Int4> pairsCpu;
m_gpuPairs.copyToHost(pairsCpu);
int sz = m_gpuPairs.size();
printf("m_gpuPairs.size()=%d\n", sz);
for (int i = 0; i < m_gpuPairs.size(); i++)
{
printf("pair %d = %d,%d\n", i, pairsCpu[i].x, pairsCpu[i].y);
}
printf("?!?\n");
}
}
}
//calculateOverlappingPairsHost(maxPairs);
}
void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
m_hostPairs.resize(0);
m_allAabbsGPU1.copyToHost(m_allAabbsCPU1);
for (int i = 0; i < m_allAabbsCPU1.size(); i++)
{
for (int j = i + 1; j < m_allAabbsCPU1.size(); j++)
{
if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec,
m_allAabbsCPU1[j].m_minVec, m_allAabbsCPU1[j].m_maxVec))
{
b3Int4 pair;
int a = m_allAabbsCPU1[j].m_minIndices[3];
int b = m_allAabbsCPU1[i].m_minIndices[3];
if (a <= b)
{
pair.x = a;
pair.y = b; //store the original index in the unsorted aabb array
}
else
{
pair.x = b;
pair.y = a; //store the original index in the unsorted aabb array
}
if (m_hostPairs.size() < maxPairs)
{
m_hostPairs.push_back(pair);
}
}
}
}
m_gpuPairs.copyFromHost(m_hostPairs);
}
//call writeAabbsToGpu after done making all changes (createProxy etc)
void b3GpuGridBroadphase::writeAabbsToGpu()
{
m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1);
m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
}
cl_mem b3GpuGridBroadphase::getAabbBufferWS()
{
return this->m_allAabbsGPU1.getBufferCL();
}
int b3GpuGridBroadphase::getNumOverlap()
{
return m_gpuPairs.size();
}
cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
{
return m_gpuPairs.getBufferCL();
}
b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
{
return m_allAabbsGPU1;
}
b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
{
return m_allAabbsCPU1;
}
b3OpenCLArray<b3Int4>& b3GpuGridBroadphase::getOverlappingPairsGPU()
{
return m_gpuPairs;
}
b3OpenCLArray<int>& b3GpuGridBroadphase::getSmallAabbIndicesGPU()
{
return m_smallAabbsMappingGPU;
}
b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()
{
return m_largeAabbsMappingGPU;
}

View file

@ -1,80 +0,0 @@
#ifndef B3_GPU_GRID_BROADPHASE_H
#define B3_GPU_GRID_BROADPHASE_H
#include "b3GpuBroadphaseInterface.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
struct b3ParamsGridBroadphaseCL
{
float m_invCellSize[4];
int m_gridSize[4];
int getMaxBodiesPerCell() const
{
return m_gridSize[3];
}
void setMaxBodiesPerCell(int maxOverlap)
{
m_gridSize[3] = maxOverlap;
}
};
class b3GpuGridBroadphase : public b3GpuBroadphaseInterface
{
protected:
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;
b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
b3AlignedObjectArray<b3Int4> m_hostPairs;
b3OpenCLArray<b3Int4> m_gpuPairs;
b3OpenCLArray<b3SortData> m_hashGpu;
b3OpenCLArray<int> m_cellStartGpu;
b3ParamsGridBroadphaseCL m_paramsCPU;
b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;
class b3RadixSort32CL* m_sorter;
public:
b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~b3GpuGridBroadphase();
static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuGridBroadphase(ctx, device, q);
}
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
virtual cl_mem getAabbBufferWS();
virtual int getNumOverlap();
virtual cl_mem getOverlappingPairBuffer();
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
};
#endif //B3_GPU_GRID_BROADPHASE_H

View file

@ -1,557 +0,0 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "b3GpuParallelLinearBvh.h"
b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue),
m_radixSorter(context, device, queue),
m_rootNodeIndex(context, queue),
m_maxDistanceFromRoot(context, queue),
m_temp(context, queue),
m_internalNodeAabbs(context, queue),
m_internalNodeLeafIndexRanges(context, queue),
m_internalNodeChildNodes(context, queue),
m_internalNodeParentNodes(context, queue),
m_commonPrefixes(context, queue),
m_commonPrefixLengths(context, queue),
m_distanceFromRoot(context, queue),
m_leafNodeParentNodes(context, queue),
m_mortonCodesAndAabbIndicies(context, queue),
m_mergedAabb(context, queue),
m_leafNodeAabbs(context, queue),
m_largeAabbs(context, queue)
{
m_rootNodeIndex.resize(1);
m_maxDistanceFromRoot.resize(1);
m_temp.resize(1);
//
const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
cl_int error;
char* additionalMacros = 0;
m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
b3Assert(m_parallelLinearBvhProgram);
m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_separateAabbsKernel);
m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findAllNodesMergedAabbKernel);
m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_computeAdjacentPairCommonPrefixKernel);
m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findDistanceFromRootKernel);
m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findLeafIndexRangesKernel);
m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhCalculateOverlappingPairsKernel);
m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhRayTraverseKernel);
m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhLargeAabbAabbTestKernel);
m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhLargeAabbRayTestKernel);
}
b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
{
clReleaseKernel(m_separateAabbsKernel);
clReleaseKernel(m_findAllNodesMergedAabbKernel);
clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
clReleaseKernel(m_findDistanceFromRootKernel);
clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
clReleaseKernel(m_findLeafIndexRangesKernel);
clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
clReleaseKernel(m_plbvhRayTraverseKernel);
clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
clReleaseProgram(m_parallelLinearBvhProgram);
}
void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices)
{
B3_PROFILE("b3ParallelLinearBvh::build()");
int numLargeAabbs = largeAabbIndices.size();
int numSmallAabbs = smallAabbIndices.size();
//Since all AABBs(both large and small) are input as a contiguous array,
//with 2 additional arrays used to indicate the indices of large and small AABBs,
//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
{
B3_PROFILE("Separate large and small AABBs");
m_largeAabbs.resize(numLargeAabbs);
m_leafNodeAabbs.resize(numSmallAabbs);
//Write large AABBs into m_largeAabbs
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
b3BufferInfoCL(largeAabbIndices.getBufferCL()),
b3BufferInfoCL(m_largeAabbs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLargeAabbs);
launcher.launch1D(numLargeAabbs);
}
//Write small AABBs into m_leafNodeAabbs
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
b3BufferInfoCL(smallAabbIndices.getBufferCL()),
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numSmallAabbs);
launcher.launch1D(numSmallAabbs);
}
clFinish(m_queue);
}
//
int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
int numInternalNodes = numLeaves - 1;
if (numLeaves < 2)
{
//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
//so it does not matter if numLeaves == 0 and rootNodeIndex == -1
int rootNodeIndex = numLeaves - 1;
m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
if (numLeaves == 1)
{
b3SortData leaf;
leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
m_mortonCodesAndAabbIndicies.resize(1);
m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
}
return;
}
//
{
m_internalNodeAabbs.resize(numInternalNodes);
m_internalNodeLeafIndexRanges.resize(numInternalNodes);
m_internalNodeChildNodes.resize(numInternalNodes);
m_internalNodeParentNodes.resize(numInternalNodes);
m_commonPrefixes.resize(numInternalNodes);
m_commonPrefixLengths.resize(numInternalNodes);
m_distanceFromRoot.resize(numInternalNodes);
m_leafNodeParentNodes.resize(numLeaves);
m_mortonCodesAndAabbIndicies.resize(numLeaves);
m_mergedAabb.resize(numLeaves);
}
//Find the merged AABB of all small AABBs; this is used to define the size of
//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
{
B3_PROFILE("Find AABB of merged nodes");
m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_mergedAabb.getBufferCL()) //Resulting AABB is stored in m_mergedAabb[0]
};
b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numAabbsNeedingMerge);
launcher.launch1D(numAabbsNeedingMerge);
}
clFinish(m_queue);
}
//Insert the center of the AABBs into a virtual grid,
//then convert the discrete grid coordinates into a morton code
//For each element in m_mortonCodesAndAabbIndicies, set
// m_key == morton code (value to sort by)
// m_value == small AABB index
{
B3_PROFILE("Assign morton codes");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_mergedAabb.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())};
b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLeaves);
launcher.launch1D(numLeaves);
clFinish(m_queue);
}
//
{
B3_PROFILE("Sort leaves by morton codes");
m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
clFinish(m_queue);
}
//
constructBinaryRadixTree();
//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
//The child nodes of each node split their parent's index range into 2 contiguous halves.
//
//For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31].
//The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31].
//
//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
{
B3_PROFILE("m_findLeafIndexRangesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())};
b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
}
void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs)
{
int maxPairs = out_overlappingPairs.size();
b3OpenCLArray<int>& numPairsGpu = m_temp;
int reset = 0;
numPairsGpu.copyFromHostPointer(&reset, 1);
//
if (m_leafNodeAabbs.size() > 1)
{
B3_PROFILE("PLBVH small-small AABB test");
int numQueryAabbs = m_leafNodeAabbs.size();
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(numPairsGpu.getBufferCL()),
b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxPairs);
launcher.setConst(numQueryAabbs);
launcher.launch1D(numQueryAabbs);
clFinish(m_queue);
}
int numLargeAabbRigids = m_largeAabbs.size();
if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0)
{
B3_PROFILE("PLBVH large-small AABB test");
int numQueryAabbs = m_leafNodeAabbs.size();
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_largeAabbs.getBufferCL()),
b3BufferInfoCL(numPairsGpu.getBufferCL()),
b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxPairs);
launcher.setConst(numLargeAabbRigids);
launcher.setConst(numQueryAabbs);
launcher.launch1D(numQueryAabbs);
clFinish(m_queue);
}
//
int numPairs = -1;
numPairsGpu.copyToHostPointer(&numPairs, 1);
if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs = maxPairs;
numPairsGpu.copyFromHostPointer(&maxPairs, 1);
}
out_overlappingPairs.resize(numPairs);
}
void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
{
B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
int numRays = rays.size();
int maxRayRigidPairs = out_rayRigidPairs.size();
int reset = 0;
out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
//
if (m_leafNodeAabbs.size() > 0)
{
B3_PROFILE("PLBVH ray test small AABB");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(rays.getBufferCL()),
b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxRayRigidPairs);
launcher.setConst(numRays);
launcher.launch1D(numRays);
clFinish(m_queue);
}
int numLargeAabbRigids = m_largeAabbs.size();
if (numLargeAabbRigids > 0)
{
B3_PROFILE("PLBVH ray test large AABB");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_largeAabbs.getBufferCL()),
b3BufferInfoCL(rays.getBufferCL()),
b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLargeAabbRigids);
launcher.setConst(maxRayRigidPairs);
launcher.setConst(numRays);
launcher.launch1D(numRays);
clFinish(m_queue);
}
//
int numRayRigidPairs = -1;
out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
if (numRayRigidPairs > maxRayRigidPairs)
b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
}
void b3GpuParallelLinearBvh::constructBinaryRadixTree()
{
B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
int numLeaves = m_leafNodeAabbs.size();
int numInternalNodes = numLeaves - 1;
//Each internal node is placed in between 2 leaf nodes.
//By using this arrangement and computing the common prefix between
//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
{
B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())};
b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//For each leaf node, select its parent node by
//comparing the 2 nearest internal nodes and assign child node indices
{
B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLeaves);
launcher.launch1D(numLeaves);
clFinish(m_queue);
}
//For each internal node, perform 2 binary searches among the other internal nodes
//to its left and right to find its potential parent nodes and assign child node indices
{
B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
b3BufferInfoCL(m_rootNodeIndex.getBufferCL())};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//Find the number of nodes seperating each internal node and the root node
//so that the AABBs can be set using the next kernel.
//Also determine the maximum number of nodes separating an internal node and the root node.
{
B3_PROFILE("m_findDistanceFromRootKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()),
b3BufferInfoCL(m_distanceFromRoot.getBufferCL())};
b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//Starting from the internal nodes nearest to the leaf nodes, recursively move up
//the tree towards the root to set the AABBs of each internal node; each internal node
//checks its children and merges their AABBs
{
B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
int maxDistanceFromRoot = -1;
{
B3_PROFILE("copy maxDistanceFromRoot to CPU");
m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
clFinish(m_queue);
}
for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_distanceFromRoot.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxDistanceFromRoot);
launcher.setConst(distanceFromRoot);
launcher.setConst(numInternalNodes);
//It may seem inefficent to launch a thread for each internal node when a
//much smaller number of nodes is actually processed, but this is actually
//faster than determining the exact nodes that are ready to merge their child AABBs.
launcher.launch1D(numInternalNodes);
}
clFinish(m_queue);
}
}

View file

@ -1,125 +0,0 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
#ifndef B3_GPU_PARALLEL_LINEAR_BVH_H
#define B3_GPU_PARALLEL_LINEAR_BVH_H
//#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
#include "Bullet3Common/shared/b3Int2.h"
#include "Bullet3Common/shared/b3Int4.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
#include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h"
#define b3Int64 cl_long
///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame
///@remarks
///See presentation in docs/b3GpuParallelLinearBvh.pdf for algorithm details.
///@par
///Related papers: \n
///"Fast BVH Construction on GPUs" [Lauterbach et al. 2009] \n
///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
///@par
///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
/// - [fully parallel] Sort morton codes
/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
/// - [somewhat parallel] Set internal node AABBs
///@par
///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree.
///Instead of searching for the child nodes of each internal node, we search for the parent node of each node.
///Additionally, a non-atomic traversal that starts from the leaf nodes and moves towards the root node is used to set the AABBs.
class b3GpuParallelLinearBvh
{
cl_command_queue m_queue;
cl_program m_parallelLinearBvhProgram;
cl_kernel m_separateAabbsKernel;
cl_kernel m_findAllNodesMergedAabbKernel;
cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
//Binary radix tree construction kernels
cl_kernel m_computeAdjacentPairCommonPrefixKernel;
cl_kernel m_buildBinaryRadixTreeLeafNodesKernel;
cl_kernel m_buildBinaryRadixTreeInternalNodesKernel;
cl_kernel m_findDistanceFromRootKernel;
cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel;
cl_kernel m_findLeafIndexRangesKernel;
//Traversal kernels
cl_kernel m_plbvhCalculateOverlappingPairsKernel;
cl_kernel m_plbvhRayTraverseKernel;
cl_kernel m_plbvhLargeAabbAabbTestKernel;
cl_kernel m_plbvhLargeAabbRayTestKernel;
b3RadixSort32CL m_radixSorter;
//1 element
b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node
b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs()
//1 element per internal node (number_of_internal_nodes == number_of_leaves - 1)
b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
//1 element per internal node; for binary radix tree construction
b3OpenCLArray<b3Int64> m_commonPrefixes;
b3OpenCLArray<int> m_commonPrefixLengths;
b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root
//1 element per leaf node (leaf nodes only include small AABBs)
b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes
b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs
//1 element per large AABB, which is not stored in the BVH
b3OpenCLArray<b3SapAabb> m_largeAabbs;
public:
b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue);
virtual ~b3GpuParallelLinearBvh();
///Must be called before any other function
void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices);
///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs.
///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs.
///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized.
void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs);
///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections;
///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough.
///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index.
///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded.
void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
private:
void constructBinaryRadixTree();
};
#endif

View file

@ -1,76 +0,0 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
#include "b3GpuParallelLinearBvhBroadphase.h"
b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : m_plbvh(context, device, queue),
m_overlappingPairsGpu(context, queue),
m_aabbsGpu(context, queue),
m_smallAabbsMappingGpu(context, queue),
m_largeAabbsMappingGpu(context, queue)
{
}
void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
int newAabbIndex = m_aabbsCpu.size();
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = newAabbIndex;
m_smallAabbsMappingCpu.push_back(newAabbIndex);
m_aabbsCpu.push_back(aabb);
}
void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
int newAabbIndex = m_aabbsCpu.size();
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = newAabbIndex;
m_largeAabbsMappingCpu.push_back(newAabbIndex);
m_aabbsCpu.push_back(aabb);
}
void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs)
{
//Reconstruct BVH
m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu);
//
m_overlappingPairsGpu.resize(maxPairs);
m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu);
}
void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
b3Assert(0); //CPU version not implemented
}
void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
{
m_aabbsGpu.copyFromHost(m_aabbsCpu);
m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu);
m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu);
}

View file

@ -1,66 +0,0 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
#ifndef B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
#define B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
#include "b3GpuParallelLinearBvh.h"
class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
{
b3GpuParallelLinearBvh m_plbvh;
b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
b3OpenCLArray<b3SapAabb> m_aabbsGpu;
b3OpenCLArray<int> m_smallAabbsMappingGpu;
b3OpenCLArray<int> m_largeAabbsMappingGpu;
b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
b3AlignedObjectArray<int> m_smallAabbsMappingCpu;
b3AlignedObjectArray<int> m_largeAabbsMappingCpu;
public:
b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue);
virtual ~b3GpuParallelLinearBvhBroadphase() {}
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }
virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; }
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; }
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; }
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
{
return new b3GpuParallelLinearBvhBroadphase(context, device, queue);
}
};
#endif

View file

@ -1,143 +0,0 @@
#ifndef B3_GPU_SAP_BROADPHASE_H
#define B3_GPU_SAP_BROADPHASE_H
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
class b3Vector3;
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "b3SapAabb.h"
#include "Bullet3Common/shared/b3Int2.h"
#include "b3GpuBroadphaseInterface.h"
class b3GpuSapBroadphase : public b3GpuBroadphaseInterface
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_flipFloatKernel;
cl_kernel m_scatterKernel;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sapKernel;
cl_kernel m_sap2Kernel;
cl_kernel m_prepareSumVarianceKernel;
class b3RadixSort32CL* m_sorter;
///test for 3d SAP
b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2];
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU0;
b3OpenCLArray<b3SortData> m_sortedAxisGPU1;
b3OpenCLArray<b3SortData> m_sortedAxisGPU2;
b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev;
b3OpenCLArray<b3Int4> m_addedHostPairsGPU;
b3OpenCLArray<b3Int4> m_removedHostPairsGPU;
b3OpenCLArray<int> m_addedCountGPU;
b3OpenCLArray<int> m_removedCountGPU;
int m_currentBuffer;
public:
b3OpenCLArray<int> m_pairCount;
b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
{
return m_allAabbsGPU;
}
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
{
return m_allAabbsCPU;
}
b3OpenCLArray<b3Vector3> m_sum;
b3OpenCLArray<b3Vector3> m_sum2;
b3OpenCLArray<b3Vector3> m_dst;
b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
b3OpenCLArray<b3Int4> m_overlappingPairs;
//temporary gpu work memory
b3OpenCLArray<b3SortData> m_gpuSmallSortData;
b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
class b3PrefixScanFloat4CL* m_prefixScanFloat4;
enum b3GpuSapKernelType
{
B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU = 1,
B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU,
B3_GPU_SAP_KERNEL_ORIGINAL,
B3_GPU_SAP_KERNEL_BARRIER,
B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY
};
b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType = B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
virtual ~b3GpuSapBroadphase();
static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
}
static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
}
static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_ORIGINAL);
}
static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BARRIER);
}
static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
}
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
void reset();
void init3dSap();
virtual void calculateOverlappingPairsHostIncremental3Sap();
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
virtual cl_mem getAabbBufferWS();
virtual int getNumOverlap();
virtual cl_mem getOverlappingPairBuffer();
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
};
#endif //B3_GPU_SAP_BROADPHASE_H

View file

@ -1,13 +0,0 @@
#ifndef B3_SAP_AABB_H
#define B3_SAP_AABB_H
#include "Bullet3Common/b3Scalar.h"
#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
///just make sure that the b3Aabb is 16-byte aligned
B3_ATTRIBUTE_ALIGNED16(struct)
b3SapAabb : public b3Aabb{
};
#endif //B3_SAP_AABB_H

View file

@ -1,216 +0,0 @@
int getPosHash(int4 gridPos, __global float4* pParams)
{
int4 gridDim = *((__global int4*)(pParams + 1));
gridPos.x &= gridDim.x - 1;
gridPos.y &= gridDim.y - 1;
gridPos.z &= gridDim.z - 1;
int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;
return hash;
}
int4 getGridPos(float4 worldPos, __global float4* pParams)
{
int4 gridPos;
int4 gridDim = *((__global int4*)(pParams + 1));
gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);
gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);
gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);
return gridPos;
}
// calculate grid hash value for each body using its AABB
__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )
{
int index = get_global_id(0);
if(index >= numObjects)
{
return;
}
float4 bbMin = allpAABB[smallAabbMapping[index]*2];
float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];
float4 pos;
pos.x = (bbMin.x + bbMax.x) * 0.5f;
pos.y = (bbMin.y + bbMax.y) * 0.5f;
pos.z = (bbMin.z + bbMax.z) * 0.5f;
pos.w = 0.f;
// get address in grid
int4 gridPos = getGridPos(pos, pParams);
int gridHash = getPosHash(gridPos, pParams);
// store grid hash and body index
int2 hashVal;
hashVal.x = gridHash;
hashVal.y = index;
pHash[index] = hashVal;
}
__kernel void kClearCellStart( int numCells,
__global int* pCellStart )
{
int index = get_global_id(0);
if(index >= numCells)
{
return;
}
pCellStart[index] = -1;
}
__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )
{
__local int sharedHash[513];
int index = get_global_id(0);
int2 sortedData;
if(index < numObjects)
{
sortedData = pHash[index];
// Load hash data into shared memory so that we can look
// at neighboring body's hash value without loading
// two hash values per thread
sharedHash[get_local_id(0) + 1] = sortedData.x;
if((index > 0) && (get_local_id(0) == 0))
{
// first thread in block must load neighbor body hash
sharedHash[0] = pHash[index-1].x;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(index < numObjects)
{
if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))
{
cellStart[sortedData.x] = index;
}
}
}
int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)
{
return (min0.x <= max1.x)&& (min1.x <= max0.x) &&
(min0.y <= max1.y)&& (min1.y <= max0.y) &&
(min0.z <= max1.z)&& (min1.z <= max0.z);
}
//search for AABB 'index' against other AABBs' in this cell
void findPairsInCell( int numObjects,
int4 gridPos,
int index,
__global int2* pHash,
__global int* pCellStart,
__global float4* allpAABB,
__global const int* smallAabbMapping,
__global float4* pParams,
volatile __global int* pairCount,
__global int4* pPairBuff2,
int maxPairs
)
{
int4 pGridDim = *((__global int4*)(pParams + 1));
int maxBodiesPerCell = pGridDim.w;
int gridHash = getPosHash(gridPos, pParams);
// get start of bucket for this cell
int bucketStart = pCellStart[gridHash];
if (bucketStart == -1)
{
return; // cell empty
}
// iterate over bodies in this cell
int2 sortedData = pHash[index];
int unsorted_indx = sortedData.y;
float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];
float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];
int handleIndex = as_int(min0.w);
int bucketEnd = bucketStart + maxBodiesPerCell;
bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;
for(int index2 = bucketStart; index2 < bucketEnd; index2++)
{
int2 cellData = pHash[index2];
if (cellData.x != gridHash)
{
break; // no longer in same bucket
}
int unsorted_indx2 = cellData.y;
//if (unsorted_indx2 < unsorted_indx) // check not colliding with self
if (unsorted_indx2 != unsorted_indx) // check not colliding with self
{
float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];
float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];
if(testAABBOverlap(min0, max0, min1, max1))
{
if (pairCount)
{
int handleIndex2 = as_int(min1.w);
if (handleIndex<handleIndex2)
{
int curPair = atomic_add(pairCount,1);
if (curPair<maxPairs)
{
int4 newpair;
newpair.x = handleIndex;
newpair.y = handleIndex2;
newpair.z = -1;
newpair.w = -1;
pPairBuff2[curPair] = newpair;
}
}
}
}
}
}
}
__kernel void kFindOverlappingPairs( int numObjects,
__global float4* allpAABB,
__global const int* smallAabbMapping,
__global int2* pHash,
__global int* pCellStart,
__global float4* pParams ,
volatile __global int* pairCount,
__global int4* pPairBuff2,
int maxPairs
)
{
int index = get_global_id(0);
if(index >= numObjects)
{
return;
}
int2 sortedData = pHash[index];
int unsorted_indx = sortedData.y;
float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];
float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];
float4 pos;
pos.x = (bbMin.x + bbMax.x) * 0.5f;
pos.y = (bbMin.y + bbMax.y) * 0.5f;
pos.z = (bbMin.z + bbMax.z) * 0.5f;
// get address in grid
int4 gridPosA = getGridPos(pos, pParams);
int4 gridPosB;
// examine only neighbouring cells
for(int z=-1; z<=1; z++)
{
gridPosB.z = gridPosA.z + z;
for(int y=-1; y<=1; y++)
{
gridPosB.y = gridPosA.y + y;
for(int x=-1; x<=1; x++)
{
gridPosB.x = gridPosA.x + x;
findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);
}
}
}
}

View file

@ -1,198 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* gridBroadphaseCL =
"int getPosHash(int4 gridPos, __global float4* pParams)\n"
"{\n"
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
" gridPos.x &= gridDim.x - 1;\n"
" gridPos.y &= gridDim.y - 1;\n"
" gridPos.z &= gridDim.z - 1;\n"
" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
" return hash;\n"
"} \n"
"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
"{\n"
" int4 gridPos;\n"
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
" return gridPos;\n"
"}\n"
"// calculate grid hash value for each body using its AABB\n"
"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numObjects)\n"
" {\n"
" return;\n"
" }\n"
" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
" float4 pos;\n"
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
" pos.w = 0.f;\n"
" // get address in grid\n"
" int4 gridPos = getGridPos(pos, pParams);\n"
" int gridHash = getPosHash(gridPos, pParams);\n"
" // store grid hash and body index\n"
" int2 hashVal;\n"
" hashVal.x = gridHash;\n"
" hashVal.y = index;\n"
" pHash[index] = hashVal;\n"
"}\n"
"__kernel void kClearCellStart( int numCells, \n"
" __global int* pCellStart )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numCells)\n"
" {\n"
" return;\n"
" }\n"
" pCellStart[index] = -1;\n"
"}\n"
"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
"{\n"
" __local int sharedHash[513];\n"
" int index = get_global_id(0);\n"
" int2 sortedData;\n"
" if(index < numObjects)\n"
" {\n"
" sortedData = pHash[index];\n"
" // Load hash data into shared memory so that we can look \n"
" // at neighboring body's hash value without loading\n"
" // two hash values per thread\n"
" sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
" if((index > 0) && (get_local_id(0) == 0))\n"
" {\n"
" // first thread in block must load neighbor body hash\n"
" sharedHash[0] = pHash[index-1].x;\n"
" }\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if(index < numObjects)\n"
" {\n"
" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
" {\n"
" cellStart[sortedData.x] = index;\n"
" }\n"
" }\n"
"}\n"
"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
"{\n"
" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
" (min0.z <= max1.z)&& (min1.z <= max0.z); \n"
"}\n"
"//search for AABB 'index' against other AABBs' in this cell\n"
"void findPairsInCell( int numObjects,\n"
" int4 gridPos,\n"
" int index,\n"
" __global int2* pHash,\n"
" __global int* pCellStart,\n"
" __global float4* allpAABB, \n"
" __global const int* smallAabbMapping,\n"
" __global float4* pParams,\n"
" volatile __global int* pairCount,\n"
" __global int4* pPairBuff2,\n"
" int maxPairs\n"
" )\n"
"{\n"
" int4 pGridDim = *((__global int4*)(pParams + 1));\n"
" int maxBodiesPerCell = pGridDim.w;\n"
" int gridHash = getPosHash(gridPos, pParams);\n"
" // get start of bucket for this cell\n"
" int bucketStart = pCellStart[gridHash];\n"
" if (bucketStart == -1)\n"
" {\n"
" return; // cell empty\n"
" }\n"
" // iterate over bodies in this cell\n"
" int2 sortedData = pHash[index];\n"
" int unsorted_indx = sortedData.y;\n"
" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
" int handleIndex = as_int(min0.w);\n"
" \n"
" int bucketEnd = bucketStart + maxBodiesPerCell;\n"
" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
" {\n"
" int2 cellData = pHash[index2];\n"
" if (cellData.x != gridHash)\n"
" {\n"
" break; // no longer in same bucket\n"
" }\n"
" int unsorted_indx2 = cellData.y;\n"
" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
" { \n"
" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
" if(testAABBOverlap(min0, max0, min1, max1))\n"
" {\n"
" if (pairCount)\n"
" {\n"
" int handleIndex2 = as_int(min1.w);\n"
" if (handleIndex<handleIndex2)\n"
" {\n"
" int curPair = atomic_add(pairCount,1);\n"
" if (curPair<maxPairs)\n"
" {\n"
" int4 newpair;\n"
" newpair.x = handleIndex;\n"
" newpair.y = handleIndex2;\n"
" newpair.z = -1;\n"
" newpair.w = -1;\n"
" pPairBuff2[curPair] = newpair;\n"
" }\n"
" }\n"
" \n"
" }\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void kFindOverlappingPairs( int numObjects,\n"
" __global float4* allpAABB, \n"
" __global const int* smallAabbMapping,\n"
" __global int2* pHash, \n"
" __global int* pCellStart, \n"
" __global float4* pParams ,\n"
" volatile __global int* pairCount,\n"
" __global int4* pPairBuff2,\n"
" int maxPairs\n"
" )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numObjects)\n"
" {\n"
" return;\n"
" }\n"
" int2 sortedData = pHash[index];\n"
" int unsorted_indx = sortedData.y;\n"
" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
" float4 pos;\n"
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
" // get address in grid\n"
" int4 gridPosA = getGridPos(pos, pParams);\n"
" int4 gridPosB; \n"
" // examine only neighbouring cells\n"
" for(int z=-1; z<=1; z++) \n"
" {\n"
" gridPosB.z = gridPosA.z + z;\n"
" for(int y=-1; y<=1; y++) \n"
" {\n"
" gridPosB.y = gridPosA.y + y;\n"
" for(int x=-1; x<=1; x++) \n"
" {\n"
" gridPosB.x = gridPosA.x + x;\n"
" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
" }\n"
" }\n"
" }\n"
"}\n";

View file

@ -1,767 +0,0 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
typedef float b3Scalar;
typedef float4 b3Vector3;
#define b3Max max
#define b3Min min
#define b3Sqrt sqrt
typedef struct
{
unsigned int m_key;
unsigned int m_value;
} SortDataCL;
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} b3AabbCL;
unsigned int interleaveBits(unsigned int x)
{
//........ ........ ......12 3456789A //x
//....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits
//......12 3456789A ......12 3456789A //x ^ (x << 16)
//11111111 ........ ........ 11111111 //0x FF 00 00 FF
//......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;
//......12 ........ 3456789A 3456789A //x ^ (x << 8)
//......11 ........ 1111.... ....1111 //0x 03 00 F0 0F
//......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;
//..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)
//......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3
//......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;
//....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)
//....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49
//....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;
//........ ........ ......11 11111111 //0x000003FF
x &= 0x000003FF; //Clear all bits above bit 10
x = (x ^ (x << 16)) & 0xFF0000FF;
x = (x ^ (x << 8)) & 0x0300F00F;
x = (x ^ (x << 4)) & 0x030C30C3;
x = (x ^ (x << 2)) & 0x09249249;
return x;
}
unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)
{
return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;
}
__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)
{
int separatedAabbIndex = get_global_id(0);
if(separatedAabbIndex >= numAabbsToSeparate) return;
int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];
out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];
}
//Should replace with an optimized parallel reduction
__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)
{
//Each time this kernel is added to the command queue,
//the number of AABBs needing to be merged is halved
//
//Example with 159 AABBs:
// numRemainingAabbs == 159 / 2 + 159 % 2 == 80
// numMergedAabbs == 159 - 80 == 79
//So, indices [0, 78] are merged with [0 + 80, 78 + 80]
int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;
int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;
int aabbIndex = get_global_id(0);
if(aabbIndex >= numMergedAabbs) return;
int otherAabbIndex = aabbIndex + numRemainingAabbs;
b3AabbCL aabb = out_mergedAabb[aabbIndex];
b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
b3AabbCL mergedAabb;
mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
out_mergedAabb[aabbIndex] = mergedAabb;
}
__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes,
__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)
{
int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index
if(leafNodeIndex >= numAabbs) return;
b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];
b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;
b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;
b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];
b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;
b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;
//Quantize into integer coordinates
//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size
b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;
int4 discretePosition;
discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );
discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );
discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );
//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]
discretePosition = b3Max( -512, b3Min(discretePosition, 511) );
discretePosition += 512;
//Interleave bits(assign a morton code, also known as a z-curve)
unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);
//
SortDataCL mortonCodeIndexPair;
mortonCodeIndexPair.m_key = mortonCode;
mortonCodeIndexPair.m_value = leafNodeIndex;
out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;
}
#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128
//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.
//If it is set, then the index is for an internal node; otherwise, it is a leaf node.
//In both cases, the bit should be cleared to access the actual node index.
int isLeafNode(int index) { return (index >> 31 == 0); }
int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }
int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }
//From sap.cl
#define NEW_PAIR_MARKER -1
bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
//From sap.cl
__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs,
__global int* rootNodeIndex,
__global int2* internalNodeChildIndices,
__global b3AabbCL* internalNodeAabbs,
__global int2* internalNodeLeafIndexRanges,
__global SortDataCL* mortonCodesAndAabbIndices,
__global int* out_numPairs, __global int4* out_overlappingPairs,
int maxPairs, int numQueryAabbs)
{
//Using get_group_id()/get_local_id() is Faster than get_global_id(0) since
//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)
int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
if(queryBvhNodeIndex >= numQueryAabbs) return;
int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;
b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];
int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];
int stackSize = 1;
stack[0] = *rootNodeIndex;
while(stackSize)
{
int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
--stackSize;
int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false
int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
//Optimization - if the BVH is structured as a binary radix tree, then
//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).
//This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.
{
int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;
if(highestLeafIndex <= queryBvhNodeIndex) continue;
}
//bvhRigidIndex is not used if internal node
int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )
{
if(isLeaf)
{
int4 pair;
pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];
pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
pair.z = NEW_PAIR_MARKER;
pair.w = NEW_PAIR_MARKER;
int pairIndex = atomic_inc(out_numPairs);
if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
}
if(!isLeaf) //Internal node
{
if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
{
//Error
}
else
{
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
}
}
}
}
}
//From rayCastKernels.cl
typedef struct
{
float4 m_from;
float4 m_to;
} b3RayInfo;
//From rayCastKernels.cl
b3Vector3 b3Vector3_normalize(b3Vector3 v)
{
b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};
return normalize(normal); //OpenCL normalize == vector4 normalize
}
b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }
int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)
{
//AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).
//t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.
//
//if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane
//and min.x will be the far plane; otherwise, it is reversed.
//
//In order for there to be a collision, the t_min and t_max of each pair must overlap.
//This can be tested for by selecting the highest t_min and lowest t_max and comparing them.
int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y)
//When using vector types, the select() function checks the most signficant bit,
//but isless() sets the least significant bit.
isNegative <<= 31;
//select(b, a, condition) == condition ? a : b
//When using select() with vector types, (condition[i]) is true if its most significant bit is 1
b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;
b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;
b3Scalar t_min_final = 0.0f;
b3Scalar t_max_final = rayLength;
//Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned.
//Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])
//Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.
t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );
t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );
return (t_min_final <= t_max_final);
}
__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,
__global int* rootNodeIndex,
__global int2* internalNodeChildIndices,
__global b3AabbCL* internalNodeAabbs,
__global int2* internalNodeLeafIndexRanges,
__global SortDataCL* mortonCodesAndAabbIndices,
__global b3RayInfo* rays,
__global int* out_numRayRigidPairs,
__global int2* out_rayRigidPairs,
int maxRayRigidPairs, int numRays)
{
int rayIndex = get_global_id(0);
if(rayIndex >= numRays) return;
//
b3Vector3 rayFrom = rays[rayIndex].m_from;
b3Vector3 rayTo = rays[rayIndex].m_to;
b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);
b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );
//
int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];
int stackSize = 1;
stack[0] = *rootNodeIndex;
while(stackSize)
{
int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
--stackSize;
int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false
int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
//bvhRigidIndex is not used if internal node
int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )
{
if(isLeaf)
{
int2 rayRigidPair;
rayRigidPair.x = rayIndex;
rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
int pairIndex = atomic_inc(out_numRayRigidPairs);
if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;
}
if(!isLeaf) //Internal node
{
if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
{
//Error
}
else
{
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
}
}
}
}
}
__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs,
__global int* out_numPairs, __global int4* out_overlappingPairs,
int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)
{
int smallAabbIndex = get_global_id(0);
if(smallAabbIndex >= numSmallAabbRigids) return;
b3AabbCL smallAabb = smallAabbs[smallAabbIndex];
for(int i = 0; i < numLargeAabbRigids; ++i)
{
b3AabbCL largeAabb = largeAabbs[i];
if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )
{
int4 pair;
pair.x = largeAabb.m_minIndices[3];
pair.y = smallAabb.m_minIndices[3];
pair.z = NEW_PAIR_MARKER;
pair.w = NEW_PAIR_MARKER;
int pairIndex = atomic_inc(out_numPairs);
if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
}
}
}
__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,
__global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,
int numLargeAabbRigids, int maxRayRigidPairs, int numRays)
{
int rayIndex = get_global_id(0);
if(rayIndex >= numRays) return;
b3Vector3 rayFrom = rays[rayIndex].m_from;
b3Vector3 rayTo = rays[rayIndex].m_to;
b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);
b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );
for(int i = 0; i < numLargeAabbRigids; ++i)
{
b3AabbCL rigidAabb = largeRigidAabbs[i];
if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )
{
int2 rayRigidPair;
rayRigidPair.x = rayIndex;
rayRigidPair.y = rigidAabb.m_minIndices[3];
int pairIndex = atomic_inc(out_numRayRigidPairs);
if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;
}
}
}
//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.
//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.
//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.
//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).
#define B3_PLBVH_INVALID_COMMON_PREFIX 128
#define B3_PLBVH_ROOT_NODE_MARKER -1
#define b3Int64 long
int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }
b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j)
{
//This function only needs to return (i & j) in order for the algorithm to work,
//but it may help with debugging to mask out the lower bits.
b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);
b3Int64 sharedBits = i & j;
b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0
return sharedBits & bitmask;
}
//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths
int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)
{
return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );
}
__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,
__global b3Int64* out_commonPrefixes,
__global int* out_commonPrefixLengths,
int numInternalNodes)
{
int internalNodeIndex = get_global_id(0);
if (internalNodeIndex >= numInternalNodes) return;
//Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,
//and the number of internal nodes is always numLeafNodes - 1
int leftLeafIndex = internalNodeIndex;
int rightLeafIndex = internalNodeIndex + 1;
int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;
int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;
//Binary radix tree construction algorithm does not work if there are duplicate morton codes.
//Append the index of each leaf node to each morton code so that there are no duplicates.
//The algorithm also requires that the morton codes are sorted in ascending order; this requirement
//is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.
//
//upsample(a, b) == ( ((b3Int64)a) << 32) | b
b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);
b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);
out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);
out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);
}
__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,
__global int2* out_childNodes, int numLeafNodes)
{
int leafNodeIndex = get_global_id(0);
if (leafNodeIndex >= numLeafNodes) return;
int numInternalNodes = numLeafNodes - 1;
int leftSplitIndex = leafNodeIndex - 1;
int rightSplitIndex = leafNodeIndex;
int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
//Parent node is the highest adjacent common prefix that is lower than the node's common prefix
//Leaf nodes are considered as having the highest common prefix
int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);
//Handle cases for the edge nodes; the first and last node
//For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX
if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;
if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;
int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;
out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;
int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa
//out_childNodesAsInt[0] == int2.x == left child
//out_childNodesAsInt[1] == int2.y == right child
int isLeaf = 1;
__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);
out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);
}
__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,
__global int2* out_childNodes,
__global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,
int numInternalNodes)
{
int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
if(internalNodeIndex >= numInternalNodes) return;
b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];
int nodePrefixLength = commonPrefixLengths[internalNodeIndex];
//#define USE_LINEAR_SEARCH
#ifdef USE_LINEAR_SEARCH
int leftIndex = -1;
int rightIndex = -1;
//Find nearest element to left with a lower common prefix
for(int i = internalNodeIndex - 1; i >= 0; --i)
{
int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);
if(nodeLeftSharedPrefixLength < nodePrefixLength)
{
leftIndex = i;
break;
}
}
//Find nearest element to right with a lower common prefix
for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)
{
int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);
if(nodeRightSharedPrefixLength < nodePrefixLength)
{
rightIndex = i;
break;
}
}
#else //Use binary search
//Find nearest element to left with a lower common prefix
int leftIndex = -1;
{
int lower = 0;
int upper = internalNodeIndex - 1;
while(lower <= upper)
{
int mid = (lower + upper) / 2;
b3Int64 midPrefix = commonPrefixes[mid];
int midPrefixLength = commonPrefixLengths[mid];
int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);
if(nodeMidSharedPrefixLength < nodePrefixLength)
{
int right = mid + 1;
if(right < internalNodeIndex)
{
b3Int64 rightPrefix = commonPrefixes[right];
int rightPrefixLength = commonPrefixLengths[right];
int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);
if(nodeRightSharedPrefixLength < nodePrefixLength)
{
lower = right;
leftIndex = right;
}
else
{
leftIndex = mid;
break;
}
}
else
{
leftIndex = mid;
break;
}
}
else upper = mid - 1;
}
}
//Find nearest element to right with a lower common prefix
int rightIndex = -1;
{
int lower = internalNodeIndex + 1;
int upper = numInternalNodes - 1;
while(lower <= upper)
{
int mid = (lower + upper) / 2;
b3Int64 midPrefix = commonPrefixes[mid];
int midPrefixLength = commonPrefixLengths[mid];
int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);
if(nodeMidSharedPrefixLength < nodePrefixLength)
{
int left = mid - 1;
if(left > internalNodeIndex)
{
b3Int64 leftPrefix = commonPrefixes[left];
int leftPrefixLength = commonPrefixLengths[left];
int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);
if(nodeLeftSharedPrefixLength < nodePrefixLength)
{
upper = left;
rightIndex = left;
}
else
{
rightIndex = mid;
break;
}
}
else
{
rightIndex = mid;
break;
}
}
else lower = mid + 1;
}
}
#endif
//Select parent
{
int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);
if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;
else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;
int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;
int isRootNode = (leftIndex == -1 && rightIndex == -1);
out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;
int isLeaf = 0;
if(!isRootNode)
{
int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa
//out_childNodesAsInt[0] == int2.x == left child
//out_childNodesAsInt[1] == int2.y == right child
__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);
out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);
}
else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);
}
}
__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,
__global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)
{
if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);
int internalNodeIndex = get_global_id(0);
if(internalNodeIndex >= numInternalNodes) return;
//
int distanceFromRoot = 0;
{
int parentIndex = internalNodeParentNodes[internalNodeIndex];
while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)
{
parentIndex = internalNodeParentNodes[parentIndex];
++distanceFromRoot;
}
}
out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;
//
__local int localMaxDistanceFromRoot;
if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;
barrier(CLK_LOCAL_MEM_FENCE);
atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);
barrier(CLK_LOCAL_MEM_FENCE);
if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);
}
__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,
__global int2* childNodes,
__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,
int maxDistanceFromRoot, int processedDistance, int numInternalNodes)
{
int internalNodeIndex = get_global_id(0);
if(internalNodeIndex >= numInternalNodes) return;
int distance = distanceFromRoot[internalNodeIndex];
if(distance == processedDistance)
{
int leftChildIndex = childNodes[internalNodeIndex].x;
int rightChildIndex = childNodes[internalNodeIndex].y;
int isLeftChildLeaf = isLeafNode(leftChildIndex);
int isRightChildLeaf = isLeafNode(rightChildIndex);
leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);
rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);
//leftRigidIndex/rightRigidIndex is not used if internal node
int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;
int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;
b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];
b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];
b3AabbCL mergedAabb;
mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);
mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);
internalNodeAabbs[internalNodeIndex] = mergedAabb;
}
}
__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)
{
int internalNodeIndex = get_global_id(0);
if(internalNodeIndex >= numInternalNodes) return;
int numLeafNodes = numInternalNodes + 1;
int2 childNodes = internalNodeChildNodes[internalNodeIndex];
int2 leafIndexRange; //x == min leaf index, y == max leaf index
//Find lowest leaf index covered by this internal node
{
int lowestIndex = childNodes.x; //childNodes.x == Left child
while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;
leafIndexRange.x = lowestIndex;
}
//Find highest leaf index covered by this internal node
{
int highestIndex = childNodes.y; //childNodes.y == Right child
while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;
leafIndexRange.y = highestIndex;
}
//
out_leafIndexRanges[internalNodeIndex] = leafIndexRange;
}

View file

@ -1,728 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* parallelLinearBvhCL =
"/*\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose,\n"
"including commercial applications, and to alter it and redistribute it freely,\n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Initial Author Jackson Lee, 2014\n"
"typedef float b3Scalar;\n"
"typedef float4 b3Vector3;\n"
"#define b3Max max\n"
"#define b3Min min\n"
"#define b3Sqrt sqrt\n"
"typedef struct\n"
"{\n"
" unsigned int m_key;\n"
" unsigned int m_value;\n"
"} SortDataCL;\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"unsigned int interleaveBits(unsigned int x)\n"
"{\n"
" //........ ........ ......12 3456789A //x\n"
" //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n"
" \n"
" //......12 3456789A ......12 3456789A //x ^ (x << 16)\n"
" //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n"
" //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n"
" \n"
" //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n"
" //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n"
" //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n"
" \n"
" //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n"
" //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n"
" //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n"
" \n"
" //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n"
" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n"
" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n"
" \n"
" //........ ........ ......11 11111111 //0x000003FF\n"
" x &= 0x000003FF; //Clear all bits above bit 10\n"
" \n"
" x = (x ^ (x << 16)) & 0xFF0000FF;\n"
" x = (x ^ (x << 8)) & 0x0300F00F;\n"
" x = (x ^ (x << 4)) & 0x030C30C3;\n"
" x = (x ^ (x << 2)) & 0x09249249;\n"
" \n"
" return x;\n"
"}\n"
"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n"
"{\n"
" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
"}\n"
"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n"
"{\n"
" int separatedAabbIndex = get_global_id(0);\n"
" if(separatedAabbIndex >= numAabbsToSeparate) return;\n"
" int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n"
" out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n"
"}\n"
"//Should replace with an optimized parallel reduction\n"
"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n"
"{\n"
" //Each time this kernel is added to the command queue, \n"
" //the number of AABBs needing to be merged is halved\n"
" //\n"
" //Example with 159 AABBs:\n"
" // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n"
" // numMergedAabbs == 159 - 80 == 79\n"
" //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n"
" \n"
" int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n"
" int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n"
" \n"
" int aabbIndex = get_global_id(0);\n"
" if(aabbIndex >= numMergedAabbs) return;\n"
" \n"
" int otherAabbIndex = aabbIndex + numRemainingAabbs;\n"
" \n"
" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
" \n"
" b3AabbCL mergedAabb;\n"
" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
" out_mergedAabb[aabbIndex] = mergedAabb;\n"
"}\n"
"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
" __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
"{\n"
" int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n"
" if(leafNodeIndex >= numAabbs) return;\n"
" \n"
" b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n"
" b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n"
" b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n"
" \n"
" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
" b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n"
" b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n"
" \n"
" //Quantize into integer coordinates\n"
" //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
" b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n"
" \n"
" int4 discretePosition;\n"
" discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
" discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n"
" discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n"
" \n"
" //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n"
" discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n"
" discretePosition += 512;\n"
" \n"
" //Interleave bits(assign a morton code, also known as a z-curve)\n"
" unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n"
" \n"
" //\n"
" SortDataCL mortonCodeIndexPair;\n"
" mortonCodeIndexPair.m_key = mortonCode;\n"
" mortonCodeIndexPair.m_value = leafNodeIndex;\n"
" \n"
" out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
"}\n"
"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n"
"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n"
"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n"
"//In both cases, the bit should be cleared to access the actual node index.\n"
"int isLeafNode(int index) { return (index >> 31 == 0); }\n"
"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n"
"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n"
"//From sap.cl\n"
"#define NEW_PAIR_MARKER -1\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"//From sap.cl\n"
"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
" __global int* rootNodeIndex, \n"
" __global int2* internalNodeChildIndices, \n"
" __global b3AabbCL* internalNodeAabbs,\n"
" __global int2* internalNodeLeafIndexRanges,\n"
" \n"
" __global SortDataCL* mortonCodesAndAabbIndices,\n"
" __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
" int maxPairs, int numQueryAabbs)\n"
"{\n"
" //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n"
" //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n"
" int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
" if(queryBvhNodeIndex >= numQueryAabbs) return;\n"
" \n"
" int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n"
" b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
" \n"
" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
" \n"
" int stackSize = 1;\n"
" stack[0] = *rootNodeIndex;\n"
" \n"
" while(stackSize)\n"
" {\n"
" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
" --stackSize;\n"
" \n"
" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
" \n"
" //Optimization - if the BVH is structured as a binary radix tree, then\n"
" //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n"
" //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n"
" {\n"
" int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
" if(highestLeafIndex <= queryBvhNodeIndex) continue;\n"
" }\n"
" \n"
" //bvhRigidIndex is not used if internal node\n"
" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
" \n"
" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
" if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
" {\n"
" if(isLeaf)\n"
" {\n"
" int4 pair;\n"
" pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
" pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
" pair.z = NEW_PAIR_MARKER;\n"
" pair.w = NEW_PAIR_MARKER;\n"
" \n"
" int pairIndex = atomic_inc(out_numPairs);\n"
" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
" }\n"
" \n"
" if(!isLeaf) //Internal node\n"
" {\n"
" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
" {\n"
" //Error\n"
" }\n"
" else\n"
" {\n"
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
" }\n"
" }\n"
" }\n"
" \n"
" }\n"
"}\n"
"//From rayCastKernels.cl\n"
"typedef struct\n"
"{\n"
" float4 m_from;\n"
" float4 m_to;\n"
"} b3RayInfo;\n"
"//From rayCastKernels.cl\n"
"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n"
"{\n"
" b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n"
" return normalize(normal); //OpenCL normalize == vector4 normalize\n"
"}\n"
"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n"
"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n"
"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n"
"{\n"
" //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n"
" //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n"
" //\n"
" //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n"
" //and min.x will be the far plane; otherwise, it is reversed.\n"
" //\n"
" //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n"
" //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n"
" \n"
" int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y)\n"
" \n"
" //When using vector types, the select() function checks the most signficant bit, \n"
" //but isless() sets the least significant bit.\n"
" isNegative <<= 31;\n"
" //select(b, a, condition) == condition ? a : b\n"
" //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n"
" b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
" b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
" \n"
" b3Scalar t_min_final = 0.0f;\n"
" b3Scalar t_max_final = rayLength;\n"
" \n"
" //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n"
" //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n"
" //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n"
" t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n"
" t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n"
" \n"
" return (t_min_final <= t_max_final);\n"
"}\n"
"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n"
" __global int* rootNodeIndex, \n"
" __global int2* internalNodeChildIndices, \n"
" __global b3AabbCL* internalNodeAabbs,\n"
" __global int2* internalNodeLeafIndexRanges,\n"
" __global SortDataCL* mortonCodesAndAabbIndices,\n"
" \n"
" __global b3RayInfo* rays,\n"
" \n"
" __global int* out_numRayRigidPairs, \n"
" __global int2* out_rayRigidPairs,\n"
" int maxRayRigidPairs, int numRays)\n"
"{\n"
" int rayIndex = get_global_id(0);\n"
" if(rayIndex >= numRays) return;\n"
" \n"
" //\n"
" b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
" b3Vector3 rayTo = rays[rayIndex].m_to;\n"
" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
" \n"
" //\n"
" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
" \n"
" int stackSize = 1;\n"
" stack[0] = *rootNodeIndex;\n"
" \n"
" while(stackSize)\n"
" {\n"
" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
" --stackSize;\n"
" \n"
" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
" \n"
" //bvhRigidIndex is not used if internal node\n"
" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
" \n"
" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n"
" {\n"
" if(isLeaf)\n"
" {\n"
" int2 rayRigidPair;\n"
" rayRigidPair.x = rayIndex;\n"
" rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
" \n"
" int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
" }\n"
" \n"
" if(!isLeaf) //Internal node\n"
" {\n"
" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
" {\n"
" //Error\n"
" }\n"
" else\n"
" {\n"
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
" }\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n"
" __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
" int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n"
"{\n"
" int smallAabbIndex = get_global_id(0);\n"
" if(smallAabbIndex >= numSmallAabbRigids) return;\n"
" \n"
" b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n"
" for(int i = 0; i < numLargeAabbRigids; ++i)\n"
" {\n"
" b3AabbCL largeAabb = largeAabbs[i];\n"
" if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n"
" {\n"
" int4 pair;\n"
" pair.x = largeAabb.m_minIndices[3];\n"
" pair.y = smallAabb.m_minIndices[3];\n"
" pair.z = NEW_PAIR_MARKER;\n"
" pair.w = NEW_PAIR_MARKER;\n"
" \n"
" int pairIndex = atomic_inc(out_numPairs);\n"
" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
" }\n"
" }\n"
"}\n"
"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n"
" __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n"
" int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n"
"{\n"
" int rayIndex = get_global_id(0);\n"
" if(rayIndex >= numRays) return;\n"
" \n"
" b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
" b3Vector3 rayTo = rays[rayIndex].m_to;\n"
" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
" \n"
" for(int i = 0; i < numLargeAabbRigids; ++i)\n"
" {\n"
" b3AabbCL rigidAabb = largeRigidAabbs[i];\n"
" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n"
" {\n"
" int2 rayRigidPair;\n"
" rayRigidPair.x = rayIndex;\n"
" rayRigidPair.y = rigidAabb.m_minIndices[3];\n"
" \n"
" int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
" }\n"
" }\n"
"}\n"
"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n"
"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n"
"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n"
"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n"
"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n"
"#define B3_PLBVH_ROOT_NODE_MARKER -1\n"
"#define b3Int64 long\n"
"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n"
"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n"
"{\n"
" //This function only needs to return (i & j) in order for the algorithm to work,\n"
" //but it may help with debugging to mask out the lower bits.\n"
" b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n"
" b3Int64 sharedBits = i & j;\n"
" b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n"
" \n"
" return sharedBits & bitmask;\n"
"}\n"
"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n"
"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n"
"{\n"
" return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n"
"}\n"
"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n"
" __global b3Int64* out_commonPrefixes,\n"
" __global int* out_commonPrefixLengths,\n"
" int numInternalNodes)\n"
"{\n"
" int internalNodeIndex = get_global_id(0);\n"
" if (internalNodeIndex >= numInternalNodes) return;\n"
" \n"
" //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n"
" //and the number of internal nodes is always numLeafNodes - 1\n"
" int leftLeafIndex = internalNodeIndex;\n"
" int rightLeafIndex = internalNodeIndex + 1;\n"
" \n"
" int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n"
" int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n"
" \n"
" //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n"
" //Append the index of each leaf node to each morton code so that there are no duplicates.\n"
" //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n"
" //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n"
" //\n"
" //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n"
" b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n"
" b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n"
" \n"
" out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
" out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
"}\n"
"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n"
" __global int2* out_childNodes, int numLeafNodes)\n"
"{\n"
" int leafNodeIndex = get_global_id(0);\n"
" if (leafNodeIndex >= numLeafNodes) return;\n"
" \n"
" int numInternalNodes = numLeafNodes - 1;\n"
" \n"
" int leftSplitIndex = leafNodeIndex - 1;\n"
" int rightSplitIndex = leafNodeIndex;\n"
" \n"
" int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
" int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
" \n"
" //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n"
" //Leaf nodes are considered as having the highest common prefix\n"
" int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n"
" \n"
" //Handle cases for the edge nodes; the first and last node\n"
" //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n"
" if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n"
" if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n"
" \n"
" int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n"
" out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n"
" \n"
" int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n"
" \n"
" //out_childNodesAsInt[0] == int2.x == left child\n"
" //out_childNodesAsInt[1] == int2.y == right child\n"
" int isLeaf = 1;\n"
" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n"
"}\n"
"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n"
" __global int2* out_childNodes,\n"
" __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n"
" int numInternalNodes)\n"
"{\n"
" int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
" if(internalNodeIndex >= numInternalNodes) return;\n"
" \n"
" b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n"
" int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n"
" \n"
"//#define USE_LINEAR_SEARCH\n"
"#ifdef USE_LINEAR_SEARCH\n"
" int leftIndex = -1;\n"
" int rightIndex = -1;\n"
" \n"
" //Find nearest element to left with a lower common prefix\n"
" for(int i = internalNodeIndex - 1; i >= 0; --i)\n"
" {\n"
" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
" if(nodeLeftSharedPrefixLength < nodePrefixLength)\n"
" {\n"
" leftIndex = i;\n"
" break;\n"
" }\n"
" }\n"
" \n"
" //Find nearest element to right with a lower common prefix\n"
" for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n"
" {\n"
" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
" if(nodeRightSharedPrefixLength < nodePrefixLength)\n"
" {\n"
" rightIndex = i;\n"
" break;\n"
" }\n"
" }\n"
" \n"
"#else //Use binary search\n"
" //Find nearest element to left with a lower common prefix\n"
" int leftIndex = -1;\n"
" {\n"
" int lower = 0;\n"
" int upper = internalNodeIndex - 1;\n"
" \n"
" while(lower <= upper)\n"
" {\n"
" int mid = (lower + upper) / 2;\n"
" b3Int64 midPrefix = commonPrefixes[mid];\n"
" int midPrefixLength = commonPrefixLengths[mid];\n"
" \n"
" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
" if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
" {\n"
" int right = mid + 1;\n"
" if(right < internalNodeIndex)\n"
" {\n"
" b3Int64 rightPrefix = commonPrefixes[right];\n"
" int rightPrefixLength = commonPrefixLengths[right];\n"
" \n"
" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n"
" if(nodeRightSharedPrefixLength < nodePrefixLength) \n"
" {\n"
" lower = right;\n"
" leftIndex = right;\n"
" }\n"
" else \n"
" {\n"
" leftIndex = mid;\n"
" break;\n"
" }\n"
" }\n"
" else \n"
" {\n"
" leftIndex = mid;\n"
" break;\n"
" }\n"
" }\n"
" else upper = mid - 1;\n"
" }\n"
" }\n"
" \n"
" //Find nearest element to right with a lower common prefix\n"
" int rightIndex = -1;\n"
" {\n"
" int lower = internalNodeIndex + 1;\n"
" int upper = numInternalNodes - 1;\n"
" \n"
" while(lower <= upper)\n"
" {\n"
" int mid = (lower + upper) / 2;\n"
" b3Int64 midPrefix = commonPrefixes[mid];\n"
" int midPrefixLength = commonPrefixLengths[mid];\n"
" \n"
" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
" if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
" {\n"
" int left = mid - 1;\n"
" if(left > internalNodeIndex)\n"
" {\n"
" b3Int64 leftPrefix = commonPrefixes[left];\n"
" int leftPrefixLength = commonPrefixLengths[left];\n"
" \n"
" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n"
" if(nodeLeftSharedPrefixLength < nodePrefixLength) \n"
" {\n"
" upper = left;\n"
" rightIndex = left;\n"
" }\n"
" else \n"
" {\n"
" rightIndex = mid;\n"
" break;\n"
" }\n"
" }\n"
" else \n"
" {\n"
" rightIndex = mid;\n"
" break;\n"
" }\n"
" }\n"
" else lower = mid + 1;\n"
" }\n"
" }\n"
"#endif\n"
" \n"
" //Select parent\n"
" {\n"
" int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
" int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
" \n"
" int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n"
" \n"
" if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n"
" else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n"
" \n"
" int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n"
" \n"
" int isRootNode = (leftIndex == -1 && rightIndex == -1);\n"
" out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n"
" \n"
" int isLeaf = 0;\n"
" if(!isRootNode)\n"
" {\n"
" int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n"
" \n"
" //out_childNodesAsInt[0] == int2.x == left child\n"
" //out_childNodesAsInt[1] == int2.y == right child\n"
" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
" }\n"
" else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
" }\n"
"}\n"
"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n"
" __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n"
"{\n"
" if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n"
" int internalNodeIndex = get_global_id(0);\n"
" if(internalNodeIndex >= numInternalNodes) return;\n"
" \n"
" //\n"
" int distanceFromRoot = 0;\n"
" {\n"
" int parentIndex = internalNodeParentNodes[internalNodeIndex];\n"
" while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n"
" {\n"
" parentIndex = internalNodeParentNodes[parentIndex];\n"
" ++distanceFromRoot;\n"
" }\n"
" }\n"
" out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n"
" \n"
" //\n"
" __local int localMaxDistanceFromRoot;\n"
" if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n"
"}\n"
"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n"
" __global int2* childNodes,\n"
" __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n"
" int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n"
"{\n"
" int internalNodeIndex = get_global_id(0);\n"
" if(internalNodeIndex >= numInternalNodes) return;\n"
" \n"
" int distance = distanceFromRoot[internalNodeIndex];\n"
" \n"
" if(distance == processedDistance)\n"
" {\n"
" int leftChildIndex = childNodes[internalNodeIndex].x;\n"
" int rightChildIndex = childNodes[internalNodeIndex].y;\n"
" \n"
" int isLeftChildLeaf = isLeafNode(leftChildIndex);\n"
" int isRightChildLeaf = isLeafNode(rightChildIndex);\n"
" \n"
" leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n"
" rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n"
" \n"
" //leftRigidIndex/rightRigidIndex is not used if internal node\n"
" int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
" int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
" \n"
" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n"
" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n"
" \n"
" b3AabbCL mergedAabb;\n"
" mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
" mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
" internalNodeAabbs[internalNodeIndex] = mergedAabb;\n"
" }\n"
"}\n"
"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n"
"{\n"
" int internalNodeIndex = get_global_id(0);\n"
" if(internalNodeIndex >= numInternalNodes) return;\n"
" \n"
" int numLeafNodes = numInternalNodes + 1;\n"
" \n"
" int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n"
" \n"
" int2 leafIndexRange; //x == min leaf index, y == max leaf index\n"
" \n"
" //Find lowest leaf index covered by this internal node\n"
" {\n"
" int lowestIndex = childNodes.x; //childNodes.x == Left child\n"
" while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n"
" leafIndexRange.x = lowestIndex;\n"
" }\n"
" \n"
" //Find highest leaf index covered by this internal node\n"
" {\n"
" int highestIndex = childNodes.y; //childNodes.y == Right child\n"
" while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n"
" leafIndexRange.y = highestIndex;\n"
" }\n"
" \n"
" //\n"
" out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n"
"}\n";

View file

@ -1,389 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#define NEW_PAIR_MARKER -1
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} btAabbCL;
/// conservative test for overlap between two aabbs
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)
{
int i = get_global_id(0);
if (i>=numUnsortedAabbs)
return;
int j = get_global_id(1);
if (j>=numUnSortedAabbs2)
return;
__global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];
__global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];
if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))
{
int4 myPair;
int xIndex = unsortedAabbPtr[0].m_minIndices[3];
int yIndex = unsortedAabbPtr2[0].m_minIndices[3];
if (xIndex>yIndex)
{
int tmp = xIndex;
xIndex=yIndex;
yIndex=tmp;
}
myPair.x = xIndex;
myPair.y = yIndex;
myPair.z = NEW_PAIR_MARKER;
myPair.w = NEW_PAIR_MARKER;
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
for (int j=i+1;j<numObjects;j++)
{
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
{
int4 myPair;
myPair.x = aabbs[i].m_minIndices[3];
myPair.y = aabbs[j].m_minIndices[3];
myPair.z = NEW_PAIR_MARKER;
myPair.w = NEW_PAIR_MARKER;
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
}
__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
for (int j=i+1;j<numObjects;j++)
{
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
{
break;
}
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
{
int4 myPair;
myPair.x = aabbs[i].m_minIndices[3];
myPair.y = aabbs[j].m_minIndices[3];
myPair.z = NEW_PAIR_MARKER;
myPair.w = NEW_PAIR_MARKER;
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
}
__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);
__local int numActiveWgItems[1];
__local int breakRequest[1];
if (localId==0)
{
numActiveWgItems[0] = 0;
breakRequest[0] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(numActiveWgItems);
barrier(CLK_LOCAL_MEM_FENCE);
int localBreak = 0;
int j=i+1;
do
{
barrier(CLK_LOCAL_MEM_FENCE);
if (j<numObjects)
{
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
{
if (!localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (j>=numObjects && !localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (!localBreak)
{
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
{
int4 myPair;
myPair.x = aabbs[i].m_minIndices[3];
myPair.y = aabbs[j].m_minIndices[3];
myPair.z = NEW_PAIR_MARKER;
myPair.w = NEW_PAIR_MARKER;
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
j++;
} while (breakRequest[0]<numActiveWgItems[0]);
}
__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);
__local int numActiveWgItems[1];
__local int breakRequest[1];
__local btAabbCL localAabbs[128];// = aabbs[i];
btAabbCL myAabb;
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
float testValue = myAabb.m_maxElems[axis];
if (localId==0)
{
numActiveWgItems[0] = 0;
breakRequest[0] = 0;
}
int localCount=0;
int block=0;
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(numActiveWgItems);
barrier(CLK_LOCAL_MEM_FENCE);
int localBreak = 0;
int j=i+1;
do
{
barrier(CLK_LOCAL_MEM_FENCE);
if (j<numObjects)
{
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
{
if (!localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (j>=numObjects && !localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (!localBreak)
{
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
{
int4 myPair;
myPair.x = myAabb.m_minIndices[3];
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
myPair.z = NEW_PAIR_MARKER;
myPair.w = NEW_PAIR_MARKER;
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
localCount++;
if (localCount==64)
{
localCount = 0;
block+=64;
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
}
j++;
} while (breakRequest[0]<numActiveWgItems[0]);
}
//http://stereopsis.com/radix.html
unsigned int FloatFlip(float fl);
unsigned int FloatFlip(float fl)
{
unsigned int f = *(unsigned int*)&fl;
unsigned int mask = -(int)(f >> 31) | 0x80000000;
return f ^ mask;
}
float IFloatFlip(unsigned int f);
float IFloatFlip(unsigned int f)
{
unsigned int mask = ((f >> 31) - 1) | 0x80000000;
unsigned int fl = f ^ mask;
return *(float*)&fl;
}
__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
int src = destAabbs[i].m_maxIndices[3];
destAabbs[i] = allAabbs[src];
destAabbs[i].m_maxIndices[3] = src;
}
__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);
sortData[i].y = i;
}
__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];
}
__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)
{
int i = get_global_id(0);
if (i>=numAabbs)
return;
btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];
float4 s;
s = (smallAabb.m_max+smallAabb.m_min)*0.5f;
sum[i]=s;
sum2[i]=s*s;
}

View file

@ -1,341 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* sapCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"#define NEW_PAIR_MARKER -1\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} btAabbCL;\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numUnsortedAabbs)\n"
" return;\n"
" int j = get_global_id(1);\n"
" if (j>=numUnSortedAabbs2)\n"
" return;\n"
" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
" {\n"
" int4 myPair;\n"
" \n"
" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
" if (xIndex>yIndex)\n"
" {\n"
" int tmp = xIndex;\n"
" xIndex=yIndex;\n"
" yIndex=tmp;\n"
" }\n"
" \n"
" myPair.x = xIndex;\n"
" myPair.y = yIndex;\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" break;\n"
" }\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" j++;\n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
"}\n"
"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" btAabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
" \n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" int localCount=0;\n"
" int block=0;\n"
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" \n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = myAabb.m_minIndices[3];\n"
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" localCount++;\n"
" if (localCount==64)\n"
" {\n"
" localCount = 0;\n"
" block+=64; \n"
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
" }\n"
" j++;\n"
" \n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
" \n"
"}\n"
"//http://stereopsis.com/radix.html\n"
"unsigned int FloatFlip(float fl);\n"
"unsigned int FloatFlip(float fl)\n"
"{\n"
" unsigned int f = *(unsigned int*)&fl;\n"
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
" return f ^ mask;\n"
"}\n"
"float IFloatFlip(unsigned int f);\n"
"float IFloatFlip(unsigned int f)\n"
"{\n"
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
" unsigned int fl = f ^ mask;\n"
" return *(float*)&fl;\n"
"}\n"
"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" int src = destAabbs[i].m_maxIndices[3];\n"
" destAabbs[i] = allAabbs[src];\n"
" destAabbs[i].m_maxIndices[3] = src;\n"
"}\n"
"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" \n"
" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
" sortData[i].y = i;\n"
" \n"
"}\n"
"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
"}\n"
"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numAabbs)\n"
" return;\n"
" \n"
" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
" \n"
" float4 s;\n"
" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
" sum[i]=s;\n"
" sum2[i]=s*s; \n"
"}\n";

View file

@ -1,77 +0,0 @@
INCLUDE_DIRECTORIES( ${BULLET_PHYSICS_SOURCE_DIR}/src )
ADD_DEFINITIONS(-DB3_USE_CLEW)
SET(Bullet3OpenCL_clew_SRCS
../clew/clew.c
BroadphaseCollision/b3GpuGridBroadphase.cpp
BroadphaseCollision/b3GpuSapBroadphase.cpp
BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp
BroadphaseCollision/b3GpuParallelLinearBvh.cpp
Initialize/b3OpenCLUtils.cpp
NarrowphaseCollision/b3ContactCache.cpp
NarrowphaseCollision/b3ConvexHullContact.cpp
NarrowphaseCollision/b3GjkEpa.cpp
NarrowphaseCollision/b3OptimizedBvh.cpp
NarrowphaseCollision/b3QuantizedBvh.cpp
NarrowphaseCollision/b3StridingMeshInterface.cpp
NarrowphaseCollision/b3TriangleCallback.cpp
NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
NarrowphaseCollision/b3VoronoiSimplexSolver.cpp
ParallelPrimitives/b3BoundSearchCL.cpp
ParallelPrimitives/b3FillCL.cpp
ParallelPrimitives/b3LauncherCL.cpp
ParallelPrimitives/b3PrefixScanCL.cpp
ParallelPrimitives/b3PrefixScanFloat4CL.cpp
ParallelPrimitives/b3RadixSort32CL.cpp
Raycast/b3GpuRaycast.cpp
RigidBody/b3GpuGenericConstraint.cpp
RigidBody/b3GpuJacobiContactSolver.cpp
RigidBody/b3GpuNarrowPhase.cpp
RigidBody/b3GpuPgsConstraintSolver.cpp
RigidBody/b3GpuPgsContactSolver.cpp
RigidBody/b3GpuRigidBodyPipeline.cpp
RigidBody/b3Solver.cpp
)
SET(Bullet3OpenCL_clew_HDRS
# ${Root_HDRS}
)
ADD_LIBRARY(Bullet3OpenCL_clew ${Bullet3OpenCL_clew_SRCS} ${Bullet3OpenCL_clew_HDRS})
SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES VERSION ${BULLET_VERSION})
SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES SOVERSION ${BULLET_VERSION})
IF (BUILD_SHARED_LIBS)
TARGET_LINK_LIBRARIES(Bullet3OpenCL_clew LinearMath Bullet3Dynamics ${CMAKE_DL_LIBS})
ENDIF (BUILD_SHARED_LIBS)
IF (INSTALL_LIBS)
IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
#INSTALL of other files requires CMake 2.6
IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
INSTALL(TARGETS Bullet3OpenCL_clew DESTINATION .)
ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
INSTALL(TARGETS Bullet3OpenCL_clew RUNTIME DESTINATION bin
LIBRARY DESTINATION lib${LIB_SUFFIX}
ARCHIVE DESTINATION lib${LIB_SUFFIX})
INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DESTINATION ${INCLUDE_INSTALL_DIR} FILES_MATCHING PATTERN "*.h" PATTERN ".svn" EXCLUDE PATTERN "CMakeFiles" EXCLUDE)
# INSTALL(FILES ../btBullet3OpenCL_clewCommon.h
#DESTINATION ${INCLUDE_INSTALL_DIR}/Bullet3OpenCL_clew)
ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES FRAMEWORK true)
SET_TARGET_PROPERTIES(Bullet3OpenCL_clew PROPERTIES PUBLIC_HEADER "${Root_HDRS}")
# Have to list out sub-directories manually:
SET_PROPERTY(SOURCE ${BroadphaseCollision_HDRS} PROPERTY MACOSX_PACKAGE_LOCATION Headers/BroadphaseCollision)
ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
ENDIF (INSTALL_LIBS)

View file

@ -1,51 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_OPENCL_INCLUDE_H
#define B3_OPENCL_INCLUDE_H
#ifdef B3_USE_CLEW
#include "clew/clew.h"
#else
#ifdef __APPLE__
#ifdef USE_MINICL
#include <MiniCL/cl.h>
#else
#include <OpenCL/cl.h>
#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE
#endif
#else
#ifdef USE_MINICL
#include <MiniCL/cl.h>
#else
#include <CL/cl.h>
#ifdef _WIN32
#include "CL/cl_gl.h"
#endif //_WIN32
#endif
#endif //__APPLE__
#endif //B3_USE_CLEW
#include <assert.h>
#include <stdio.h>
#define oclCHECKERROR(a, b) \
if ((a) != (b)) \
{ \
printf("OCL Error : %d\n", (a)); \
assert((a) == (b)); \
}
#endif //B3_OPENCL_INCLUDE_H

View file

@ -1,963 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Original author: Roman Ponomarev
//Mostly Reimplemented by Erwin Coumans
bool gDebugForceLoadingFromSource = false;
bool gDebugSkipLoadingBinary = false;
#include "Bullet3Common/b3Logging.h"
#include <string.h>
#ifdef _WIN32
#pragma warning(disable : 4996)
#endif
#include "b3OpenCLUtils.h"
//#include "b3OpenCLInclude.h"
#include <stdio.h>
#include <stdlib.h>
#define B3_MAX_CL_DEVICES 16 //who needs 16 devices?
#ifdef _WIN32
#include <windows.h>
#endif
#include <assert.h>
#define b3Assert assert
#ifndef _WIN32
#include <sys/stat.h>
#endif
static const char* sCachedBinaryPath = "cache";
//Set the preferred platform vendor using the OpenCL SDK
static const char* spPlatformVendor =
#if defined(CL_PLATFORM_MINI_CL)
"MiniCL, SCEA";
#elif defined(CL_PLATFORM_AMD)
"Advanced Micro Devices, Inc.";
#elif defined(CL_PLATFORM_NVIDIA)
"NVIDIA Corporation";
#elif defined(CL_PLATFORM_INTEL)
"Intel(R) Corporation";
#elif defined(B3_USE_CLEW)
"clew (OpenCL Extension Wrangler library)";
#else
"Unknown Vendor";
#endif
#ifndef CL_PLATFORM_MINI_CL
#ifdef _WIN32
#ifndef B3_USE_CLEW
#include "CL/cl_gl.h"
#endif //B3_USE_CLEW
#endif //_WIN32
#endif
void MyFatalBreakAPPLE(const char* errstr,
const void* private_info,
size_t cb,
void* user_data)
{
const char* patloc = strstr(errstr, "Warning");
//find out if it is a warning or error, exit if error
if (patloc)
{
b3Warning("Warning: %s\n", errstr);
}
else
{
b3Error("Error: %s\n", errstr);
b3Assert(0);
}
}
#ifdef B3_USE_CLEW
int b3OpenCLUtils_clewInit()
{
int result = -1;
#ifdef _WIN32
const char* cl = "OpenCL.dll";
#elif defined __APPLE__
const char* cl = "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
#else //presumable Linux? \
//linux (tested on Ubuntu 12.10 with Catalyst 13.4 beta drivers, not that there is no symbolic link from libOpenCL.so
const char* cl = "libOpenCL.so.1";
result = clewInit(cl);
if (result != CLEW_SUCCESS)
{
cl = "libOpenCL.so";
}
else
{
clewExit();
}
#endif
result = clewInit(cl);
if (result != CLEW_SUCCESS)
{
b3Error("clewInit failed with error code %d\n", result);
}
else
{
b3Printf("clewInit succesfull using %s\n", cl);
}
return result;
}
#endif
int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum)
{
#ifdef B3_USE_CLEW
b3OpenCLUtils_clewInit();
#endif
cl_platform_id pPlatforms[10] = {0};
cl_uint numPlatforms = 0;
cl_int ciErrNum = clGetPlatformIDs(10, pPlatforms, &numPlatforms);
//cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if (ciErrNum != CL_SUCCESS)
{
if (pErrNum != NULL)
*pErrNum = ciErrNum;
}
return numPlatforms;
}
const char* b3OpenCLUtils_getSdkVendorName()
{
return spPlatformVendor;
}
void b3OpenCLUtils_setCachePath(const char* path)
{
sCachedBinaryPath = path;
}
cl_platform_id b3OpenCLUtils_getPlatform(int platformIndex0, cl_int* pErrNum)
{
#ifdef B3_USE_CLEW
b3OpenCLUtils_clewInit();
#endif
cl_platform_id platform = 0;
unsigned int platformIndex = (unsigned int)platformIndex0;
cl_uint numPlatforms;
cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if (platformIndex < numPlatforms)
{
cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * numPlatforms);
ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
if (ciErrNum != CL_SUCCESS)
{
if (pErrNum != NULL)
*pErrNum = ciErrNum;
return platform;
}
platform = platforms[platformIndex];
free(platforms);
}
return platform;
}
void b3OpenCLUtils::getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo)
{
b3Assert(platform);
cl_int ciErrNum;
ciErrNum = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, B3_MAX_STRING_LENGTH, platformInfo->m_platformVendor, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
ciErrNum = clGetPlatformInfo(platform, CL_PLATFORM_NAME, B3_MAX_STRING_LENGTH, platformInfo->m_platformName, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
ciErrNum = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, B3_MAX_STRING_LENGTH, platformInfo->m_platformVersion, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform)
{
b3OpenCLPlatformInfo platformInfo;
b3OpenCLUtils::getPlatformInfo(platform, &platformInfo);
b3Printf("Platform info:\n");
b3Printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n", platformInfo.m_platformVendor);
b3Printf(" CL_PLATFORM_NAME: \t\t\t%s\n", platformInfo.m_platformName);
b3Printf(" CL_PLATFORM_VERSION: \t\t\t%s\n", platformInfo.m_platformVersion);
}
cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex)
{
cl_context retContext = 0;
cl_int ciErrNum = 0;
cl_uint num_entries;
cl_device_id devices[B3_MAX_CL_DEVICES];
cl_uint num_devices;
cl_context_properties* cprops;
/*
* If we could find our platform, use it. Otherwise pass a NULL and get whatever the
* implementation thinks we should be using.
*/
cl_context_properties cps[7] = {0, 0, 0, 0, 0, 0, 0};
cps[0] = CL_CONTEXT_PLATFORM;
cps[1] = (cl_context_properties)platform;
#ifdef _WIN32
#ifndef B3_USE_CLEW
if (pGLContext && pGLDC)
{
cps[2] = CL_GL_CONTEXT_KHR;
cps[3] = (cl_context_properties)pGLContext;
cps[4] = CL_WGL_HDC_KHR;
cps[5] = (cl_context_properties)pGLDC;
}
#endif //B3_USE_CLEW
#endif //_WIN32
num_entries = B3_MAX_CL_DEVICES;
num_devices = -1;
ciErrNum = clGetDeviceIDs(
platform,
deviceType,
num_entries,
devices,
&num_devices);
if (ciErrNum < 0)
{
b3Printf("clGetDeviceIDs returned %d\n", ciErrNum);
return 0;
}
cprops = (NULL == platform) ? NULL : cps;
if (!num_devices)
return 0;
if (pGLContext)
{
//search for the GPU that relates to the OpenCL context
unsigned int i;
for (i = 0; i < num_devices; i++)
{
retContext = clCreateContext(cprops, 1, &devices[i], NULL, NULL, &ciErrNum);
if (ciErrNum == CL_SUCCESS)
break;
}
}
else
{
if (preferredDeviceIndex >= 0 && (unsigned int)preferredDeviceIndex < num_devices)
{
//create a context of the preferred device index
retContext = clCreateContext(cprops, 1, &devices[preferredDeviceIndex], NULL, NULL, &ciErrNum);
}
else
{
//create a context of all devices
#if defined(__APPLE__)
retContext = clCreateContext(cprops, num_devices, devices, MyFatalBreakAPPLE, NULL, &ciErrNum);
#else
b3Printf("numDevices=%d\n", num_devices);
retContext = clCreateContext(cprops, num_devices, devices, NULL, NULL, &ciErrNum);
#endif
}
}
if (pErrNum != NULL)
{
*pErrNum = ciErrNum;
};
return retContext;
}
cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* retPlatformId)
{
#ifdef B3_USE_CLEW
b3OpenCLUtils_clewInit();
#endif
cl_uint numPlatforms;
cl_context retContext = 0;
unsigned int i;
cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if (ciErrNum != CL_SUCCESS)
{
if (pErrNum != NULL) *pErrNum = ciErrNum;
return NULL;
}
if (numPlatforms > 0)
{
cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * numPlatforms);
ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
if (ciErrNum != CL_SUCCESS)
{
if (pErrNum != NULL)
*pErrNum = ciErrNum;
free(platforms);
return NULL;
}
for (i = 0; i < numPlatforms; ++i)
{
char pbuf[128];
ciErrNum = clGetPlatformInfo(platforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuf),
pbuf,
NULL);
if (ciErrNum != CL_SUCCESS)
{
if (pErrNum != NULL) *pErrNum = ciErrNum;
return NULL;
}
if (preferredPlatformIndex >= 0 && i == preferredPlatformIndex)
{
cl_platform_id tmpPlatform = platforms[0];
platforms[0] = platforms[i];
platforms[i] = tmpPlatform;
break;
}
else
{
if (!strcmp(pbuf, spPlatformVendor))
{
cl_platform_id tmpPlatform = platforms[0];
platforms[0] = platforms[i];
platforms[i] = tmpPlatform;
}
}
}
for (i = 0; i < numPlatforms; ++i)
{
cl_platform_id platform = platforms[i];
assert(platform);
retContext = b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLContext, pGLDC, preferredDeviceIndex, preferredPlatformIndex);
if (retContext)
{
// printf("OpenCL platform details:\n");
b3OpenCLPlatformInfo platformInfo;
b3OpenCLUtils::getPlatformInfo(platform, &platformInfo);
if (retPlatformId)
*retPlatformId = platform;
break;
}
}
free(platforms);
}
return retContext;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxMainContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int deviceIndex)
{
assert(cxMainContext);
size_t szParmDataBytes;
cl_device_id* cdDevices;
cl_device_id device;
// get the list of devices associated with context
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
if (szParmDataBytes / sizeof(cl_device_id) < (unsigned int)deviceIndex)
{
return (cl_device_id)-1;
}
cdDevices = (cl_device_id*)malloc(szParmDataBytes);
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
device = cdDevices[deviceIndex];
free(cdDevices);
return device;
}
int b3OpenCLUtils_getNumDevices(cl_context cxMainContext)
{
size_t szParamDataBytes;
int device_count;
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes);
device_count = (int)szParamDataBytes / sizeof(cl_device_id);
return device_count;
}
void b3OpenCLUtils::getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info)
{
// CL_DEVICE_NAME
clGetDeviceInfo(device, CL_DEVICE_NAME, B3_MAX_STRING_LENGTH, &info->m_deviceName, NULL);
// CL_DEVICE_VENDOR
clGetDeviceInfo(device, CL_DEVICE_VENDOR, B3_MAX_STRING_LENGTH, &info->m_deviceVendor, NULL);
// CL_DRIVER_VERSION
clGetDeviceInfo(device, CL_DRIVER_VERSION, B3_MAX_STRING_LENGTH, &info->m_driverVersion, NULL);
// CL_DEVICE_INFO
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info->m_deviceType, NULL);
// CL_DEVICE_MAX_COMPUTE_UNITS
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info->m_computeUnits), &info->m_computeUnits, NULL);
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info->m_workitemDims), &info->m_workitemDims, NULL);
// CL_DEVICE_MAX_WORK_ITEM_SIZES
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info->m_workItemSize), &info->m_workItemSize, NULL);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info->m_workgroupSize), &info->m_workgroupSize, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info->m_clockFrequency), &info->m_clockFrequency, NULL);
// CL_DEVICE_ADDRESS_BITS
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info->m_addressBits), &info->m_addressBits, NULL);
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info->m_maxMemAllocSize), &info->m_maxMemAllocSize, NULL);
// CL_DEVICE_GLOBAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info->m_globalMemSize), &info->m_globalMemSize, NULL);
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info->m_errorCorrectionSupport), &info->m_errorCorrectionSupport, NULL);
// CL_DEVICE_LOCAL_MEM_TYPE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info->m_localMemType), &info->m_localMemType, NULL);
// CL_DEVICE_LOCAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info->m_localMemSize), &info->m_localMemSize, NULL);
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info->m_constantBufferSize), &info->m_constantBufferSize, NULL);
// CL_DEVICE_QUEUE_PROPERTIES
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info->m_queueProperties), &info->m_queueProperties, NULL);
// CL_DEVICE_IMAGE_SUPPORT
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info->m_imageSupport), &info->m_imageSupport, NULL);
// CL_DEVICE_MAX_READ_IMAGE_ARGS
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info->m_maxReadImageArgs), &info->m_maxReadImageArgs, NULL);
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info->m_maxWriteImageArgs), &info->m_maxWriteImageArgs, NULL);
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info->m_image2dMaxWidth, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info->m_image2dMaxHeight, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info->m_image3dMaxWidth, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info->m_image3dMaxHeight, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info->m_image3dMaxDepth, NULL);
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, B3_MAX_STRING_LENGTH, &info->m_deviceExtensions, NULL);
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info->m_vecWidthChar, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info->m_vecWidthShort, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info->m_vecWidthInt, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info->m_vecWidthLong, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info->m_vecWidthFloat, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info->m_vecWidthDouble, NULL);
}
void b3OpenCLUtils_printDeviceInfo(cl_device_id device)
{
b3OpenCLDeviceInfo info;
b3OpenCLUtils::getDeviceInfo(device, &info);
b3Printf("Device Info:\n");
b3Printf(" CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName);
b3Printf(" CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor);
b3Printf(" CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion);
if (info.m_deviceType & CL_DEVICE_TYPE_CPU)
b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if (info.m_deviceType & CL_DEVICE_TYPE_GPU)
b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if (info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR)
b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if (info.m_deviceType & CL_DEVICE_TYPE_DEFAULT)
b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
b3Printf(" CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits);
b3Printf(" CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims);
b3Printf(" CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]);
b3Printf(" CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize);
b3Printf(" CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency);
b3Printf(" CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits);
b3Printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize / (1024 * 1024)));
b3Printf(" CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize / (1024 * 1024)));
b3Printf(" CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport == CL_TRUE ? "yes" : "no");
b3Printf(" CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global");
b3Printf(" CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024));
b3Printf(" CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024));
if (info.m_queueProperties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
b3Printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
if (info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE)
b3Printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
b3Printf(" CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport);
b3Printf(" CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs);
b3Printf(" CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs);
b3Printf("\n CL_DEVICE_IMAGE <dim>");
b3Printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth);
b3Printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight);
b3Printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth);
b3Printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight);
b3Printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth);
if (*info.m_deviceExtensions != 0)
{
b3Printf("\n CL_DEVICE_EXTENSIONS:%s\n", info.m_deviceExtensions);
}
else
{
b3Printf(" CL_DEVICE_EXTENSIONS: None\n");
}
b3Printf(" CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
b3Printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong, info.m_vecWidthFloat, info.m_vecWidthDouble);
}
static const char* strip2(const char* name, const char* pattern)
{
size_t const patlen = strlen(pattern);
size_t patcnt = 0;
const char* oriptr;
const char* patloc;
// find how many times the pattern occurs in the original string
for (oriptr = name; (patloc = strstr(oriptr, pattern)); oriptr = patloc + patlen)
{
patcnt++;
}
return oriptr;
}
cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSourceOrg, cl_int* pErrNum, const char* additionalMacrosArg, const char* clFileNameForCaching, bool disableBinaryCaching)
{
const char* additionalMacros = additionalMacrosArg ? additionalMacrosArg : "";
if (disableBinaryCaching)
{
//kernelSourceOrg = 0;
}
cl_program m_cpProgram = 0;
cl_int status;
char binaryFileName[B3_MAX_STRING_LENGTH];
char deviceName[256];
char driverVersion[256];
const char* strippedName;
int fileUpToDate = 0;
#ifdef _WIN32
int binaryFileValid = 0;
#endif
if (!disableBinaryCaching && clFileNameForCaching)
{
clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL);
clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
strippedName = strip2(clFileNameForCaching, "\\");
strippedName = strip2(strippedName, "/");
#ifdef _MSC_VER
sprintf_s(binaryFileName, B3_MAX_STRING_LENGTH, "%s/%s.%s.%s.bin", sCachedBinaryPath, strippedName, deviceName, driverVersion);
#else
sprintf(binaryFileName, "%s/%s.%s.%s.bin", sCachedBinaryPath, strippedName, deviceName, driverVersion);
#endif
}
if (clFileNameForCaching && !(disableBinaryCaching || gDebugSkipLoadingBinary || gDebugForceLoadingFromSource))
{
#ifdef _WIN32
char* bla = 0;
//printf("searching for %s\n", binaryFileName);
FILETIME modtimeBinary;
CreateDirectoryA(sCachedBinaryPath, 0);
{
HANDLE binaryFileHandle = CreateFileA(binaryFileName, GENERIC_READ, 0, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
if (binaryFileHandle == INVALID_HANDLE_VALUE)
{
DWORD errorCode;
errorCode = GetLastError();
switch (errorCode)
{
case ERROR_FILE_NOT_FOUND:
{
b3Warning("\nCached file not found %s\n", binaryFileName);
break;
}
case ERROR_PATH_NOT_FOUND:
{
b3Warning("\nCached file path not found %s\n", binaryFileName);
break;
}
default:
{
b3Warning("\nFailed reading cached file with errorCode = %d\n", errorCode);
}
}
}
else
{
if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary) == 0)
{
DWORD errorCode;
errorCode = GetLastError();
b3Warning("\nGetFileTime errorCode = %d\n", errorCode);
}
else
{
binaryFileValid = 1;
}
CloseHandle(binaryFileHandle);
}
if (binaryFileValid)
{
HANDLE srcFileHandle = CreateFileA(clFileNameForCaching, GENERIC_READ, 0, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
if (srcFileHandle == INVALID_HANDLE_VALUE)
{
const char* prefix[] = {"./", "../", "../../", "../../../", "../../../../"};
for (int i = 0; (srcFileHandle == INVALID_HANDLE_VALUE) && i < 5; i++)
{
char relativeFileName[1024];
sprintf(relativeFileName, "%s%s", prefix[i], clFileNameForCaching);
srcFileHandle = CreateFileA(relativeFileName, GENERIC_READ, 0, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
}
}
if (srcFileHandle != INVALID_HANDLE_VALUE)
{
FILETIME modtimeSrc;
if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc) == 0)
{
DWORD errorCode;
errorCode = GetLastError();
b3Warning("\nGetFileTime errorCode = %d\n", errorCode);
}
if ((modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime) || ((modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime) && (modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
{
fileUpToDate = 1;
}
else
{
b3Warning("\nCached binary file out-of-date (%s)\n", binaryFileName);
}
CloseHandle(srcFileHandle);
}
else
{
#ifdef _DEBUG
DWORD errorCode;
errorCode = GetLastError();
switch (errorCode)
{
case ERROR_FILE_NOT_FOUND:
{
b3Warning("\nSrc file not found %s\n", clFileNameForCaching);
break;
}
case ERROR_PATH_NOT_FOUND:
{
b3Warning("\nSrc path not found %s\n", clFileNameForCaching);
break;
}
default:
{
b3Warning("\nnSrc file reading errorCode = %d\n", errorCode);
}
}
//we should make sure the src file exists so we can verify the timestamp with binary
// assert(0);
b3Warning("Warning: cannot find OpenCL kernel %s to verify timestamp of binary cached kernel %s\n", clFileNameForCaching, binaryFileName);
fileUpToDate = true;
#else
//if we cannot find the source, assume it is OK in release builds
fileUpToDate = true;
#endif
}
}
}
#else
fileUpToDate = true;
if (mkdir(sCachedBinaryPath, 0777) == -1)
{
}
else
{
b3Printf("Succesfully created cache directory: %s\n", sCachedBinaryPath);
}
#endif //_WIN32
}
if (fileUpToDate)
{
#ifdef _MSC_VER
FILE* file;
if (fopen_s(&file, binaryFileName, "rb") != 0)
file = 0;
#else
FILE* file = fopen(binaryFileName, "rb");
#endif
if (file)
{
size_t binarySize = 0;
char* binary = 0;
fseek(file, 0L, SEEK_END);
binarySize = ftell(file);
rewind(file);
binary = (char*)malloc(sizeof(char) * binarySize);
int bytesRead;
bytesRead = fread(binary, sizeof(char), binarySize, file);
fclose(file);
m_cpProgram = clCreateProgramWithBinary(clContext, 1, &device, &binarySize, (const unsigned char**)&binary, 0, &status);
b3Assert(status == CL_SUCCESS);
status = clBuildProgram(m_cpProgram, 1, &device, additionalMacros, 0, 0);
b3Assert(status == CL_SUCCESS);
if (status != CL_SUCCESS)
{
char* build_log;
size_t ret_val_size;
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = (char*)malloc(sizeof(char) * (ret_val_size + 1));
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
b3Error("%s\n", build_log);
free(build_log);
b3Assert(0);
m_cpProgram = 0;
b3Warning("clBuildProgram reported failure on cached binary: %s\n", binaryFileName);
}
else
{
b3Printf("clBuildProgram successfully compiled cached binary: %s\n", binaryFileName);
}
free(binary);
}
else
{
b3Warning("Cannot open cached binary: %s\n", binaryFileName);
}
}
if (!m_cpProgram)
{
cl_int localErrNum;
char* compileFlags;
int flagsize;
const char* kernelSource = kernelSourceOrg;
if (!kernelSourceOrg || gDebugForceLoadingFromSource)
{
if (clFileNameForCaching)
{
FILE* file = fopen(clFileNameForCaching, "rb");
//in many cases the relative path is a few levels up the directory hierarchy, so try it
if (!file)
{
const char* prefix[] = {"../", "../../", "../../../", "../../../../"};
for (int i = 0; !file && i < 3; i++)
{
char relativeFileName[1024];
sprintf(relativeFileName, "%s%s", prefix[i], clFileNameForCaching);
file = fopen(relativeFileName, "rb");
}
}
if (file)
{
char* kernelSrc = 0;
fseek(file, 0L, SEEK_END);
int kernelSize = ftell(file);
rewind(file);
kernelSrc = (char*)malloc(kernelSize + 1);
int readBytes;
readBytes = fread((void*)kernelSrc, 1, kernelSize, file);
kernelSrc[kernelSize] = 0;
fclose(file);
kernelSource = kernelSrc;
}
}
}
size_t program_length = kernelSource ? strlen(kernelSource) : 0;
#ifdef MAC //or __APPLE__?
char* flags = "-cl-mad-enable -DMAC ";
#else
const char* flags = "";
#endif
m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum);
if (localErrNum != CL_SUCCESS)
{
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
// Build the program with 'mad' Optimization option
flagsize = sizeof(char) * (strlen(additionalMacros) + strlen(flags) + 5);
compileFlags = (char*)malloc(flagsize);
#ifdef _MSC_VER
sprintf_s(compileFlags, flagsize, "%s %s", flags, additionalMacros);
#else
sprintf(compileFlags, "%s %s", flags, additionalMacros);
#endif
localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL);
if (localErrNum != CL_SUCCESS)
{
char* build_log;
size_t ret_val_size;
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = (char*)malloc(sizeof(char) * (ret_val_size + 1));
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
// to be carefully, terminate with \0
// there's no information in the reference whether the string is 0 terminated or not
build_log[ret_val_size] = '\0';
b3Error("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
free(build_log);
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
if (!disableBinaryCaching && clFileNameForCaching)
{ // write to binary
cl_uint numAssociatedDevices;
status = clGetProgramInfo(m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0);
b3Assert(status == CL_SUCCESS);
if (numAssociatedDevices == 1)
{
size_t binarySize;
char* binary;
status = clGetProgramInfo(m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0);
b3Assert(status == CL_SUCCESS);
binary = (char*)malloc(sizeof(char) * binarySize);
status = clGetProgramInfo(m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0);
b3Assert(status == CL_SUCCESS);
{
FILE* file = 0;
#ifdef _MSC_VER
if (fopen_s(&file, binaryFileName, "wb") != 0)
file = 0;
#else
file = fopen(binaryFileName, "wb");
#endif
if (file)
{
fwrite(binary, sizeof(char), binarySize, file);
fclose(file);
}
else
{
b3Warning("cannot write file %s\n", binaryFileName);
}
}
free(binary);
}
}
free(compileFlags);
}
return m_cpProgram;
}
cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros)
{
cl_kernel kernel;
cl_int localErrNum;
cl_program m_cpProgram = prog;
b3Printf("compiling kernel %s ", kernelName);
if (!m_cpProgram)
{
m_cpProgram = b3OpenCLUtils_compileCLProgramFromString(clContext, device, kernelSource, pErrNum, additionalMacros, 0, false);
}
// Create the kernel
kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum);
if (localErrNum != CL_SUCCESS)
{
b3Error("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName);
assert(0);
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
if (!prog && m_cpProgram)
{
clReleaseProgram(m_cpProgram);
}
b3Printf("ready. \n");
if (pErrNum)
*pErrNum = CL_SUCCESS;
return kernel;
}

View file

@ -1,190 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//original author: Roman Ponomarev
//cleanup by Erwin Coumans
#ifndef B3_OPENCL_UTILS_H
#define B3_OPENCL_UTILS_H
#include "b3OpenCLInclude.h"
#ifdef __cplusplus
extern "C"
{
#endif
///C API for OpenCL utilities: convenience functions, see below for C++ API
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* platformId);
int b3OpenCLUtils_getNumDevices(cl_context cxMainContext);
cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int nr);
void b3OpenCLUtils_printDeviceInfo(cl_device_id device);
cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros);
//optional
cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum, const char* additionalMacros, const char* srcFileNameForCaching, bool disableBinaryCaching);
//the following optional APIs provide access using specific platform information
int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum);
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
cl_platform_id b3OpenCLUtils_getPlatform(int nr, cl_int* pErrNum);
void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform);
const char* b3OpenCLUtils_getSdkVendorName();
///set the path (directory/folder) where the compiled OpenCL kernel are stored
void b3OpenCLUtils_setCachePath(const char* path);
cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex);
#ifdef __cplusplus
}
#define B3_MAX_STRING_LENGTH 1024
typedef struct
{
char m_deviceName[B3_MAX_STRING_LENGTH];
char m_deviceVendor[B3_MAX_STRING_LENGTH];
char m_driverVersion[B3_MAX_STRING_LENGTH];
char m_deviceExtensions[B3_MAX_STRING_LENGTH];
cl_device_type m_deviceType;
cl_uint m_computeUnits;
size_t m_workitemDims;
size_t m_workItemSize[3];
size_t m_image2dMaxWidth;
size_t m_image2dMaxHeight;
size_t m_image3dMaxWidth;
size_t m_image3dMaxHeight;
size_t m_image3dMaxDepth;
size_t m_workgroupSize;
cl_uint m_clockFrequency;
cl_ulong m_constantBufferSize;
cl_ulong m_localMemSize;
cl_ulong m_globalMemSize;
cl_bool m_errorCorrectionSupport;
cl_device_local_mem_type m_localMemType;
cl_uint m_maxReadImageArgs;
cl_uint m_maxWriteImageArgs;
cl_uint m_addressBits;
cl_ulong m_maxMemAllocSize;
cl_command_queue_properties m_queueProperties;
cl_bool m_imageSupport;
cl_uint m_vecWidthChar;
cl_uint m_vecWidthShort;
cl_uint m_vecWidthInt;
cl_uint m_vecWidthLong;
cl_uint m_vecWidthFloat;
cl_uint m_vecWidthDouble;
} b3OpenCLDeviceInfo;
struct b3OpenCLPlatformInfo
{
char m_platformVendor[B3_MAX_STRING_LENGTH];
char m_platformName[B3_MAX_STRING_LENGTH];
char m_platformVersion[B3_MAX_STRING_LENGTH];
b3OpenCLPlatformInfo()
{
m_platformVendor[0] = 0;
m_platformName[0] = 0;
m_platformVersion[0] = 0;
}
};
///C++ API for OpenCL utilities: convenience functions
struct b3OpenCLUtils
{
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex = -1, cl_platform_id* platformId = 0)
{
return b3OpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx, pGLDC, preferredDeviceIndex, preferredPlatformIndex, platformId);
}
static inline int getNumDevices(cl_context cxMainContext)
{
return b3OpenCLUtils_getNumDevices(cxMainContext);
}
static inline cl_device_id getDevice(cl_context cxMainContext, int nr)
{
return b3OpenCLUtils_getDevice(cxMainContext, nr);
}
static void getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info);
static inline void printDeviceInfo(cl_device_id device)
{
b3OpenCLUtils_printDeviceInfo(device);
}
static inline cl_kernel compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum = 0, cl_program prog = 0, const char* additionalMacros = "")
{
return b3OpenCLUtils_compileCLKernelFromString(clContext, device, kernelSource, kernelName, pErrNum, prog, additionalMacros);
}
//optional
static inline cl_program compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum = 0, const char* additionalMacros = "", const char* srcFileNameForCaching = 0, bool disableBinaryCaching = false)
{
return b3OpenCLUtils_compileCLProgramFromString(clContext, device, kernelSource, pErrNum, additionalMacros, srcFileNameForCaching, disableBinaryCaching);
}
//the following optional APIs provide access using specific platform information
static inline int getNumPlatforms(cl_int* pErrNum = 0)
{
return b3OpenCLUtils_getNumPlatforms(pErrNum);
}
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum = 0)
{
return b3OpenCLUtils_getPlatform(nr, pErrNum);
}
static void getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo);
static inline void printPlatformInfo(cl_platform_id platform)
{
b3OpenCLUtils_printPlatformInfo(platform);
}
static inline const char* getSdkVendorName()
{
return b3OpenCLUtils_getSdkVendorName();
}
static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex = -1)
{
return b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx, pGLDC, preferredDeviceIndex, preferredPlatformIndex);
}
static void setCachePath(const char* path)
{
b3OpenCLUtils_setCachePath(path);
}
};
#endif //__cplusplus
#endif // B3_OPENCL_UTILS_H

View file

@ -1,17 +0,0 @@
#ifndef B3_BVH_INFO_H
#define B3_BVH_INFO_H
#include "Bullet3Common/b3Vector3.h"
struct b3BvhInfo
{
b3Vector3 m_aabbMin;
b3Vector3 m_aabbMax;
b3Vector3 m_quantization;
int m_numNodes;
int m_numSubTrees;
int m_nodeOffset;
int m_subTreeOffset;
};
#endif //B3_BVH_INFO_H

View file

@ -1,253 +0,0 @@
#if 0
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "b3ContactCache.h"
#include "Bullet3Common/b3Transform.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
b3Scalar gContactBreakingThreshold = b3Scalar(0.02);
///gContactCalcArea3Points will approximate the convex hull area using 3 points
///when setting it to false, it will use 4 points to compute the area: it is more accurate but slower
bool gContactCalcArea3Points = true;
static inline b3Scalar calcArea4Points(const b3Vector3 &p0,const b3Vector3 &p1,const b3Vector3 &p2,const b3Vector3 &p3)
{
// It calculates possible 3 area constructed from random 4 points and returns the biggest one.
b3Vector3 a[3],b[3];
a[0] = p0 - p1;
a[1] = p0 - p2;
a[2] = p0 - p3;
b[0] = p2 - p3;
b[1] = p1 - p3;
b[2] = p1 - p2;
//todo: Following 3 cross production can be easily optimized by SIMD.
b3Vector3 tmp0 = a[0].cross(b[0]);
b3Vector3 tmp1 = a[1].cross(b[1]);
b3Vector3 tmp2 = a[2].cross(b[2]);
return b3Max(b3Max(tmp0.length2(),tmp1.length2()),tmp2.length2());
}
#if 0
//using localPointA for all points
int b3ContactCache::sortCachedPoints(const b3Vector3& pt)
{
//calculate 4 possible cases areas, and take biggest area
//also need to keep 'deepest'
int maxPenetrationIndex = -1;
#define KEEP_DEEPEST_POINT 1
#ifdef KEEP_DEEPEST_POINT
b3Scalar maxPenetration = pt.getDistance();
for (int i=0;i<4;i++)
{
if (m_pointCache[i].getDistance() < maxPenetration)
{
maxPenetrationIndex = i;
maxPenetration = m_pointCache[i].getDistance();
}
}
#endif //KEEP_DEEPEST_POINT
b3Scalar res0(b3Scalar(0.)),res1(b3Scalar(0.)),res2(b3Scalar(0.)),res3(b3Scalar(0.));
if (gContactCalcArea3Points)
{
if (maxPenetrationIndex != 0)
{
b3Vector3 a0 = pt.m_localPointA-m_pointCache[1].m_localPointA;
b3Vector3 b0 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
b3Vector3 cross = a0.cross(b0);
res0 = cross.length2();
}
if (maxPenetrationIndex != 1)
{
b3Vector3 a1 = pt.m_localPointA-m_pointCache[0].m_localPointA;
b3Vector3 b1 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
b3Vector3 cross = a1.cross(b1);
res1 = cross.length2();
}
if (maxPenetrationIndex != 2)
{
b3Vector3 a2 = pt.m_localPointA-m_pointCache[0].m_localPointA;
b3Vector3 b2 = m_pointCache[3].m_localPointA-m_pointCache[1].m_localPointA;
b3Vector3 cross = a2.cross(b2);
res2 = cross.length2();
}
if (maxPenetrationIndex != 3)
{
b3Vector3 a3 = pt.m_localPointA-m_pointCache[0].m_localPointA;
b3Vector3 b3 = m_pointCache[2].m_localPointA-m_pointCache[1].m_localPointA;
b3Vector3 cross = a3.cross(b3);
res3 = cross.length2();
}
}
else
{
if(maxPenetrationIndex != 0) {
res0 = calcArea4Points(pt.m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[2].m_localPointA,m_pointCache[3].m_localPointA);
}
if(maxPenetrationIndex != 1) {
res1 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[2].m_localPointA,m_pointCache[3].m_localPointA);
}
if(maxPenetrationIndex != 2) {
res2 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[3].m_localPointA);
}
if(maxPenetrationIndex != 3) {
res3 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[2].m_localPointA);
}
}
b3Vector4 maxvec(res0,res1,res2,res3);
int biggestarea = maxvec.closestAxis4();
return biggestarea;
}
int b3ContactCache::getCacheEntry(const b3Vector3& newPoint) const
{
b3Scalar shortestDist = getContactBreakingThreshold() * getContactBreakingThreshold();
int size = getNumContacts();
int nearestPoint = -1;
for( int i = 0; i < size; i++ )
{
const b3Vector3 &mp = m_pointCache[i];
b3Vector3 diffA = mp.m_localPointA- newPoint.m_localPointA;
const b3Scalar distToManiPoint = diffA.dot(diffA);
if( distToManiPoint < shortestDist )
{
shortestDist = distToManiPoint;
nearestPoint = i;
}
}
return nearestPoint;
}
int b3ContactCache::addManifoldPoint(const b3Vector3& newPoint)
{
b3Assert(validContactDistance(newPoint));
int insertIndex = getNumContacts();
if (insertIndex == MANIFOLD_CACHE_SIZE)
{
#if MANIFOLD_CACHE_SIZE >= 4
//sort cache so best points come first, based on area
insertIndex = sortCachedPoints(newPoint);
#else
insertIndex = 0;
#endif
clearUserCache(m_pointCache[insertIndex]);
} else
{
m_cachedPoints++;
}
if (insertIndex<0)
insertIndex=0;
//b3Assert(m_pointCache[insertIndex].m_userPersistentData==0);
m_pointCache[insertIndex] = newPoint;
return insertIndex;
}
#endif
bool b3ContactCache::validContactDistance(const b3Vector3& pt)
{
return pt.w <= gContactBreakingThreshold;
}
void b3ContactCache::removeContactPoint(struct b3Contact4Data& newContactCache,int i)
{
int numContacts = b3Contact4Data_getNumPoints(&newContactCache);
if (i!=(numContacts-1))
{
b3Swap(newContactCache.m_localPosA[i],newContactCache.m_localPosA[numContacts-1]);
b3Swap(newContactCache.m_localPosB[i],newContactCache.m_localPosB[numContacts-1]);
b3Swap(newContactCache.m_worldPosB[i],newContactCache.m_worldPosB[numContacts-1]);
}
b3Contact4Data_setNumPoints(&newContactCache,numContacts-1);
}
void b3ContactCache::refreshContactPoints(const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& contacts)
{
int numContacts = b3Contact4Data_getNumPoints(&contacts);
int i;
/// first refresh worldspace positions and distance
for (i=numContacts-1;i>=0;i--)
{
b3Vector3 worldPosA = trA( contacts.m_localPosA[i]);
b3Vector3 worldPosB = trB( contacts.m_localPosB[i]);
contacts.m_worldPosB[i] = worldPosB;
float distance = (worldPosA - worldPosB).dot(contacts.m_worldNormalOnB);
contacts.m_worldPosB[i].w = distance;
}
/// then
b3Scalar distance2d;
b3Vector3 projectedDifference,projectedPoint;
for (i=numContacts-1;i>=0;i--)
{
b3Vector3 worldPosA = trA( contacts.m_localPosA[i]);
b3Vector3 worldPosB = trB( contacts.m_localPosB[i]);
b3Vector3&pt = contacts.m_worldPosB[i];
//contact becomes invalid when signed distance exceeds margin (projected on contactnormal direction)
if (!validContactDistance(pt))
{
removeContactPoint(contacts,i);
} else
{
//contact also becomes invalid when relative movement orthogonal to normal exceeds margin
projectedPoint = worldPosA - contacts.m_worldNormalOnB * contacts.m_worldPosB[i].w;
projectedDifference = contacts.m_worldPosB[i] - projectedPoint;
distance2d = projectedDifference.dot(projectedDifference);
if (distance2d > gContactBreakingThreshold*gContactBreakingThreshold )
{
removeContactPoint(contacts,i);
} else
{
////contact point processed callback
//if (gContactProcessedCallback)
// (*gContactProcessedCallback)(manifoldPoint,(void*)m_body0,(void*)m_body1);
}
}
}
}
#endif

View file

@ -1,62 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2013 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_CONTACT_CACHE_H
#define B3_CONTACT_CACHE_H
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3Common/b3Transform.h"
#include "Bullet3Common/b3AlignedAllocator.h"
///maximum contact breaking and merging threshold
extern b3Scalar gContactBreakingThreshold;
#define MANIFOLD_CACHE_SIZE 4
///b3ContactCache is a contact point cache, it stays persistent as long as objects are overlapping in the broadphase.
///Those contact points are created by the collision narrow phase.
///The cache can be empty, or hold 1,2,3 or 4 points. Some collision algorithms (GJK) might only add one point at a time.
///updates/refreshes old contact points, and throw them away if necessary (distance becomes too large)
///reduces the cache to 4 points, when more then 4 points are added, using following rules:
///the contact point with deepest penetration is always kept, and it tries to maximuze the area covered by the points
///note that some pairs of objects might have more then one contact manifold.
B3_ATTRIBUTE_ALIGNED16(class)
b3ContactCache
{
/// sort cached points so most isolated points come first
int sortCachedPoints(const b3Vector3& pt);
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
int addManifoldPoint(const b3Vector3& newPoint);
/*void replaceContactPoint(const b3Vector3& newPoint,int insertIndex)
{
b3Assert(validContactDistance(newPoint));
m_pointCache[insertIndex] = newPoint;
}
*/
static bool validContactDistance(const b3Vector3& pt);
/// calculated new worldspace coordinates and depth, and reject points that exceed the collision margin
static void refreshContactPoints(const b3Transform& trA, const b3Transform& trB, struct b3Contact4Data& newContactCache);
static void removeContactPoint(struct b3Contact4Data & newContactCache, int i);
};
#endif //B3_CONTACT_CACHE_H

View file

@ -1,106 +0,0 @@
#ifndef _CONVEX_HULL_CONTACT_H
#define _CONVEX_HULL_CONTACT_H
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
#include "Bullet3Common/shared/b3Int2.h"
#include "Bullet3Common/shared/b3Int4.h"
#include "b3OptimizedBvh.h"
#include "b3BvhInfo.h"
#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
//#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h"
struct GpuSatCollision
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_findSeparatingAxisKernel;
cl_kernel m_mprPenetrationKernel;
cl_kernel m_findSeparatingAxisUnitSphereKernel;
cl_kernel m_findSeparatingAxisVertexFaceKernel;
cl_kernel m_findSeparatingAxisEdgeEdgeKernel;
cl_kernel m_findConcaveSeparatingAxisKernel;
cl_kernel m_findConcaveSeparatingAxisVertexFaceKernel;
cl_kernel m_findConcaveSeparatingAxisEdgeEdgeKernel;
cl_kernel m_findCompoundPairsKernel;
cl_kernel m_processCompoundPairsKernel;
cl_kernel m_clipHullHullKernel;
cl_kernel m_clipCompoundsHullHullKernel;
cl_kernel m_clipFacesAndFindContacts;
cl_kernel m_findClippingFacesKernel;
cl_kernel m_clipHullHullConcaveConvexKernel;
// cl_kernel m_extractManifoldAndAddContactKernel;
cl_kernel m_newContactReductionKernel;
cl_kernel m_bvhTraversalKernel;
cl_kernel m_primitiveContactsKernel;
cl_kernel m_findConcaveSphereContactsKernel;
cl_kernel m_processCompoundPairsPrimitivesKernel;
b3OpenCLArray<b3Vector3> m_unitSphereDirections;
b3OpenCLArray<int> m_totalContactsOut;
b3OpenCLArray<b3Vector3> m_sepNormals;
b3OpenCLArray<float> m_dmins;
b3OpenCLArray<int> m_hasSeparatingNormals;
b3OpenCLArray<b3Vector3> m_concaveSepNormals;
b3OpenCLArray<int> m_concaveHasSeparatingNormals;
b3OpenCLArray<int> m_numConcavePairsOut;
b3OpenCLArray<b3CompoundOverlappingPair> m_gpuCompoundPairs;
b3OpenCLArray<b3Vector3> m_gpuCompoundSepNormals;
b3OpenCLArray<int> m_gpuHasCompoundSepNormals;
b3OpenCLArray<int> m_numCompoundPairsOut;
GpuSatCollision(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~GpuSatCollision();
void computeConvexConvexContactsGPUSAT(b3OpenCLArray<b3Int4>* pairs, int nPairs,
const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
b3OpenCLArray<b3Contact4>* contactOut, int& nContacts,
const b3OpenCLArray<b3Contact4>* oldContacts,
int maxContactCapacity,
int compoundPairCapacity,
const b3OpenCLArray<b3ConvexPolyhedronData>& hostConvexData,
const b3OpenCLArray<b3Vector3>& vertices,
const b3OpenCLArray<b3Vector3>& uniqueEdges,
const b3OpenCLArray<b3GpuFace>& faces,
const b3OpenCLArray<int>& indices,
const b3OpenCLArray<b3Collidable>& gpuCollidables,
const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes,
const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace,
const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace,
b3OpenCLArray<b3Vector3>& worldVertsB1GPU,
b3OpenCLArray<b3Int4>& clippingFacesOutGPU,
b3OpenCLArray<b3Vector3>& worldNormalsAGPU,
b3OpenCLArray<b3Vector3>& worldVertsA1GPU,
b3OpenCLArray<b3Vector3>& worldVertsB2GPU,
b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData,
b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU,
b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU,
b3OpenCLArray<b3BvhInfo>* bvhInfo,
int numObjects,
int maxTriConvexPairCapacity,
b3OpenCLArray<b3Int4>& triangleConvexPairs,
int& numTriConvexPairsOut);
};
#endif //_CONVEX_HULL_CONTACT_H

View file

@ -1,7 +0,0 @@
#ifndef CONVEX_POLYHEDRON_CL
#define CONVEX_POLYHEDRON_CL
#include "Bullet3Common/b3Transform.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
#endif //CONVEX_POLYHEDRON_CL

View file

@ -1,79 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2008 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the
use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software in a
product, an acknowledgment in the product documentation would be appreciated
but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
/*
GJK-EPA collision solver by Nathanael Presson, 2008
*/
#ifndef B3_GJK_EPA2_H
#define B3_GJK_EPA2_H
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Common/b3Transform.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
///btGjkEpaSolver contributed under zlib by Nathanael Presson
struct b3GjkEpaSolver2
{
struct sResults
{
enum eStatus
{
Separated, /* Shapes doesnt penetrate */
Penetrating, /* Shapes are penetrating */
GJK_Failed, /* GJK phase fail, no big issue, shapes are probably just 'touching' */
EPA_Failed /* EPA phase fail, bigger problem, need to save parameters, and debug */
} status;
b3Vector3 witnesses[2];
b3Vector3 normal;
b3Scalar distance;
};
static int StackSizeRequirement();
static bool Distance(const b3Transform& transA, const b3Transform& transB,
const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB,
const b3AlignedObjectArray<b3Vector3>& verticesA,
const b3AlignedObjectArray<b3Vector3>& verticesB,
const b3Vector3& guess,
sResults& results);
static bool Penetration(const b3Transform& transA, const b3Transform& transB,
const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB,
const b3AlignedObjectArray<b3Vector3>& verticesA,
const b3AlignedObjectArray<b3Vector3>& verticesB,
const b3Vector3& guess,
sResults& results,
bool usemargins = true);
#if 0
static b3Scalar SignedDistance( const b3Vector3& position,
b3Scalar margin,
const btConvexShape* shape,
const btTransform& wtrs,
sResults& results);
static bool SignedDistance( const btConvexShape* shape0,const btTransform& wtrs0,
const btConvexShape* shape1,const btTransform& wtrs1,
const b3Vector3& guess,
sResults& results);
#endif
};
#endif //B3_GJK_EPA2_H

View file

@ -1,358 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "b3OptimizedBvh.h"
#include "b3StridingMeshInterface.h"
#include "Bullet3Geometry/b3AabbUtil.h"
b3OptimizedBvh::b3OptimizedBvh()
{
}
b3OptimizedBvh::~b3OptimizedBvh()
{
}
void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax)
{
m_useQuantization = useQuantizedAabbCompression;
// NodeArray triangleNodes;
struct NodeTriangleCallback : public b3InternalTriangleIndexCallback
{
NodeArray& m_triangleNodes;
NodeTriangleCallback& operator=(NodeTriangleCallback& other)
{
m_triangleNodes.copyFromArray(other.m_triangleNodes);
return *this;
}
NodeTriangleCallback(NodeArray& triangleNodes)
: m_triangleNodes(triangleNodes)
{
}
virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
{
b3OptimizedBvhNode node;
b3Vector3 aabbMin, aabbMax;
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
aabbMin.setMin(triangle[0]);
aabbMax.setMax(triangle[0]);
aabbMin.setMin(triangle[1]);
aabbMax.setMax(triangle[1]);
aabbMin.setMin(triangle[2]);
aabbMax.setMax(triangle[2]);
//with quantization?
node.m_aabbMinOrg = aabbMin;
node.m_aabbMaxOrg = aabbMax;
node.m_escapeIndex = -1;
//for child nodes
node.m_subPart = partId;
node.m_triangleIndex = triangleIndex;
m_triangleNodes.push_back(node);
}
};
struct QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback
{
QuantizedNodeArray& m_triangleNodes;
const b3QuantizedBvh* m_optimizedTree; // for quantization
QuantizedNodeTriangleCallback& operator=(QuantizedNodeTriangleCallback& other)
{
m_triangleNodes.copyFromArray(other.m_triangleNodes);
m_optimizedTree = other.m_optimizedTree;
return *this;
}
QuantizedNodeTriangleCallback(QuantizedNodeArray& triangleNodes, const b3QuantizedBvh* tree)
: m_triangleNodes(triangleNodes), m_optimizedTree(tree)
{
}
virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
{
// The partId and triangle index must fit in the same (positive) integer
b3Assert(partId < (1 << MAX_NUM_PARTS_IN_BITS));
b3Assert(triangleIndex < (1 << (31 - MAX_NUM_PARTS_IN_BITS)));
//negative indices are reserved for escapeIndex
b3Assert(triangleIndex >= 0);
b3QuantizedBvhNode node;
b3Vector3 aabbMin, aabbMax;
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
aabbMin.setMin(triangle[0]);
aabbMax.setMax(triangle[0]);
aabbMin.setMin(triangle[1]);
aabbMax.setMax(triangle[1]);
aabbMin.setMin(triangle[2]);
aabbMax.setMax(triangle[2]);
//PCK: add these checks for zero dimensions of aabb
const b3Scalar MIN_AABB_DIMENSION = b3Scalar(0.002);
const b3Scalar MIN_AABB_HALF_DIMENSION = b3Scalar(0.001);
if (aabbMax.getX() - aabbMin.getX() < MIN_AABB_DIMENSION)
{
aabbMax.setX(aabbMax.getX() + MIN_AABB_HALF_DIMENSION);
aabbMin.setX(aabbMin.getX() - MIN_AABB_HALF_DIMENSION);
}
if (aabbMax.getY() - aabbMin.getY() < MIN_AABB_DIMENSION)
{
aabbMax.setY(aabbMax.getY() + MIN_AABB_HALF_DIMENSION);
aabbMin.setY(aabbMin.getY() - MIN_AABB_HALF_DIMENSION);
}
if (aabbMax.getZ() - aabbMin.getZ() < MIN_AABB_DIMENSION)
{
aabbMax.setZ(aabbMax.getZ() + MIN_AABB_HALF_DIMENSION);
aabbMin.setZ(aabbMin.getZ() - MIN_AABB_HALF_DIMENSION);
}
m_optimizedTree->quantize(&node.m_quantizedAabbMin[0], aabbMin, 0);
m_optimizedTree->quantize(&node.m_quantizedAabbMax[0], aabbMax, 1);
node.m_escapeIndexOrTriangleIndex = (partId << (31 - MAX_NUM_PARTS_IN_BITS)) | triangleIndex;
m_triangleNodes.push_back(node);
}
};
int numLeafNodes = 0;
if (m_useQuantization)
{
//initialize quantization values
setQuantizationValues(bvhAabbMin, bvhAabbMax);
QuantizedNodeTriangleCallback callback(m_quantizedLeafNodes, this);
triangles->InternalProcessAllTriangles(&callback, m_bvhAabbMin, m_bvhAabbMax);
//now we have an array of leafnodes in m_leafNodes
numLeafNodes = m_quantizedLeafNodes.size();
m_quantizedContiguousNodes.resize(2 * numLeafNodes);
}
else
{
NodeTriangleCallback callback(m_leafNodes);
b3Vector3 aabbMin = b3MakeVector3(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
b3Vector3 aabbMax = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
triangles->InternalProcessAllTriangles(&callback, aabbMin, aabbMax);
//now we have an array of leafnodes in m_leafNodes
numLeafNodes = m_leafNodes.size();
m_contiguousNodes.resize(2 * numLeafNodes);
}
m_curNodeIndex = 0;
buildTree(0, numLeafNodes);
///if the entire tree is small then subtree size, we need to create a header info for the tree
if (m_useQuantization && !m_SubtreeHeaders.size())
{
b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]);
subtree.m_rootNodeIndex = 0;
subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex();
}
//PCK: update the copy of the size
m_subtreeHeaderCount = m_SubtreeHeaders.size();
//PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary
m_quantizedLeafNodes.clear();
m_leafNodes.clear();
}
void b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface, const b3Vector3& aabbMin, const b3Vector3& aabbMax)
{
if (m_useQuantization)
{
setQuantizationValues(aabbMin, aabbMax);
updateBvhNodes(meshInterface, 0, m_curNodeIndex, 0);
///now update all subtree headers
int i;
for (i = 0; i < m_SubtreeHeaders.size(); i++)
{
b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
}
}
else
{
}
}
void b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface, const b3Vector3& aabbMin, const b3Vector3& aabbMax)
{
//incrementally initialize quantization values
b3Assert(m_useQuantization);
b3Assert(aabbMin.getX() > m_bvhAabbMin.getX());
b3Assert(aabbMin.getY() > m_bvhAabbMin.getY());
b3Assert(aabbMin.getZ() > m_bvhAabbMin.getZ());
b3Assert(aabbMax.getX() < m_bvhAabbMax.getX());
b3Assert(aabbMax.getY() < m_bvhAabbMax.getY());
b3Assert(aabbMax.getZ() < m_bvhAabbMax.getZ());
///we should update all quantization values, using updateBvhNodes(meshInterface);
///but we only update chunks that overlap the given aabb
unsigned short quantizedQueryAabbMin[3];
unsigned short quantizedQueryAabbMax[3];
quantize(&quantizedQueryAabbMin[0], aabbMin, 0);
quantize(&quantizedQueryAabbMax[0], aabbMax, 1);
int i;
for (i = 0; i < this->m_SubtreeHeaders.size(); i++)
{
b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
//PCK: unsigned instead of bool
unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin, quantizedQueryAabbMax, subtree.m_quantizedAabbMin, subtree.m_quantizedAabbMax);
if (overlap != 0)
{
updateBvhNodes(meshInterface, subtree.m_rootNodeIndex, subtree.m_rootNodeIndex + subtree.m_subtreeSize, i);
subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
}
}
}
void b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface, int firstNode, int endNode, int index)
{
(void)index;
b3Assert(m_useQuantization);
int curNodeSubPart = -1;
//get access info to trianglemesh data
const unsigned char* vertexbase = 0;
int numverts = 0;
PHY_ScalarType type = PHY_INTEGER;
int stride = 0;
const unsigned char* indexbase = 0;
int indexstride = 0;
int numfaces = 0;
PHY_ScalarType indicestype = PHY_INTEGER;
b3Vector3 triangleVerts[3];
b3Vector3 aabbMin, aabbMax;
const b3Vector3& meshScaling = meshInterface->getScaling();
int i;
for (i = endNode - 1; i >= firstNode; i--)
{
b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i];
if (curNode.isLeafNode())
{
//recalc aabb from triangle data
int nodeSubPart = curNode.getPartId();
int nodeTriangleIndex = curNode.getTriangleIndex();
if (nodeSubPart != curNodeSubPart)
{
if (curNodeSubPart >= 0)
meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numfaces, indicestype, nodeSubPart);
curNodeSubPart = nodeSubPart;
b3Assert(indicestype == PHY_INTEGER || indicestype == PHY_SHORT);
}
//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,
unsigned int* gfxbase = (unsigned int*)(indexbase + nodeTriangleIndex * indexstride);
for (int j = 2; j >= 0; j--)
{
int graphicsindex = indicestype == PHY_SHORT ? ((unsigned short*)gfxbase)[j] : gfxbase[j];
if (type == PHY_FLOAT)
{
float* graphicsbase = (float*)(vertexbase + graphicsindex * stride);
triangleVerts[j] = b3MakeVector3(
graphicsbase[0] * meshScaling.getX(),
graphicsbase[1] * meshScaling.getY(),
graphicsbase[2] * meshScaling.getZ());
}
else
{
double* graphicsbase = (double*)(vertexbase + graphicsindex * stride);
triangleVerts[j] = b3MakeVector3(b3Scalar(graphicsbase[0] * meshScaling.getX()), b3Scalar(graphicsbase[1] * meshScaling.getY()), b3Scalar(graphicsbase[2] * meshScaling.getZ()));
}
}
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
aabbMin.setMin(triangleVerts[0]);
aabbMax.setMax(triangleVerts[0]);
aabbMin.setMin(triangleVerts[1]);
aabbMax.setMax(triangleVerts[1]);
aabbMin.setMin(triangleVerts[2]);
aabbMax.setMax(triangleVerts[2]);
quantize(&curNode.m_quantizedAabbMin[0], aabbMin, 0);
quantize(&curNode.m_quantizedAabbMax[0], aabbMax, 1);
}
else
{
//combine aabb from both children
b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i + 1];
b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i + 2] : &m_quantizedContiguousNodes[i + 1 + leftChildNode->getEscapeIndex()];
{
for (int i = 0; i < 3; i++)
{
curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i];
if (curNode.m_quantizedAabbMin[i] > rightChildNode->m_quantizedAabbMin[i])
curNode.m_quantizedAabbMin[i] = rightChildNode->m_quantizedAabbMin[i];
curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i];
if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i])
curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i];
}
}
}
}
if (curNodeSubPart >= 0)
meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
}
///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
{
b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer, i_dataBufferSize, i_swapEndian);
//we don't add additional data so just do a static upcast
return static_cast<b3OptimizedBvh*>(bvh);
}

View file

@ -1,56 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///Contains contributions from Disney Studio's
#ifndef B3_OPTIMIZED_BVH_H
#define B3_OPTIMIZED_BVH_H
#include "b3QuantizedBvh.h"
class b3StridingMeshInterface;
///The b3OptimizedBvh extends the b3QuantizedBvh to create AABB tree for triangle meshes, through the b3StridingMeshInterface.
B3_ATTRIBUTE_ALIGNED16(class)
b3OptimizedBvh : public b3QuantizedBvh
{
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
protected:
public:
b3OptimizedBvh();
virtual ~b3OptimizedBvh();
void build(b3StridingMeshInterface * triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax);
void refit(b3StridingMeshInterface * triangles, const b3Vector3& aabbMin, const b3Vector3& aabbMax);
void refitPartial(b3StridingMeshInterface * triangles, const b3Vector3& aabbMin, const b3Vector3& aabbMax);
void updateBvhNodes(b3StridingMeshInterface * meshInterface, int firstNode, int endNode, int index);
/// Data buffer MUST be 16 byte aligned
virtual bool serializeInPlace(void* o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const
{
return b3QuantizedBvh::serialize(o_alignedDataBuffer, i_dataBufferSize, i_swapEndian);
}
///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
static b3OptimizedBvh* deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
};
#endif //B3_OPTIMIZED_BVH_H

View file

@ -1,511 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_QUANTIZED_BVH_H
#define B3_QUANTIZED_BVH_H
class b3Serializer;
//#define DEBUG_CHECK_DEQUANTIZATION 1
#ifdef DEBUG_CHECK_DEQUANTIZATION
#ifdef __SPU__
#define printf spu_printf
#endif //__SPU__
#include <stdio.h>
#include <stdlib.h>
#endif //DEBUG_CHECK_DEQUANTIZATION
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3Common/b3AlignedAllocator.h"
#ifdef B3_USE_DOUBLE_PRECISION
#define b3QuantizedBvhData b3QuantizedBvhDoubleData
#define b3OptimizedBvhNodeData b3OptimizedBvhNodeDoubleData
#define b3QuantizedBvhDataName "b3QuantizedBvhDoubleData"
#else
#define b3QuantizedBvhData b3QuantizedBvhFloatData
#define b3OptimizedBvhNodeData b3OptimizedBvhNodeFloatData
#define b3QuantizedBvhDataName "b3QuantizedBvhFloatData"
#endif
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"
//http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclang/html/vclrf__m128.asp
//Note: currently we have 16 bytes per quantized node
#define MAX_SUBTREE_SIZE_IN_BYTES 2048
// 10 gives the potential for 1024 parts, with at most 2^21 (2097152) (minus one
// actually) triangles each (since the sign bit is reserved
#define MAX_NUM_PARTS_IN_BITS 10
///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
B3_ATTRIBUTE_ALIGNED16(struct)
b3QuantizedBvhNode : public b3QuantizedBvhNodeData
{
B3_DECLARE_ALIGNED_ALLOCATOR();
bool isLeafNode() const
{
//skipindex is negative (internal node), triangleindex >=0 (leafnode)
return (m_escapeIndexOrTriangleIndex >= 0);
}
int getEscapeIndex() const
{
b3Assert(!isLeafNode());
return -m_escapeIndexOrTriangleIndex;
}
int getTriangleIndex() const
{
b3Assert(isLeafNode());
unsigned int x = 0;
unsigned int y = (~(x & 0)) << (31 - MAX_NUM_PARTS_IN_BITS);
// Get only the lower bits where the triangle index is stored
return (m_escapeIndexOrTriangleIndex & ~(y));
}
int getPartId() const
{
b3Assert(isLeafNode());
// Get only the highest bits where the part index is stored
return (m_escapeIndexOrTriangleIndex >> (31 - MAX_NUM_PARTS_IN_BITS));
}
};
/// b3OptimizedBvhNode contains both internal and leaf node information.
/// Total node size is 44 bytes / node. You can use the compressed version of 16 bytes.
B3_ATTRIBUTE_ALIGNED16(struct)
b3OptimizedBvhNode
{
B3_DECLARE_ALIGNED_ALLOCATOR();
//32 bytes
b3Vector3 m_aabbMinOrg;
b3Vector3 m_aabbMaxOrg;
//4
int m_escapeIndex;
//8
//for child nodes
int m_subPart;
int m_triangleIndex;
//pad the size to 64 bytes
char m_padding[20];
};
///b3BvhSubtreeInfo provides info to gather a subtree of limited size
B3_ATTRIBUTE_ALIGNED16(class)
b3BvhSubtreeInfo : public b3BvhSubtreeInfoData
{
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
b3BvhSubtreeInfo()
{
//memset(&m_padding[0], 0, sizeof(m_padding));
}
void setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode)
{
m_quantizedAabbMin[0] = quantizedNode.m_quantizedAabbMin[0];
m_quantizedAabbMin[1] = quantizedNode.m_quantizedAabbMin[1];
m_quantizedAabbMin[2] = quantizedNode.m_quantizedAabbMin[2];
m_quantizedAabbMax[0] = quantizedNode.m_quantizedAabbMax[0];
m_quantizedAabbMax[1] = quantizedNode.m_quantizedAabbMax[1];
m_quantizedAabbMax[2] = quantizedNode.m_quantizedAabbMax[2];
}
};
class b3NodeOverlapCallback
{
public:
virtual ~b3NodeOverlapCallback(){};
virtual void processNode(int subPart, int triangleIndex) = 0;
};
#include "Bullet3Common/b3AlignedAllocator.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
///for code readability:
typedef b3AlignedObjectArray<b3OptimizedBvhNode> NodeArray;
typedef b3AlignedObjectArray<b3QuantizedBvhNode> QuantizedNodeArray;
typedef b3AlignedObjectArray<b3BvhSubtreeInfo> BvhSubtreeInfoArray;
///The b3QuantizedBvh class stores an AABB tree that can be quickly traversed on CPU and Cell SPU.
///It is used by the b3BvhTriangleMeshShape as midphase
///It is recommended to use quantization for better performance and lower memory requirements.
B3_ATTRIBUTE_ALIGNED16(class)
b3QuantizedBvh
{
public:
enum b3TraversalMode
{
TRAVERSAL_STACKLESS = 0,
TRAVERSAL_STACKLESS_CACHE_FRIENDLY,
TRAVERSAL_RECURSIVE
};
b3Vector3 m_bvhAabbMin;
b3Vector3 m_bvhAabbMax;
b3Vector3 m_bvhQuantization;
protected:
int m_bulletVersion; //for serialization versioning. It could also be used to detect endianess.
int m_curNodeIndex;
//quantization data
bool m_useQuantization;
NodeArray m_leafNodes;
NodeArray m_contiguousNodes;
QuantizedNodeArray m_quantizedLeafNodes;
QuantizedNodeArray m_quantizedContiguousNodes;
b3TraversalMode m_traversalMode;
BvhSubtreeInfoArray m_SubtreeHeaders;
//This is only used for serialization so we don't have to add serialization directly to b3AlignedObjectArray
mutable int m_subtreeHeaderCount;
///two versions, one for quantized and normal nodes. This allows code-reuse while maintaining readability (no template/macro!)
///this might be refactored into a virtual, it is usually not calculated at run-time
void setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin)
{
if (m_useQuantization)
{
quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0], aabbMin, 0);
}
else
{
m_contiguousNodes[nodeIndex].m_aabbMinOrg = aabbMin;
}
}
void setInternalNodeAabbMax(int nodeIndex, const b3Vector3& aabbMax)
{
if (m_useQuantization)
{
quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0], aabbMax, 1);
}
else
{
m_contiguousNodes[nodeIndex].m_aabbMaxOrg = aabbMax;
}
}
b3Vector3 getAabbMin(int nodeIndex) const
{
if (m_useQuantization)
{
return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMin[0]);
}
//non-quantized
return m_leafNodes[nodeIndex].m_aabbMinOrg;
}
b3Vector3 getAabbMax(int nodeIndex) const
{
if (m_useQuantization)
{
return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMax[0]);
}
//non-quantized
return m_leafNodes[nodeIndex].m_aabbMaxOrg;
}
void setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex)
{
if (m_useQuantization)
{
m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = -escapeIndex;
}
else
{
m_contiguousNodes[nodeIndex].m_escapeIndex = escapeIndex;
}
}
void mergeInternalNodeAabb(int nodeIndex, const b3Vector3& newAabbMin, const b3Vector3& newAabbMax)
{
if (m_useQuantization)
{
unsigned short int quantizedAabbMin[3];
unsigned short int quantizedAabbMax[3];
quantize(quantizedAabbMin, newAabbMin, 0);
quantize(quantizedAabbMax, newAabbMax, 1);
for (int i = 0; i < 3; i++)
{
if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] > quantizedAabbMin[i])
m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] = quantizedAabbMin[i];
if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] < quantizedAabbMax[i])
m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] = quantizedAabbMax[i];
}
}
else
{
//non-quantized
m_contiguousNodes[nodeIndex].m_aabbMinOrg.setMin(newAabbMin);
m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax);
}
}
void swapLeafNodes(int firstIndex, int secondIndex);
void assignInternalNodeFromLeafNode(int internalNode, int leafNodeIndex);
protected:
void buildTree(int startIndex, int endIndex);
int calcSplittingAxis(int startIndex, int endIndex);
int sortAndCalcSplittingIndex(int startIndex, int endIndex, int splitAxis);
void walkStacklessTree(b3NodeOverlapCallback * nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
void walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const;
void walkStacklessQuantizedTree(b3NodeOverlapCallback * nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax, int startNodeIndex, int endNodeIndex) const;
void walkStacklessTreeAgainstRay(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const;
///tree traversal designed for small-memory processors like PS3 SPU
void walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback * nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const;
///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
void walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode, b3NodeOverlapCallback* nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const;
///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
void walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA, const b3QuantizedBvhNode* treeNodeB, b3NodeOverlapCallback* nodeCallback) const;
void updateSubtreeHeaders(int leftChildNodexIndex, int rightChildNodexIndex);
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
b3QuantizedBvh();
virtual ~b3QuantizedBvh();
///***************************************** expert/internal use only *************************
void setQuantizationValues(const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax, b3Scalar quantizationMargin = b3Scalar(1.0));
QuantizedNodeArray& getLeafNodeArray() { return m_quantizedLeafNodes; }
///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized
void buildInternal();
///***************************************** expert/internal use only *************************
void reportAabbOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
void reportRayOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const;
void reportBoxCastOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point, int isMax) const
{
b3Assert(m_useQuantization);
b3Assert(point.getX() <= m_bvhAabbMax.getX());
b3Assert(point.getY() <= m_bvhAabbMax.getY());
b3Assert(point.getZ() <= m_bvhAabbMax.getZ());
b3Assert(point.getX() >= m_bvhAabbMin.getX());
b3Assert(point.getY() >= m_bvhAabbMin.getY());
b3Assert(point.getZ() >= m_bvhAabbMin.getZ());
b3Vector3 v = (point - m_bvhAabbMin) * m_bvhQuantization;
///Make sure rounding is done in a way that unQuantize(quantizeWithClamp(...)) is conservative
///end-points always set the first bit, so that they are sorted properly (so that neighbouring AABBs overlap properly)
///@todo: double-check this
if (isMax)
{
out[0] = (unsigned short)(((unsigned short)(v.getX() + b3Scalar(1.)) | 1));
out[1] = (unsigned short)(((unsigned short)(v.getY() + b3Scalar(1.)) | 1));
out[2] = (unsigned short)(((unsigned short)(v.getZ() + b3Scalar(1.)) | 1));
}
else
{
out[0] = (unsigned short)(((unsigned short)(v.getX()) & 0xfffe));
out[1] = (unsigned short)(((unsigned short)(v.getY()) & 0xfffe));
out[2] = (unsigned short)(((unsigned short)(v.getZ()) & 0xfffe));
}
#ifdef DEBUG_CHECK_DEQUANTIZATION
b3Vector3 newPoint = unQuantize(out);
if (isMax)
{
if (newPoint.getX() < point.getX())
{
printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n", newPoint.getX() - point.getX(), newPoint.getX(), point.getX());
}
if (newPoint.getY() < point.getY())
{
printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n", newPoint.getY() - point.getY(), newPoint.getY(), point.getY());
}
if (newPoint.getZ() < point.getZ())
{
printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n", newPoint.getZ() - point.getZ(), newPoint.getZ(), point.getZ());
}
}
else
{
if (newPoint.getX() > point.getX())
{
printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n", newPoint.getX() - point.getX(), newPoint.getX(), point.getX());
}
if (newPoint.getY() > point.getY())
{
printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n", newPoint.getY() - point.getY(), newPoint.getY(), point.getY());
}
if (newPoint.getZ() > point.getZ())
{
printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n", newPoint.getZ() - point.getZ(), newPoint.getZ(), point.getZ());
}
}
#endif //DEBUG_CHECK_DEQUANTIZATION
}
B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2, int isMax) const
{
b3Assert(m_useQuantization);
b3Vector3 clampedPoint(point2);
clampedPoint.setMax(m_bvhAabbMin);
clampedPoint.setMin(m_bvhAabbMax);
quantize(out, clampedPoint, isMax);
}
B3_FORCE_INLINE b3Vector3 unQuantize(const unsigned short* vecIn) const
{
b3Vector3 vecOut;
vecOut.setValue(
(b3Scalar)(vecIn[0]) / (m_bvhQuantization.getX()),
(b3Scalar)(vecIn[1]) / (m_bvhQuantization.getY()),
(b3Scalar)(vecIn[2]) / (m_bvhQuantization.getZ()));
vecOut += m_bvhAabbMin;
return vecOut;
}
///setTraversalMode let's you choose between stackless, recursive or stackless cache friendly tree traversal. Note this is only implemented for quantized trees.
void setTraversalMode(b3TraversalMode traversalMode)
{
m_traversalMode = traversalMode;
}
B3_FORCE_INLINE QuantizedNodeArray& getQuantizedNodeArray()
{
return m_quantizedContiguousNodes;
}
B3_FORCE_INLINE BvhSubtreeInfoArray& getSubtreeInfoArray()
{
return m_SubtreeHeaders;
}
////////////////////////////////////////////////////////////////////
/////Calculate space needed to store BVH for serialization
unsigned calculateSerializeBufferSize() const;
/// Data buffer MUST be 16 byte aligned
virtual bool serialize(void* o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const;
///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
static b3QuantizedBvh* deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
static unsigned int getAlignmentSerializationPadding();
//////////////////////////////////////////////////////////////////////
virtual int calculateSerializeBufferSizeNew() const;
///fills the dataBuffer and returns the struct name (and 0 on failure)
virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const;
virtual void deSerializeFloat(struct b3QuantizedBvhFloatData & quantizedBvhFloatData);
virtual void deSerializeDouble(struct b3QuantizedBvhDoubleData & quantizedBvhDoubleData);
////////////////////////////////////////////////////////////////////
B3_FORCE_INLINE bool isQuantized()
{
return m_useQuantization;
}
private:
// Special "copy" constructor that allows for in-place deserialization
// Prevents b3Vector3's default constructor from being called, but doesn't inialize much else
// ownsMemory should most likely be false if deserializing, and if you are not, don't call this (it also changes the function signature, which we need)
b3QuantizedBvh(b3QuantizedBvh & other, bool ownsMemory);
};
struct b3OptimizedBvhNodeFloatData
{
b3Vector3FloatData m_aabbMinOrg;
b3Vector3FloatData m_aabbMaxOrg;
int m_escapeIndex;
int m_subPart;
int m_triangleIndex;
char m_pad[4];
};
struct b3OptimizedBvhNodeDoubleData
{
b3Vector3DoubleData m_aabbMinOrg;
b3Vector3DoubleData m_aabbMaxOrg;
int m_escapeIndex;
int m_subPart;
int m_triangleIndex;
char m_pad[4];
};
struct b3QuantizedBvhFloatData
{
b3Vector3FloatData m_bvhAabbMin;
b3Vector3FloatData m_bvhAabbMax;
b3Vector3FloatData m_bvhQuantization;
int m_curNodeIndex;
int m_useQuantization;
int m_numContiguousLeafNodes;
int m_numQuantizedContiguousNodes;
b3OptimizedBvhNodeFloatData* m_contiguousNodesPtr;
b3QuantizedBvhNodeData* m_quantizedContiguousNodesPtr;
b3BvhSubtreeInfoData* m_subTreeInfoPtr;
int m_traversalMode;
int m_numSubtreeHeaders;
};
struct b3QuantizedBvhDoubleData
{
b3Vector3DoubleData m_bvhAabbMin;
b3Vector3DoubleData m_bvhAabbMax;
b3Vector3DoubleData m_bvhQuantization;
int m_curNodeIndex;
int m_useQuantization;
int m_numContiguousLeafNodes;
int m_numQuantizedContiguousNodes;
b3OptimizedBvhNodeDoubleData* m_contiguousNodesPtr;
b3QuantizedBvhNodeData* m_quantizedContiguousNodesPtr;
int m_traversalMode;
int m_numSubtreeHeaders;
b3BvhSubtreeInfoData* m_subTreeInfoPtr;
};
B3_FORCE_INLINE int b3QuantizedBvh::calculateSerializeBufferSizeNew() const
{
return sizeof(b3QuantizedBvhData);
}
#endif //B3_QUANTIZED_BVH_H

View file

@ -1,207 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "b3StridingMeshInterface.h"
b3StridingMeshInterface::~b3StridingMeshInterface()
{
}
void b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
{
(void)aabbMin;
(void)aabbMax;
int numtotalphysicsverts = 0;
int part, graphicssubparts = getNumSubParts();
const unsigned char* vertexbase;
const unsigned char* indexbase;
int indexstride;
PHY_ScalarType type;
PHY_ScalarType gfxindextype;
int stride, numverts, numtriangles;
int gfxindex;
b3Vector3 triangle[3];
b3Vector3 meshScaling = getScaling();
///if the number of parts is big, the performance might drop due to the innerloop switch on indextype
for (part = 0; part < graphicssubparts; part++)
{
getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numtriangles, gfxindextype, part);
numtotalphysicsverts += numtriangles * 3; //upper bound
///unlike that developers want to pass in double-precision meshes in single-precision Bullet build
///so disable this feature by default
///see patch http://code.google.com/p/bullet/issues/detail?id=213
switch (type)
{
case PHY_FLOAT:
{
float* graphicsbase;
switch (gfxindextype)
{
case PHY_INTEGER:
{
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned int* tri_indices = (unsigned int*)(indexbase + gfxindex * indexstride);
graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
case PHY_SHORT:
{
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned short int* tri_indices = (unsigned short int*)(indexbase + gfxindex * indexstride);
graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
case PHY_UCHAR:
{
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned char* tri_indices = (unsigned char*)(indexbase + gfxindex * indexstride);
graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
default:
b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
}
break;
}
case PHY_DOUBLE:
{
double* graphicsbase;
switch (gfxindextype)
{
case PHY_INTEGER:
{
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned int* tri_indices = (unsigned int*)(indexbase + gfxindex * indexstride);
graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
case PHY_SHORT:
{
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned short int* tri_indices = (unsigned short int*)(indexbase + gfxindex * indexstride);
graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
case PHY_UCHAR:
{
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned char* tri_indices = (unsigned char*)(indexbase + gfxindex * indexstride);
graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
default:
b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
}
break;
}
default:
b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE));
}
unLockReadOnlyVertexBase(part);
}
}
void b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin, b3Vector3& aabbMax)
{
struct AabbCalculationCallback : public b3InternalTriangleIndexCallback
{
b3Vector3 m_aabbMin;
b3Vector3 m_aabbMax;
AabbCalculationCallback()
{
m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
}
virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
{
(void)partId;
(void)triangleIndex;
m_aabbMin.setMin(triangle[0]);
m_aabbMax.setMax(triangle[0]);
m_aabbMin.setMin(triangle[1]);
m_aabbMax.setMax(triangle[1]);
m_aabbMin.setMin(triangle[2]);
m_aabbMax.setMax(triangle[2]);
}
};
//first calculate the total aabb for all triangles
AabbCalculationCallback aabbCallback;
aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
InternalProcessAllTriangles(&aabbCallback, aabbMin, aabbMax);
aabbMin = aabbCallback.m_aabbMin;
aabbMax = aabbCallback.m_aabbMax;
}

View file

@ -1,158 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_STRIDING_MESHINTERFACE_H
#define B3_STRIDING_MESHINTERFACE_H
#include "Bullet3Common/b3Vector3.h"
#include "b3TriangleCallback.h"
//#include "b3ConcaveShape.h"
enum PHY_ScalarType
{
PHY_FLOAT,
PHY_DOUBLE,
PHY_INTEGER,
PHY_SHORT,
PHY_FIXEDPOINT88,
PHY_UCHAR
};
/// The b3StridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with b3BvhTriangleMeshShape and some other collision shapes.
/// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips.
/// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory.
B3_ATTRIBUTE_ALIGNED16(class)
b3StridingMeshInterface
{
protected:
b3Vector3 m_scaling;
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
b3StridingMeshInterface() : m_scaling(b3MakeVector3(b3Scalar(1.), b3Scalar(1.), b3Scalar(1.)))
{
}
virtual ~b3StridingMeshInterface();
virtual void InternalProcessAllTriangles(b3InternalTriangleIndexCallback * callback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
///brute force method to calculate aabb
void calculateAabbBruteForce(b3Vector3 & aabbMin, b3Vector3 & aabbMax);
/// get read and write access to a subpart of a triangle mesh
/// this subpart has a continuous array of vertices and indices
/// in this way the mesh can be handled as chunks of memory with striding
/// very similar to OpenGL vertexarray support
/// make a call to unLockVertexBase when the read and write access is finished
virtual void getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& stride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) = 0;
virtual void getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& stride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) const = 0;
/// unLockVertexBase finishes the access to a subpart of the triangle mesh
/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
virtual void unLockVertexBase(int subpart) = 0;
virtual void unLockReadOnlyVertexBase(int subpart) const = 0;
/// getNumSubParts returns the number of seperate subparts
/// each subpart has a continuous array of vertices and indices
virtual int getNumSubParts() const = 0;
virtual void preallocateVertices(int numverts) = 0;
virtual void preallocateIndices(int numindices) = 0;
virtual bool hasPremadeAabb() const { return false; }
virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
{
(void)aabbMin;
(void)aabbMax;
}
virtual void getPremadeAabb(b3Vector3 * aabbMin, b3Vector3 * aabbMax) const
{
(void)aabbMin;
(void)aabbMax;
}
const b3Vector3& getScaling() const
{
return m_scaling;
}
void setScaling(const b3Vector3& scaling)
{
m_scaling = scaling;
}
virtual int calculateSerializeBufferSize() const;
///fills the dataBuffer and returns the struct name (and 0 on failure)
//virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const;
};
struct b3IntIndexData
{
int m_value;
};
struct b3ShortIntIndexData
{
short m_value;
char m_pad[2];
};
struct b3ShortIntIndexTripletData
{
short m_values[3];
char m_pad[2];
};
struct b3CharIndexTripletData
{
unsigned char m_values[3];
char m_pad;
};
///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
struct b3MeshPartData
{
b3Vector3FloatData* m_vertices3f;
b3Vector3DoubleData* m_vertices3d;
b3IntIndexData* m_indices32;
b3ShortIntIndexTripletData* m_3indices16;
b3CharIndexTripletData* m_3indices8;
b3ShortIntIndexData* m_indices16; //backwards compatibility
int m_numTriangles; //length of m_indices = m_numTriangles
int m_numVertices;
};
///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
struct b3StridingMeshInterfaceData
{
b3MeshPartData* m_meshPartsPtr;
b3Vector3FloatData m_scaling;
int m_numMeshParts;
char m_padding[4];
};
B3_FORCE_INLINE int b3StridingMeshInterface::calculateSerializeBufferSize() const
{
return sizeof(b3StridingMeshInterfaceData);
}
#endif //B3_STRIDING_MESHINTERFACE_H

View file

@ -1,34 +0,0 @@
#ifndef B3_SUPPORT_MAPPINGS_H
#define B3_SUPPORT_MAPPINGS_H
#include "Bullet3Common/b3Transform.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "b3VectorFloat4.h"
struct b3GjkPairDetector;
inline b3Vector3 localGetSupportVertexWithMargin(const float4& supportVec, const struct b3ConvexPolyhedronData* hull,
const b3AlignedObjectArray<b3Vector3>& verticesA, b3Scalar margin)
{
b3Vector3 supVec = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
b3Scalar maxDot = b3Scalar(-B3_LARGE_FLOAT);
// Here we take advantage of dot(a, b*c) = dot(a*b, c). Note: This is true mathematically, but not numerically.
if (0 < hull->m_numVertices)
{
const b3Vector3 scaled = supportVec;
int index = (int)scaled.maxDot(&verticesA[hull->m_vertexOffset], hull->m_numVertices, maxDot);
return verticesA[hull->m_vertexOffset + index];
}
return supVec;
}
inline b3Vector3 localGetSupportVertexWithoutMargin(const float4& supportVec, const struct b3ConvexPolyhedronData* hull,
const b3AlignedObjectArray<b3Vector3>& verticesA)
{
return localGetSupportVertexWithMargin(supportVec, hull, verticesA, 0.f);
}
#endif //B3_SUPPORT_MAPPINGS_H

View file

@ -1,24 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "b3TriangleCallback.h"
b3TriangleCallback::~b3TriangleCallback()
{
}
b3InternalTriangleIndexCallback::~b3InternalTriangleIndexCallback()
{
}

View file

@ -1,37 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_TRIANGLE_CALLBACK_H
#define B3_TRIANGLE_CALLBACK_H
#include "Bullet3Common/b3Vector3.h"
///The b3TriangleCallback provides a callback for each overlapping triangle when calling processAllTriangles.
///This callback is called by processAllTriangles for all b3ConcaveShape derived class, such as b3BvhTriangleMeshShape, b3StaticPlaneShape and b3HeightfieldTerrainShape.
class b3TriangleCallback
{
public:
virtual ~b3TriangleCallback();
virtual void processTriangle(b3Vector3* triangle, int partId, int triangleIndex) = 0;
};
class b3InternalTriangleIndexCallback
{
public:
virtual ~b3InternalTriangleIndexCallback();
virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex) = 0;
};
#endif //B3_TRIANGLE_CALLBACK_H

View file

@ -1,90 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "b3TriangleIndexVertexArray.h"
b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles, int* triangleIndexBase, int triangleIndexStride, int numVertices, b3Scalar* vertexBase, int vertexStride)
: m_hasAabb(0)
{
b3IndexedMesh mesh;
mesh.m_numTriangles = numTriangles;
mesh.m_triangleIndexBase = (const unsigned char*)triangleIndexBase;
mesh.m_triangleIndexStride = triangleIndexStride;
mesh.m_numVertices = numVertices;
mesh.m_vertexBase = (const unsigned char*)vertexBase;
mesh.m_vertexStride = vertexStride;
addIndexedMesh(mesh);
}
b3TriangleIndexVertexArray::~b3TriangleIndexVertexArray()
{
}
void b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart)
{
b3Assert(subpart < getNumSubParts());
b3IndexedMesh& mesh = m_indexedMeshes[subpart];
numverts = mesh.m_numVertices;
(*vertexbase) = (unsigned char*)mesh.m_vertexBase;
type = mesh.m_vertexType;
vertexStride = mesh.m_vertexStride;
numfaces = mesh.m_numTriangles;
(*indexbase) = (unsigned char*)mesh.m_triangleIndexBase;
indexstride = mesh.m_triangleIndexStride;
indicestype = mesh.m_indexType;
}
void b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart) const
{
const b3IndexedMesh& mesh = m_indexedMeshes[subpart];
numverts = mesh.m_numVertices;
(*vertexbase) = (const unsigned char*)mesh.m_vertexBase;
type = mesh.m_vertexType;
vertexStride = mesh.m_vertexStride;
numfaces = mesh.m_numTriangles;
(*indexbase) = (const unsigned char*)mesh.m_triangleIndexBase;
indexstride = mesh.m_triangleIndexStride;
indicestype = mesh.m_indexType;
}
bool b3TriangleIndexVertexArray::hasPremadeAabb() const
{
return (m_hasAabb == 1);
}
void b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
{
m_aabbMin = aabbMin;
m_aabbMax = aabbMax;
m_hasAabb = 1; // this is intentionally an int see notes in header
}
void b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax) const
{
*aabbMin = m_aabbMin;
*aabbMax = m_aabbMax;
}

View file

@ -1,128 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
#define B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
#include "b3StridingMeshInterface.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Common/b3Scalar.h"
///The b3IndexedMesh indexes a single vertex and index array. Multiple b3IndexedMesh objects can be passed into a b3TriangleIndexVertexArray using addIndexedMesh.
///Instead of the number of indices, we pass the number of triangles.
B3_ATTRIBUTE_ALIGNED16(struct)
b3IndexedMesh
{
B3_DECLARE_ALIGNED_ALLOCATOR();
int m_numTriangles;
const unsigned char* m_triangleIndexBase;
// Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed)
int m_triangleIndexStride;
int m_numVertices;
const unsigned char* m_vertexBase;
// Size of a vertex, in bytes
int m_vertexStride;
// The index type is set when adding an indexed mesh to the
// b3TriangleIndexVertexArray, do not set it manually
PHY_ScalarType m_indexType;
// The vertex type has a default type similar to Bullet's precision mode (float or double)
// but can be set manually if you for example run Bullet with double precision but have
// mesh data in single precision..
PHY_ScalarType m_vertexType;
b3IndexedMesh()
: m_indexType(PHY_INTEGER),
#ifdef B3_USE_DOUBLE_PRECISION
m_vertexType(PHY_DOUBLE)
#else // B3_USE_DOUBLE_PRECISION
m_vertexType(PHY_FLOAT)
#endif // B3_USE_DOUBLE_PRECISION
{
}
};
typedef b3AlignedObjectArray<b3IndexedMesh> IndexedMeshArray;
///The b3TriangleIndexVertexArray allows to access multiple triangle meshes, by indexing into existing triangle/index arrays.
///Additional meshes can be added using addIndexedMesh
///No duplcate is made of the vertex/index data, it only indexes into external vertex/index arrays.
///So keep those arrays around during the lifetime of this b3TriangleIndexVertexArray.
B3_ATTRIBUTE_ALIGNED16(class)
b3TriangleIndexVertexArray : public b3StridingMeshInterface
{
protected:
IndexedMeshArray m_indexedMeshes;
int m_pad[2];
mutable int m_hasAabb; // using int instead of bool to maintain alignment
mutable b3Vector3 m_aabbMin;
mutable b3Vector3 m_aabbMax;
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
b3TriangleIndexVertexArray() : m_hasAabb(0)
{
}
virtual ~b3TriangleIndexVertexArray();
//just to be backwards compatible
b3TriangleIndexVertexArray(int numTriangles, int* triangleIndexBase, int triangleIndexStride, int numVertices, b3Scalar* vertexBase, int vertexStride);
void addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER)
{
m_indexedMeshes.push_back(mesh);
m_indexedMeshes[m_indexedMeshes.size() - 1].m_indexType = indexType;
}
virtual void getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0);
virtual void getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) const;
/// unLockVertexBase finishes the access to a subpart of the triangle mesh
/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
virtual void unLockVertexBase(int subpart) { (void)subpart; }
virtual void unLockReadOnlyVertexBase(int subpart) const { (void)subpart; }
/// getNumSubParts returns the number of seperate subparts
/// each subpart has a continuous array of vertices and indices
virtual int getNumSubParts() const
{
return (int)m_indexedMeshes.size();
}
IndexedMeshArray& getIndexedMeshArray()
{
return m_indexedMeshes;
}
const IndexedMeshArray& getIndexedMeshArray() const
{
return m_indexedMeshes;
}
virtual void preallocateVertices(int numverts) { (void)numverts; }
virtual void preallocateIndices(int numindices) { (void)numindices; }
virtual bool hasPremadeAabb() const;
virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
virtual void getPremadeAabb(b3Vector3 * aabbMin, b3Vector3 * aabbMax) const;
};
#endif //B3_TRIANGLE_INDEX_VERTEX_ARRAY_H

View file

@ -1,10 +0,0 @@
#ifndef B3_VECTOR_FLOAT4_H
#define B3_VECTOR_FLOAT4_H
#include "Bullet3Common/b3Transform.h"
//#define cross3(a,b) (a.cross(b))
#define float4 b3Vector3
//#define make_float4(x,y,z,w) b3Vector4(x,y,z,w)
#endif //B3_VECTOR_FLOAT4_H

View file

@ -1,574 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Elsevier CDROM license agreements grants nonexclusive license to use the software
for any purpose, commercial or non-commercial as long as the following credit is included
identifying the original source of the software:
Parts of the source are "from the book Real-Time Collision Detection by
Christer Ericson, published by Morgan Kaufmann Publishers,
(c) 2005 Elsevier Inc."
*/
#include "b3VoronoiSimplexSolver.h"
#define VERTA 0
#define VERTB 1
#define VERTC 2
#define VERTD 3
#define B3_CATCH_DEGENERATE_TETRAHEDRON 1
void b3VoronoiSimplexSolver::removeVertex(int index)
{
b3Assert(m_numVertices > 0);
m_numVertices--;
m_simplexVectorW[index] = m_simplexVectorW[m_numVertices];
m_simplexPointsP[index] = m_simplexPointsP[m_numVertices];
m_simplexPointsQ[index] = m_simplexPointsQ[m_numVertices];
}
void b3VoronoiSimplexSolver::reduceVertices(const b3UsageBitfield& usedVerts)
{
if ((numVertices() >= 4) && (!usedVerts.usedVertexD))
removeVertex(3);
if ((numVertices() >= 3) && (!usedVerts.usedVertexC))
removeVertex(2);
if ((numVertices() >= 2) && (!usedVerts.usedVertexB))
removeVertex(1);
if ((numVertices() >= 1) && (!usedVerts.usedVertexA))
removeVertex(0);
}
//clear the simplex, remove all the vertices
void b3VoronoiSimplexSolver::reset()
{
m_cachedValidClosest = false;
m_numVertices = 0;
m_needsUpdate = true;
m_lastW = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
m_cachedBC.reset();
}
//add a vertex
void b3VoronoiSimplexSolver::addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q)
{
m_lastW = w;
m_needsUpdate = true;
m_simplexVectorW[m_numVertices] = w;
m_simplexPointsP[m_numVertices] = p;
m_simplexPointsQ[m_numVertices] = q;
m_numVertices++;
}
bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
{
if (m_needsUpdate)
{
m_cachedBC.reset();
m_needsUpdate = false;
switch (numVertices())
{
case 0:
m_cachedValidClosest = false;
break;
case 1:
{
m_cachedP1 = m_simplexPointsP[0];
m_cachedP2 = m_simplexPointsQ[0];
m_cachedV = m_cachedP1 - m_cachedP2; //== m_simplexVectorW[0]
m_cachedBC.reset();
m_cachedBC.setBarycentricCoordinates(b3Scalar(1.), b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
m_cachedValidClosest = m_cachedBC.isValid();
break;
};
case 2:
{
//closest point origin from line segment
const b3Vector3& from = m_simplexVectorW[0];
const b3Vector3& to = m_simplexVectorW[1];
b3Vector3 nearest;
b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
b3Vector3 diff = p - from;
b3Vector3 v = to - from;
b3Scalar t = v.dot(diff);
if (t > 0)
{
b3Scalar dotVV = v.dot(v);
if (t < dotVV)
{
t /= dotVV;
diff -= t * v;
m_cachedBC.m_usedVertices.usedVertexA = true;
m_cachedBC.m_usedVertices.usedVertexB = true;
}
else
{
t = 1;
diff -= v;
//reduce to 1 point
m_cachedBC.m_usedVertices.usedVertexB = true;
}
}
else
{
t = 0;
//reduce to 1 point
m_cachedBC.m_usedVertices.usedVertexA = true;
}
m_cachedBC.setBarycentricCoordinates(1 - t, t);
nearest = from + t * v;
m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]);
m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]);
m_cachedV = m_cachedP1 - m_cachedP2;
reduceVertices(m_cachedBC.m_usedVertices);
m_cachedValidClosest = m_cachedBC.isValid();
break;
}
case 3:
{
//closest point origin from triangle
b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
const b3Vector3& a = m_simplexVectorW[0];
const b3Vector3& b = m_simplexVectorW[1];
const b3Vector3& c = m_simplexVectorW[2];
closestPtPointTriangle(p, a, b, c, m_cachedBC);
m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2];
m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2];
m_cachedV = m_cachedP1 - m_cachedP2;
reduceVertices(m_cachedBC.m_usedVertices);
m_cachedValidClosest = m_cachedBC.isValid();
break;
}
case 4:
{
b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
const b3Vector3& a = m_simplexVectorW[0];
const b3Vector3& b = m_simplexVectorW[1];
const b3Vector3& c = m_simplexVectorW[2];
const b3Vector3& d = m_simplexVectorW[3];
bool hasSeperation = closestPtPointTetrahedron(p, a, b, c, d, m_cachedBC);
if (hasSeperation)
{
m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] +
m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3];
m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] +
m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3];
m_cachedV = m_cachedP1 - m_cachedP2;
reduceVertices(m_cachedBC.m_usedVertices);
}
else
{
// printf("sub distance got penetration\n");
if (m_cachedBC.m_degenerate)
{
m_cachedValidClosest = false;
}
else
{
m_cachedValidClosest = true;
//degenerate case == false, penetration = true + zero
m_cachedV.setValue(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
}
break;
}
m_cachedValidClosest = m_cachedBC.isValid();
//closest point origin from tetrahedron
break;
}
default:
{
m_cachedValidClosest = false;
}
};
}
return m_cachedValidClosest;
}
//return/calculate the closest vertex
bool b3VoronoiSimplexSolver::closest(b3Vector3& v)
{
bool succes = updateClosestVectorAndPoints();
v = m_cachedV;
return succes;
}
b3Scalar b3VoronoiSimplexSolver::maxVertex()
{
int i, numverts = numVertices();
b3Scalar maxV = b3Scalar(0.);
for (i = 0; i < numverts; i++)
{
b3Scalar curLen2 = m_simplexVectorW[i].length2();
if (maxV < curLen2)
maxV = curLen2;
}
return maxV;
}
//return the current simplex
int b3VoronoiSimplexSolver::getSimplex(b3Vector3* pBuf, b3Vector3* qBuf, b3Vector3* yBuf) const
{
int i;
for (i = 0; i < numVertices(); i++)
{
yBuf[i] = m_simplexVectorW[i];
pBuf[i] = m_simplexPointsP[i];
qBuf[i] = m_simplexPointsQ[i];
}
return numVertices();
}
bool b3VoronoiSimplexSolver::inSimplex(const b3Vector3& w)
{
bool found = false;
int i, numverts = numVertices();
//b3Scalar maxV = b3Scalar(0.);
//w is in the current (reduced) simplex
for (i = 0; i < numverts; i++)
{
#ifdef BT_USE_EQUAL_VERTEX_THRESHOLD
if (m_simplexVectorW[i].distance2(w) <= m_equalVertexThreshold)
#else
if (m_simplexVectorW[i] == w)
#endif
found = true;
}
//check in case lastW is already removed
if (w == m_lastW)
return true;
return found;
}
void b3VoronoiSimplexSolver::backup_closest(b3Vector3& v)
{
v = m_cachedV;
}
bool b3VoronoiSimplexSolver::emptySimplex() const
{
return (numVertices() == 0);
}
void b3VoronoiSimplexSolver::compute_points(b3Vector3& p1, b3Vector3& p2)
{
updateClosestVectorAndPoints();
p1 = m_cachedP1;
p2 = m_cachedP2;
}
bool b3VoronoiSimplexSolver::closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, b3SubSimplexClosestResult& result)
{
result.m_usedVertices.reset();
// Check if P in vertex region outside A
b3Vector3 ab = b - a;
b3Vector3 ac = c - a;
b3Vector3 ap = p - a;
b3Scalar d1 = ab.dot(ap);
b3Scalar d2 = ac.dot(ap);
if (d1 <= b3Scalar(0.0) && d2 <= b3Scalar(0.0))
{
result.m_closestPointOnSimplex = a;
result.m_usedVertices.usedVertexA = true;
result.setBarycentricCoordinates(1, 0, 0);
return true; // a; // barycentric coordinates (1,0,0)
}
// Check if P in vertex region outside B
b3Vector3 bp = p - b;
b3Scalar d3 = ab.dot(bp);
b3Scalar d4 = ac.dot(bp);
if (d3 >= b3Scalar(0.0) && d4 <= d3)
{
result.m_closestPointOnSimplex = b;
result.m_usedVertices.usedVertexB = true;
result.setBarycentricCoordinates(0, 1, 0);
return true; // b; // barycentric coordinates (0,1,0)
}
// Check if P in edge region of AB, if so return projection of P onto AB
b3Scalar vc = d1 * d4 - d3 * d2;
if (vc <= b3Scalar(0.0) && d1 >= b3Scalar(0.0) && d3 <= b3Scalar(0.0))
{
b3Scalar v = d1 / (d1 - d3);
result.m_closestPointOnSimplex = a + v * ab;
result.m_usedVertices.usedVertexA = true;
result.m_usedVertices.usedVertexB = true;
result.setBarycentricCoordinates(1 - v, v, 0);
return true;
//return a + v * ab; // barycentric coordinates (1-v,v,0)
}
// Check if P in vertex region outside C
b3Vector3 cp = p - c;
b3Scalar d5 = ab.dot(cp);
b3Scalar d6 = ac.dot(cp);
if (d6 >= b3Scalar(0.0) && d5 <= d6)
{
result.m_closestPointOnSimplex = c;
result.m_usedVertices.usedVertexC = true;
result.setBarycentricCoordinates(0, 0, 1);
return true; //c; // barycentric coordinates (0,0,1)
}
// Check if P in edge region of AC, if so return projection of P onto AC
b3Scalar vb = d5 * d2 - d1 * d6;
if (vb <= b3Scalar(0.0) && d2 >= b3Scalar(0.0) && d6 <= b3Scalar(0.0))
{
b3Scalar w = d2 / (d2 - d6);
result.m_closestPointOnSimplex = a + w * ac;
result.m_usedVertices.usedVertexA = true;
result.m_usedVertices.usedVertexC = true;
result.setBarycentricCoordinates(1 - w, 0, w);
return true;
//return a + w * ac; // barycentric coordinates (1-w,0,w)
}
// Check if P in edge region of BC, if so return projection of P onto BC
b3Scalar va = d3 * d6 - d5 * d4;
if (va <= b3Scalar(0.0) && (d4 - d3) >= b3Scalar(0.0) && (d5 - d6) >= b3Scalar(0.0))
{
b3Scalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6));
result.m_closestPointOnSimplex = b + w * (c - b);
result.m_usedVertices.usedVertexB = true;
result.m_usedVertices.usedVertexC = true;
result.setBarycentricCoordinates(0, 1 - w, w);
return true;
// return b + w * (c - b); // barycentric coordinates (0,1-w,w)
}
// P inside face region. Compute Q through its barycentric coordinates (u,v,w)
b3Scalar denom = b3Scalar(1.0) / (va + vb + vc);
b3Scalar v = vb * denom;
b3Scalar w = vc * denom;
result.m_closestPointOnSimplex = a + ab * v + ac * w;
result.m_usedVertices.usedVertexA = true;
result.m_usedVertices.usedVertexB = true;
result.m_usedVertices.usedVertexC = true;
result.setBarycentricCoordinates(1 - v - w, v, w);
return true;
// return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = b3Scalar(1.0) - v - w
}
/// Test if point p and d lie on opposite sides of plane through abc
int b3VoronoiSimplexSolver::pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d)
{
b3Vector3 normal = (b - a).cross(c - a);
b3Scalar signp = (p - a).dot(normal); // [AP AB AC]
b3Scalar signd = (d - a).dot(normal); // [AD AB AC]
#ifdef B3_CATCH_DEGENERATE_TETRAHEDRON
#ifdef BT_USE_DOUBLE_PRECISION
if (signd * signd < (b3Scalar(1e-8) * b3Scalar(1e-8)))
{
return -1;
}
#else
if (signd * signd < (b3Scalar(1e-4) * b3Scalar(1e-4)))
{
// printf("affine dependent/degenerate\n");//
return -1;
}
#endif
#endif
// Points on opposite sides if expression signs are opposite
return signp * signd < b3Scalar(0.);
}
bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult)
{
b3SubSimplexClosestResult tempResult;
// Start out assuming point inside all halfspaces, so closest to itself
finalResult.m_closestPointOnSimplex = p;
finalResult.m_usedVertices.reset();
finalResult.m_usedVertices.usedVertexA = true;
finalResult.m_usedVertices.usedVertexB = true;
finalResult.m_usedVertices.usedVertexC = true;
finalResult.m_usedVertices.usedVertexD = true;
int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d);
int pointOutsideACD = pointOutsideOfPlane(p, a, c, d, b);
int pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c);
int pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a);
if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0)
{
finalResult.m_degenerate = true;
return false;
}
if (!pointOutsideABC && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC)
{
return false;
}
b3Scalar bestSqDist = FLT_MAX;
// If point outside face abc then compute closest point on abc
if (pointOutsideABC)
{
closestPtPointTriangle(p, a, b, c, tempResult);
b3Vector3 q = tempResult.m_closestPointOnSimplex;
b3Scalar sqDist = (q - p).dot(q - p);
// Update best closest point if (squared) distance is less than current best
if (sqDist < bestSqDist)
{
bestSqDist = sqDist;
finalResult.m_closestPointOnSimplex = q;
//convert result bitmask!
finalResult.m_usedVertices.reset();
finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexB;
finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
finalResult.setBarycentricCoordinates(
tempResult.m_barycentricCoords[VERTA],
tempResult.m_barycentricCoords[VERTB],
tempResult.m_barycentricCoords[VERTC],
0);
}
}
// Repeat test for face acd
if (pointOutsideACD)
{
closestPtPointTriangle(p, a, c, d, tempResult);
b3Vector3 q = tempResult.m_closestPointOnSimplex;
//convert result bitmask!
b3Scalar sqDist = (q - p).dot(q - p);
if (sqDist < bestSqDist)
{
bestSqDist = sqDist;
finalResult.m_closestPointOnSimplex = q;
finalResult.m_usedVertices.reset();
finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexB;
finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexC;
finalResult.setBarycentricCoordinates(
tempResult.m_barycentricCoords[VERTA],
0,
tempResult.m_barycentricCoords[VERTB],
tempResult.m_barycentricCoords[VERTC]);
}
}
// Repeat test for face adb
if (pointOutsideADB)
{
closestPtPointTriangle(p, a, d, b, tempResult);
b3Vector3 q = tempResult.m_closestPointOnSimplex;
//convert result bitmask!
b3Scalar sqDist = (q - p).dot(q - p);
if (sqDist < bestSqDist)
{
bestSqDist = sqDist;
finalResult.m_closestPointOnSimplex = q;
finalResult.m_usedVertices.reset();
finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexC;
finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
finalResult.setBarycentricCoordinates(
tempResult.m_barycentricCoords[VERTA],
tempResult.m_barycentricCoords[VERTC],
0,
tempResult.m_barycentricCoords[VERTB]);
}
}
// Repeat test for face bdc
if (pointOutsideBDC)
{
closestPtPointTriangle(p, b, d, c, tempResult);
b3Vector3 q = tempResult.m_closestPointOnSimplex;
//convert result bitmask!
b3Scalar sqDist = (q - p).dot(q - p);
if (sqDist < bestSqDist)
{
bestSqDist = sqDist;
finalResult.m_closestPointOnSimplex = q;
finalResult.m_usedVertices.reset();
//
finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexA;
finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
finalResult.setBarycentricCoordinates(
0,
tempResult.m_barycentricCoords[VERTA],
tempResult.m_barycentricCoords[VERTC],
tempResult.m_barycentricCoords[VERTB]);
}
}
//help! we ended up full !
if (finalResult.m_usedVertices.usedVertexA &&
finalResult.m_usedVertices.usedVertexB &&
finalResult.m_usedVertices.usedVertexC &&
finalResult.m_usedVertices.usedVertexD)
{
return true;
}
return true;
}

View file

@ -1,164 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_VORONOI_SIMPLEX_SOLVER_H
#define B3_VORONOI_SIMPLEX_SOLVER_H
#include "Bullet3Common/b3Vector3.h"
#define VORONOI_SIMPLEX_MAX_VERTS 5
///disable next define, or use defaultCollisionConfiguration->getSimplexSolver()->setEqualVertexThreshold(0.f) to disable/configure
//#define BT_USE_EQUAL_VERTEX_THRESHOLD
#define VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD 0.0001f
struct b3UsageBitfield
{
b3UsageBitfield()
{
reset();
}
void reset()
{
usedVertexA = false;
usedVertexB = false;
usedVertexC = false;
usedVertexD = false;
}
unsigned short usedVertexA : 1;
unsigned short usedVertexB : 1;
unsigned short usedVertexC : 1;
unsigned short usedVertexD : 1;
unsigned short unused1 : 1;
unsigned short unused2 : 1;
unsigned short unused3 : 1;
unsigned short unused4 : 1;
};
struct b3SubSimplexClosestResult
{
b3Vector3 m_closestPointOnSimplex;
//MASK for m_usedVertices
//stores the simplex vertex-usage, using the MASK,
// if m_usedVertices & MASK then the related vertex is used
b3UsageBitfield m_usedVertices;
b3Scalar m_barycentricCoords[4];
bool m_degenerate;
void reset()
{
m_degenerate = false;
setBarycentricCoordinates();
m_usedVertices.reset();
}
bool isValid()
{
bool valid = (m_barycentricCoords[0] >= b3Scalar(0.)) &&
(m_barycentricCoords[1] >= b3Scalar(0.)) &&
(m_barycentricCoords[2] >= b3Scalar(0.)) &&
(m_barycentricCoords[3] >= b3Scalar(0.));
return valid;
}
void setBarycentricCoordinates(b3Scalar a = b3Scalar(0.), b3Scalar b = b3Scalar(0.), b3Scalar c = b3Scalar(0.), b3Scalar d = b3Scalar(0.))
{
m_barycentricCoords[0] = a;
m_barycentricCoords[1] = b;
m_barycentricCoords[2] = c;
m_barycentricCoords[3] = d;
}
};
/// b3VoronoiSimplexSolver is an implementation of the closest point distance algorithm from a 1-4 points simplex to the origin.
/// Can be used with GJK, as an alternative to Johnson distance algorithm.
B3_ATTRIBUTE_ALIGNED16(class)
b3VoronoiSimplexSolver
{
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
int m_numVertices;
b3Vector3 m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS];
b3Vector3 m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS];
b3Vector3 m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS];
b3Vector3 m_cachedP1;
b3Vector3 m_cachedP2;
b3Vector3 m_cachedV;
b3Vector3 m_lastW;
b3Scalar m_equalVertexThreshold;
bool m_cachedValidClosest;
b3SubSimplexClosestResult m_cachedBC;
bool m_needsUpdate;
void removeVertex(int index);
void reduceVertices(const b3UsageBitfield& usedVerts);
bool updateClosestVectorAndPoints();
bool closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult);
int pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d);
bool closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, b3SubSimplexClosestResult& result);
public:
b3VoronoiSimplexSolver()
: m_equalVertexThreshold(VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD)
{
}
void reset();
void addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q);
void setEqualVertexThreshold(b3Scalar threshold)
{
m_equalVertexThreshold = threshold;
}
b3Scalar getEqualVertexThreshold() const
{
return m_equalVertexThreshold;
}
bool closest(b3Vector3 & v);
b3Scalar maxVertex();
bool fullSimplex() const
{
return (m_numVertices == 4);
}
int getSimplex(b3Vector3 * pBuf, b3Vector3 * qBuf, b3Vector3 * yBuf) const;
bool inSimplex(const b3Vector3& w);
void backup_closest(b3Vector3 & v);
bool emptySimplex() const;
void compute_points(b3Vector3 & p1, b3Vector3 & p2);
int numVertices() const
{
return m_numVertices;
}
};
#endif //B3_VORONOI_SIMPLEX_SOLVER_H

View file

@ -1,283 +0,0 @@
//keep this enum in sync with the CPU version (in btCollidable.h)
//written by Erwin Coumans
#define SHAPE_CONVEX_HULL 3
#define SHAPE_CONCAVE_TRIMESH 5
#define TRIANGLE_NUM_CONVEX_FACES 5
#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
#define SHAPE_SPHERE 7
typedef unsigned int u32;
#define MAX_NUM_PARTS_IN_BITS 10
///btQuantizedBvhNode is a compressed aabb node, 16 bytes.
///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
typedef struct
{
//12 bytes
unsigned short int m_quantizedAabbMin[3];
unsigned short int m_quantizedAabbMax[3];
//4 bytes
int m_escapeIndexOrTriangleIndex;
} btQuantizedBvhNode;
typedef struct
{
float4 m_aabbMin;
float4 m_aabbMax;
float4 m_quantization;
int m_numNodes;
int m_numSubTrees;
int m_nodeOffset;
int m_subTreeOffset;
} b3BvhInfo;
int getTriangleIndex(const btQuantizedBvhNode* rootNode)
{
unsigned int x=0;
unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
// Get only the lower bits where the triangle index is stored
return (rootNode->m_escapeIndexOrTriangleIndex&~(y));
}
int isLeaf(const btQuantizedBvhNode* rootNode)
{
//skipindex is negative (internal node), triangleindex >=0 (leafnode)
return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;
}
int getEscapeIndex(const btQuantizedBvhNode* rootNode)
{
return -rootNode->m_escapeIndexOrTriangleIndex;
}
typedef struct
{
//12 bytes
unsigned short int m_quantizedAabbMin[3];
unsigned short int m_quantizedAabbMax[3];
//4 bytes, points to the root of the subtree
int m_rootNodeIndex;
//4 bytes
int m_subtreeSize;
int m_padding[3];
} btBvhSubtreeInfo;
///keep this in sync with btCollidable.h
typedef struct
{
int m_numChildShapes;
int blaat2;
int m_shapeType;
int m_shapeIndex;
} btCollidableGpu;
typedef struct
{
float4 m_childPosition;
float4 m_childOrientation;
int m_shapeIndex;
int m_unused0;
int m_unused1;
int m_unused2;
} btGpuChildShape;
typedef struct
{
float4 m_pos;
float4 m_quat;
float4 m_linVel;
float4 m_angVel;
u32 m_collidableIdx;
float m_invMass;
float m_restituitionCoeff;
float m_frictionCoeff;
} BodyData;
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} btAabbCL;
int testQuantizedAabbAgainstQuantizedAabb(
const unsigned short int* aabbMin1,
const unsigned short int* aabbMax1,
const unsigned short int* aabbMin2,
const unsigned short int* aabbMax2)
{
//int overlap = 1;
if (aabbMin1[0] > aabbMax2[0])
return 0;
if (aabbMax1[0] < aabbMin2[0])
return 0;
if (aabbMin1[1] > aabbMax2[1])
return 0;
if (aabbMax1[1] < aabbMin2[1])
return 0;
if (aabbMin1[2] > aabbMax2[2])
return 0;
if (aabbMax1[2] < aabbMin2[2])
return 0;
return 1;
//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;
//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;
//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;
//return overlap;
}
void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)
{
float4 clampedPoint = max(point2,bvhAabbMin);
clampedPoint = min (clampedPoint, bvhAabbMax);
float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;
if (isMax)
{
out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));
out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));
out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));
} else
{
out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));
out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));
out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));
}
}
// work-in-progress
__kernel void bvhTraversalKernel( __global const int4* pairs,
__global const BodyData* rigidBodies,
__global const btCollidableGpu* collidables,
__global btAabbCL* aabbs,
__global int4* concavePairsOut,
__global volatile int* numConcavePairsOut,
__global const btBvhSubtreeInfo* subtreeHeadersRoot,
__global const btQuantizedBvhNode* quantizedNodesRoot,
__global const b3BvhInfo* bvhInfos,
int numPairs,
int maxNumConcavePairsCapacity)
{
int id = get_global_id(0);
if (id>=numPairs)
return;
int bodyIndexA = pairs[id].x;
int bodyIndexB = pairs[id].y;
int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
//once the broadphase avoids static-static pairs, we can remove this test
if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
{
return;
}
if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)
return;
int shapeTypeB = collidables[collidableIndexB].m_shapeType;
if (shapeTypeB!=SHAPE_CONVEX_HULL &&
shapeTypeB!=SHAPE_SPHERE &&
shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS
)
return;
b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];
float4 bvhAabbMin = bvhInfo.m_aabbMin;
float4 bvhAabbMax = bvhInfo.m_aabbMax;
float4 bvhQuantization = bvhInfo.m_quantization;
int numSubtreeHeaders = bvhInfo.m_numSubTrees;
__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];
__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];
unsigned short int quantizedQueryAabbMin[3];
unsigned short int quantizedQueryAabbMax[3];
quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);
quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);
for (int i=0;i<numSubtreeHeaders;i++)
{
btBvhSubtreeInfo subtree = subtreeHeaders[i];
int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
if (overlap != 0)
{
int startNodeIndex = subtree.m_rootNodeIndex;
int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;
int curIndex = startNodeIndex;
int escapeIndex;
int isLeafNode;
int aabbOverlap;
while (curIndex < endNodeIndex)
{
btQuantizedBvhNode rootNode = quantizedNodes[curIndex];
aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);
isLeafNode = isLeaf(&rootNode);
if (aabbOverlap)
{
if (isLeafNode)
{
int triangleIndex = getTriangleIndex(&rootNode);
if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)
{
int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);
for (int b=0;b<numChildrenB;b++)
{
if ((pairIdx+b)<maxNumConcavePairsCapacity)
{
int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);
concavePairsOut[pairIdx+b] = newPair;
}
}
} else
{
int pairIdx = atomic_inc(numConcavePairsOut);
if (pairIdx<maxNumConcavePairsCapacity)
{
int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);
concavePairsOut[pairIdx] = newPair;
}
}
}
curIndex++;
} else
{
if (isLeafNode)
{
curIndex++;
} else
{
escapeIndex = getEscapeIndex(&rootNode);
curIndex += escapeIndex;
}
}
}
}
}
}

View file

@ -1,257 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* bvhTraversalKernelCL =
"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
"//written by Erwin Coumans\n"
"#define SHAPE_CONVEX_HULL 3\n"
"#define SHAPE_CONCAVE_TRIMESH 5\n"
"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
"#define SHAPE_SPHERE 7\n"
"typedef unsigned int u32;\n"
"#define MAX_NUM_PARTS_IN_BITS 10\n"
"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
"typedef struct\n"
"{\n"
" //12 bytes\n"
" unsigned short int m_quantizedAabbMin[3];\n"
" unsigned short int m_quantizedAabbMax[3];\n"
" //4 bytes\n"
" int m_escapeIndexOrTriangleIndex;\n"
"} btQuantizedBvhNode;\n"
"typedef struct\n"
"{\n"
" float4 m_aabbMin;\n"
" float4 m_aabbMax;\n"
" float4 m_quantization;\n"
" int m_numNodes;\n"
" int m_numSubTrees;\n"
" int m_nodeOffset;\n"
" int m_subTreeOffset;\n"
"} b3BvhInfo;\n"
"int getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" unsigned int x=0;\n"
" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
" // Get only the lower bits where the triangle index is stored\n"
" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
"}\n"
"int isLeaf(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
"}\n"
" \n"
"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" return -rootNode->m_escapeIndexOrTriangleIndex;\n"
"}\n"
"typedef struct\n"
"{\n"
" //12 bytes\n"
" unsigned short int m_quantizedAabbMin[3];\n"
" unsigned short int m_quantizedAabbMax[3];\n"
" //4 bytes, points to the root of the subtree\n"
" int m_rootNodeIndex;\n"
" //4 bytes\n"
" int m_subtreeSize;\n"
" int m_padding[3];\n"
"} btBvhSubtreeInfo;\n"
"///keep this in sync with btCollidable.h\n"
"typedef struct\n"
"{\n"
" int m_numChildShapes;\n"
" int blaat2;\n"
" int m_shapeType;\n"
" int m_shapeIndex;\n"
" \n"
"} btCollidableGpu;\n"
"typedef struct\n"
"{\n"
" float4 m_childPosition;\n"
" float4 m_childOrientation;\n"
" int m_shapeIndex;\n"
" int m_unused0;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"} btGpuChildShape;\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" float4 m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
" u32 m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} BodyData;\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} btAabbCL;\n"
"int testQuantizedAabbAgainstQuantizedAabb(\n"
" const unsigned short int* aabbMin1,\n"
" const unsigned short int* aabbMax1,\n"
" const unsigned short int* aabbMin2,\n"
" const unsigned short int* aabbMax2)\n"
"{\n"
" //int overlap = 1;\n"
" if (aabbMin1[0] > aabbMax2[0])\n"
" return 0;\n"
" if (aabbMax1[0] < aabbMin2[0])\n"
" return 0;\n"
" if (aabbMin1[1] > aabbMax2[1])\n"
" return 0;\n"
" if (aabbMax1[1] < aabbMin2[1])\n"
" return 0;\n"
" if (aabbMin1[2] > aabbMax2[2])\n"
" return 0;\n"
" if (aabbMax1[2] < aabbMin2[2])\n"
" return 0;\n"
" return 1;\n"
" //overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n"
" //overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n"
" //overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n"
" //return overlap;\n"
"}\n"
"void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n"
"{\n"
" float4 clampedPoint = max(point2,bvhAabbMin);\n"
" clampedPoint = min (clampedPoint, bvhAabbMax);\n"
" float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n"
" if (isMax)\n"
" {\n"
" out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n"
" out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n"
" out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n"
" } else\n"
" {\n"
" out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n"
" out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n"
" out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n"
" }\n"
"}\n"
"// work-in-progress\n"
"__kernel void bvhTraversalKernel( __global const int4* pairs, \n"
" __global const BodyData* rigidBodies, \n"
" __global const btCollidableGpu* collidables,\n"
" __global btAabbCL* aabbs,\n"
" __global int4* concavePairsOut,\n"
" __global volatile int* numConcavePairsOut,\n"
" __global const btBvhSubtreeInfo* subtreeHeadersRoot,\n"
" __global const btQuantizedBvhNode* quantizedNodesRoot,\n"
" __global const b3BvhInfo* bvhInfos,\n"
" int numPairs,\n"
" int maxNumConcavePairsCapacity)\n"
"{\n"
" int id = get_global_id(0);\n"
" if (id>=numPairs)\n"
" return;\n"
" \n"
" int bodyIndexA = pairs[id].x;\n"
" int bodyIndexB = pairs[id].y;\n"
" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
" \n"
" //once the broadphase avoids static-static pairs, we can remove this test\n"
" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
" {\n"
" return;\n"
" }\n"
" \n"
" if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n"
" return;\n"
" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
" \n"
" if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n"
" shapeTypeB!=SHAPE_SPHERE &&\n"
" shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
" )\n"
" return;\n"
" b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n"
" float4 bvhAabbMin = bvhInfo.m_aabbMin;\n"
" float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
" float4 bvhQuantization = bvhInfo.m_quantization;\n"
" int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
" __global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
" __global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
" \n"
" unsigned short int quantizedQueryAabbMin[3];\n"
" unsigned short int quantizedQueryAabbMax[3];\n"
" quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
" quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
" \n"
" for (int i=0;i<numSubtreeHeaders;i++)\n"
" {\n"
" btBvhSubtreeInfo subtree = subtreeHeaders[i];\n"
" \n"
" int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
" if (overlap != 0)\n"
" {\n"
" int startNodeIndex = subtree.m_rootNodeIndex;\n"
" int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n"
" int curIndex = startNodeIndex;\n"
" int escapeIndex;\n"
" int isLeafNode;\n"
" int aabbOverlap;\n"
" while (curIndex < endNodeIndex)\n"
" {\n"
" btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
" aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
" isLeafNode = isLeaf(&rootNode);\n"
" if (aabbOverlap)\n"
" {\n"
" if (isLeafNode)\n"
" {\n"
" int triangleIndex = getTriangleIndex(&rootNode);\n"
" if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
" {\n"
" int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
" int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n"
" for (int b=0;b<numChildrenB;b++)\n"
" {\n"
" if ((pairIdx+b)<maxNumConcavePairsCapacity)\n"
" {\n"
" int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n"
" concavePairsOut[pairIdx+b] = newPair;\n"
" }\n"
" }\n"
" } else\n"
" {\n"
" int pairIdx = atomic_inc(numConcavePairsOut);\n"
" if (pairIdx<maxNumConcavePairsCapacity)\n"
" {\n"
" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n"
" concavePairsOut[pairIdx] = newPair;\n"
" }\n"
" }\n"
" } \n"
" curIndex++;\n"
" } else\n"
" {\n"
" if (isLeafNode)\n"
" {\n"
" curIndex++;\n"
" } else\n"
" {\n"
" escapeIndex = getEscapeIndex(&rootNode);\n"
" curIndex += escapeIndex;\n"
" }\n"
" }\n"
" }\n"
" }\n"
" }\n"
"}\n";

View file

@ -1,311 +0,0 @@
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
#define AppendInc(x, out) out = atomic_inc(x)
#define GET_NPOINTS(x) (x).m_worldNormalOnB.w
#ifdef cl_ext_atomic_counters_32
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
#else
#define counter32_t volatile __global int*
#endif
__kernel void mprPenetrationKernel( __global int4* pairs,
__global const b3RigidBodyData_t* rigidBodies,
__global const b3Collidable_t* collidables,
__global const b3ConvexPolyhedronData_t* convexShapes,
__global const float4* vertices,
__global float4* separatingNormals,
__global int* hasSeparatingAxis,
__global struct b3Contact4Data* restrict globalContactsOut,
counter32_t nGlobalContactsOut,
int contactCapacity,
int numPairs)
{
int i = get_global_id(0);
int pairIndex = i;
if (i<numPairs)
{
int bodyIndexA = pairs[i].x;
int bodyIndexB = pairs[i].y;
int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
//once the broadphase avoids static-static pairs, we can remove this test
if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
{
return;
}
if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
{
return;
}
float depthOut;
b3Float4 dirOut;
b3Float4 posOut;
int res = b3MprPenetration(pairIndex, bodyIndexA, bodyIndexB,rigidBodies,convexShapes,collidables,vertices,separatingNormals,hasSeparatingAxis,&depthOut, &dirOut, &posOut);
if (res==0)
{
//add a contact
int dstIdx;
AppendInc( nGlobalContactsOut, dstIdx );
if (dstIdx<contactCapacity)
{
pairs[pairIndex].z = dstIdx;
__global struct b3Contact4Data* c = globalContactsOut + dstIdx;
c->m_worldNormalOnB = -dirOut;//normal;
c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
c->m_batchIdx = pairIndex;
int bodyA = pairs[pairIndex].x;
int bodyB = pairs[pairIndex].y;
c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;
c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;
c->m_childIndexA = -1;
c->m_childIndexB = -1;
//for (int i=0;i<nContacts;i++)
posOut.w = -depthOut;
c->m_worldPosB[0] = posOut;//localPoints[contactIdx[i]];
GET_NPOINTS(*c) = 1;//nContacts;
}
}
}
}
typedef float4 Quaternion;
#define make_float4 (float4)
__inline
float dot3F4(float4 a, float4 b)
{
float4 a1 = make_float4(a.xyz,0.f);
float4 b1 = make_float4(b.xyz,0.f);
return dot(a1, b1);
}
__inline
float4 cross3(float4 a, float4 b)
{
return cross(a,b);
}
__inline
Quaternion qtMul(Quaternion a, Quaternion b)
{
Quaternion ans;
ans = cross3( a, b );
ans += a.w*b+b.w*a;
// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
ans.w = a.w*b.w - dot3F4(a, b);
return ans;
}
__inline
Quaternion qtInvert(Quaternion q)
{
return (Quaternion)(-q.xyz, q.w);
}
__inline
float4 qtRotate(Quaternion q, float4 vec)
{
Quaternion qInv = qtInvert( q );
float4 vcpy = vec;
vcpy.w = 0.f;
float4 out = qtMul(qtMul(q,vcpy),qInv);
return out;
}
__inline
float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
{
return qtRotate( *orientation, *p ) + (*translation);
}
__inline
float4 qtInvRotate(const Quaternion q, float4 vec)
{
return qtRotate( qtInvert( q ), vec );
}
inline void project(__global const b3ConvexPolyhedronData_t* hull, const float4 pos, const float4 orn,
const float4* dir, __global const float4* vertices, float* min, float* max)
{
min[0] = FLT_MAX;
max[0] = -FLT_MAX;
int numVerts = hull->m_numVertices;
const float4 localDir = qtInvRotate(orn,*dir);
float offset = dot(pos,*dir);
for(int i=0;i<numVerts;i++)
{
float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
if(dp < min[0])
min[0] = dp;
if(dp > max[0])
max[0] = dp;
}
if(min[0]>max[0])
{
float tmp = min[0];
min[0] = max[0];
max[0] = tmp;
}
min[0] += offset;
max[0] += offset;
}
bool findSeparatingAxisUnitSphere( __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB,
const float4 posA1,
const float4 ornA,
const float4 posB1,
const float4 ornB,
const float4 DeltaC2,
__global const float4* vertices,
__global const float4* unitSphereDirections,
int numUnitSphereDirections,
float4* sep,
float* dmin)
{
float4 posA = posA1;
posA.w = 0.f;
float4 posB = posB1;
posB.w = 0.f;
int curPlaneTests=0;
int curEdgeEdge = 0;
// Test unit sphere directions
for (int i=0;i<numUnitSphereDirections;i++)
{
float4 crossje;
crossje = unitSphereDirections[i];
if (dot3F4(DeltaC2,crossje)>0)
crossje *= -1.f;
{
float dist;
bool result = true;
float Min0,Max0;
float Min1,Max1;
project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
if(Max0<Min1 || Max1<Min0)
return false;
float d0 = Max0 - Min1;
float d1 = Max1 - Min0;
dist = d0<d1 ? d0:d1;
result = true;
if(dist<*dmin)
{
*dmin = dist;
*sep = crossje;
}
}
}
if((dot3F4(-DeltaC2,*sep))>0.0f)
{
*sep = -(*sep);
}
return true;
}
__kernel void findSeparatingAxisUnitSphereKernel( __global const int4* pairs,
__global const b3RigidBodyData_t* rigidBodies,
__global const b3Collidable_t* collidables,
__global const b3ConvexPolyhedronData_t* convexShapes,
__global const float4* vertices,
__global const float4* unitSphereDirections,
__global float4* separatingNormals,
__global int* hasSeparatingAxis,
__global float* dmins,
int numUnitSphereDirections,
int numPairs
)
{
int i = get_global_id(0);
if (i<numPairs)
{
if (hasSeparatingAxis[i])
{
int bodyIndexA = pairs[i].x;
int bodyIndexB = pairs[i].y;
int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
int numFacesA = convexShapes[shapeIndexA].m_numFaces;
float dmin = dmins[i];
float4 posA = rigidBodies[bodyIndexA].m_pos;
posA.w = 0.f;
float4 posB = rigidBodies[bodyIndexB].m_pos;
posB.w = 0.f;
float4 c0local = convexShapes[shapeIndexA].m_localCenter;
float4 ornA = rigidBodies[bodyIndexA].m_quat;
float4 c0 = transform(&c0local, &posA, &ornA);
float4 c1local = convexShapes[shapeIndexB].m_localCenter;
float4 ornB =rigidBodies[bodyIndexB].m_quat;
float4 c1 = transform(&c1local,&posB,&ornB);
const float4 DeltaC2 = c0 - c1;
float4 sepNormal = separatingNormals[i];
int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;
if (numEdgeEdgeDirections>numUnitSphereDirections)
{
bool sepEE = findSeparatingAxisUnitSphere( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
posB,ornB,
DeltaC2,
vertices,unitSphereDirections,numUnitSphereDirections,&sepNormal,&dmin);
if (!sepEE)
{
hasSeparatingAxis[i] = 0;
} else
{
hasSeparatingAxis[i] = 1;
separatingNormals[i] = sepNormal;
}
}
} //if (hasSeparatingAxis[i])
}//(i<numPairs)
}

View file

@ -1,203 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
//Host-code rewritten by Erwin Coumans
#define BOUNDSEARCH_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl"
#define KERNEL0 "SearchSortDataLowerKernel"
#define KERNEL1 "SearchSortDataUpperKernel"
#define KERNEL2 "SubtractKernel"
#include "b3BoundSearchCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "b3LauncherCL.h"
#include "kernels/BoundSearchKernelsCL.h"
b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
: m_context(ctx),
m_device(device),
m_queue(queue)
{
const char* additionalMacros = "";
//const char* srcFileNameForCaching="";
cl_int pErrNum;
const char* kernelSource = boundSearchKernelsCL;
cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, BOUNDSEARCH_PATH);
b3Assert(boundSearchProg);
m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg, additionalMacros);
b3Assert(m_lowerSortDataKernel);
m_upperSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg, additionalMacros);
b3Assert(m_upperSortDataKernel);
m_subtractKernel = 0;
if (maxSize)
{
m_subtractKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg, additionalMacros);
b3Assert(m_subtractKernel);
}
//m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST );
m_lower = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
m_upper = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
m_filler = new b3FillCL(ctx, device, queue);
}
b3BoundSearchCL::~b3BoundSearchCL()
{
delete m_lower;
delete m_upper;
delete m_filler;
clReleaseKernel(m_lowerSortDataKernel);
clReleaseKernel(m_upperSortDataKernel);
clReleaseKernel(m_subtractKernel);
}
void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option)
{
b3Int4 constBuffer;
constBuffer.x = nSrc;
constBuffer.y = nDst;
if (option == BOUND_LOWER)
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
b3LauncherCL launcher(m_queue, m_lowerSortDataKernel, "m_lowerSortDataKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(nSrc);
launcher.setConst(nDst);
launcher.launch1D(nSrc, 64);
}
else if (option == BOUND_UPPER)
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
b3LauncherCL launcher(m_queue, m_upperSortDataKernel, "m_upperSortDataKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(nSrc);
launcher.setConst(nDst);
launcher.launch1D(nSrc, 64);
}
else if (option == COUNT)
{
b3Assert(m_lower);
b3Assert(m_upper);
b3Assert(m_lower->capacity() <= (int)nDst);
b3Assert(m_upper->capacity() <= (int)nDst);
int zero = 0;
m_filler->execute(*m_lower, zero, nDst);
m_filler->execute(*m_upper, zero, nDst);
execute(src, nSrc, *m_lower, nDst, BOUND_LOWER);
execute(src, nSrc, *m_upper, nDst, BOUND_UPPER);
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_upper->getBufferCL(), true), b3BufferInfoCL(m_lower->getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
b3LauncherCL launcher(m_queue, m_subtractKernel, "m_subtractKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(nSrc);
launcher.setConst(nDst);
launcher.launch1D(nDst, 64);
}
}
else
{
b3Assert(0);
}
}
void b3BoundSearchCL::executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc,
b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option)
{
for (int i = 0; i < nSrc - 1; i++)
b3Assert(src[i].m_key <= src[i + 1].m_key);
b3SortData minData, zeroData, maxData;
minData.m_key = -1;
minData.m_value = -1;
zeroData.m_key = 0;
zeroData.m_value = 0;
maxData.m_key = nDst;
maxData.m_value = nDst;
if (option == BOUND_LOWER)
{
for (int i = 0; i < nSrc; i++)
{
b3SortData& iData = (i == 0) ? minData : src[i - 1];
b3SortData& jData = (i == nSrc) ? maxData : src[i];
if (iData.m_key != jData.m_key)
{
int k = jData.m_key;
{
dst[k] = i;
}
}
}
}
else if (option == BOUND_UPPER)
{
for (int i = 1; i < nSrc + 1; i++)
{
b3SortData& iData = src[i - 1];
b3SortData& jData = (i == nSrc) ? maxData : src[i];
if (iData.m_key != jData.m_key)
{
int k = iData.m_key;
{
dst[k] = i;
}
}
}
}
else if (option == COUNT)
{
b3AlignedObjectArray<unsigned int> lower;
lower.resize(nDst);
b3AlignedObjectArray<unsigned int> upper;
upper.resize(nDst);
for (int i = 0; i < nDst; i++)
{
lower[i] = upper[i] = 0;
}
executeHost(src, nSrc, lower, nDst, BOUND_LOWER);
executeHost(src, nSrc, upper, nDst, BOUND_UPPER);
for (int i = 0; i < nDst; i++)
{
dst[i] = upper[i] - lower[i];
}
}
else
{
b3Assert(0);
}
}

View file

@ -1,64 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#ifndef B3_BOUNDSEARCH_H
#define B3_BOUNDSEARCH_H
#pragma once
/*#include <Adl/Adl.h>
#include <AdlPrimitives/Math/Math.h>
#include <AdlPrimitives/Sort/SortData.h>
#include <AdlPrimitives/Fill/Fill.h>
*/
#include "b3OpenCLArray.h"
#include "b3FillCL.h"
#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?)
class b3BoundSearchCL
{
public:
enum Option
{
BOUND_LOWER,
BOUND_UPPER,
COUNT,
};
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_lowerSortDataKernel;
cl_kernel m_upperSortDataKernel;
cl_kernel m_subtractKernel;
b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
b3OpenCLArray<unsigned int>* m_lower;
b3OpenCLArray<unsigned int>* m_upper;
b3FillCL* m_filler;
b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
virtual ~b3BoundSearchCL();
// src has to be src[i].m_key <= src[i+1].m_key
void execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
void executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
};
#endif //B3_BOUNDSEARCH_H

View file

@ -1,18 +0,0 @@
#ifndef B3_BUFFER_INFO_CL_H
#define B3_BUFFER_INFO_CL_H
#include "b3OpenCLArray.h"
struct b3BufferInfoCL
{
//b3BufferInfoCL(){}
// template<typename T>
b3BufferInfoCL(cl_mem buff, bool isReadOnly = false) : m_clBuffer(buff), m_isReadOnly(isReadOnly) {}
cl_mem m_clBuffer;
bool m_isReadOnly;
};
#endif //B3_BUFFER_INFO_CL_H

View file

@ -1,119 +0,0 @@
#include "b3FillCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "b3BufferInfoCL.h"
#include "b3LauncherCL.h"
#define FILL_CL_PROGRAM_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl"
#include "kernels/FillKernelsCL.h"
b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
: m_commandQueue(queue)
{
const char* kernelSource = fillKernelsCL;
cl_int pErrNum;
const char* additionalMacros = "";
cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, FILL_CL_PROGRAM_PATH);
b3Assert(fillProg);
m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg, additionalMacros);
b3Assert(m_fillIntKernel);
m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg, additionalMacros);
b3Assert(m_fillIntKernel);
m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg, additionalMacros);
b3Assert(m_fillFloatKernel);
m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg, additionalMacros);
b3Assert(m_fillKernelInt2);
}
b3FillCL::~b3FillCL()
{
clReleaseKernel(m_fillKernelInt2);
clReleaseKernel(m_fillIntKernel);
clReleaseKernel(m_fillUnsignedIntKernel);
clReleaseKernel(m_fillFloatKernel);
}
void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset)
{
b3Assert(n > 0);
{
b3LauncherCL launcher(m_commandQueue, m_fillFloatKernel, "m_fillFloatKernel");
launcher.setBuffer(src.getBufferCL());
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);
launcher.launch1D(n);
}
}
void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset)
{
b3Assert(n > 0);
{
b3LauncherCL launcher(m_commandQueue, m_fillIntKernel, "m_fillIntKernel");
launcher.setBuffer(src.getBufferCL());
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);
launcher.launch1D(n);
}
}
void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
{
b3Assert(n > 0);
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_fillUnsignedIntKernel, "m_fillUnsignedIntKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);
launcher.launch1D(n);
}
}
void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset)
{
for (int i = 0; i < n; i++)
{
src[i + offset] = value;
}
}
void b3FillCL::executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset)
{
for (int i = 0; i < n; i++)
{
src[i + offset] = value;
}
}
void b3FillCL::execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset)
{
b3Assert(n > 0);
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2, "m_fillKernelInt2");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);
//( constBuffer );
launcher.launch1D(n);
}
}

View file

@ -1,52 +0,0 @@
#ifndef B3_FILL_CL_H
#define B3_FILL_CL_H
#include "b3OpenCLArray.h"
#include "Bullet3Common/b3Scalar.h"
#include "Bullet3Common/shared/b3Int2.h"
#include "Bullet3Common/shared/b3Int4.h"
class b3FillCL
{
cl_command_queue m_commandQueue;
cl_kernel m_fillKernelInt2;
cl_kernel m_fillIntKernel;
cl_kernel m_fillUnsignedIntKernel;
cl_kernel m_fillFloatKernel;
public:
struct b3ConstData
{
union {
b3Int4 m_data;
b3UnsignedInt4 m_UnsignedData;
};
int m_offset;
int m_n;
int m_padding[2];
};
protected:
public:
b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
virtual ~b3FillCL();
void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);
void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);
void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);
void executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset);
void executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset);
// void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0);
};
#endif //B3_FILL_CL_H

View file

@ -1,296 +0,0 @@
#include "b3LauncherCL.h"
bool gDebugLauncherCL = false;
b3LauncherCL::b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name)
: m_commandQueue(queue),
m_kernel(kernel),
m_idx(0),
m_enableSerialization(false),
m_name(name)
{
if (gDebugLauncherCL)
{
static int counter = 0;
printf("[%d] Prepare to launch OpenCL kernel %s\n", counter++, name);
}
m_serializationSizeInBytes = sizeof(int);
}
b3LauncherCL::~b3LauncherCL()
{
for (int i = 0; i < m_arrays.size(); i++)
{
delete (m_arrays[i]);
}
m_arrays.clear();
if (gDebugLauncherCL)
{
static int counter = 0;
printf("[%d] Finished launching OpenCL kernel %s\n", counter++, m_name);
}
}
void b3LauncherCL::setBuffer(cl_mem clBuffer)
{
if (m_enableSerialization)
{
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = clBuffer;
cl_mem_info param_name = CL_MEM_SIZE;
size_t param_value;
size_t sizeInBytes = sizeof(size_t);
size_t actualSizeInBytes;
cl_int err;
err = clGetMemObjectInfo(kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
b3Assert(err == CL_SUCCESS);
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes += sizeof(b3KernelArgData);
m_serializationSizeInBytes += param_value;
}
cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
b3Assert(status == CL_SUCCESS);
}
void b3LauncherCL::setBuffers(b3BufferInfoCL* buffInfo, int n)
{
for (int i = 0; i < n; i++)
{
if (m_enableSerialization)
{
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
cl_mem_info param_name = CL_MEM_SIZE;
size_t param_value;
size_t sizeInBytes = sizeof(size_t);
size_t actualSizeInBytes;
cl_int err;
err = clGetMemObjectInfo(kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
b3Assert(err == CL_SUCCESS);
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes += sizeof(b3KernelArgData);
m_serializationSizeInBytes += param_value;
}
cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
b3Assert(status == CL_SUCCESS);
}
}
struct b3KernelArgDataUnaligned
{
int m_isBuffer;
int m_argIndex;
int m_argSizeInBytes;
int m_unusedPadding;
union {
cl_mem m_clBuffer;
unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
};
};
#include <string.h>
int b3LauncherCL::deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
{
int index = 0;
int numArguments = *(int*)&buf[index];
index += sizeof(int);
for (int i = 0; i < numArguments; i++)
{
b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index];
index += sizeof(b3KernelArgData);
if (arg->m_isBuffer)
{
b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx, m_commandQueue, arg->m_argSizeInBytes);
clData->resize(arg->m_argSizeInBytes);
clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
arg->m_clBuffer = clData->getBufferCL();
m_arrays.push_back(clData);
cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
b3Assert(status == CL_SUCCESS);
index += arg->m_argSizeInBytes;
}
else
{
cl_int status = clSetKernelArg(m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
b3Assert(status == CL_SUCCESS);
}
b3KernelArgData b;
memcpy(&b, arg, sizeof(b3KernelArgDataUnaligned));
m_kernelArguments.push_back(b);
}
m_serializationSizeInBytes = index;
return index;
}
int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
{
int index = 0;
int numArguments = *(int*)&goldBuffer[index];
index += sizeof(int);
if (numArguments != m_kernelArguments.size())
{
printf("failed validation: expected %d arguments, found %d\n", numArguments, m_kernelArguments.size());
return -1;
}
for (int ii = 0; ii < numArguments; ii++)
{
b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];
if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
{
printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n", ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
return -2;
}
{
int expected = argGold->m_isBuffer;
int found = m_kernelArguments[ii].m_isBuffer;
if (expected != found)
{
printf("failed validation: argument %d isBuffer expected: %d, found %d\n", ii, expected, found);
return -3;
}
}
index += sizeof(b3KernelArgData);
if (argGold->m_isBuffer)
{
unsigned char* memBuf = (unsigned char*)malloc(m_kernelArguments[ii].m_argSizeInBytes);
unsigned char* goldBuf = &goldBuffer[index];
for (int j = 0; j < m_kernelArguments[j].m_argSizeInBytes; j++)
{
memBuf[j] = 0xaa;
}
cl_int status = 0;
status = clEnqueueReadBuffer(m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
memBuf, 0, 0, 0);
b3Assert(status == CL_SUCCESS);
clFinish(m_commandQueue);
for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
{
int expected = goldBuf[b];
int found = memBuf[b];
if (expected != found)
{
printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
ii, b, expected, found);
return -4;
}
}
index += argGold->m_argSizeInBytes;
}
else
{
//compare content
for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
{
int expected = argGold->m_argData[b];
int found = m_kernelArguments[ii].m_argData[b];
if (expected != found)
{
printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
ii, b, expected, found);
return -5;
}
}
}
}
return index;
}
int b3LauncherCL::serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
{
//initialize to known values
for (int i = 0; i < destBufferCapacity; i++)
destBuffer[i] = 0xec;
assert(destBufferCapacity >= m_serializationSizeInBytes);
//todo: use the b3Serializer for this to allow for 32/64bit, endianness etc
int numArguments = m_kernelArguments.size();
int curBufferSize = 0;
int* dest = (int*)&destBuffer[curBufferSize];
*dest = numArguments;
curBufferSize += sizeof(int);
for (int i = 0; i < this->m_kernelArguments.size(); i++)
{
b3KernelArgData* arg = (b3KernelArgData*)&destBuffer[curBufferSize];
*arg = m_kernelArguments[i];
curBufferSize += sizeof(b3KernelArgData);
if (arg->m_isBuffer == 1)
{
//copy the OpenCL buffer content
cl_int status = 0;
status = clEnqueueReadBuffer(m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
&destBuffer[curBufferSize], 0, 0, 0);
b3Assert(status == CL_SUCCESS);
clFinish(m_commandQueue);
curBufferSize += arg->m_argSizeInBytes;
}
}
return curBufferSize;
}
void b3LauncherCL::serializeToFile(const char* fileName, int numWorkItems)
{
int num = numWorkItems;
int buffSize = getSerializationBufferSize();
unsigned char* buf = new unsigned char[buffSize + sizeof(int)];
for (int i = 0; i < buffSize + 1; i++)
{
unsigned char* ptr = (unsigned char*)&buf[i];
*ptr = 0xff;
}
// int actualWrite = serializeArguments(buf,buffSize);
// unsigned char* cptr = (unsigned char*)&buf[buffSize];
// printf("buf[buffSize] = %d\n",*cptr);
assert(buf[buffSize] == 0xff); //check for buffer overrun
int* ptr = (int*)&buf[buffSize];
*ptr = num;
FILE* f = fopen(fileName, "wb");
fwrite(buf, buffSize + sizeof(int), 1, f);
fclose(f);
delete[] buf;
}

View file

@ -1,128 +0,0 @@
#ifndef B3_LAUNCHER_CL_H
#define B3_LAUNCHER_CL_H
#include "b3BufferInfoCL.h"
#include "Bullet3Common/b3MinMax.h"
#include "b3OpenCLArray.h"
#include <stdio.h>
#define B3_DEBUG_SERIALIZE_CL
#ifdef _WIN32
#pragma warning(disable : 4996)
#endif
#define B3_CL_MAX_ARG_SIZE 16
B3_ATTRIBUTE_ALIGNED16(struct)
b3KernelArgData
{
int m_isBuffer;
int m_argIndex;
int m_argSizeInBytes;
int m_unusedPadding;
union {
cl_mem m_clBuffer;
unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
};
};
class b3LauncherCL
{
cl_command_queue m_commandQueue;
cl_kernel m_kernel;
int m_idx;
b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
int m_serializationSizeInBytes;
bool m_enableSerialization;
const char* m_name;
public:
b3AlignedObjectArray<b3OpenCLArray<unsigned char>*> m_arrays;
b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name);
virtual ~b3LauncherCL();
void setBuffer(cl_mem clBuffer);
void setBuffers(b3BufferInfoCL* buffInfo, int n);
int getSerializationBufferSize() const
{
return m_serializationSizeInBytes;
}
int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx);
inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx);
int serializeArguments(unsigned char* destBuffer, int destBufferCapacity);
int getNumArguments() const
{
return m_kernelArguments.size();
}
b3KernelArgData getArgument(int index)
{
return m_kernelArguments[index];
}
void serializeToFile(const char* fileName, int numWorkItems);
template <typename T>
inline void setConst(const T& consts)
{
int sz = sizeof(T);
b3Assert(sz <= B3_CL_MAX_ARG_SIZE);
if (m_enableSerialization)
{
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 0;
T* destArg = (T*)kernelArg.m_argData;
*destArg = consts;
kernelArg.m_argSizeInBytes = sizeof(T);
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes += sizeof(b3KernelArgData);
}
cl_int status = clSetKernelArg(m_kernel, m_idx++, sz, &consts);
b3Assert(status == CL_SUCCESS);
}
inline void launch1D(int numThreads, int localSize = 64)
{
launch2D(numThreads, 1, localSize, 1);
}
inline void launch2D(int numThreadsX, int numThreadsY, int localSizeX, int localSizeY)
{
size_t gRange[3] = {1, 1, 1};
size_t lRange[3] = {1, 1, 1};
lRange[0] = localSizeX;
lRange[1] = localSizeY;
gRange[0] = b3Max((size_t)1, (numThreadsX / lRange[0]) + (!(numThreadsX % lRange[0]) ? 0 : 1));
gRange[0] *= lRange[0];
gRange[1] = b3Max((size_t)1, (numThreadsY / lRange[1]) + (!(numThreadsY % lRange[1]) ? 0 : 1));
gRange[1] *= lRange[1];
cl_int status = clEnqueueNDRangeKernel(m_commandQueue,
m_kernel, 2, NULL, gRange, lRange, 0, 0, 0);
if (status != CL_SUCCESS)
{
printf("Error: OpenCL status = %d\n", status);
}
b3Assert(status == CL_SUCCESS);
}
void enableSerialization(bool serialize)
{
m_enableSerialization = serialize;
}
};
#endif //B3_LAUNCHER_CL_H

View file

@ -1,300 +0,0 @@
#ifndef B3_OPENCL_ARRAY_H
#define B3_OPENCL_ARRAY_H
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
template <typename T>
class b3OpenCLArray
{
size_t m_size;
size_t m_capacity;
cl_mem m_clBuffer;
cl_context m_clContext;
cl_command_queue m_commandQueue;
bool m_ownsMemory;
bool m_allowGrowingCapacity;
void deallocate()
{
if (m_clBuffer && m_ownsMemory)
{
clReleaseMemObject(m_clBuffer);
}
m_clBuffer = 0;
m_capacity = 0;
}
b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src);
B3_FORCE_INLINE size_t allocSize(size_t size)
{
return (size ? size * 2 : 1);
}
public:
b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity = 0, bool allowGrowingCapacity = true)
: m_size(0), m_capacity(0), m_clBuffer(0), m_clContext(ctx), m_commandQueue(queue), m_ownsMemory(true), m_allowGrowingCapacity(true)
{
if (initialCapacity)
{
reserve(initialCapacity);
}
m_allowGrowingCapacity = allowGrowingCapacity;
}
///this is an error-prone method with no error checking, be careful!
void setFromOpenCLBuffer(cl_mem buffer, size_t sizeInElements)
{
deallocate();
m_ownsMemory = false;
m_allowGrowingCapacity = false;
m_clBuffer = buffer;
m_size = sizeInElements;
m_capacity = sizeInElements;
}
// we could enable this assignment, but need to make sure to avoid accidental deep copies
// b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
// {
// copyFromArray(src);
// return *this;
// }
cl_mem getBufferCL() const
{
return m_clBuffer;
}
virtual ~b3OpenCLArray()
{
deallocate();
m_size = 0;
m_capacity = 0;
}
B3_FORCE_INLINE bool push_back(const T& _Val, bool waitForCompletion = true)
{
bool result = true;
size_t sz = size();
if (sz == capacity())
{
result = reserve(allocSize(size()));
}
copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
m_size++;
return result;
}
B3_FORCE_INLINE T forcedAt(size_t n) const
{
b3Assert(n >= 0);
b3Assert(n < capacity());
T elem;
copyToHostPointer(&elem, 1, n, true);
return elem;
}
B3_FORCE_INLINE T at(size_t n) const
{
b3Assert(n >= 0);
b3Assert(n < size());
T elem;
copyToHostPointer(&elem, 1, n, true);
return elem;
}
B3_FORCE_INLINE bool resize(size_t newsize, bool copyOldContents = true)
{
bool result = true;
size_t curSize = size();
if (newsize < curSize)
{
//leave the OpenCL memory for now
}
else
{
if (newsize > size())
{
result = reserve(newsize, copyOldContents);
}
//leave new data uninitialized (init in debug mode?)
//for (size_t i=curSize;i<newsize;i++) ...
}
if (result)
{
m_size = newsize;
}
else
{
m_size = 0;
}
return result;
}
B3_FORCE_INLINE size_t size() const
{
return m_size;
}
B3_FORCE_INLINE size_t capacity() const
{
return m_capacity;
}
B3_FORCE_INLINE bool reserve(size_t _Count, bool copyOldContents = true)
{
bool result = true;
// determine new minimum length of allocated storage
if (capacity() < _Count)
{ // not enough room, reallocate
if (m_allowGrowingCapacity)
{
cl_int ciErrNum;
//create a new OpenCL buffer
size_t memSizeInBytes = sizeof(T) * _Count;
cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
if (ciErrNum != CL_SUCCESS)
{
b3Error("OpenCL out-of-memory\n");
_Count = 0;
result = false;
}
//#define B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
#ifdef B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
for (size_t i = 0; i < memSizeInBytes; i++)
src[i] = 0xbb;
ciErrNum = clEnqueueWriteBuffer(m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0, 0, 0);
b3Assert(ciErrNum == CL_SUCCESS);
clFinish(m_commandQueue);
free(src);
#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
if (result)
{
if (copyOldContents)
copyToCL(buf, size());
}
//deallocate the old buffer
deallocate();
m_clBuffer = buf;
m_capacity = _Count;
}
else
{
//fail: assert and
b3Assert(0);
deallocate();
result = false;
}
}
return result;
}
void copyToCL(cl_mem destination, size_t numElements, size_t firstElem = 0, size_t dstOffsetInElems = 0) const
{
if (numElements <= 0)
return;
b3Assert(m_clBuffer);
b3Assert(destination);
//likely some error, destination is same as source
b3Assert(m_clBuffer != destination);
b3Assert((firstElem + numElements) <= m_size);
cl_int status = 0;
b3Assert(numElements > 0);
b3Assert(numElements <= m_size);
size_t srcOffsetBytes = sizeof(T) * firstElem;
size_t dstOffsetInBytes = sizeof(T) * dstOffsetInElems;
status = clEnqueueCopyBuffer(m_commandQueue, m_clBuffer, destination,
srcOffsetBytes, dstOffsetInBytes, sizeof(T) * numElements, 0, 0, 0);
b3Assert(status == CL_SUCCESS);
}
void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion = true)
{
size_t newSize = srcArray.size();
bool copyOldContents = false;
resize(newSize, copyOldContents);
if (newSize)
copyFromHostPointer(&srcArray[0], newSize, 0, waitForCompletion);
}
void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem = 0, bool waitForCompletion = true)
{
b3Assert(numElems + destFirstElem <= capacity());
if (numElems + destFirstElem)
{
cl_int status = 0;
size_t sizeInBytes = sizeof(T) * numElems;
status = clEnqueueWriteBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * destFirstElem, sizeInBytes,
src, 0, 0, 0);
b3Assert(status == CL_SUCCESS);
if (waitForCompletion)
clFinish(m_commandQueue);
}
else
{
b3Error("copyFromHostPointer invalid range\n");
}
}
void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion = true) const
{
destArray.resize(this->size());
if (size())
copyToHostPointer(&destArray[0], size(), 0, waitForCompletion);
}
void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem = 0, bool waitForCompletion = true) const
{
b3Assert(numElem + srcFirstElem <= capacity());
if (numElem + srcFirstElem <= capacity())
{
cl_int status = 0;
status = clEnqueueReadBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * srcFirstElem, sizeof(T) * numElem,
destPtr, 0, 0, 0);
b3Assert(status == CL_SUCCESS);
if (waitForCompletion)
clFinish(m_commandQueue);
}
else
{
b3Error("copyToHostPointer invalid range\n");
}
}
void copyFromOpenCLArray(const b3OpenCLArray& src)
{
size_t newSize = src.size();
resize(newSize);
if (size())
{
src.copyToCL(m_clBuffer, size());
}
}
};
#endif //B3_OPENCL_ARRAY_H

View file

@ -1,120 +0,0 @@
#include "b3PrefixScanCL.h"
#include "b3FillCL.h"
#define B3_PREFIXSCAN_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl"
#include "b3LauncherCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "kernels/PrefixScanKernelsCL.h"
b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
: m_commandQueue(queue)
{
const char* scanKernelSource = prefixScanKernelsCL;
cl_int pErrNum;
char* additionalMacros = 0;
m_workBuffer = new b3OpenCLArray<unsigned int>(ctx, queue, size);
cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_PROG_PATH);
b3Assert(scanProg);
m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_localScanKernel);
m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_blockSumKernel);
m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_propagationKernel);
}
b3PrefixScanCL::~b3PrefixScanCL()
{
delete m_workBuffer;
clReleaseKernel(m_localScanKernel);
clReleaseKernel(m_blockSumKernel);
clReleaseKernel(m_propagationKernel);
}
template <class T>
T b3NextPowerOf2(T n)
{
n -= 1;
for (int i = 0; i < sizeof(T) * 8; i++)
n = n | (n >> i);
return n + 1;
}
void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
{
// b3Assert( data->m_option == EXCLUSIVE );
const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));
dst.resize(src.size());
m_workBuffer->resize(src.size());
b3Int4 constBuffer;
constBuffer.x = n;
constBuffer.y = numBlocks;
constBuffer.z = (int)b3NextPowerOf2(numBlocks);
b3OpenCLArray<unsigned int>* srcNative = &src;
b3OpenCLArray<unsigned int>* dstNative = &dst;
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
}
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
}
if (numBlocks > 1)
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
}
if (sum)
{
clFinish(m_commandQueue);
dstNative->copyToHostPointer(sum, 1, n - 1, true);
}
}
void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
{
unsigned int s = 0;
//if( data->m_option == EXCLUSIVE )
{
for (int i = 0; i < n; i++)
{
dst[i] = s;
s += src[i];
}
}
/*else
{
for(int i=0; i<n; i++)
{
s += hSrc[i];
hDst[i] = s;
}
}
*/
if (sum)
{
*sum = dst[n - 1];
}
}

View file

@ -1,35 +0,0 @@
#ifndef B3_PREFIX_SCAN_CL_H
#define B3_PREFIX_SCAN_CL_H
#include "b3OpenCLArray.h"
#include "b3BufferInfoCL.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
class b3PrefixScanCL
{
enum
{
BLOCK_SIZE = 128
};
// Option m_option;
cl_command_queue m_commandQueue;
cl_kernel m_localScanKernel;
cl_kernel m_blockSumKernel;
cl_kernel m_propagationKernel;
b3OpenCLArray<unsigned int>* m_workBuffer;
public:
b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);
virtual ~b3PrefixScanCL();
void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum = 0);
};
#endif //B3_PREFIX_SCAN_CL_H

View file

@ -1,120 +0,0 @@
#include "b3PrefixScanFloat4CL.h"
#include "b3FillCL.h"
#define B3_PREFIXSCAN_FLOAT4_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl"
#include "b3LauncherCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "kernels/PrefixScanKernelsFloat4CL.h"
b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
: m_commandQueue(queue)
{
const char* scanKernelSource = prefixScanKernelsFloat4CL;
cl_int pErrNum;
char* additionalMacros = 0;
m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx, queue, size);
cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
b3Assert(scanProg);
m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_localScanKernel);
m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_blockSumKernel);
m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_propagationKernel);
}
b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL()
{
delete m_workBuffer;
clReleaseKernel(m_localScanKernel);
clReleaseKernel(m_blockSumKernel);
clReleaseKernel(m_propagationKernel);
}
template <class T>
T b3NextPowerOf2(T n)
{
n -= 1;
for (int i = 0; i < sizeof(T) * 8; i++)
n = n | (n >> i);
return n + 1;
}
void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum)
{
// b3Assert( data->m_option == EXCLUSIVE );
const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));
dst.resize(src.size());
m_workBuffer->resize(src.size());
b3Int4 constBuffer;
constBuffer.x = n;
constBuffer.y = numBlocks;
constBuffer.z = (int)b3NextPowerOf2(numBlocks);
b3OpenCLArray<b3Vector3>* srcNative = &src;
b3OpenCLArray<b3Vector3>* dstNative = &dst;
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
}
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
}
if (numBlocks > 1)
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
}
if (sum)
{
clFinish(m_commandQueue);
dstNative->copyToHostPointer(sum, 1, n - 1, true);
}
}
void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum)
{
b3Vector3 s = b3MakeVector3(0, 0, 0);
//if( data->m_option == EXCLUSIVE )
{
for (int i = 0; i < n; i++)
{
dst[i] = s;
s += src[i];
}
}
/*else
{
for(int i=0; i<n; i++)
{
s += hSrc[i];
hDst[i] = s;
}
}
*/
if (sum)
{
*sum = dst[n - 1];
}
}

View file

@ -1,36 +0,0 @@
#ifndef B3_PREFIX_SCAN_CL_H
#define B3_PREFIX_SCAN_CL_H
#include "b3OpenCLArray.h"
#include "b3BufferInfoCL.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Common/b3Vector3.h"
class b3PrefixScanFloat4CL
{
enum
{
BLOCK_SIZE = 128
};
// Option m_option;
cl_command_queue m_commandQueue;
cl_kernel m_localScanKernel;
cl_kernel m_blockSumKernel;
cl_kernel m_propagationKernel;
b3OpenCLArray<b3Vector3>* m_workBuffer;
public:
b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);
virtual ~b3PrefixScanFloat4CL();
void execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum = 0);
void executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum);
};
#endif //B3_PREFIX_SCAN_CL_H

View file

@ -1,646 +0,0 @@
#include "b3RadixSort32CL.h"
#include "b3LauncherCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "b3PrefixScanCL.h"
#include "b3FillCL.h"
#define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
#include "kernels/RadixSort32KernelsCL.h"
b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
: m_commandQueue(queue)
{
b3OpenCLDeviceInfo info;
b3OpenCLUtils::getDeviceInfo(device, &info);
m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
if (initialCapacity > 0)
{
m_workBuffer1->resize(initialCapacity);
m_workBuffer3->resize(initialCapacity);
m_workBuffer3a->resize(initialCapacity);
m_workBuffer4->resize(initialCapacity);
m_workBuffer4a->resize(initialCapacity);
}
m_scan = new b3PrefixScanCL(ctx, device, queue);
m_fill = new b3FillCL(ctx, device, queue);
const char* additionalMacros = "";
cl_int pErrNum;
const char* kernelSource = radixSort32KernelsCL;
cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
b3Assert(sortProg);
m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_streamCountSortDataKernel);
m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_streamCountKernel);
if (m_deviceCPU)
{
m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterKernel);
}
else
{
m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterKernel);
}
m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_prefixScanKernel);
}
b3RadixSort32CL::~b3RadixSort32CL()
{
delete m_scan;
delete m_fill;
delete m_workBuffer1;
delete m_workBuffer2;
delete m_workBuffer3;
delete m_workBuffer3a;
delete m_workBuffer4;
delete m_workBuffer4a;
clReleaseKernel(m_streamCountSortDataKernel);
clReleaseKernel(m_streamCountKernel);
clReleaseKernel(m_sortAndScatterSortDataKernel);
clReleaseKernel(m_sortAndScatterKernel);
clReleaseKernel(m_prefixScanKernel);
}
void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
{
int n = inout.size();
const int BITS_PER_PASS = 8;
const int NUM_TABLES = (1 << BITS_PER_PASS);
int tables[NUM_TABLES];
int counter[NUM_TABLES];
b3SortData* src = &inout[0];
b3AlignedObjectArray<b3SortData> workbuffer;
workbuffer.resize(inout.size());
b3SortData* dst = &workbuffer[0];
int count = 0;
for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
{
for (int i = 0; i < NUM_TABLES; i++)
{
tables[i] = 0;
}
for (int i = 0; i < n; i++)
{
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
tables[tableIdx]++;
}
//#define TEST
#ifdef TEST
printf("histogram size=%d\n", NUM_TABLES);
for (int i = 0; i < NUM_TABLES; i++)
{
if (tables[i] != 0)
{
printf("tables[%d]=%d]\n", i, tables[i]);
}
}
#endif //TEST \
// prefix scan
int sum = 0;
for (int i = 0; i < NUM_TABLES; i++)
{
int iData = tables[i];
tables[i] = sum;
sum += iData;
counter[i] = 0;
}
// distribute
for (int i = 0; i < n; i++)
{
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
dst[tables[tableIdx] + counter[tableIdx]] = src[i];
counter[tableIdx]++;
}
b3Swap(src, dst);
count++;
}
if (count & 1)
{
b3Assert(0); //need to copy
}
}
void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
{
b3AlignedObjectArray<b3SortData> inout;
keyValuesInOut.copyToHost(inout);
executeHost(inout, sortBits);
keyValuesInOut.copyFromHost(inout);
}
void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
{
}
//#define DEBUG_RADIXSORT
//#define DEBUG_RADIXSORT2
void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
{
int originalSize = keyValuesInOut.size();
int workingSize = originalSize;
int dataAlignment = DATA_ALIGNMENT;
#ifdef DEBUG_RADIXSORT2
b3AlignedObjectArray<b3SortData> test2;
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n", test2.size());
for (int i = 0; i < test2.size(); i++)
{
printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
}
#endif //DEBUG_RADIXSORT2
b3OpenCLArray<b3SortData>* src = 0;
if (workingSize % dataAlignment)
{
workingSize += dataAlignment - (workingSize % dataAlignment);
m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
m_workBuffer4->resize(workingSize);
b3SortData fillValue;
fillValue.m_key = 0xffffffff;
fillValue.m_value = 0xffffffff;
#define USE_BTFILL
#ifdef USE_BTFILL
m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
#else
//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
for (int i = originalSize; i < workingSize; i++)
{
m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
}
#endif //USE_BTFILL
src = m_workBuffer4;
}
else
{
src = &keyValuesInOut;
m_workBuffer4->resize(0);
}
b3Assert(workingSize % DATA_ALIGNMENT == 0);
int minCap = NUM_BUCKET * NUM_WGS;
int n = workingSize;
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
b3Assert(BITS_PER_PASS == 4);
b3Assert(WG_SIZE == 64);
b3Assert((sortBits & 0x3) == 0);
b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
b3ConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
int nBlocks = (n + blockSize - 1) / (blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
if (nBlocks < NUM_WGS)
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
int count = 0;
for (int ib = 0; ib < sortBits; ib += 4)
{
#ifdef DEBUG_RADIXSORT2
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n", test2.size());
for (int i = 0; i < test2.size(); i++)
{
if (test2[i].m_key != test2[i].m_value)
{
printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
}
}
#endif //DEBUG_RADIXSORT2
cdata.m_startBit = ib;
if (src->size())
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
int num = NUM_WGS * WG_SIZE;
launcher.launch1D(num, WG_SIZE);
}
#ifdef DEBUG_RADIXSORT
b3AlignedObjectArray<unsigned int> testHist;
srcHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
for (int i = 0; i < testHist.size(); i++)
{
if (testHist[i] != 0)
printf("testHist[%d]=%d\n", i, testHist[i]);
}
#endif //DEBUG_RADIXSORT
//fast prefix scan is not working properly on Mac OSX yet
#ifdef __APPLE__
bool fastScan = false;
#else
bool fastScan = !m_deviceCPU; //only use fast scan on GPU
#endif
if (fastScan)
{ // prefix scan group histogram
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
launcher.launch1D(128, 128);
destHisto = srcHisto;
}
else
{
//unsigned int sum; //for debugging
m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
}
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
for (int i = 0; i < testHist.size(); i++)
{
if (testHist[i] != 0)
printf("testHist[%d]=%d\n", i, testHist[i]);
}
for (int i = 0; i < testHist.size(); i += NUM_WGS)
{
printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
}
#endif //DEBUG_RADIXSORT
#define USE_GPU
#ifdef USE_GPU
if (src->size())
{ // local sort and distribute
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
}
#else
{
#define NUM_TABLES 16
//#define SEQUENTIAL
#ifdef SEQUENTIAL
int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int tables[NUM_TABLES];
int startBit = ib;
destHisto->copyToHost(testHist);
b3AlignedObjectArray<b3SortData> srcHost;
b3AlignedObjectArray<b3SortData> dstHost;
dstHost.resize(src->size());
src->copyToHost(srcHost);
for (int i = 0; i < NUM_TABLES; i++)
{
tables[i] = testHist[i * NUM_WGS];
}
// distribute
for (int i = 0; i < n; i++)
{
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
counter2[tableIdx]++;
}
#else
int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int tables[NUM_TABLES];
b3AlignedObjectArray<b3SortData> dstHostOK;
dstHostOK.resize(src->size());
destHisto->copyToHost(testHist);
b3AlignedObjectArray<b3SortData> srcHost;
src->copyToHost(srcHost);
int blockSize = 256;
int nBlocksPerWG = cdata.m_nBlocksPerWG;
int startBit = ib;
{
for (int i = 0; i < NUM_TABLES; i++)
{
tables[i] = testHist[i * NUM_WGS];
}
// distribute
for (int i = 0; i < n; i++)
{
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
counter2[tableIdx]++;
}
}
b3AlignedObjectArray<b3SortData> dstHost;
dstHost.resize(src->size());
int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
{
int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
{
for (int lIdx = 0; lIdx < 64; lIdx++)
{
int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
// MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
// Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
// AMD: AtomInc performs better while NV prefers ++
for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
{
if (addr + j < n)
{
// printf ("addr+j=%d\n", addr+j);
int i = addr + j;
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
b3SortData ok = dstHostOK[destIndex];
if (ok.m_key != srcHost[i].m_key)
{
printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
}
if (ok.m_value != srcHost[i].m_value)
{
printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
}
dstHost[destIndex] = srcHost[i];
counter[tableIdx]++;
}
}
}
}
}
#endif //SEQUENTIAL
dst->copyFromHost(dstHost);
}
#endif //USE_GPU
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
for (int i = 0; i < testHist.size(); i++)
{
if (testHist[i] != 0)
printf("testHist[%d]=%d\n", i, testHist[i]);
}
#endif //DEBUG_RADIXSORT
b3Swap(src, dst);
b3Swap(srcHisto, destHisto);
#ifdef DEBUG_RADIXSORT2
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n", test2.size());
for (int i = 0; i < test2.size(); i++)
{
if (test2[i].m_key != test2[i].m_value)
{
printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
}
}
#endif //DEBUG_RADIXSORT2
count++;
}
if (count & 1)
{
b3Assert(0); //need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4->size())
{
m_workBuffer4->resize(originalSize);
keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
}
#ifdef DEBUG_RADIXSORT
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n", test2.size());
for (int i = 0; i < test2.size(); i++)
{
printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
}
#endif
}
void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
{
int originalSize = keysInOut.size();
int workingSize = originalSize;
int dataAlignment = DATA_ALIGNMENT;
b3OpenCLArray<unsigned int>* src = 0;
if (workingSize % dataAlignment)
{
workingSize += dataAlignment - (workingSize % dataAlignment);
m_workBuffer4a->copyFromOpenCLArray(keysInOut);
m_workBuffer4a->resize(workingSize);
unsigned int fillValue = 0xffffffff;
m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
src = m_workBuffer4a;
}
else
{
src = &keysInOut;
m_workBuffer4a->resize(0);
}
b3Assert(workingSize % DATA_ALIGNMENT == 0);
int minCap = NUM_BUCKET * NUM_WGS;
int n = workingSize;
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
m_workBuffer3a->resize(workingSize);
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
b3Assert(BITS_PER_PASS == 4);
b3Assert(WG_SIZE == 64);
b3Assert((sortBits & 0x3) == 0);
b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
b3ConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
int nBlocks = (n + blockSize - 1) / (blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
if (nBlocks < NUM_WGS)
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
int count = 0;
for (int ib = 0; ib < sortBits; ib += 4)
{
cdata.m_startBit = ib;
if (src->size())
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
int num = NUM_WGS * WG_SIZE;
launcher.launch1D(num, WG_SIZE);
}
//fast prefix scan is not working properly on Mac OSX yet
#ifdef __APPLE__
bool fastScan = false;
#else
bool fastScan = !m_deviceCPU;
#endif
if (fastScan)
{ // prefix scan group histogram
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
launcher.launch1D(128, 128);
destHisto = srcHisto;
}
else
{
//unsigned int sum; //for debugging
m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
}
if (src->size())
{ // local sort and distribute
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
}
b3Swap(src, dst);
b3Swap(srcHisto, destHisto);
count++;
}
if (count & 1)
{
b3Assert(0); //need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4a->size())
{
m_workBuffer4a->resize(originalSize);
keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
}
}

View file

@ -1,84 +0,0 @@
#ifndef B3_RADIXSORT32_H
#define B3_RADIXSORT32_H
#include "b3OpenCLArray.h"
struct b3SortData
{
union {
unsigned int m_key;
unsigned int x;
};
union {
unsigned int m_value;
unsigned int y;
};
};
#include "b3BufferInfoCL.h"
class b3RadixSort32CL
{
b3OpenCLArray<unsigned int>* m_workBuffer1;
b3OpenCLArray<unsigned int>* m_workBuffer2;
b3OpenCLArray<b3SortData>* m_workBuffer3;
b3OpenCLArray<b3SortData>* m_workBuffer4;
b3OpenCLArray<unsigned int>* m_workBuffer3a;
b3OpenCLArray<unsigned int>* m_workBuffer4a;
cl_command_queue m_commandQueue;
cl_kernel m_streamCountSortDataKernel;
cl_kernel m_streamCountKernel;
cl_kernel m_prefixScanKernel;
cl_kernel m_sortAndScatterSortDataKernel;
cl_kernel m_sortAndScatterKernel;
bool m_deviceCPU;
class b3PrefixScanCL* m_scan;
class b3FillCL* m_fill;
public:
struct b3ConstData
{
int m_n;
int m_nWGs;
int m_startBit;
int m_nBlocksPerWG;
};
enum
{
DATA_ALIGNMENT = 256,
WG_SIZE = 64,
BLOCK_SIZE = 256,
ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE),
BITS_PER_PASS = 4,
NUM_BUCKET = (1 << BITS_PER_PASS),
// if you change this, change nPerWI in kernel as well
NUM_WGS = 20 * 6, // cypress
// NUM_WGS = 24*6, // cayman
// NUM_WGS = 32*4, // nv
};
private:
public:
b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0);
virtual ~b3RadixSort32CL();
void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
///keys only
void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32);
void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
};
#endif //B3_RADIXSORT32_H

View file

@ -1,106 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
typedef struct
{
u32 m_key;
u32 m_value;
}SortData;
typedef struct
{
u32 m_nSrc;
u32 m_nDst;
u32 m_padding[2];
} ConstBuffer;
__attribute__((reqd_work_group_size(64,1,1)))
__kernel
void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst,
unsigned int nSrc, unsigned int nDst)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < nSrc )
{
SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);
SortData end; end.m_key = nDst; end.m_value = nDst;
SortData iData = (gIdx==0)? first: src[gIdx-1];
SortData jData = (gIdx==nSrc)? end: src[gIdx];
if( iData.m_key != jData.m_key )
{
// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
u32 k = jData.m_key;
{
dst[k] = gIdx;
}
}
}
}
__attribute__((reqd_work_group_size(64,1,1)))
__kernel
void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst,
unsigned int nSrc, unsigned int nDst)
{
int gIdx = GET_GLOBAL_IDX+1;
if( gIdx < nSrc+1 )
{
SortData first; first.m_key = 0; first.m_value = 0;
SortData end; end.m_key = nDst; end.m_value = nDst;
SortData iData = src[gIdx-1];
SortData jData = (gIdx==nSrc)? end: src[gIdx];
if( iData.m_key != jData.m_key )
{
u32 k = iData.m_key;
{
dst[k] = gIdx;
}
}
}
}
__attribute__((reqd_work_group_size(64,1,1)))
__kernel
void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C,
unsigned int nSrc, unsigned int nDst)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < nDst )
{
C[gIdx] = A[gIdx] - B[gIdx];
}
}

View file

@ -1,86 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* boundSearchKernelsCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"typedef struct\n"
"{\n"
" u32 m_key; \n"
" u32 m_value;\n"
"}SortData;\n"
"typedef struct\n"
"{\n"
" u32 m_nSrc;\n"
" u32 m_nDst;\n"
" u32 m_padding[2];\n"
"} ConstBuffer;\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < nSrc )\n"
" {\n"
" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
" SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
" if( iData.m_key != jData.m_key )\n"
" {\n"
"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
" u32 k = jData.m_key;\n"
" {\n"
" dst[k] = gIdx;\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX+1;\n"
" if( gIdx < nSrc+1 )\n"
" {\n"
" SortData first; first.m_key = 0; first.m_value = 0;\n"
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
" SortData iData = src[gIdx-1];\n"
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
" if( iData.m_key != jData.m_key )\n"
" {\n"
" u32 k = iData.m_key;\n"
" {\n"
" dst[k] = gIdx;\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" \n"
" if( gIdx < nDst )\n"
" {\n"
" C[gIdx] = A[gIdx] - B[gIdx];\n"
" }\n"
"}\n";

View file

@ -1,128 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define make_uint4 (uint4)
#define make_uint2 (uint2)
#define make_int2 (int2)
typedef struct
{
int m_n;
int m_padding[3];
} ConstBuffer;
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void Copy1F4Kernel(__global float4* dst, __global float4* src,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < cb.m_n )
{
float4 a0 = src[gIdx];
dst[ gIdx ] = a0;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void Copy2F4Kernel(__global float4* dst, __global float4* src,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( 2*gIdx <= cb.m_n )
{
float4 a0 = src[gIdx*2+0];
float4 a1 = src[gIdx*2+1];
dst[ gIdx*2+0 ] = a0;
dst[ gIdx*2+1 ] = a1;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void Copy4F4Kernel(__global float4* dst, __global float4* src,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( 4*gIdx <= cb.m_n )
{
int idx0 = gIdx*4+0;
int idx1 = gIdx*4+1;
int idx2 = gIdx*4+2;
int idx3 = gIdx*4+3;
float4 a0 = src[idx0];
float4 a1 = src[idx1];
float4 a2 = src[idx2];
float4 a3 = src[idx3];
dst[ idx0 ] = a0;
dst[ idx1 ] = a1;
dst[ idx2 ] = a2;
dst[ idx3 ] = a3;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void CopyF1Kernel(__global float* dstF1, __global float* srcF1,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < cb.m_n )
{
float a0 = srcF1[gIdx];
dstF1[ gIdx ] = a0;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < cb.m_n )
{
float2 a0 = srcF2[gIdx];
dstF2[ gIdx ] = a0;
}
}

View file

@ -1,131 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* copyKernelsCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"\n"
"#define make_uint4 (uint4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"typedef struct\n"
"{\n"
" int m_n;\n"
" int m_padding[3];\n"
"} ConstBuffer;\n"
"\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float4 a0 = src[gIdx];\n"
"\n"
" dst[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( 2*gIdx <= cb.m_n )\n"
" {\n"
" float4 a0 = src[gIdx*2+0];\n"
" float4 a1 = src[gIdx*2+1];\n"
"\n"
" dst[ gIdx*2+0 ] = a0;\n"
" dst[ gIdx*2+1 ] = a1;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( 4*gIdx <= cb.m_n )\n"
" {\n"
" int idx0 = gIdx*4+0;\n"
" int idx1 = gIdx*4+1;\n"
" int idx2 = gIdx*4+2;\n"
" int idx3 = gIdx*4+3;\n"
"\n"
" float4 a0 = src[idx0];\n"
" float4 a1 = src[idx1];\n"
" float4 a2 = src[idx2];\n"
" float4 a3 = src[idx3];\n"
"\n"
" dst[ idx0 ] = a0;\n"
" dst[ idx1 ] = a1;\n"
" dst[ idx2 ] = a2;\n"
" dst[ idx3 ] = a3;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float a0 = srcF1[gIdx];\n"
"\n"
" dstF1[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float2 a0 = srcF2[gIdx];\n"
"\n"
" dstF2[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"\n";

View file

@ -1,107 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define make_uint4 (uint4)
#define make_uint2 (uint2)
#define make_int2 (int2)
typedef struct
{
union
{
int4 m_data;
uint4 m_unsignedData;
float m_floatData;
};
int m_offset;
int m_n;
int m_padding[2];
} ConstBuffer;
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num_elements )
{
dstInt[ offset+gIdx ] = value;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num_elements )
{
dstFloat[ offset+gIdx ] = value;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num )
{
dstInt[ offset+gIdx ] = value;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num )
{
dstInt2[ gIdx + offset] = make_int2( value.x, value.y );
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num )
{
dstInt4[ offset+gIdx ] = value;
}
}

View file

@ -1,90 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* fillKernelsCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define make_uint4 (uint4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"typedef struct\n"
"{\n"
" union\n"
" {\n"
" int4 m_data;\n"
" uint4 m_unsignedData;\n"
" float m_floatData;\n"
" };\n"
" int m_offset;\n"
" int m_n;\n"
" int m_padding[2];\n"
"} ConstBuffer;\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num_elements )\n"
" {\n"
" dstInt[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num_elements )\n"
" {\n"
" dstFloat[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num )\n"
" {\n"
" dstInt[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num )\n"
" {\n"
" dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num )\n"
" {\n"
" dstInt4[ offset+gIdx ] = value;\n"
" }\n"
"}\n";

View file

@ -1,154 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
// takahiro end
#define WG_SIZE 128
#define m_numElems x
#define m_numBlocks y
#define m_numScanBlocks z
/*typedef struct
{
uint m_numElems;
uint m_numBlocks;
uint m_numScanBlocks;
uint m_padding[1];
} ConstBuffer;
*/
float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)
{
float4 blocksum;
int offset = 1;
for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
{
GROUP_LDS_BARRIER;
for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
{
int ai = offset*(2*iIdx+1)-1;
int bi = offset*(2*iIdx+2)-1;
data[bi] += data[ai];
}
}
GROUP_LDS_BARRIER;
if( lIdx == 0 )
{
blocksum = data[ n-1 ];
data[ n-1 ] = 0;
}
GROUP_LDS_BARRIER;
offset >>= 1;
for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
{
GROUP_LDS_BARRIER;
for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
{
int ai = offset*(2*iIdx+1)-1;
int bi = offset*(2*iIdx+2)-1;
float4 temp = data[ai];
data[ai] = data[bi];
data[bi] += temp;
}
}
GROUP_LDS_BARRIER;
return blocksum;
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb)
{
__local float4 ldsData[WG_SIZE*2];
int gIdx = GET_GLOBAL_IDX;
int lIdx = GET_LOCAL_IDX;
ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
if( lIdx == 0 )
sumBuffer[GET_GROUP_IDX] = sum;
if( (2*gIdx) < cb.m_numElems )
{
dst[2*gIdx] = ldsData[2*lIdx];
}
if( (2*gIdx + 1) < cb.m_numElems )
{
dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
}
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)
{
const u32 blockSize = WG_SIZE*2;
int myIdx = GET_GROUP_IDX+1;
int lIdx = GET_LOCAL_IDX;
float4 iBlockSum = blockSum[myIdx];
int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
{
dst[i] += iBlockSum;
}
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void TopLevelScanKernel(__global float4* dst, uint4 cb)
{
__local float4 ldsData[2048];
int gIdx = GET_GLOBAL_IDX;
int lIdx = GET_LOCAL_IDX;
int lSize = GET_GROUP_SIZE;
for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
{
ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
}
GROUP_LDS_BARRIER;
float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
{
dst[i] = ldsData[i];
}
if( gIdx == 0 )
{
dst[cb.m_numBlocks] = sum;
}
}

View file

@ -1,154 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
// takahiro end
#define WG_SIZE 128
#define m_numElems x
#define m_numBlocks y
#define m_numScanBlocks z
/*typedef struct
{
uint m_numElems;
uint m_numBlocks;
uint m_numScanBlocks;
uint m_padding[1];
} ConstBuffer;
*/
u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)
{
u32 blocksum;
int offset = 1;
for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
{
GROUP_LDS_BARRIER;
for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
{
int ai = offset*(2*iIdx+1)-1;
int bi = offset*(2*iIdx+2)-1;
data[bi] += data[ai];
}
}
GROUP_LDS_BARRIER;
if( lIdx == 0 )
{
blocksum = data[ n-1 ];
data[ n-1 ] = 0;
}
GROUP_LDS_BARRIER;
offset >>= 1;
for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
{
GROUP_LDS_BARRIER;
for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
{
int ai = offset*(2*iIdx+1)-1;
int bi = offset*(2*iIdx+2)-1;
u32 temp = data[ai];
data[ai] = data[bi];
data[bi] += temp;
}
}
GROUP_LDS_BARRIER;
return blocksum;
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,
uint4 cb)
{
__local u32 ldsData[WG_SIZE*2];
int gIdx = GET_GLOBAL_IDX;
int lIdx = GET_LOCAL_IDX;
ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
if( (2*gIdx) < cb.m_numElems )
{
dst[2*gIdx] = ldsData[2*lIdx];
}
if( (2*gIdx + 1) < cb.m_numElems )
{
dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
}
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)
{
const u32 blockSize = WG_SIZE*2;
int myIdx = GET_GROUP_IDX+1;
int lIdx = GET_LOCAL_IDX;
u32 iBlockSum = blockSum[myIdx];
int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
{
dst[i] += iBlockSum;
}
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void TopLevelScanKernel(__global u32* dst, uint4 cb)
{
__local u32 ldsData[2048];
int gIdx = GET_GLOBAL_IDX;
int lIdx = GET_LOCAL_IDX;
int lSize = GET_GROUP_SIZE;
for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
{
ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
}
GROUP_LDS_BARRIER;
u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
{
dst[i] = ldsData[i];
}
if( gIdx == 0 )
{
dst[cb.m_numBlocks] = sum;
}
}

View file

@ -1,128 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* prefixScanKernelsCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"// takahiro end\n"
"#define WG_SIZE 128 \n"
"#define m_numElems x\n"
"#define m_numBlocks y\n"
"#define m_numScanBlocks z\n"
"/*typedef struct\n"
"{\n"
" uint m_numElems;\n"
" uint m_numBlocks;\n"
" uint m_numScanBlocks;\n"
" uint m_padding[1];\n"
"} ConstBuffer;\n"
"*/\n"
"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
"{\n"
" u32 blocksum;\n"
" int offset = 1;\n"
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" data[bi] += data[ai];\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 )\n"
" {\n"
" blocksum = data[ n-1 ];\n"
" data[ n-1 ] = 0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" offset >>= 1;\n"
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" u32 temp = data[ai];\n"
" data[ai] = data[bi];\n"
" data[bi] += temp;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" return blocksum;\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
" uint4 cb)\n"
"{\n"
" __local u32 ldsData[WG_SIZE*2];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
" u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
" if( (2*gIdx) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx] = ldsData[2*lIdx];\n"
" }\n"
" if( (2*gIdx + 1) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
"{\n"
" const u32 blockSize = WG_SIZE*2;\n"
" int myIdx = GET_GROUP_IDX+1;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" u32 iBlockSum = blockSum[myIdx];\n"
" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
" {\n"
" dst[i] += iBlockSum;\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
"{\n"
" __local u32 ldsData[2048];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int lSize = GET_GROUP_SIZE;\n"
" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
" {\n"
" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
" {\n"
" dst[i] = ldsData[i];\n"
" }\n"
" if( gIdx == 0 )\n"
" {\n"
" dst[cb.m_numBlocks] = sum;\n"
" }\n"
"}\n";

View file

@ -1,128 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* prefixScanKernelsFloat4CL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"// takahiro end\n"
"#define WG_SIZE 128 \n"
"#define m_numElems x\n"
"#define m_numBlocks y\n"
"#define m_numScanBlocks z\n"
"/*typedef struct\n"
"{\n"
" uint m_numElems;\n"
" uint m_numBlocks;\n"
" uint m_numScanBlocks;\n"
" uint m_padding[1];\n"
"} ConstBuffer;\n"
"*/\n"
"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
"{\n"
" float4 blocksum;\n"
" int offset = 1;\n"
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" data[bi] += data[ai];\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 )\n"
" {\n"
" blocksum = data[ n-1 ];\n"
" data[ n-1 ] = 0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" offset >>= 1;\n"
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" float4 temp = data[ai];\n"
" data[ai] = data[bi];\n"
" data[bi] += temp;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" return blocksum;\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb)\n"
"{\n"
" __local float4 ldsData[WG_SIZE*2];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
" float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" if( lIdx == 0 ) \n"
" sumBuffer[GET_GROUP_IDX] = sum;\n"
" if( (2*gIdx) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx] = ldsData[2*lIdx];\n"
" }\n"
" if( (2*gIdx + 1) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
"{\n"
" const u32 blockSize = WG_SIZE*2;\n"
" int myIdx = GET_GROUP_IDX+1;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" float4 iBlockSum = blockSum[myIdx];\n"
" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
" {\n"
" dst[i] += iBlockSum;\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
"{\n"
" __local float4 ldsData[2048];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int lSize = GET_GROUP_SIZE;\n"
" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
" {\n"
" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
" {\n"
" dst[i] = ldsData[i];\n"
" }\n"
" if( gIdx == 0 )\n"
" {\n"
" dst[cb.m_numBlocks] = sum;\n"
" }\n"
"}\n";

View file

@ -1,909 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* radixSort32KernelsCL =
"/*\n"
"Bullet Continuous Collision Detection and Physics Library\n"
"Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Author Takahiro Harada\n"
"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"#define make_uint4 (uint4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"#define WG_SIZE 64\n"
"#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n"
"#define BITS_PER_PASS 4\n"
"#define NUM_BUCKET (1<<BITS_PER_PASS)\n"
"typedef uchar u8;\n"
"// this isn't optimization for VLIW. But just reducing writes. \n"
"#define USE_2LEVEL_REDUCE 1\n"
"//#define CHECK_BOUNDARY 1\n"
"//#define NV_GPU 1\n"
"// Cypress\n"
"#define nPerWI 16\n"
"// Cayman\n"
"//#define nPerWI 20\n"
"#define m_n x\n"
"#define m_nWGs y\n"
"#define m_startBit z\n"
"#define m_nBlocksPerWG w\n"
"/*\n"
"typedef struct\n"
"{\n"
" int m_n;\n"
" int m_nWGs;\n"
" int m_startBit;\n"
" int m_nBlocksPerWG;\n"
"} ConstBuffer;\n"
"*/\n"
"typedef struct\n"
"{\n"
" unsigned int m_key;\n"
" unsigned int m_value;\n"
"} SortDataCL;\n"
"uint prefixScanVectorEx( uint4* data )\n"
"{\n"
" u32 sum = 0;\n"
" u32 tmp = data[0].x;\n"
" data[0].x = sum;\n"
" sum += tmp;\n"
" tmp = data[0].y;\n"
" data[0].y = sum;\n"
" sum += tmp;\n"
" tmp = data[0].z;\n"
" data[0].z = sum;\n"
" sum += tmp;\n"
" tmp = data[0].w;\n"
" data[0].w = sum;\n"
" sum += tmp;\n"
" return sum;\n"
"}\n"
"u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n"
"{\n"
" { // Set data\n"
" sorterSharedMemory[lIdx] = 0;\n"
" sorterSharedMemory[lIdx+wgSize] = pData;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" { // Prefix sum\n"
" int idx = 2*lIdx + (wgSize+1);\n"
"#if defined(USE_2LEVEL_REDUCE)\n"
" if( lIdx < 64 )\n"
" {\n"
" u32 u0, u1, u2;\n"
" u0 = sorterSharedMemory[idx-3];\n"
" u1 = sorterSharedMemory[idx-2];\n"
" u2 = sorterSharedMemory[idx-1];\n"
" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n"
" GROUP_MEM_FENCE;\n"
" u0 = sorterSharedMemory[idx-12];\n"
" u1 = sorterSharedMemory[idx-8];\n"
" u2 = sorterSharedMemory[idx-4];\n"
" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n"
" GROUP_MEM_FENCE;\n"
" u0 = sorterSharedMemory[idx-48];\n"
" u1 = sorterSharedMemory[idx-32];\n"
" u2 = sorterSharedMemory[idx-16];\n"
" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n"
" GROUP_MEM_FENCE;\n"
" if( wgSize > 64 )\n"
" {\n"
" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
" GROUP_MEM_FENCE;\n"
" }\n"
" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
" GROUP_MEM_FENCE;\n"
" }\n"
"#else\n"
" if( lIdx < 64 )\n"
" {\n"
" sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
" GROUP_MEM_FENCE;\n"
" sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; \n"
" GROUP_MEM_FENCE;\n"
" sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
" GROUP_MEM_FENCE;\n"
" sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
" GROUP_MEM_FENCE;\n"
" sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
" GROUP_MEM_FENCE;\n"
" sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n"
" GROUP_MEM_FENCE;\n"
" if( wgSize > 64 )\n"
" {\n"
" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
" GROUP_MEM_FENCE;\n"
" }\n"
" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
" GROUP_MEM_FENCE;\n"
" }\n"
"#endif\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" *totalSum = sorterSharedMemory[wgSize*2-1];\n"
" u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n"
" return addValue;\n"
"}\n"
"//__attribute__((reqd_work_group_size(128,1,1)))\n"
"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
"{\n"
" u32 s4 = prefixScanVectorEx( &pData );\n"
" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n"
" return pData + make_uint4( rank, rank, rank, rank );\n"
"}\n"
"//__attribute__((reqd_work_group_size(64,1,1)))\n"
"uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
"{\n"
" u32 s4 = prefixScanVectorEx( &pData );\n"
" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n"
" return pData + make_uint4( rank, rank, rank, rank );\n"
"}\n"
"u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n"
"u32 bit8Scan(u32 v)\n"
"{\n"
" return (v<<8) + (v<<16) + (v<<24);\n"
"}\n"
"//===\n"
"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n"
"{\n"
" __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
" u32 gIdx = GET_GLOBAL_IDX;\n"
" u32 lIdx = GET_LOCAL_IDX;\n"
" u32 wgIdx = GET_GROUP_IDX;\n"
" u32 wgSize = GET_GROUP_SIZE;\n"
" const int startBit = cb.m_startBit;\n"
" const int n = cb.m_n;\n"
" const int nWGs = cb.m_nWGs;\n"
" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
" for(int i=0; i<NUM_BUCKET; i++)\n"
" {\n"
" MY_HISTOGRAM(i) = 0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
" u32 localKey;\n"
" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
" {\n"
" // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
" // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
" // AMD: AtomInc performs better while NV prefers ++\n"
" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
" {\n"
"#if defined(CHECK_BOUNDARY)\n"
" if( addr+i < n )\n"
"#endif\n"
" {\n"
" localKey = (gSrc[addr+i]>>startBit) & 0xf;\n"
"#if defined(NV_GPU)\n"
" MY_HISTOGRAM( localKey )++;\n"
"#else\n"
" AtomInc( MY_HISTOGRAM( localKey ) );\n"
"#endif\n"
" }\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" \n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" u32 sum = 0;\n"
" for(int i=0; i<GET_GROUP_SIZE; i++)\n"
" {\n"
" sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
" }\n"
" histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4 cb )\n"
"{\n"
" __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
" u32 gIdx = GET_GLOBAL_IDX;\n"
" u32 lIdx = GET_LOCAL_IDX;\n"
" u32 wgIdx = GET_GROUP_IDX;\n"
" u32 wgSize = GET_GROUP_SIZE;\n"
" const int startBit = cb.m_startBit;\n"
" const int n = cb.m_n;\n"
" const int nWGs = cb.m_nWGs;\n"
" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
" for(int i=0; i<NUM_BUCKET; i++)\n"
" {\n"
" MY_HISTOGRAM(i) = 0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
" u32 localKey;\n"
" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
" {\n"
" // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
" // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
" // AMD: AtomInc performs better while NV prefers ++\n"
" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
" {\n"
"#if defined(CHECK_BOUNDARY)\n"
" if( addr+i < n )\n"
"#endif\n"
" {\n"
" localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;\n"
"#if defined(NV_GPU)\n"
" MY_HISTOGRAM( localKey )++;\n"
"#else\n"
" AtomInc( MY_HISTOGRAM( localKey ) );\n"
"#endif\n"
" }\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" \n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" u32 sum = 0;\n"
" for(int i=0; i<GET_GROUP_SIZE; i++)\n"
" {\n"
" sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
" }\n"
" histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
" }\n"
"}\n"
"#define nPerLane (nPerWI/4)\n"
"// NUM_BUCKET*nWGs < 128*nPerWI\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(128,1,1)))\n"
"void PrefixScanKernel( __global u32* wHistogram1, int4 cb )\n"
"{\n"
" __local u32 ldsTopScanData[128*2];\n"
" u32 lIdx = GET_LOCAL_IDX;\n"
" u32 wgIdx = GET_GROUP_IDX;\n"
" const int nWGs = cb.m_nWGs;\n"
" u32 data[nPerWI];\n"
" for(int i=0; i<nPerWI; i++)\n"
" {\n"
" data[i] = 0;\n"
" if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n"
" data[i] = wHistogram1[nPerWI*lIdx+i];\n"
" }\n"
" uint4 myData = make_uint4(0,0,0,0);\n"
" for(int i=0; i<nPerLane; i++)\n"
" {\n"
" myData.x += data[nPerLane*0+i];\n"
" myData.y += data[nPerLane*1+i];\n"
" myData.z += data[nPerLane*2+i];\n"
" myData.w += data[nPerLane*3+i];\n"
" }\n"
" uint totalSum;\n"
" uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n"
"// for(int j=0; j<4; j++) // somehow it introduces a lot of branches\n"
" { int j = 0;\n"
" u32 sum = 0;\n"
" for(int i=0; i<nPerLane; i++)\n"
" {\n"
" u32 tmp = data[nPerLane*j+i];\n"
" data[nPerLane*j+i] = sum;\n"
" sum += tmp;\n"
" }\n"
" }\n"
" { int j = 1;\n"
" u32 sum = 0;\n"
" for(int i=0; i<nPerLane; i++)\n"
" {\n"
" u32 tmp = data[nPerLane*j+i];\n"
" data[nPerLane*j+i] = sum;\n"
" sum += tmp;\n"
" }\n"
" }\n"
" { int j = 2;\n"
" u32 sum = 0;\n"
" for(int i=0; i<nPerLane; i++)\n"
" {\n"
" u32 tmp = data[nPerLane*j+i];\n"
" data[nPerLane*j+i] = sum;\n"
" sum += tmp;\n"
" }\n"
" }\n"
" { int j = 3;\n"
" u32 sum = 0;\n"
" for(int i=0; i<nPerLane; i++)\n"
" {\n"
" u32 tmp = data[nPerLane*j+i];\n"
" data[nPerLane*j+i] = sum;\n"
" sum += tmp;\n"
" }\n"
" }\n"
" for(int i=0; i<nPerLane; i++)\n"
" {\n"
" data[nPerLane*0+i] += scanned.x;\n"
" data[nPerLane*1+i] += scanned.y;\n"
" data[nPerLane*2+i] += scanned.z;\n"
" data[nPerLane*3+i] += scanned.w;\n"
" }\n"
" for(int i=0; i<nPerWI; i++)\n"
" {\n"
" int index = nPerWI*lIdx+i;\n"
" if (index < NUM_BUCKET*nWGs)\n"
" wHistogram1[nPerWI*lIdx+i] = data[i];\n"
" }\n"
"}\n"
"// 4 scan, 4 exchange\n"
"void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
"{\n"
" for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)\n"
" {\n"
" u32 mask = (1<<bitIdx);\n"
" uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n"
" uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
" u32 total;\n"
" prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n"
" {\n"
" uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
" uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
" dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
" GROUP_LDS_BARRIER;\n"
" ldsSortData[dstAddr.x] = sortData[0];\n"
" ldsSortData[dstAddr.y] = sortData[1];\n"
" ldsSortData[dstAddr.z] = sortData[2];\n"
" ldsSortData[dstAddr.w] = sortData[3];\n"
" GROUP_LDS_BARRIER;\n"
" sortData[0] = ldsSortData[localAddr.x];\n"
" sortData[1] = ldsSortData[localAddr.y];\n"
" sortData[2] = ldsSortData[localAddr.z];\n"
" sortData[3] = ldsSortData[localAddr.w];\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" }\n"
"}\n"
"// 2 scan, 2 exchange\n"
"void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
"{\n"
" for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
" {\n"
" uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
" (sortData[1]>>(startBit+ibit)) & 0x3, \n"
" (sortData[2]>>(startBit+ibit)) & 0x3, \n"
" (sortData[3]>>(startBit+ibit)) & 0x3);\n"
" u32 key4;\n"
" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
" {\n"
" sKeyPacked[0] |= 1<<(8*b.x);\n"
" sKeyPacked[1] |= 1<<(8*b.y);\n"
" sKeyPacked[2] |= 1<<(8*b.z);\n"
" sKeyPacked[3] |= 1<<(8*b.w);\n"
" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
" }\n"
" u32 rankPacked;\n"
" u32 sumPacked;\n"
" {\n"
" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" u32 newOffset[4] = { 0,0,0,0 };\n"
" {\n"
" u32 sumScanned = bit8Scan( sumPacked );\n"
" u32 scannedKeys[4];\n"
" scannedKeys[0] = 1<<(8*b.x);\n"
" scannedKeys[1] = 1<<(8*b.y);\n"
" scannedKeys[2] = 1<<(8*b.z);\n"
" scannedKeys[3] = 1<<(8*b.w);\n"
" { // 4 scans at once\n"
" u32 sum4 = 0;\n"
" for(int ie=0; ie<4; ie++)\n"
" {\n"
" u32 tmp = scannedKeys[ie];\n"
" scannedKeys[ie] = sum4;\n"
" sum4 += tmp;\n"
" }\n"
" }\n"
" {\n"
" u32 sumPlusRank = sumScanned + rankPacked;\n"
" { u32 ie = b.x;\n"
" scannedKeys[0] += sumPlusRank;\n"
" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
" }\n"
" { u32 ie = b.y;\n"
" scannedKeys[1] += sumPlusRank;\n"
" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
" }\n"
" { u32 ie = b.z;\n"
" scannedKeys[2] += sumPlusRank;\n"
" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
" }\n"
" { u32 ie = b.w;\n"
" scannedKeys[3] += sumPlusRank;\n"
" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
" }\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" {\n"
" ldsSortData[newOffset[0]] = sortData[0];\n"
" ldsSortData[newOffset[1]] = sortData[1];\n"
" ldsSortData[newOffset[2]] = sortData[2];\n"
" ldsSortData[newOffset[3]] = sortData[3];\n"
" GROUP_LDS_BARRIER;\n"
" u32 dstAddr = 4*lIdx;\n"
" sortData[0] = ldsSortData[dstAddr+0];\n"
" sortData[1] = ldsSortData[dstAddr+1];\n"
" sortData[2] = ldsSortData[dstAddr+2];\n"
" sortData[3] = ldsSortData[dstAddr+3];\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" }\n"
"}\n"
"#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n"
"{\n"
" __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
" __local u32 localHistogramToCarry[NUM_BUCKET];\n"
" __local u32 localHistogram[NUM_BUCKET*2];\n"
" u32 gIdx = GET_GLOBAL_IDX;\n"
" u32 lIdx = GET_LOCAL_IDX;\n"
" u32 wgIdx = GET_GROUP_IDX;\n"
" u32 wgSize = GET_GROUP_SIZE;\n"
" const int n = cb.m_n;\n"
" const int nWGs = cb.m_nWGs;\n"
" const int startBit = cb.m_startBit;\n"
" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
" if( lIdx < (NUM_BUCKET) )\n"
" {\n"
" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
" {\n"
" u32 myHistogram = 0;\n"
" u32 sortData[ELEMENTS_PER_WORK_ITEM];\n"
" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
"#if defined(CHECK_BOUNDARY)\n"
" sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;\n"
"#else\n"
" sortData[i] = gSrc[ addr+i ];\n"
"#endif\n"
" sort4Bits(sortData, startBit, lIdx, ldsSortData);\n"
" u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
" keys[i] = (sortData[i]>>startBit) & 0xf;\n"
" { // create histogram\n"
" u32 setIdx = lIdx/16;\n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" localHistogram[lIdx] = 0;\n"
" }\n"
" ldsSortData[lIdx] = 0;\n"
" GROUP_LDS_BARRIER;\n"
" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
"#if defined(CHECK_BOUNDARY)\n"
" if( addr+i < n )\n"
"#endif\n"
"#if defined(NV_GPU)\n"
" SET_HISTOGRAM( setIdx, keys[i] )++;\n"
"#else\n"
" AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
"#endif\n"
" \n"
" GROUP_LDS_BARRIER;\n"
" \n"
" uint hIdx = NUM_BUCKET+lIdx;\n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" u32 sum = 0;\n"
" for(int i=0; i<WG_SIZE/16; i++)\n"
" {\n"
" sum += SET_HISTOGRAM( i, lIdx );\n"
" }\n"
" myHistogram = sum;\n"
" localHistogram[hIdx] = sum;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
"#if defined(USE_2LEVEL_REDUCE)\n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" localHistogram[hIdx] = localHistogram[hIdx-1];\n"
" GROUP_MEM_FENCE;\n"
" u32 u0, u1, u2;\n"
" u0 = localHistogram[hIdx-3];\n"
" u1 = localHistogram[hIdx-2];\n"
" u2 = localHistogram[hIdx-1];\n"
" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
" GROUP_MEM_FENCE;\n"
" u0 = localHistogram[hIdx-12];\n"
" u1 = localHistogram[hIdx-8];\n"
" u2 = localHistogram[hIdx-4];\n"
" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
" GROUP_MEM_FENCE;\n"
" }\n"
"#else\n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" localHistogram[hIdx] = localHistogram[hIdx-1];\n"
" GROUP_MEM_FENCE;\n"
" localHistogram[hIdx] += localHistogram[hIdx-1];\n"
" GROUP_MEM_FENCE;\n"
" localHistogram[hIdx] += localHistogram[hIdx-2];\n"
" GROUP_MEM_FENCE;\n"
" localHistogram[hIdx] += localHistogram[hIdx-4];\n"
" GROUP_MEM_FENCE;\n"
" localHistogram[hIdx] += localHistogram[hIdx-8];\n"
" GROUP_MEM_FENCE;\n"
" }\n"
"#endif\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" {\n"
" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
" {\n"
" int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
" int binIdx = keys[ie];\n"
" int groupOffset = localHistogramToCarry[binIdx];\n"
" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
"#if defined(CHECK_BOUNDARY)\n"
" if( addr+ie < n )\n"
"#endif\n"
" gDst[ groupOffset + myIdx ] = sortData[ie];\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" localHistogramToCarry[lIdx] += myHistogram;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
"}\n"
"// 2 scan, 2 exchange\n"
"void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n"
"{\n"
" for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
" {\n"
" uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
" (sortData[1]>>(startBit+ibit)) & 0x3, \n"
" (sortData[2]>>(startBit+ibit)) & 0x3, \n"
" (sortData[3]>>(startBit+ibit)) & 0x3);\n"
" u32 key4;\n"
" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
" {\n"
" sKeyPacked[0] |= 1<<(8*b.x);\n"
" sKeyPacked[1] |= 1<<(8*b.y);\n"
" sKeyPacked[2] |= 1<<(8*b.z);\n"
" sKeyPacked[3] |= 1<<(8*b.w);\n"
" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
" }\n"
" u32 rankPacked;\n"
" u32 sumPacked;\n"
" {\n"
" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" u32 newOffset[4] = { 0,0,0,0 };\n"
" {\n"
" u32 sumScanned = bit8Scan( sumPacked );\n"
" u32 scannedKeys[4];\n"
" scannedKeys[0] = 1<<(8*b.x);\n"
" scannedKeys[1] = 1<<(8*b.y);\n"
" scannedKeys[2] = 1<<(8*b.z);\n"
" scannedKeys[3] = 1<<(8*b.w);\n"
" { // 4 scans at once\n"
" u32 sum4 = 0;\n"
" for(int ie=0; ie<4; ie++)\n"
" {\n"
" u32 tmp = scannedKeys[ie];\n"
" scannedKeys[ie] = sum4;\n"
" sum4 += tmp;\n"
" }\n"
" }\n"
" {\n"
" u32 sumPlusRank = sumScanned + rankPacked;\n"
" { u32 ie = b.x;\n"
" scannedKeys[0] += sumPlusRank;\n"
" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
" }\n"
" { u32 ie = b.y;\n"
" scannedKeys[1] += sumPlusRank;\n"
" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
" }\n"
" { u32 ie = b.z;\n"
" scannedKeys[2] += sumPlusRank;\n"
" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
" }\n"
" { u32 ie = b.w;\n"
" scannedKeys[3] += sumPlusRank;\n"
" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
" }\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" {\n"
" ldsSortData[newOffset[0]] = sortData[0];\n"
" ldsSortData[newOffset[1]] = sortData[1];\n"
" ldsSortData[newOffset[2]] = sortData[2];\n"
" ldsSortData[newOffset[3]] = sortData[3];\n"
" ldsSortVal[newOffset[0]] = sortVal[0];\n"
" ldsSortVal[newOffset[1]] = sortVal[1];\n"
" ldsSortVal[newOffset[2]] = sortVal[2];\n"
" ldsSortVal[newOffset[3]] = sortVal[3];\n"
" GROUP_LDS_BARRIER;\n"
" u32 dstAddr = 4*lIdx;\n"
" sortData[0] = ldsSortData[dstAddr+0];\n"
" sortData[1] = ldsSortData[dstAddr+1];\n"
" sortData[2] = ldsSortData[dstAddr+2];\n"
" sortData[3] = ldsSortData[dstAddr+3];\n"
" sortVal[0] = ldsSortVal[dstAddr+0];\n"
" sortVal[1] = ldsSortVal[dstAddr+1];\n"
" sortVal[2] = ldsSortVal[dstAddr+2];\n"
" sortVal[3] = ldsSortVal[dstAddr+3];\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
"{\n"
" __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
" __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
" __local u32 localHistogramToCarry[NUM_BUCKET];\n"
" __local u32 localHistogram[NUM_BUCKET*2];\n"
" u32 gIdx = GET_GLOBAL_IDX;\n"
" u32 lIdx = GET_LOCAL_IDX;\n"
" u32 wgIdx = GET_GROUP_IDX;\n"
" u32 wgSize = GET_GROUP_SIZE;\n"
" const int n = cb.m_n;\n"
" const int nWGs = cb.m_nWGs;\n"
" const int startBit = cb.m_startBit;\n"
" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
" if( lIdx < (NUM_BUCKET) )\n"
" {\n"
" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" \n"
" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
" {\n"
" u32 myHistogram = 0;\n"
" int sortData[ELEMENTS_PER_WORK_ITEM];\n"
" int sortVal[ELEMENTS_PER_WORK_ITEM];\n"
" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
"#if defined(CHECK_BOUNDARY)\n"
" {\n"
" sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;\n"
" sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;\n"
" }\n"
"#else\n"
" {\n"
" sortData[i] = gSrc[ addr+i ].m_key;\n"
" sortVal[i] = gSrc[ addr+i ].m_value;\n"
" }\n"
"#endif\n"
" sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n"
" u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
" keys[i] = (sortData[i]>>startBit) & 0xf;\n"
" { // create histogram\n"
" u32 setIdx = lIdx/16;\n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" localHistogram[lIdx] = 0;\n"
" }\n"
" ldsSortData[lIdx] = 0;\n"
" GROUP_LDS_BARRIER;\n"
" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
"#if defined(CHECK_BOUNDARY)\n"
" if( addr+i < n )\n"
"#endif\n"
"#if defined(NV_GPU)\n"
" SET_HISTOGRAM( setIdx, keys[i] )++;\n"
"#else\n"
" AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
"#endif\n"
" \n"
" GROUP_LDS_BARRIER;\n"
" \n"
" uint hIdx = NUM_BUCKET+lIdx;\n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" u32 sum = 0;\n"
" for(int i=0; i<WG_SIZE/16; i++)\n"
" {\n"
" sum += SET_HISTOGRAM( i, lIdx );\n"
" }\n"
" myHistogram = sum;\n"
" localHistogram[hIdx] = sum;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
"#if defined(USE_2LEVEL_REDUCE)\n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" localHistogram[hIdx] = localHistogram[hIdx-1];\n"
" GROUP_MEM_FENCE;\n"
" u32 u0, u1, u2;\n"
" u0 = localHistogram[hIdx-3];\n"
" u1 = localHistogram[hIdx-2];\n"
" u2 = localHistogram[hIdx-1];\n"
" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
" GROUP_MEM_FENCE;\n"
" u0 = localHistogram[hIdx-12];\n"
" u1 = localHistogram[hIdx-8];\n"
" u2 = localHistogram[hIdx-4];\n"
" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
" GROUP_MEM_FENCE;\n"
" }\n"
"#else\n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" localHistogram[hIdx] = localHistogram[hIdx-1];\n"
" GROUP_MEM_FENCE;\n"
" localHistogram[hIdx] += localHistogram[hIdx-1];\n"
" GROUP_MEM_FENCE;\n"
" localHistogram[hIdx] += localHistogram[hIdx-2];\n"
" GROUP_MEM_FENCE;\n"
" localHistogram[hIdx] += localHistogram[hIdx-4];\n"
" GROUP_MEM_FENCE;\n"
" localHistogram[hIdx] += localHistogram[hIdx-8];\n"
" GROUP_MEM_FENCE;\n"
" }\n"
"#endif\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" {\n"
" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
" {\n"
" int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
" int binIdx = keys[ie];\n"
" int groupOffset = localHistogramToCarry[binIdx];\n"
" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
"#if defined(CHECK_BOUNDARY)\n"
" if( addr+ie < n )\n"
" {\n"
" if ((groupOffset + myIdx)<n)\n"
" {\n"
" if (sortData[ie]==sortVal[ie])\n"
" {\n"
" \n"
" SortDataCL tmp;\n"
" tmp.m_key = sortData[ie];\n"
" tmp.m_value = sortVal[ie];\n"
" if (tmp.m_key == tmp.m_value)\n"
" gDst[groupOffset + myIdx ] = tmp;\n"
" }\n"
" \n"
" }\n"
" }\n"
"#else\n"
" if ((groupOffset + myIdx)<n)\n"
" {\n"
" gDst[ groupOffset + myIdx ].m_key = sortData[ie];\n"
" gDst[ groupOffset + myIdx ].m_value = sortVal[ie];\n"
" }\n"
"#endif\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx < NUM_BUCKET )\n"
" {\n"
" localHistogramToCarry[lIdx] += myHistogram;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
"{\n"
" \n"
" u32 gIdx = GET_GLOBAL_IDX;\n"
" u32 realLocalIdx = GET_LOCAL_IDX;\n"
" u32 wgIdx = GET_GROUP_IDX;\n"
" u32 wgSize = GET_GROUP_SIZE;\n"
" const int startBit = cb.m_startBit;\n"
" const int n = cb.m_n;\n"
" const int nWGs = cb.m_nWGs;\n"
" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
" int counter[NUM_BUCKET];\n"
" \n"
" if (realLocalIdx>0)\n"
" return;\n"
" \n"
" for (int c=0;c<NUM_BUCKET;c++)\n"
" counter[c]=0;\n"
" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
" \n"
" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
" {\n"
" for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
" {\n"
" int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
" \n"
" for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
" {\n"
" int i = addr2+j;\n"
" if( i < n )\n"
" {\n"
" int tableIdx;\n"
" tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
" counter[tableIdx] ++;\n"
" }\n"
" }\n"
" }\n"
" }\n"
" \n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n"
"{\n"
" \n"
" u32 gIdx = GET_GLOBAL_IDX;\n"
" u32 realLocalIdx = GET_LOCAL_IDX;\n"
" u32 wgIdx = GET_GROUP_IDX;\n"
" u32 wgSize = GET_GROUP_SIZE;\n"
" const int startBit = cb.m_startBit;\n"
" const int n = cb.m_n;\n"
" const int nWGs = cb.m_nWGs;\n"
" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
" int counter[NUM_BUCKET];\n"
" \n"
" if (realLocalIdx>0)\n"
" return;\n"
" \n"
" for (int c=0;c<NUM_BUCKET;c++)\n"
" counter[c]=0;\n"
" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
" \n"
" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
" {\n"
" for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
" {\n"
" int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
" \n"
" for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
" {\n"
" int i = addr2+j;\n"
" if( i < n )\n"
" {\n"
" int tableIdx;\n"
" tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
" counter[tableIdx] ++;\n"
" }\n"
" }\n"
" }\n"
" }\n"
" \n"
"}\n";

View file

@ -1,374 +0,0 @@
#include "b3GpuRaycast.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h"
#include "Bullet3OpenCL/Raycast/kernels/rayCastKernels.h"
#define B3_RAYCAST_PATH "src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl"
struct b3GpuRaycastInternalData
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_q;
cl_kernel m_raytraceKernel;
cl_kernel m_raytracePairsKernel;
cl_kernel m_findRayRigidPairIndexRanges;
b3GpuParallelLinearBvh* m_plbvh;
b3RadixSort32CL* m_radixSorter;
b3FillCL* m_fill;
//1 element per ray
b3OpenCLArray<b3RayInfo>* m_gpuRays;
b3OpenCLArray<b3RayHit>* m_gpuHitResults;
b3OpenCLArray<int>* m_firstRayRigidPairIndexPerRay;
b3OpenCLArray<int>* m_numRayRigidPairsPerRay;
//1 element per (ray index, rigid index) pair, where the ray intersects with the rigid's AABB
b3OpenCLArray<int>* m_gpuNumRayRigidPairs;
b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs; //x == ray index, y == rigid index
int m_test;
};
b3GpuRaycast::b3GpuRaycast(cl_context ctx, cl_device_id device, cl_command_queue q)
{
m_data = new b3GpuRaycastInternalData;
m_data->m_context = ctx;
m_data->m_device = device;
m_data->m_q = q;
m_data->m_raytraceKernel = 0;
m_data->m_raytracePairsKernel = 0;
m_data->m_findRayRigidPairIndexRanges = 0;
m_data->m_plbvh = new b3GpuParallelLinearBvh(ctx, device, q);
m_data->m_radixSorter = new b3RadixSort32CL(ctx, device, q);
m_data->m_fill = new b3FillCL(ctx, device, q);
m_data->m_gpuRays = new b3OpenCLArray<b3RayInfo>(ctx, q);
m_data->m_gpuHitResults = new b3OpenCLArray<b3RayHit>(ctx, q);
m_data->m_firstRayRigidPairIndexPerRay = new b3OpenCLArray<int>(ctx, q);
m_data->m_numRayRigidPairsPerRay = new b3OpenCLArray<int>(ctx, q);
m_data->m_gpuNumRayRigidPairs = new b3OpenCLArray<int>(ctx, q);
m_data->m_gpuRayRigidPairs = new b3OpenCLArray<b3Int2>(ctx, q);
{
cl_int errNum = 0;
cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, &errNum, "", B3_RAYCAST_PATH);
b3Assert(errNum == CL_SUCCESS);
m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "rayCastKernel", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "rayCastPairsKernel", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "findRayRigidPairIndexRanges", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
clReleaseProgram(prog);
}
}
b3GpuRaycast::~b3GpuRaycast()
{
clReleaseKernel(m_data->m_raytraceKernel);
clReleaseKernel(m_data->m_raytracePairsKernel);
clReleaseKernel(m_data->m_findRayRigidPairIndexRanges);
delete m_data->m_plbvh;
delete m_data->m_radixSorter;
delete m_data->m_fill;
delete m_data->m_gpuRays;
delete m_data->m_gpuHitResults;
delete m_data->m_firstRayRigidPairIndexPerRay;
delete m_data->m_numRayRigidPairsPerRay;
delete m_data->m_gpuNumRayRigidPairs;
delete m_data->m_gpuRayRigidPairs;
delete m_data;
}
bool sphere_intersect(const b3Vector3& spherePos, b3Scalar radius, const b3Vector3& rayFrom, const b3Vector3& rayTo, float& hitFraction)
{
b3Vector3 rs = rayFrom - spherePos;
b3Vector3 rayDir = rayTo - rayFrom;
float A = b3Dot(rayDir, rayDir);
float B = b3Dot(rs, rayDir);
float C = b3Dot(rs, rs) - (radius * radius);
float D = B * B - A * C;
if (D > 0.0)
{
float t = (-B - sqrt(D)) / A;
if ((t >= 0.0f) && (t < hitFraction))
{
hitFraction = t;
return true;
}
}
return false;
}
bool rayConvex(const b3Vector3& rayFromLocal, const b3Vector3& rayToLocal, const b3ConvexPolyhedronData& poly,
const b3AlignedObjectArray<b3GpuFace>& faces, float& hitFraction, b3Vector3& hitNormal)
{
float exitFraction = hitFraction;
float enterFraction = -0.1f;
b3Vector3 curHitNormal = b3MakeVector3(0, 0, 0);
for (int i = 0; i < poly.m_numFaces; i++)
{
const b3GpuFace& face = faces[poly.m_faceOffset + i];
float fromPlaneDist = b3Dot(rayFromLocal, face.m_plane) + face.m_plane.w;
float toPlaneDist = b3Dot(rayToLocal, face.m_plane) + face.m_plane.w;
if (fromPlaneDist < 0.f)
{
if (toPlaneDist >= 0.f)
{
float fraction = fromPlaneDist / (fromPlaneDist - toPlaneDist);
if (exitFraction > fraction)
{
exitFraction = fraction;
}
}
}
else
{
if (toPlaneDist < 0.f)
{
float fraction = fromPlaneDist / (fromPlaneDist - toPlaneDist);
if (enterFraction <= fraction)
{
enterFraction = fraction;
curHitNormal = face.m_plane;
curHitNormal.w = 0.f;
}
}
else
{
return false;
}
}
if (exitFraction <= enterFraction)
return false;
}
if (enterFraction < 0.f)
return false;
hitFraction = enterFraction;
hitNormal = curHitNormal;
return true;
}
void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData)
{
// return castRays(rays,hitResults,numBodies,bodies,numCollidables,collidables);
B3_PROFILE("castRaysHost");
for (int r = 0; r < rays.size(); r++)
{
b3Vector3 rayFrom = rays[r].m_from;
b3Vector3 rayTo = rays[r].m_to;
float hitFraction = hitResults[r].m_hitFraction;
int hitBodyIndex = -1;
b3Vector3 hitNormal;
for (int b = 0; b < numBodies; b++)
{
const b3Vector3& pos = bodies[b].m_pos;
//const b3Quaternion& orn = bodies[b].m_quat;
switch (collidables[bodies[b].m_collidableIdx].m_shapeType)
{
case SHAPE_SPHERE:
{
b3Scalar radius = collidables[bodies[b].m_collidableIdx].m_radius;
if (sphere_intersect(pos, radius, rayFrom, rayTo, hitFraction))
{
hitBodyIndex = b;
b3Vector3 hitPoint;
hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to, hitFraction);
hitNormal = (hitPoint - bodies[b].m_pos).normalize();
}
}
case SHAPE_CONVEX_HULL:
{
b3Transform convexWorldTransform;
convexWorldTransform.setIdentity();
convexWorldTransform.setOrigin(bodies[b].m_pos);
convexWorldTransform.setRotation(bodies[b].m_quat);
b3Transform convexWorld2Local = convexWorldTransform.inverse();
b3Vector3 rayFromLocal = convexWorld2Local(rayFrom);
b3Vector3 rayToLocal = convexWorld2Local(rayTo);
int shapeIndex = collidables[bodies[b].m_collidableIdx].m_shapeIndex;
const b3ConvexPolyhedronData& poly = narrowphaseData->m_convexPolyhedra[shapeIndex];
if (rayConvex(rayFromLocal, rayToLocal, poly, narrowphaseData->m_convexFaces, hitFraction, hitNormal))
{
hitBodyIndex = b;
}
break;
}
default:
{
static bool once = true;
if (once)
{
once = false;
b3Warning("Raytest: unsupported shape type\n");
}
}
}
}
if (hitBodyIndex >= 0)
{
hitResults[r].m_hitFraction = hitFraction;
hitResults[r].m_hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to, hitFraction);
hitResults[r].m_hitNormal = hitNormal;
hitResults[r].m_hitBody = hitBodyIndex;
}
}
}
///todo: add some acceleration structure (AABBs, tree etc)
void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase)
{
//castRaysHost(rays,hitResults,numBodies,bodies,numCollidables,collidables,narrowphaseData);
B3_PROFILE("castRaysGPU");
{
B3_PROFILE("raycast copyFromHost");
m_data->m_gpuRays->copyFromHost(rays);
m_data->m_gpuHitResults->copyFromHost(hitResults);
}
int numRays = hitResults.size();
{
m_data->m_firstRayRigidPairIndexPerRay->resize(numRays);
m_data->m_numRayRigidPairsPerRay->resize(numRays);
m_data->m_gpuNumRayRigidPairs->resize(1);
m_data->m_gpuRayRigidPairs->resize(numRays * 16);
}
//run kernel
const bool USE_BRUTE_FORCE_RAYCAST = false;
if (USE_BRUTE_FORCE_RAYCAST)
{
B3_PROFILE("raycast launch1D");
b3LauncherCL launcher(m_data->m_q, m_data->m_raytraceKernel, "m_raytraceKernel");
int numRays = rays.size();
launcher.setConst(numRays);
launcher.setBuffer(m_data->m_gpuRays->getBufferCL());
launcher.setBuffer(m_data->m_gpuHitResults->getBufferCL());
launcher.setConst(numBodies);
launcher.setBuffer(narrowphaseData->m_bodyBufferGPU->getBufferCL());
launcher.setBuffer(narrowphaseData->m_collidablesGPU->getBufferCL());
launcher.setBuffer(narrowphaseData->m_convexFacesGPU->getBufferCL());
launcher.setBuffer(narrowphaseData->m_convexPolyhedraGPU->getBufferCL());
launcher.launch1D(numRays);
clFinish(m_data->m_q);
}
else
{
m_data->m_plbvh->build(broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU());
m_data->m_plbvh->testRaysAgainstBvhAabbs(*m_data->m_gpuRays, *m_data->m_gpuNumRayRigidPairs, *m_data->m_gpuRayRigidPairs);
int numRayRigidPairs = -1;
m_data->m_gpuNumRayRigidPairs->copyToHostPointer(&numRayRigidPairs, 1);
if (numRayRigidPairs > m_data->m_gpuRayRigidPairs->size())
{
numRayRigidPairs = m_data->m_gpuRayRigidPairs->size();
m_data->m_gpuNumRayRigidPairs->copyFromHostPointer(&numRayRigidPairs, 1);
}
m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs); //Radix sort needs b3OpenCLArray::size() to be correct
//Sort ray-rigid pairs by ray index
{
B3_PROFILE("sort ray-rigid pairs");
m_data->m_radixSorter->execute(*reinterpret_cast<b3OpenCLArray<b3SortData>*>(m_data->m_gpuRayRigidPairs));
}
//detect start,count of each ray pair
{
B3_PROFILE("detect ray-rigid pair index ranges");
{
B3_PROFILE("reset ray-rigid pair index ranges");
m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays); //atomic_min used to find first index
m_data->m_fill->execute(*m_data->m_numRayRigidPairsPerRay, 0, numRays);
clFinish(m_data->m_q);
}
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_data->m_gpuRayRigidPairs->getBufferCL()),
b3BufferInfoCL(m_data->m_firstRayRigidPairIndexPerRay->getBufferCL()),
b3BufferInfoCL(m_data->m_numRayRigidPairsPerRay->getBufferCL())};
b3LauncherCL launcher(m_data->m_q, m_data->m_findRayRigidPairIndexRanges, "m_findRayRigidPairIndexRanges");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numRayRigidPairs);
launcher.launch1D(numRayRigidPairs);
clFinish(m_data->m_q);
}
{
B3_PROFILE("ray-rigid intersection");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_data->m_gpuRays->getBufferCL()),
b3BufferInfoCL(m_data->m_gpuHitResults->getBufferCL()),
b3BufferInfoCL(m_data->m_firstRayRigidPairIndexPerRay->getBufferCL()),
b3BufferInfoCL(m_data->m_numRayRigidPairsPerRay->getBufferCL()),
b3BufferInfoCL(narrowphaseData->m_bodyBufferGPU->getBufferCL()),
b3BufferInfoCL(narrowphaseData->m_collidablesGPU->getBufferCL()),
b3BufferInfoCL(narrowphaseData->m_convexFacesGPU->getBufferCL()),
b3BufferInfoCL(narrowphaseData->m_convexPolyhedraGPU->getBufferCL()),
b3BufferInfoCL(m_data->m_gpuRayRigidPairs->getBufferCL())};
b3LauncherCL launcher(m_data->m_q, m_data->m_raytracePairsKernel, "m_raytracePairsKernel");
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numRays);
launcher.launch1D(numRays);
clFinish(m_data->m_q);
}
}
//copy results
{
B3_PROFILE("raycast copyToHost");
m_data->m_gpuHitResults->copyToHost(hitResults);
}
}

View file

@ -1,28 +0,0 @@
#ifndef B3_GPU_RAYCAST_H
#define B3_GPU_RAYCAST_H
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
class b3GpuRaycast
{
protected:
struct b3GpuRaycastInternalData* m_data;
public:
b3GpuRaycast(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~b3GpuRaycast();
void castRaysHost(const b3AlignedObjectArray<b3RayInfo>& raysIn, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
const struct b3GpuNarrowPhaseInternalData* narrowphaseData);
void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase);
};
#endif //B3_GPU_RAYCAST_H

View file

@ -1,439 +0,0 @@
#define SHAPE_CONVEX_HULL 3
#define SHAPE_PLANE 4
#define SHAPE_CONCAVE_TRIMESH 5
#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
#define SHAPE_SPHERE 7
typedef struct
{
float4 m_from;
float4 m_to;
} b3RayInfo;
typedef struct
{
float m_hitFraction;
int m_hitResult0;
int m_hitResult1;
int m_hitResult2;
float4 m_hitPoint;
float4 m_hitNormal;
} b3RayHit;
typedef struct
{
float4 m_pos;
float4 m_quat;
float4 m_linVel;
float4 m_angVel;
unsigned int m_collidableIdx;
float m_invMass;
float m_restituitionCoeff;
float m_frictionCoeff;
} Body;
typedef struct Collidable
{
union {
int m_numChildShapes;
int m_bvhIndex;
};
float m_radius;
int m_shapeType;
int m_shapeIndex;
} Collidable;
typedef struct
{
float4 m_localCenter;
float4 m_extents;
float4 mC;
float4 mE;
float m_radius;
int m_faceOffset;
int m_numFaces;
int m_numVertices;
int m_vertexOffset;
int m_uniqueEdgesOffset;
int m_numUniqueEdges;
int m_unused;
} ConvexPolyhedronCL;
typedef struct
{
float4 m_plane;
int m_indexOffset;
int m_numIndices;
} b3GpuFace;
///////////////////////////////////////
// Quaternion
///////////////////////////////////////
typedef float4 Quaternion;
__inline
Quaternion qtMul(Quaternion a, Quaternion b);
__inline
Quaternion qtNormalize(Quaternion in);
__inline
Quaternion qtInvert(Quaternion q);
__inline
float dot3F4(float4 a, float4 b)
{
float4 a1 = (float4)(a.xyz,0.f);
float4 b1 = (float4)(b.xyz,0.f);
return dot(a1, b1);
}
__inline
Quaternion qtMul(Quaternion a, Quaternion b)
{
Quaternion ans;
ans = cross( a, b );
ans += a.w*b+b.w*a;
// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
ans.w = a.w*b.w - dot3F4(a, b);
return ans;
}
__inline
Quaternion qtNormalize(Quaternion in)
{
return fast_normalize(in);
// in /= length( in );
// return in;
}
__inline
float4 qtRotate(Quaternion q, float4 vec)
{
Quaternion qInv = qtInvert( q );
float4 vcpy = vec;
vcpy.w = 0.f;
float4 out = qtMul(q,vcpy);
out = qtMul(out,qInv);
return out;
}
__inline
Quaternion qtInvert(Quaternion q)
{
return (Quaternion)(-q.xyz, q.w);
}
__inline
float4 qtInvRotate(const Quaternion q, float4 vec)
{
return qtRotate( qtInvert( q ), vec );
}
void trInverse(float4 translationIn, Quaternion orientationIn,
float4* translationOut, Quaternion* orientationOut)
{
*orientationOut = qtInvert(orientationIn);
*translationOut = qtRotate(*orientationOut, -translationIn);
}
bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,
__global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)
{
rayFromLocal.w = 0.f;
rayToLocal.w = 0.f;
bool result = true;
float exitFraction = hitFraction[0];
float enterFraction = -0.3f;
float4 curHitNormal = (float4)(0,0,0,0);
for (int i=0;i<numFaces && result;i++)
{
b3GpuFace face = faces[faceOffset+i];
float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;
float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;
if (fromPlaneDist<0.f)
{
if (toPlaneDist >= 0.f)
{
float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
if (exitFraction>fraction)
{
exitFraction = fraction;
}
}
} else
{
if (toPlaneDist<0.f)
{
float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
if (enterFraction <= fraction)
{
enterFraction = fraction;
curHitNormal = face.m_plane;
curHitNormal.w = 0.f;
}
} else
{
result = false;
}
}
if (exitFraction <= enterFraction)
result = false;
}
if (enterFraction < 0.f)
{
result = false;
}
if (result)
{
hitFraction[0] = enterFraction;
hitNormal[0] = curHitNormal;
}
return result;
}
bool sphere_intersect(float4 spherePos, float radius, float4 rayFrom, float4 rayTo, float* hitFraction)
{
float4 rs = rayFrom - spherePos;
rs.w = 0.f;
float4 rayDir = rayTo-rayFrom;
rayDir.w = 0.f;
float A = dot(rayDir,rayDir);
float B = dot(rs, rayDir);
float C = dot(rs, rs) - (radius * radius);
float D = B * B - A*C;
if (D > 0.0f)
{
float t = (-B - sqrt(D))/A;
if ( (t >= 0.0f) && (t < (*hitFraction)) )
{
*hitFraction = t;
return true;
}
}
return false;
}
float4 setInterpolate3(float4 from, float4 to, float t)
{
float s = 1.0f - t;
float4 result;
result = s * from + t * to;
result.w = 0.f;
return result;
}
__kernel void rayCastKernel(
int numRays,
const __global b3RayInfo* rays,
__global b3RayHit* hitResults,
const int numBodies,
__global Body* bodies,
__global Collidable* collidables,
__global const b3GpuFace* faces,
__global const ConvexPolyhedronCL* convexShapes )
{
int i = get_global_id(0);
if (i>=numRays)
return;
hitResults[i].m_hitFraction = 1.f;
float4 rayFrom = rays[i].m_from;
float4 rayTo = rays[i].m_to;
float hitFraction = 1.f;
float4 hitPoint;
float4 hitNormal;
int hitBodyIndex= -1;
int cachedCollidableIndex = -1;
Collidable cachedCollidable;
for (int b=0;b<numBodies;b++)
{
if (hitResults[i].m_hitResult2==b)
continue;
Body body = bodies[b];
float4 pos = body.m_pos;
float4 orn = body.m_quat;
if (cachedCollidableIndex != body.m_collidableIdx)
{
cachedCollidableIndex = body.m_collidableIdx;
cachedCollidable = collidables[cachedCollidableIndex];
}
if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)
{
float4 invPos = (float4)(0,0,0,0);
float4 invOrn = (float4)(0,0,0,0);
float4 rayFromLocal = (float4)(0,0,0,0);
float4 rayToLocal = (float4)(0,0,0,0);
invOrn = qtInvert(orn);
invPos = qtRotate(invOrn, -pos);
rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;
rayToLocal = qtRotate( invOrn, rayTo) + invPos;
rayFromLocal.w = 0.f;
rayToLocal.w = 0.f;
int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;
int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;
if (numFaces)
{
if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))
{
hitBodyIndex = b;
}
}
}
if (cachedCollidable.m_shapeType == SHAPE_SPHERE)
{
float radius = cachedCollidable.m_radius;
if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))
{
hitBodyIndex = b;
hitNormal = (float4) (hitPoint-bodies[b].m_pos);
}
}
}
if (hitBodyIndex>=0)
{
hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);
hitResults[i].m_hitFraction = hitFraction;
hitResults[i].m_hitPoint = hitPoint;
hitResults[i].m_hitNormal = normalize(hitNormal);
hitResults[i].m_hitResult0 = hitBodyIndex;
}
}
__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs,
__global int* out_firstRayRigidPairIndexPerRay,
__global int* out_numRayRigidPairsPerRay,
int numRayRigidPairs)
{
int rayRigidPairIndex = get_global_id(0);
if (rayRigidPairIndex >= numRayRigidPairs) return;
int rayIndex = rayRigidPairs[rayRigidPairIndex].x;
atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);
atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);
}
__kernel void rayCastPairsKernel(const __global b3RayInfo* rays,
__global b3RayHit* hitResults,
__global int* firstRayRigidPairIndexPerRay,
__global int* numRayRigidPairsPerRay,
__global Body* bodies,
__global Collidable* collidables,
__global const b3GpuFace* faces,
__global const ConvexPolyhedronCL* convexShapes,
__global int2* rayRigidPairs,
int numRays)
{
int i = get_global_id(0);
if (i >= numRays) return;
float4 rayFrom = rays[i].m_from;
float4 rayTo = rays[i].m_to;
hitResults[i].m_hitFraction = 1.f;
float hitFraction = 1.f;
float4 hitPoint;
float4 hitNormal;
int hitBodyIndex = -1;
//
for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)
{
int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];
int b = rayRigidPairs[rayRigidPairIndex].y;
if (hitResults[i].m_hitResult2 == b) continue;
Body body = bodies[b];
Collidable rigidCollidable = collidables[body.m_collidableIdx];
float4 pos = body.m_pos;
float4 orn = body.m_quat;
if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)
{
float4 invPos = (float4)(0,0,0,0);
float4 invOrn = (float4)(0,0,0,0);
float4 rayFromLocal = (float4)(0,0,0,0);
float4 rayToLocal = (float4)(0,0,0,0);
invOrn = qtInvert(orn);
invPos = qtRotate(invOrn, -pos);
rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;
rayToLocal = qtRotate( invOrn, rayTo) + invPos;
rayFromLocal.w = 0.f;
rayToLocal.w = 0.f;
int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;
int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;
if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))
{
hitBodyIndex = b;
hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
}
}
if (rigidCollidable.m_shapeType == SHAPE_SPHERE)
{
float radius = rigidCollidable.m_radius;
if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))
{
hitBodyIndex = b;
hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
hitNormal = (float4) (hitPoint - bodies[b].m_pos);
}
}
}
if (hitBodyIndex >= 0)
{
hitResults[i].m_hitFraction = hitFraction;
hitResults[i].m_hitPoint = hitPoint;
hitResults[i].m_hitNormal = normalize(hitNormal);
hitResults[i].m_hitResult0 = hitBodyIndex;
}
}

View file

@ -1,380 +0,0 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* rayCastKernelCL =
"#define SHAPE_CONVEX_HULL 3\n"
"#define SHAPE_PLANE 4\n"
"#define SHAPE_CONCAVE_TRIMESH 5\n"
"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
"#define SHAPE_SPHERE 7\n"
"typedef struct\n"
"{\n"
" float4 m_from;\n"
" float4 m_to;\n"
"} b3RayInfo;\n"
"typedef struct\n"
"{\n"
" float m_hitFraction;\n"
" int m_hitResult0;\n"
" int m_hitResult1;\n"
" int m_hitResult2;\n"
" float4 m_hitPoint;\n"
" float4 m_hitNormal;\n"
"} b3RayHit;\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" float4 m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
" unsigned int m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"typedef struct Collidable\n"
"{\n"
" union {\n"
" int m_numChildShapes;\n"
" int m_bvhIndex;\n"
" };\n"
" float m_radius;\n"
" int m_shapeType;\n"
" int m_shapeIndex;\n"
"} Collidable;\n"
"typedef struct \n"
"{\n"
" float4 m_localCenter;\n"
" float4 m_extents;\n"
" float4 mC;\n"
" float4 mE;\n"
" float m_radius;\n"
" int m_faceOffset;\n"
" int m_numFaces;\n"
" int m_numVertices;\n"
" int m_vertexOffset;\n"
" int m_uniqueEdgesOffset;\n"
" int m_numUniqueEdges;\n"
" int m_unused;\n"
"} ConvexPolyhedronCL;\n"
"typedef struct\n"
"{\n"
" float4 m_plane;\n"
" int m_indexOffset;\n"
" int m_numIndices;\n"
"} b3GpuFace;\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"typedef float4 Quaternion;\n"
"__inline\n"
" Quaternion qtMul(Quaternion a, Quaternion b);\n"
"__inline\n"
" Quaternion qtNormalize(Quaternion in);\n"
"__inline\n"
" Quaternion qtInvert(Quaternion q);\n"
"__inline\n"
" float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = (float4)(a.xyz,0.f);\n"
" float4 b1 = (float4)(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"__inline\n"
" Quaternion qtMul(Quaternion a, Quaternion b)\n"
"{\n"
" Quaternion ans;\n"
" ans = cross( a, b );\n"
" ans += a.w*b+b.w*a;\n"
" // ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
" ans.w = a.w*b.w - dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"__inline\n"
" Quaternion qtNormalize(Quaternion in)\n"
"{\n"
" return fast_normalize(in);\n"
" // in /= length( in );\n"
" // return in;\n"
"}\n"
"__inline\n"
" float4 qtRotate(Quaternion q, float4 vec)\n"
"{\n"
" Quaternion qInv = qtInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = qtMul(q,vcpy);\n"
" out = qtMul(out,qInv);\n"
" return out;\n"
"}\n"
"__inline\n"
" Quaternion qtInvert(Quaternion q)\n"
"{\n"
" return (Quaternion)(-q.xyz, q.w);\n"
"}\n"
"__inline\n"
" float4 qtInvRotate(const Quaternion q, float4 vec)\n"
"{\n"
" return qtRotate( qtInvert( q ), vec );\n"
"}\n"
"void trInverse(float4 translationIn, Quaternion orientationIn,\n"
" float4* translationOut, Quaternion* orientationOut)\n"
"{\n"
" *orientationOut = qtInvert(orientationIn);\n"
" *translationOut = qtRotate(*orientationOut, -translationIn);\n"
"}\n"
"bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,\n"
" __global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)\n"
"{\n"
" rayFromLocal.w = 0.f;\n"
" rayToLocal.w = 0.f;\n"
" bool result = true;\n"
" float exitFraction = hitFraction[0];\n"
" float enterFraction = -0.3f;\n"
" float4 curHitNormal = (float4)(0,0,0,0);\n"
" for (int i=0;i<numFaces && result;i++)\n"
" {\n"
" b3GpuFace face = faces[faceOffset+i];\n"
" float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;\n"
" float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;\n"
" if (fromPlaneDist<0.f)\n"
" {\n"
" if (toPlaneDist >= 0.f)\n"
" {\n"
" float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
" if (exitFraction>fraction)\n"
" {\n"
" exitFraction = fraction;\n"
" }\n"
" } \n"
" } else\n"
" {\n"
" if (toPlaneDist<0.f)\n"
" {\n"
" float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
" if (enterFraction <= fraction)\n"
" {\n"
" enterFraction = fraction;\n"
" curHitNormal = face.m_plane;\n"
" curHitNormal.w = 0.f;\n"
" }\n"
" } else\n"
" {\n"
" result = false;\n"
" }\n"
" }\n"
" if (exitFraction <= enterFraction)\n"
" result = false;\n"
" }\n"
" if (enterFraction < 0.f)\n"
" {\n"
" result = false;\n"
" }\n"
" if (result)\n"
" { \n"
" hitFraction[0] = enterFraction;\n"
" hitNormal[0] = curHitNormal;\n"
" }\n"
" return result;\n"
"}\n"
"bool sphere_intersect(float4 spherePos, float radius, float4 rayFrom, float4 rayTo, float* hitFraction)\n"
"{\n"
" float4 rs = rayFrom - spherePos;\n"
" rs.w = 0.f;\n"
" float4 rayDir = rayTo-rayFrom;\n"
" rayDir.w = 0.f;\n"
" float A = dot(rayDir,rayDir);\n"
" float B = dot(rs, rayDir);\n"
" float C = dot(rs, rs) - (radius * radius);\n"
" float D = B * B - A*C;\n"
" if (D > 0.0f)\n"
" {\n"
" float t = (-B - sqrt(D))/A;\n"
" if ( (t >= 0.0f) && (t < (*hitFraction)) )\n"
" {\n"
" *hitFraction = t;\n"
" return true;\n"
" }\n"
" }\n"
" return false;\n"
"}\n"
"float4 setInterpolate3(float4 from, float4 to, float t)\n"
"{\n"
" float s = 1.0f - t;\n"
" float4 result;\n"
" result = s * from + t * to;\n"
" result.w = 0.f; \n"
" return result; \n"
"}\n"
"__kernel void rayCastKernel( \n"
" int numRays, \n"
" const __global b3RayInfo* rays, \n"
" __global b3RayHit* hitResults, \n"
" const int numBodies, \n"
" __global Body* bodies,\n"
" __global Collidable* collidables,\n"
" __global const b3GpuFace* faces,\n"
" __global const ConvexPolyhedronCL* convexShapes )\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numRays)\n"
" return;\n"
" hitResults[i].m_hitFraction = 1.f;\n"
" float4 rayFrom = rays[i].m_from;\n"
" float4 rayTo = rays[i].m_to;\n"
" float hitFraction = 1.f;\n"
" float4 hitPoint;\n"
" float4 hitNormal;\n"
" int hitBodyIndex= -1;\n"
" int cachedCollidableIndex = -1;\n"
" Collidable cachedCollidable;\n"
" for (int b=0;b<numBodies;b++)\n"
" {\n"
" if (hitResults[i].m_hitResult2==b)\n"
" continue;\n"
" Body body = bodies[b];\n"
" float4 pos = body.m_pos;\n"
" float4 orn = body.m_quat;\n"
" if (cachedCollidableIndex != body.m_collidableIdx)\n"
" {\n"
" cachedCollidableIndex = body.m_collidableIdx;\n"
" cachedCollidable = collidables[cachedCollidableIndex];\n"
" }\n"
" if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
" {\n"
" float4 invPos = (float4)(0,0,0,0);\n"
" float4 invOrn = (float4)(0,0,0,0);\n"
" float4 rayFromLocal = (float4)(0,0,0,0);\n"
" float4 rayToLocal = (float4)(0,0,0,0);\n"
" invOrn = qtInvert(orn);\n"
" invPos = qtRotate(invOrn, -pos);\n"
" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
" rayFromLocal.w = 0.f;\n"
" rayToLocal.w = 0.f;\n"
" int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;\n"
" int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;\n"
" if (numFaces)\n"
" {\n"
" if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
" {\n"
" hitBodyIndex = b;\n"
" \n"
" }\n"
" }\n"
" }\n"
" if (cachedCollidable.m_shapeType == SHAPE_SPHERE)\n"
" {\n"
" float radius = cachedCollidable.m_radius;\n"
" \n"
" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
" {\n"
" hitBodyIndex = b;\n"
" hitNormal = (float4) (hitPoint-bodies[b].m_pos);\n"
" }\n"
" }\n"
" }\n"
" if (hitBodyIndex>=0)\n"
" {\n"
" hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);\n"
" hitResults[i].m_hitFraction = hitFraction;\n"
" hitResults[i].m_hitPoint = hitPoint;\n"
" hitResults[i].m_hitNormal = normalize(hitNormal);\n"
" hitResults[i].m_hitResult0 = hitBodyIndex;\n"
" }\n"
"}\n"
"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n"
" __global int* out_firstRayRigidPairIndexPerRay,\n"
" __global int* out_numRayRigidPairsPerRay,\n"
" int numRayRigidPairs)\n"
"{\n"
" int rayRigidPairIndex = get_global_id(0);\n"
" if (rayRigidPairIndex >= numRayRigidPairs) return;\n"
" \n"
" int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n"
" \n"
" atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n"
" atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n"
"}\n"
"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n"
" __global b3RayHit* hitResults, \n"
" __global int* firstRayRigidPairIndexPerRay,\n"
" __global int* numRayRigidPairsPerRay,\n"
" \n"
" __global Body* bodies,\n"
" __global Collidable* collidables,\n"
" __global const b3GpuFace* faces,\n"
" __global const ConvexPolyhedronCL* convexShapes,\n"
" \n"
" __global int2* rayRigidPairs,\n"
" int numRays)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i >= numRays) return;\n"
" \n"
" float4 rayFrom = rays[i].m_from;\n"
" float4 rayTo = rays[i].m_to;\n"
" \n"
" hitResults[i].m_hitFraction = 1.f;\n"
" \n"
" float hitFraction = 1.f;\n"
" float4 hitPoint;\n"
" float4 hitNormal;\n"
" int hitBodyIndex = -1;\n"
" \n"
" //\n"
" for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n"
" {\n"
" int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n"
" int b = rayRigidPairs[rayRigidPairIndex].y;\n"
" \n"
" if (hitResults[i].m_hitResult2 == b) continue;\n"
" \n"
" Body body = bodies[b];\n"
" Collidable rigidCollidable = collidables[body.m_collidableIdx];\n"
" \n"
" float4 pos = body.m_pos;\n"
" float4 orn = body.m_quat;\n"
" \n"
" if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
" {\n"
" float4 invPos = (float4)(0,0,0,0);\n"
" float4 invOrn = (float4)(0,0,0,0);\n"
" float4 rayFromLocal = (float4)(0,0,0,0);\n"
" float4 rayToLocal = (float4)(0,0,0,0);\n"
" invOrn = qtInvert(orn);\n"
" invPos = qtRotate(invOrn, -pos);\n"
" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
" rayFromLocal.w = 0.f;\n"
" rayToLocal.w = 0.f;\n"
" int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n"
" int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n"
" \n"
" if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
" {\n"
" hitBodyIndex = b;\n"
" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
" }\n"
" }\n"
" \n"
" if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n"
" {\n"
" float radius = rigidCollidable.m_radius;\n"
" \n"
" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
" {\n"
" hitBodyIndex = b;\n"
" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
" hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n"
" }\n"
" }\n"
" }\n"
" \n"
" if (hitBodyIndex >= 0)\n"
" {\n"
" hitResults[i].m_hitFraction = hitFraction;\n"
" hitResults[i].m_hitPoint = hitPoint;\n"
" hitResults[i].m_hitNormal = normalize(hitNormal);\n"
" hitResults[i].m_hitResult0 = hitBodyIndex;\n"
" }\n"
" \n"
"}\n";

View file

@ -1,17 +0,0 @@
#ifndef B3_CONSTRAINT4_h
#define B3_CONSTRAINT4_h
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3Dynamics/shared/b3ContactConstraint4.h"
B3_ATTRIBUTE_ALIGNED16(struct)
b3GpuConstraint4 : public b3ContactConstraint4
{
B3_DECLARE_ALIGNED_ALLOCATOR();
inline void setFrictionCoeff(float value) { m_linear[3] = value; }
inline float getFrictionCoeff() const { return m_linear[3]; }
};
#endif //B3_CONSTRAINT4_h

View file

@ -1,134 +0,0 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#include "b3GpuGenericConstraint.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
#include <new>
#include "Bullet3Common/b3Transform.h"
void b3GpuGenericConstraint::getInfo1(unsigned int* info, const b3RigidBodyData* bodies)
{
switch (m_constraintType)
{
case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
{
*info = 3;
break;
};
default:
{
b3Assert(0);
}
};
}
void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies)
{
b3Transform trA;
trA.setIdentity();
trA.setOrigin(bodies[constraint->m_rbA].m_pos);
trA.setRotation(bodies[constraint->m_rbA].m_quat);
b3Transform trB;
trB.setIdentity();
trB.setOrigin(bodies[constraint->m_rbB].m_pos);
trB.setRotation(bodies[constraint->m_rbB].m_quat);
// anchor points in global coordinates with respect to body PORs.
// set jacobian
info->m_J1linearAxis[0] = 1;
info->m_J1linearAxis[info->rowskip + 1] = 1;
info->m_J1linearAxis[2 * info->rowskip + 2] = 1;
b3Vector3 a1 = trA.getBasis() * constraint->getPivotInA();
//b3Vector3 a1a = b3QuatRotate(trA.getRotation(),constraint->getPivotInA());
{
b3Vector3* angular0 = (b3Vector3*)(info->m_J1angularAxis);
b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis + info->rowskip);
b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis + 2 * info->rowskip);
b3Vector3 a1neg = -a1;
a1neg.getSkewSymmetricMatrix(angular0, angular1, angular2);
}
if (info->m_J2linearAxis)
{
info->m_J2linearAxis[0] = -1;
info->m_J2linearAxis[info->rowskip + 1] = -1;
info->m_J2linearAxis[2 * info->rowskip + 2] = -1;
}
b3Vector3 a2 = trB.getBasis() * constraint->getPivotInB();
{
// b3Vector3 a2n = -a2;
b3Vector3* angular0 = (b3Vector3*)(info->m_J2angularAxis);
b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis + info->rowskip);
b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis + 2 * info->rowskip);
a2.getSkewSymmetricMatrix(angular0, angular1, angular2);
}
// set right hand side
// b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;
b3Scalar currERP = info->erp;
b3Scalar k = info->fps * currERP;
int j;
for (j = 0; j < 3; j++)
{
info->m_constraintError[j * info->rowskip] = k * (a2[j] + trB.getOrigin()[j] - a1[j] - trA.getOrigin()[j]);
//printf("info->m_constraintError[%d]=%f\n",j,info->m_constraintError[j]);
}
#if 0
if(m_flags & B3_P2P_FLAGS_CFM)
{
for (j=0; j<3; j++)
{
info->cfm[j*info->rowskip] = m_cfm;
}
}
#endif
#if 0
b3Scalar impulseClamp = m_setting.m_impulseClamp;//
for (j=0; j<3; j++)
{
if (m_setting.m_impulseClamp > 0)
{
info->m_lowerLimit[j*info->rowskip] = -impulseClamp;
info->m_upperLimit[j*info->rowskip] = impulseClamp;
}
}
info->m_damping = m_setting.m_damping;
#endif
}
void b3GpuGenericConstraint::getInfo2(b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies)
{
switch (m_constraintType)
{
case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
{
getInfo2Point2Point(this, info, bodies);
break;
};
default:
{
b3Assert(0);
}
};
}

View file

@ -1,128 +0,0 @@
/*
Copyright (c) 2013 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef B3_GPU_GENERIC_CONSTRAINT_H
#define B3_GPU_GENERIC_CONSTRAINT_H
#include "Bullet3Common/b3Quaternion.h"
struct b3RigidBodyData;
enum B3_CONSTRAINT_FLAGS
{
B3_CONSTRAINT_FLAG_ENABLED = 1,
};
enum b3GpuGenericConstraintType
{
B3_GPU_POINT2POINT_CONSTRAINT_TYPE = 3,
B3_GPU_FIXED_CONSTRAINT_TYPE = 4,
// B3_HINGE_CONSTRAINT_TYPE,
// B3_CONETWIST_CONSTRAINT_TYPE,
// B3_D6_CONSTRAINT_TYPE,
// B3_SLIDER_CONSTRAINT_TYPE,
// B3_CONTACT_CONSTRAINT_TYPE,
// B3_D6_SPRING_CONSTRAINT_TYPE,
// B3_GEAR_CONSTRAINT_TYPE,
B3_GPU_MAX_CONSTRAINT_TYPE
};
struct b3GpuConstraintInfo2
{
// integrator parameters: frames per second (1/stepsize), default error
// reduction parameter (0..1).
b3Scalar fps, erp;
// for the first and second body, pointers to two (linear and angular)
// n*3 jacobian sub matrices, stored by rows. these matrices will have
// been initialized to 0 on entry. if the second body is zero then the
// J2xx pointers may be 0.
b3Scalar *m_J1linearAxis, *m_J1angularAxis, *m_J2linearAxis, *m_J2angularAxis;
// elements to jump from one row to the next in J's
int rowskip;
// right hand sides of the equation J*v = c + cfm * lambda. cfm is the
// "constraint force mixing" vector. c is set to zero on entry, cfm is
// set to a constant value (typically very small or zero) value on entry.
b3Scalar *m_constraintError, *cfm;
// lo and hi limits for variables (set to -/+ infinity on entry).
b3Scalar *m_lowerLimit, *m_upperLimit;
// findex vector for variables. see the LCP solver interface for a
// description of what this does. this is set to -1 on entry.
// note that the returned indexes are relative to the first index of
// the constraint.
int* findex;
// number of solver iterations
int m_numIterations;
//damping of the velocity
b3Scalar m_damping;
};
B3_ATTRIBUTE_ALIGNED16(struct)
b3GpuGenericConstraint
{
int m_constraintType;
int m_rbA;
int m_rbB;
float m_breakingImpulseThreshold;
b3Vector3 m_pivotInA;
b3Vector3 m_pivotInB;
b3Quaternion m_relTargetAB;
int m_flags;
int m_uid;
int m_padding[2];
int getRigidBodyA() const
{
return m_rbA;
}
int getRigidBodyB() const
{
return m_rbB;
}
const b3Vector3& getPivotInA() const
{
return m_pivotInA;
}
const b3Vector3& getPivotInB() const
{
return m_pivotInB;
}
int isEnabled() const
{
return m_flags & B3_CONSTRAINT_FLAG_ENABLED;
}
float getBreakingImpulseThreshold() const
{
return m_breakingImpulseThreshold;
}
///internal method used by the constraint solver, don't use them directly
void getInfo1(unsigned int* info, const b3RigidBodyData* bodies);
///internal method used by the constraint solver, don't use them directly
void getInfo2(b3GpuConstraintInfo2 * info, const b3RigidBodyData* bodies);
};
#endif //B3_GPU_GENERIC_CONSTRAINT_H

View file

@ -1,56 +0,0 @@
#ifndef B3_GPU_JACOBI_CONTACT_SOLVER_H
#define B3_GPU_JACOBI_CONTACT_SOLVER_H
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
//#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
//struct b3InertiaData;
//b3InertiaData
class b3TypedConstraint;
struct b3JacobiSolverInfo
{
int m_fixedBodyIndex;
float m_deltaTime;
float m_positionDrift;
float m_positionConstraintCoeff;
int m_numIterations;
b3JacobiSolverInfo()
: m_fixedBodyIndex(0),
m_deltaTime(1. / 60.f),
m_positionDrift(0.005f),
m_positionConstraintCoeff(0.99f),
m_numIterations(7)
{
}
};
class b3GpuJacobiContactSolver
{
protected:
struct b3GpuJacobiSolverInternalData* m_data;
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
public:
b3GpuJacobiContactSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity);
virtual ~b3GpuJacobiContactSolver();
void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index);
void solveGroupHost(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, struct b3Contact4* manifoldPtr, int numManifolds, const b3JacobiSolverInfo& solverInfo);
//void solveGroupHost(btRigidBodyCL* bodies,b3InertiaData* inertias,int numBodies,btContact4* manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btJacobiSolverInfo& solverInfo);
//b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
//void solveGroup(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo);
//void solveGroupMixed(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo);
};
#endif //B3_GPU_JACOBI_CONTACT_SOLVER_H

View file

@ -1,101 +0,0 @@
#ifndef B3_GPU_NARROWPHASE_H
#define B3_GPU_NARROWPHASE_H
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Common/b3Vector3.h"
class b3GpuNarrowPhase
{
protected:
struct b3GpuNarrowPhaseInternalData* m_data;
int m_acceleratedCompanionShapeIndex;
int m_planeBodyIndex;
int m_static0Index;
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
int registerConvexHullShapeInternal(class b3ConvexUtility* convexPtr, b3Collidable& col);
int registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, b3Collidable& col, const float* scaling);
public:
b3GpuNarrowPhase(cl_context vtx, cl_device_id dev, cl_command_queue q, const struct b3Config& config);
virtual ~b3GpuNarrowPhase(void);
int registerSphereShape(float radius);
int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
int registerFace(const b3Vector3& faceNormal, float faceConstant);
int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling);
//do they need to be merged?
int registerConvexHullShape(b3ConvexUtility* utilPtr);
int registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax, bool writeToGpu);
void setObjectTransform(const float* position, const float* orientation, int bodyIndex);
void writeAllBodiesToGpu();
void reset();
void readbackAllBodiesToCpu();
bool getObjectTransformFromCpu(float* position, float* orientation, int bodyIndex) const;
void setObjectTransformCpu(float* position, float* orientation, int bodyIndex);
void setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex);
virtual void computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects);
cl_mem getBodiesGpu();
const struct b3RigidBodyData* getBodiesCpu() const;
//struct b3RigidBodyData* getBodiesCpu();
int getNumBodiesGpu() const;
cl_mem getBodyInertiasGpu();
int getNumBodyInertiasGpu() const;
cl_mem getCollidablesGpu();
const struct b3Collidable* getCollidablesCpu() const;
int getNumCollidablesGpu() const;
const struct b3SapAabb* getLocalSpaceAabbsCpu() const;
const struct b3Contact4* getContactsCPU() const;
cl_mem getContactsGpu();
int getNumContactsGpu() const;
cl_mem getAabbLocalSpaceBufferGpu();
int getNumRigidBodies() const;
int allocateCollidable();
int getStatic0Index() const
{
return m_static0Index;
}
b3Collidable& getCollidableCpu(int collidableIndex);
const b3Collidable& getCollidableCpu(int collidableIndex) const;
const b3GpuNarrowPhaseInternalData* getInternalData() const
{
return m_data;
}
b3GpuNarrowPhaseInternalData* getInternalData()
{
return m_data;
}
const struct b3SapAabb& getLocalSpaceAabb(int collidableIndex) const;
};
#endif //B3_GPU_NARROWPHASE_H

View file

@ -1,89 +0,0 @@
#ifndef B3_GPU_NARROWPHASE_INTERNAL_DATA_H
#define B3_GPU_NARROWPHASE_INTERNAL_DATA_H
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
#include "Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h"
#include "Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h"
#include "Bullet3Common/shared/b3Int4.h"
#include "Bullet3Common/shared/b3Int2.h"
class b3ConvexUtility;
struct b3GpuNarrowPhaseInternalData
{
b3AlignedObjectArray<b3ConvexUtility*>* m_convexData;
b3AlignedObjectArray<b3ConvexPolyhedronData> m_convexPolyhedra;
b3AlignedObjectArray<b3Vector3> m_uniqueEdges;
b3AlignedObjectArray<b3Vector3> m_convexVertices;
b3AlignedObjectArray<int> m_convexIndices;
b3OpenCLArray<b3ConvexPolyhedronData>* m_convexPolyhedraGPU;
b3OpenCLArray<b3Vector3>* m_uniqueEdgesGPU;
b3OpenCLArray<b3Vector3>* m_convexVerticesGPU;
b3OpenCLArray<int>* m_convexIndicesGPU;
b3OpenCLArray<b3Vector3>* m_worldVertsB1GPU;
b3OpenCLArray<b3Int4>* m_clippingFacesOutGPU;
b3OpenCLArray<b3Vector3>* m_worldNormalsAGPU;
b3OpenCLArray<b3Vector3>* m_worldVertsA1GPU;
b3OpenCLArray<b3Vector3>* m_worldVertsB2GPU;
b3AlignedObjectArray<b3GpuChildShape> m_cpuChildShapes;
b3OpenCLArray<b3GpuChildShape>* m_gpuChildShapes;
b3AlignedObjectArray<b3GpuFace> m_convexFaces;
b3OpenCLArray<b3GpuFace>* m_convexFacesGPU;
struct GpuSatCollision* m_gpuSatCollision;
b3OpenCLArray<b3Int4>* m_triangleConvexPairs;
b3OpenCLArray<b3Contact4>* m_pBufContactBuffersGPU[2];
int m_currentContactBuffer;
b3AlignedObjectArray<b3Contact4>* m_pBufContactOutCPU;
b3AlignedObjectArray<b3RigidBodyData>* m_bodyBufferCPU;
b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
b3AlignedObjectArray<b3InertiaData>* m_inertiaBufferCPU;
b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
int m_numAcceleratedShapes;
int m_numAcceleratedRigidBodies;
b3AlignedObjectArray<b3Collidable> m_collidablesCPU;
b3OpenCLArray<b3Collidable>* m_collidablesGPU;
b3OpenCLArray<b3SapAabb>* m_localShapeAABBGPU;
b3AlignedObjectArray<b3SapAabb>* m_localShapeAABBCPU;
b3AlignedObjectArray<class b3OptimizedBvh*> m_bvhData;
b3AlignedObjectArray<class b3TriangleIndexVertexArray*> m_meshInterfaces;
b3AlignedObjectArray<b3QuantizedBvhNode> m_treeNodesCPU;
b3AlignedObjectArray<b3BvhSubtreeInfo> m_subTreesCPU;
b3AlignedObjectArray<b3BvhInfo> m_bvhInfoCPU;
b3OpenCLArray<b3BvhInfo>* m_bvhInfoGPU;
b3OpenCLArray<b3QuantizedBvhNode>* m_treeNodesGPU;
b3OpenCLArray<b3BvhSubtreeInfo>* m_subTreesGPU;
b3Config m_config;
};
#endif //B3_GPU_NARROWPHASE_INTERNAL_DATA_H

View file

@ -1,76 +0,0 @@
/*
Copyright (c) 2013 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef B3_GPU_PGS_CONSTRAINT_SOLVER_H
#define B3_GPU_PGS_CONSTRAINT_SOLVER_H
struct b3Contact4;
struct b3ContactPoint;
class b3Dispatcher;
#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
#include "Bullet3Dynamics/ConstraintSolver/b3ContactSolverInfo.h"
#include "b3GpuSolverBody.h"
#include "b3GpuSolverConstraint.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
struct b3RigidBodyData;
struct b3InertiaData;
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
#include "b3GpuGenericConstraint.h"
class b3GpuPgsConstraintSolver
{
protected:
int m_staticIdx;
struct b3GpuPgsJacobiSolverInternalData* m_gpuData;
protected:
b3AlignedObjectArray<b3GpuSolverBody> m_tmpSolverBodyPool;
b3GpuConstraintArray m_tmpSolverContactConstraintPool;
b3GpuConstraintArray m_tmpSolverNonContactConstraintPool;
b3GpuConstraintArray m_tmpSolverContactFrictionConstraintPool;
b3GpuConstraintArray m_tmpSolverContactRollingFrictionConstraintPool;
b3AlignedObjectArray<unsigned int> m_tmpConstraintSizesPool;
bool m_usePgs;
void averageVelocities();
int m_maxOverrideNumSolverIterations;
int m_numSplitImpulseRecoveries;
// int getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias);
void initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb);
public:
b3GpuPgsConstraintSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, bool usePgs);
virtual ~b3GpuPgsConstraintSolver();
virtual b3Scalar solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1, int numConstraints, const b3ContactSolverInfo& infoGlobal);
virtual b3Scalar solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
b3Scalar solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
void solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias,
int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints);
int sortConstraintByBatch3(struct b3BatchConstraint* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies);
void recomputeBatches();
};
#endif //B3_GPU_PGS_CONSTRAINT_SOLVER_H

View file

@ -1,37 +0,0 @@
#ifndef B3_GPU_BATCHING_PGS_SOLVER_H
#define B3_GPU_BATCHING_PGS_SOLVER_H
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
#include "b3GpuConstraint4.h"
class b3GpuPgsContactSolver
{
protected:
int m_debugOutput;
struct b3GpuBatchingPgsSolverInternalData* m_data;
void batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx);
inline int sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies);
inline int sortConstraintByBatch2(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies);
inline int sortConstraintByBatch3(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies, int* batchSizes);
void solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes); //const b3OpenCLArray<int>* gpuBatchSizes);
void solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes); //const b3OpenCLArray<int>* gpuBatchSizes);
public:
b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity);
virtual ~b3GpuPgsContactSolver();
void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index);
};
#endif //B3_GPU_BATCHING_PGS_SOLVER_H

View file

@ -1,677 +0,0 @@
/*
Copyright (c) 2013 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#include "b3GpuRigidBodyPipeline.h"
#include "b3GpuRigidBodyPipelineInternalData.h"
#include "kernels/integrateKernel.h"
#include "kernels/updateAabbsKernel.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "b3GpuNarrowPhase.h"
#include "Bullet3Geometry/b3AabbUtil.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h"
#include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h"
//#define TEST_OTHER_GPU_SOLVER
#define B3_RIGIDBODY_INTEGRATE_PATH "src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl"
#define B3_RIGIDBODY_UPDATEAABB_PATH "src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl"
bool useBullet2CpuSolver = true;
//choice of contact solver
bool gUseJacobi = false;
bool gUseDbvt = false;
bool gDumpContactStats = false;
bool gCalcWorldSpaceAabbOnCpu = false;
bool gUseCalculateOverlappingPairsHost = false;
bool gIntegrateOnCpu = false;
bool gClearPairsOnGpu = true;
#define TEST_OTHER_GPU_SOLVER 1
#ifdef TEST_OTHER_GPU_SOLVER
#include "b3GpuJacobiContactSolver.h"
#endif //TEST_OTHER_GPU_SOLVER
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
#include "Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h"
#include "b3GpuPgsContactSolver.h"
#include "b3Solver.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
#include "Bullet3OpenCL/Raycast/b3GpuRaycast.h"
#include "Bullet3Dynamics/shared/b3IntegrateTransforms.h"
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h"
b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx, cl_device_id device, cl_command_queue q, class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config)
{
m_data = new b3GpuRigidBodyPipelineInternalData;
m_data->m_constraintUid = 0;
m_data->m_config = config;
m_data->m_context = ctx;
m_data->m_device = device;
m_data->m_queue = q;
m_data->m_solver = new b3PgsJacobiSolver(true); //new b3PgsJacobiSolver(true);
m_data->m_gpuSolver = new b3GpuPgsConstraintSolver(ctx, device, q, true); //new b3PgsJacobiSolver(true);
m_data->m_allAabbsGPU = new b3OpenCLArray<b3SapAabb>(ctx, q, config.m_maxConvexBodies);
m_data->m_overlappingPairsGPU = new b3OpenCLArray<b3BroadphasePair>(ctx, q, config.m_maxBroadphasePairs);
m_data->m_gpuConstraints = new b3OpenCLArray<b3GpuGenericConstraint>(ctx, q);
#ifdef TEST_OTHER_GPU_SOLVER
m_data->m_solver3 = new b3GpuJacobiContactSolver(ctx, device, q, config.m_maxBroadphasePairs);
#endif // TEST_OTHER_GPU_SOLVER
m_data->m_solver2 = new b3GpuPgsContactSolver(ctx, device, q, config.m_maxBroadphasePairs);
m_data->m_raycaster = new b3GpuRaycast(ctx, device, q);
m_data->m_broadphaseDbvt = broadphaseDbvt;
m_data->m_broadphaseSap = broadphaseSap;
m_data->m_narrowphase = narrowphase;
m_data->m_gravity.setValue(0.f, -9.8f, 0.f);
cl_int errNum = 0;
{
cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, integrateKernelCL, &errNum, "", B3_RIGIDBODY_INTEGRATE_PATH);
b3Assert(errNum == CL_SUCCESS);
m_data->m_integrateTransformsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, integrateKernelCL, "integrateTransformsKernel", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
clReleaseProgram(prog);
}
{
cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, &errNum, "", B3_RIGIDBODY_UPDATEAABB_PATH);
b3Assert(errNum == CL_SUCCESS);
m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, "initializeGpuAabbsFull", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
m_data->m_clearOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, "clearOverlappingPairsKernel", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
clReleaseProgram(prog);
}
}
b3GpuRigidBodyPipeline::~b3GpuRigidBodyPipeline()
{
if (m_data->m_integrateTransformsKernel)
clReleaseKernel(m_data->m_integrateTransformsKernel);
if (m_data->m_updateAabbsKernel)
clReleaseKernel(m_data->m_updateAabbsKernel);
if (m_data->m_clearOverlappingPairsKernel)
clReleaseKernel(m_data->m_clearOverlappingPairsKernel);
delete m_data->m_raycaster;
delete m_data->m_solver;
delete m_data->m_allAabbsGPU;
delete m_data->m_gpuConstraints;
delete m_data->m_overlappingPairsGPU;
#ifdef TEST_OTHER_GPU_SOLVER
delete m_data->m_solver3;
#endif //TEST_OTHER_GPU_SOLVER
delete m_data->m_solver2;
delete m_data;
}
void b3GpuRigidBodyPipeline::reset()
{
m_data->m_gpuConstraints->resize(0);
m_data->m_cpuConstraints.resize(0);
m_data->m_allAabbsGPU->resize(0);
m_data->m_allAabbsCPU.resize(0);
}
void b3GpuRigidBodyPipeline::addConstraint(b3TypedConstraint* constraint)
{
m_data->m_joints.push_back(constraint);
}
void b3GpuRigidBodyPipeline::removeConstraint(b3TypedConstraint* constraint)
{
m_data->m_joints.remove(constraint);
}
void b3GpuRigidBodyPipeline::removeConstraintByUid(int uid)
{
m_data->m_gpuSolver->recomputeBatches();
//slow linear search
m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints);
//remove
for (int i = 0; i < m_data->m_cpuConstraints.size(); i++)
{
if (m_data->m_cpuConstraints[i].m_uid == uid)
{
//m_data->m_cpuConstraints.remove(m_data->m_cpuConstraints[i]);
m_data->m_cpuConstraints.swap(i, m_data->m_cpuConstraints.size() - 1);
m_data->m_cpuConstraints.pop_back();
break;
}
}
if (m_data->m_cpuConstraints.size())
{
m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints);
}
else
{
m_data->m_gpuConstraints->resize(0);
}
}
int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, float breakingThreshold)
{
m_data->m_gpuSolver->recomputeBatches();
b3GpuGenericConstraint c;
c.m_uid = m_data->m_constraintUid;
m_data->m_constraintUid++;
c.m_flags = B3_CONSTRAINT_FLAG_ENABLED;
c.m_rbA = bodyA;
c.m_rbB = bodyB;
c.m_pivotInA.setValue(pivotInA[0], pivotInA[1], pivotInA[2]);
c.m_pivotInB.setValue(pivotInB[0], pivotInB[1], pivotInB[2]);
c.m_breakingImpulseThreshold = breakingThreshold;
c.m_constraintType = B3_GPU_POINT2POINT_CONSTRAINT_TYPE;
m_data->m_cpuConstraints.push_back(c);
return c.m_uid;
}
int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold)
{
m_data->m_gpuSolver->recomputeBatches();
b3GpuGenericConstraint c;
c.m_uid = m_data->m_constraintUid;
m_data->m_constraintUid++;
c.m_flags = B3_CONSTRAINT_FLAG_ENABLED;
c.m_rbA = bodyA;
c.m_rbB = bodyB;
c.m_pivotInA.setValue(pivotInA[0], pivotInA[1], pivotInA[2]);
c.m_pivotInB.setValue(pivotInB[0], pivotInB[1], pivotInB[2]);
c.m_relTargetAB.setValue(relTargetAB[0], relTargetAB[1], relTargetAB[2], relTargetAB[3]);
c.m_breakingImpulseThreshold = breakingThreshold;
c.m_constraintType = B3_GPU_FIXED_CONSTRAINT_TYPE;
m_data->m_cpuConstraints.push_back(c);
return c.m_uid;
}
void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
{
//update worldspace AABBs from local AABB/worldtransform
{
B3_PROFILE("setupGpuAabbs");
setupGpuAabbsFull();
}
int numPairs = 0;
//compute overlapping pairs
{
if (gUseDbvt)
{
{
B3_PROFILE("setAabb");
m_data->m_allAabbsGPU->copyToHost(m_data->m_allAabbsCPU);
for (int i = 0; i < m_data->m_allAabbsCPU.size(); i++)
{
b3Vector3 aabbMin = b3MakeVector3(m_data->m_allAabbsCPU[i].m_min[0], m_data->m_allAabbsCPU[i].m_min[1], m_data->m_allAabbsCPU[i].m_min[2]);
b3Vector3 aabbMax = b3MakeVector3(m_data->m_allAabbsCPU[i].m_max[0], m_data->m_allAabbsCPU[i].m_max[1], m_data->m_allAabbsCPU[i].m_max[2]);
m_data->m_broadphaseDbvt->setAabb(i, aabbMin, aabbMax, 0);
}
}
{
B3_PROFILE("calculateOverlappingPairs");
m_data->m_broadphaseDbvt->calculateOverlappingPairs();
}
numPairs = m_data->m_broadphaseDbvt->getOverlappingPairCache()->getNumOverlappingPairs();
}
else
{
if (gUseCalculateOverlappingPairsHost)
{
m_data->m_broadphaseSap->calculateOverlappingPairsHost(m_data->m_config.m_maxBroadphasePairs);
}
else
{
m_data->m_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs);
}
numPairs = m_data->m_broadphaseSap->getNumOverlap();
}
}
//compute contact points
// printf("numPairs=%d\n",numPairs);
int numContacts = 0;
int numBodies = m_data->m_narrowphase->getNumRigidBodies();
if (numPairs)
{
cl_mem pairs = 0;
cl_mem aabbsWS = 0;
if (gUseDbvt)
{
B3_PROFILE("m_overlappingPairsGPU->copyFromHost");
m_data->m_overlappingPairsGPU->copyFromHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray());
pairs = m_data->m_overlappingPairsGPU->getBufferCL();
aabbsWS = m_data->m_allAabbsGPU->getBufferCL();
}
else
{
pairs = m_data->m_broadphaseSap->getOverlappingPairBuffer();
aabbsWS = m_data->m_broadphaseSap->getAabbBufferWS();
}
m_data->m_overlappingPairsGPU->resize(numPairs);
//mark the contacts for each pair as 'unused'
if (numPairs)
{
b3OpenCLArray<b3BroadphasePair> gpuPairs(this->m_data->m_context, m_data->m_queue);
gpuPairs.setFromOpenCLBuffer(pairs, numPairs);
if (gClearPairsOnGpu)
{
//b3AlignedObjectArray<b3BroadphasePair> hostPairs;//just for debugging
//gpuPairs.copyToHost(hostPairs);
b3LauncherCL launcher(m_data->m_queue, m_data->m_clearOverlappingPairsKernel, "clearOverlappingPairsKernel");
launcher.setBuffer(pairs);
launcher.setConst(numPairs);
launcher.launch1D(numPairs);
//gpuPairs.copyToHost(hostPairs);
}
else
{
b3AlignedObjectArray<b3BroadphasePair> hostPairs;
gpuPairs.copyToHost(hostPairs);
for (int i = 0; i < hostPairs.size(); i++)
{
hostPairs[i].z = 0xffffffff;
}
gpuPairs.copyFromHost(hostPairs);
}
}
m_data->m_narrowphase->computeContacts(pairs, numPairs, aabbsWS, numBodies);
numContacts = m_data->m_narrowphase->getNumContactsGpu();
if (gUseDbvt)
{
///store the cached information (contact locations in the 'z' component)
B3_PROFILE("m_overlappingPairsGPU->copyToHost");
m_data->m_overlappingPairsGPU->copyToHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray());
}
if (gDumpContactStats && numContacts)
{
m_data->m_narrowphase->getContactsGpu();
printf("numContacts = %d\n", numContacts);
int totalPoints = 0;
const b3Contact4* contacts = m_data->m_narrowphase->getContactsCPU();
for (int i = 0; i < numContacts; i++)
{
totalPoints += contacts->getNPoints();
}
printf("totalPoints=%d\n", totalPoints);
}
}
//convert contact points to contact constraints
//solve constraints
b3OpenCLArray<b3RigidBodyData> gpuBodies(m_data->m_context, m_data->m_queue, 0, true);
gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(), m_data->m_narrowphase->getNumRigidBodies());
b3OpenCLArray<b3InertiaData> gpuInertias(m_data->m_context, m_data->m_queue, 0, true);
gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(), m_data->m_narrowphase->getNumRigidBodies());
b3OpenCLArray<b3Contact4> gpuContacts(m_data->m_context, m_data->m_queue, 0, true);
gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(), m_data->m_narrowphase->getNumContactsGpu());
int numJoints = m_data->m_joints.size() ? m_data->m_joints.size() : m_data->m_cpuConstraints.size();
if (useBullet2CpuSolver && numJoints)
{
// b3AlignedObjectArray<b3Contact4> hostContacts;
//gpuContacts.copyToHost(hostContacts);
{
bool useGpu = m_data->m_joints.size() == 0;
// b3Contact4* contacts = numContacts? &hostContacts[0]: 0;
//m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,contacts,numJoints, joints);
if (useGpu)
{
m_data->m_gpuSolver->solveJoints(m_data->m_narrowphase->getNumRigidBodies(), &gpuBodies, &gpuInertias, numJoints, m_data->m_gpuConstraints);
}
else
{
b3AlignedObjectArray<b3RigidBodyData> hostBodies;
gpuBodies.copyToHost(hostBodies);
b3AlignedObjectArray<b3InertiaData> hostInertias;
gpuInertias.copyToHost(hostInertias);
b3TypedConstraint** joints = numJoints ? &m_data->m_joints[0] : 0;
m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(), &hostBodies[0], &hostInertias[0], 0, 0, numJoints, joints);
gpuBodies.copyFromHost(hostBodies);
}
}
}
if (numContacts)
{
#ifdef TEST_OTHER_GPU_SOLVER
if (gUseJacobi)
{
bool useGpu = true;
if (useGpu)
{
bool forceHost = false;
if (forceHost)
{
b3AlignedObjectArray<b3RigidBodyData> hostBodies;
b3AlignedObjectArray<b3InertiaData> hostInertias;
b3AlignedObjectArray<b3Contact4> hostContacts;
{
B3_PROFILE("copyToHost");
gpuBodies.copyToHost(hostBodies);
gpuInertias.copyToHost(hostInertias);
gpuContacts.copyToHost(hostContacts);
}
{
b3JacobiSolverInfo solverInfo;
m_data->m_solver3->solveGroupHost(&hostBodies[0], &hostInertias[0], hostBodies.size(), &hostContacts[0], hostContacts.size(), solverInfo);
}
{
B3_PROFILE("copyFromHost");
gpuBodies.copyFromHost(hostBodies);
}
}
else
{
int static0Index = m_data->m_narrowphase->getStatic0Index();
b3JacobiSolverInfo solverInfo;
//m_data->m_solver3->solveContacts( >solveGroup(&gpuBodies, &gpuInertias, &gpuContacts,solverInfo);
//m_data->m_solver3->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]);
m_data->m_solver3->solveContacts(numBodies, gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL(), m_data->m_config, static0Index);
}
}
else
{
b3AlignedObjectArray<b3RigidBodyData> hostBodies;
gpuBodies.copyToHost(hostBodies);
b3AlignedObjectArray<b3InertiaData> hostInertias;
gpuInertias.copyToHost(hostInertias);
b3AlignedObjectArray<b3Contact4> hostContacts;
gpuContacts.copyToHost(hostContacts);
{
//m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]);
}
gpuBodies.copyFromHost(hostBodies);
}
}
else
#endif //TEST_OTHER_GPU_SOLVER
{
int static0Index = m_data->m_narrowphase->getStatic0Index();
m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL(), m_data->m_config, static0Index);
//m_data->m_solver4->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(), gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL());
/*m_data->m_solver3->solveContactConstraintHost(
(b3OpenCLArray<RigidBodyBase::Body>*)&gpuBodies,
(b3OpenCLArray<RigidBodyBase::Inertia>*)&gpuInertias,
(b3OpenCLArray<Constraint4>*) &gpuContacts,
0,numContacts,256);
*/
}
}
integrate(deltaTime);
}
void b3GpuRigidBodyPipeline::integrate(float timeStep)
{
//integrate
int numBodies = m_data->m_narrowphase->getNumRigidBodies();
float angularDamp = 0.99f;
if (gIntegrateOnCpu)
{
if (numBodies)
{
b3GpuNarrowPhaseInternalData* npData = m_data->m_narrowphase->getInternalData();
npData->m_bodyBufferGPU->copyToHost(*npData->m_bodyBufferCPU);
b3RigidBodyData_t* bodies = &npData->m_bodyBufferCPU->at(0);
for (int nodeID = 0; nodeID < numBodies; nodeID++)
{
integrateSingleTransform(bodies, nodeID, timeStep, angularDamp, m_data->m_gravity);
}
npData->m_bodyBufferGPU->copyFromHost(*npData->m_bodyBufferCPU);
}
}
else
{
b3LauncherCL launcher(m_data->m_queue, m_data->m_integrateTransformsKernel, "m_integrateTransformsKernel");
launcher.setBuffer(m_data->m_narrowphase->getBodiesGpu());
launcher.setConst(numBodies);
launcher.setConst(timeStep);
launcher.setConst(angularDamp);
launcher.setConst(m_data->m_gravity);
launcher.launch1D(numBodies);
}
}
void b3GpuRigidBodyPipeline::setupGpuAabbsFull()
{
cl_int ciErrNum = 0;
int numBodies = m_data->m_narrowphase->getNumRigidBodies();
if (!numBodies)
return;
if (gCalcWorldSpaceAabbOnCpu)
{
if (numBodies)
{
if (gUseDbvt)
{
m_data->m_allAabbsCPU.resize(numBodies);
m_data->m_narrowphase->readbackAllBodiesToCpu();
for (int i = 0; i < numBodies; i++)
{
b3ComputeWorldAabb(i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(), &m_data->m_allAabbsCPU[0]);
}
m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
}
else
{
m_data->m_broadphaseSap->getAllAabbsCPU().resize(numBodies);
m_data->m_narrowphase->readbackAllBodiesToCpu();
for (int i = 0; i < numBodies; i++)
{
b3ComputeWorldAabb(i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(), &m_data->m_broadphaseSap->getAllAabbsCPU()[0]);
}
m_data->m_broadphaseSap->getAllAabbsGPU().copyFromHost(m_data->m_broadphaseSap->getAllAabbsCPU());
//m_data->m_broadphaseSap->writeAabbsToGpu();
}
}
}
else
{
//__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB)
b3LauncherCL launcher(m_data->m_queue, m_data->m_updateAabbsKernel, "m_updateAabbsKernel");
launcher.setConst(numBodies);
cl_mem bodies = m_data->m_narrowphase->getBodiesGpu();
launcher.setBuffer(bodies);
cl_mem collidables = m_data->m_narrowphase->getCollidablesGpu();
launcher.setBuffer(collidables);
cl_mem localAabbs = m_data->m_narrowphase->getAabbLocalSpaceBufferGpu();
launcher.setBuffer(localAabbs);
cl_mem worldAabbs = 0;
if (gUseDbvt)
{
worldAabbs = m_data->m_allAabbsGPU->getBufferCL();
}
else
{
worldAabbs = m_data->m_broadphaseSap->getAabbBufferWS();
}
launcher.setBuffer(worldAabbs);
launcher.launch1D(numBodies);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
/*
b3AlignedObjectArray<b3SapAabb> aabbs;
m_data->m_broadphaseSap->m_allAabbsGPU.copyToHost(aabbs);
printf("numAabbs = %d\n", aabbs.size());
for (int i=0;i<aabbs.size();i++)
{
printf("aabb[%d].m_min=%f,%f,%f,%d\n",i,aabbs[i].m_minVec[0],aabbs[i].m_minVec[1],aabbs[i].m_minVec[2],aabbs[i].m_minIndices[3]);
printf("aabb[%d].m_max=%f,%f,%f,%d\n",i,aabbs[i].m_maxVec[0],aabbs[i].m_maxVec[1],aabbs[i].m_maxVec[2],aabbs[i].m_signedMaxIndices[3]);
};
*/
}
cl_mem b3GpuRigidBodyPipeline::getBodyBuffer()
{
return m_data->m_narrowphase->getBodiesGpu();
}
int b3GpuRigidBodyPipeline::getNumBodies() const
{
return m_data->m_narrowphase->getNumRigidBodies();
}
void b3GpuRigidBodyPipeline::setGravity(const float* grav)
{
m_data->m_gravity.setValue(grav[0], grav[1], grav[2]);
}
void b3GpuRigidBodyPipeline::copyConstraintsToHost()
{
m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints);
}
void b3GpuRigidBodyPipeline::writeAllInstancesToGpu()
{
m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints);
}
int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex, bool writeInstanceToGpu)
{
b3Vector3 aabbMin = b3MakeVector3(0, 0, 0), aabbMax = b3MakeVector3(0, 0, 0);
if (collidableIndex >= 0)
{
b3SapAabb localAabb = m_data->m_narrowphase->getLocalSpaceAabb(collidableIndex);
b3Vector3 localAabbMin = b3MakeVector3(localAabb.m_min[0], localAabb.m_min[1], localAabb.m_min[2]);
b3Vector3 localAabbMax = b3MakeVector3(localAabb.m_max[0], localAabb.m_max[1], localAabb.m_max[2]);
b3Scalar margin = 0.01f;
b3Transform t;
t.setIdentity();
t.setOrigin(b3MakeVector3(position[0], position[1], position[2]));
t.setRotation(b3Quaternion(orientation[0], orientation[1], orientation[2], orientation[3]));
b3TransformAabb(localAabbMin, localAabbMax, margin, t, aabbMin, aabbMax);
}
else
{
b3Error("registerPhysicsInstance using invalid collidableIndex\n");
return -1;
}
bool writeToGpu = false;
int bodyIndex = m_data->m_narrowphase->getNumRigidBodies();
bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex, mass, position, orientation, &aabbMin.getX(), &aabbMax.getX(), writeToGpu);
if (bodyIndex >= 0)
{
if (gUseDbvt)
{
m_data->m_broadphaseDbvt->createProxy(aabbMin, aabbMax, bodyIndex, 0, 1, 1);
b3SapAabb aabb;
for (int i = 0; i < 3; i++)
{
aabb.m_min[i] = aabbMin[i];
aabb.m_max[i] = aabbMax[i];
aabb.m_minIndices[3] = bodyIndex;
}
m_data->m_allAabbsCPU.push_back(aabb);
if (writeInstanceToGpu)
{
m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
}
}
else
{
if (mass)
{
m_data->m_broadphaseSap->createProxy(aabbMin, aabbMax, bodyIndex, 1, 1); //m_dispatcher);
}
else
{
m_data->m_broadphaseSap->createLargeProxy(aabbMin, aabbMax, bodyIndex, 1, 1); //m_dispatcher);
}
}
}
/*
if (mass>0.f)
m_numDynamicPhysicsInstances++;
m_numPhysicsInstances++;
*/
return bodyIndex;
}
void b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults)
{
this->m_data->m_raycaster->castRays(rays, hitResults,
getNumBodies(), this->m_data->m_narrowphase->getBodiesCpu(),
m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(),
m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap);
}

View file

@ -1,70 +0,0 @@
/*
Copyright (c) 2013 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef B3_GPU_RIGIDBODY_PIPELINE_H
#define B3_GPU_RIGIDBODY_PIPELINE_H
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
class b3GpuRigidBodyPipeline
{
protected:
struct b3GpuRigidBodyPipelineInternalData* m_data;
int allocateCollidable();
public:
b3GpuRigidBodyPipeline(cl_context ctx, cl_device_id device, cl_command_queue q, class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config);
virtual ~b3GpuRigidBodyPipeline();
void stepSimulation(float deltaTime);
void integrate(float timeStep);
void setupGpuAabbsFull();
int registerConvexPolyhedron(class b3ConvexUtility* convex);
//int registerConvexPolyhedron(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
//int registerSphereShape(float radius);
//int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
//int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling);
//int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
int registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData, bool writeInstanceToGpu);
//if you passed "writeInstanceToGpu" false in the registerPhysicsInstance method (for performance) you need to call writeAllInstancesToGpu after all instances are registered
void writeAllInstancesToGpu();
void copyConstraintsToHost();
void setGravity(const float* grav);
void reset();
int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, float breakingThreshold);
int createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold);
void removeConstraintByUid(int uid);
void addConstraint(class b3TypedConstraint* constraint);
void removeConstraint(b3TypedConstraint* constraint);
void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults);
cl_mem getBodyBuffer();
int getNumBodies() const;
};
#endif //B3_GPU_RIGIDBODY_PIPELINE_H

View file

@ -1,68 +0,0 @@
/*
Copyright (c) 2013 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
#define B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
#include "Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h"
#include "Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h"
struct b3GpuRigidBodyPipelineInternalData
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_integrateTransformsKernel;
cl_kernel m_updateAabbsKernel;
cl_kernel m_clearOverlappingPairsKernel;
class b3PgsJacobiSolver* m_solver;
class b3GpuPgsConstraintSolver* m_gpuSolver;
class b3GpuPgsContactSolver* m_solver2;
class b3GpuJacobiContactSolver* m_solver3;
class b3GpuRaycast* m_raycaster;
class b3GpuBroadphaseInterface* m_broadphaseSap;
struct b3DynamicBvhBroadphase* m_broadphaseDbvt;
b3OpenCLArray<b3SapAabb>* m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
b3OpenCLArray<b3BroadphasePair>* m_overlappingPairsGPU;
b3OpenCLArray<b3GpuGenericConstraint>* m_gpuConstraints;
b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints;
b3AlignedObjectArray<b3TypedConstraint*> m_joints;
int m_constraintUid;
class b3GpuNarrowPhase* m_narrowphase;
b3Vector3 m_gravity;
b3Config m_config;
};
#endif //B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H

View file

@ -1,210 +0,0 @@
/*
Copyright (c) 2013 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
#ifndef B3_GPU_SOLVER_BODY_H
#define B3_GPU_SOLVER_BODY_H
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3Common/b3Matrix3x3.h"
#include "Bullet3Common/b3AlignedAllocator.h"
#include "Bullet3Common/b3TransformUtil.h"
///Until we get other contributions, only use SIMD on Windows, when using Visual Studio 2008 or later, and not double precision
#ifdef B3_USE_SSE
#define USE_SIMD 1
#endif //
///The b3SolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance.
B3_ATTRIBUTE_ALIGNED16(struct)
b3GpuSolverBody
{
B3_DECLARE_ALIGNED_ALLOCATOR();
// b3Transform m_worldTransformUnused;
b3Vector3 m_deltaLinearVelocity;
b3Vector3 m_deltaAngularVelocity;
b3Vector3 m_angularFactor;
b3Vector3 m_linearFactor;
b3Vector3 m_invMass;
b3Vector3 m_pushVelocity;
b3Vector3 m_turnVelocity;
b3Vector3 m_linearVelocity;
b3Vector3 m_angularVelocity;
union {
void* m_originalBody;
int m_originalBodyIndex;
};
int padding[3];
/*
void setWorldTransform(const b3Transform& worldTransform)
{
m_worldTransform = worldTransform;
}
const b3Transform& getWorldTransform() const
{
return m_worldTransform;
}
*/
B3_FORCE_INLINE void getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const
{
if (m_originalBody)
velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos);
else
velocity.setValue(0, 0, 0);
}
B3_FORCE_INLINE void getAngularVelocity(b3Vector3 & angVel) const
{
if (m_originalBody)
angVel = m_angularVelocity + m_deltaAngularVelocity;
else
angVel.setValue(0, 0, 0);
}
//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude)
{
if (m_originalBody)
{
m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor;
m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
}
}
B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, b3Scalar impulseMagnitude)
{
if (m_originalBody)
{
m_pushVelocity += linearComponent * impulseMagnitude * m_linearFactor;
m_turnVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
}
}
const b3Vector3& getDeltaLinearVelocity() const
{
return m_deltaLinearVelocity;
}
const b3Vector3& getDeltaAngularVelocity() const
{
return m_deltaAngularVelocity;
}
const b3Vector3& getPushVelocity() const
{
return m_pushVelocity;
}
const b3Vector3& getTurnVelocity() const
{
return m_turnVelocity;
}
////////////////////////////////////////////////
///some internal methods, don't use them
b3Vector3& internalGetDeltaLinearVelocity()
{
return m_deltaLinearVelocity;
}
b3Vector3& internalGetDeltaAngularVelocity()
{
return m_deltaAngularVelocity;
}
const b3Vector3& internalGetAngularFactor() const
{
return m_angularFactor;
}
const b3Vector3& internalGetInvMass() const
{
return m_invMass;
}
void internalSetInvMass(const b3Vector3& invMass)
{
m_invMass = invMass;
}
b3Vector3& internalGetPushVelocity()
{
return m_pushVelocity;
}
b3Vector3& internalGetTurnVelocity()
{
return m_turnVelocity;
}
B3_FORCE_INLINE void internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const
{
velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos);
}
B3_FORCE_INLINE void internalGetAngularVelocity(b3Vector3 & angVel) const
{
angVel = m_angularVelocity + m_deltaAngularVelocity;
}
//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude)
{
//if (m_originalBody)
{
m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor;
m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
}
}
void writebackVelocity()
{
//if (m_originalBody>=0)
{
m_linearVelocity += m_deltaLinearVelocity;
m_angularVelocity += m_deltaAngularVelocity;
//m_originalBody->setCompanionId(-1);
}
}
void writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp)
{
(void)timeStep;
if (m_originalBody)
{
m_linearVelocity += m_deltaLinearVelocity;
m_angularVelocity += m_deltaAngularVelocity;
//correct the position/orientation based on push/turn recovery
b3Transform newTransform;
if (m_pushVelocity[0] != 0.f || m_pushVelocity[1] != 0 || m_pushVelocity[2] != 0 || m_turnVelocity[0] != 0.f || m_turnVelocity[1] != 0 || m_turnVelocity[2] != 0)
{
// b3Quaternion orn = m_worldTransform.getRotation();
// b3TransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform);
// m_worldTransform = newTransform;
}
//m_worldTransform.setRotation(orn);
//m_originalBody->setCompanionId(-1);
}
}
};
#endif //B3_SOLVER_BODY_H

Some files were not shown because too many files have changed in this diff Show more