Bullet Library v2.81

2026-04-23 13:25:36 +00:00 · 2013-07-04 20:50:16 +02:00 · 2013-07-04 20:50:16 +02:00 · 1eb94f4828
commit 1eb94f4828
parent 64fef8b2ad
462 changed files with 59613 additions and 8036 deletions
--- a/Engine/lib/bullet/src/BulletMultiThreaded/CMakeLists.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/CMakeLists.txt
@ -1,71 +1,123 @@
 INCLUDE_DIRECTORIES(
 	${BULLET_PHYSICS_SOURCE_DIR}/src
-	${BULLET_PHYSICS_SOURCE_DIR}/src/BulletMultiThreaded/vectormath/scalar/cpp
+	${VECTOR_MATH_INCLUDE}
 )

-ADD_LIBRARY(BulletMultiThreaded
-		PlatformDefinitions.h
-		SpuFakeDma.cpp
-		SpuFakeDma.h
-		SpuSync.h
-		SpuDoubleBuffer.h
-		SpuLibspe2Support.cpp
-		SpuLibspe2Support.h
-		btThreadSupportInterface.cpp
-		btThreadSupportInterface.h
-		
-		Win32ThreadSupport.cpp
-		Win32ThreadSupport.h
-		PosixThreadSupport.cpp
-		PosixThreadSupport.h
-		SequentialThreadSupport.cpp
-		SequentialThreadSupport.h
-		SpuSampleTaskProcess.h
-		SpuSampleTaskProcess.cpp
+SET(BulletMultiThreaded_SRCS
+	SpuFakeDma.cpp
+	SpuLibspe2Support.cpp
+	btThreadSupportInterface.cpp
+	Win32ThreadSupport.cpp
+	PosixThreadSupport.cpp
+	SequentialThreadSupport.cpp
+	SpuSampleTaskProcess.cpp
+	SpuCollisionObjectWrapper.cpp 
+	SpuCollisionTaskProcess.cpp
+	SpuGatheringCollisionDispatcher.cpp
+	SpuContactManifoldCollisionAlgorithm.cpp
+	btParallelConstraintSolver.cpp
+	
+	#SPURS_PEGatherScatterTask/SpuPEGatherScatterTask.cpp
+	#SpuPEGatherScatterTaskProcess.cpp

-		SpuCollisionObjectWrapper.cpp 
-		SpuCollisionObjectWrapper.h 
-		SpuCollisionTaskProcess.h
-		SpuCollisionTaskProcess.cpp
-		SpuGatheringCollisionDispatcher.h
-		SpuGatheringCollisionDispatcher.cpp
-		SpuContactManifoldCollisionAlgorithm.cpp
-		SpuContactManifoldCollisionAlgorithm.h
-		SpuNarrowPhaseCollisionTask/Box.h
-		SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
-		SpuNarrowPhaseCollisionTask/boxBoxDistance.h
-		SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
-		SpuNarrowPhaseCollisionTask/SpuContactResult.h
-		SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
-		SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
-		SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
-		SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h
-		SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
-		SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
-		SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
-		SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
+	SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
+	SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
+	SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
+	SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+	SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp

-
-#Some GPU related stuff, mainly CUDA and perhaps OpenCL
-		btGpu3DGridBroadphase.cpp
-		btGpu3DGridBroadphase.h
-		btGpu3DGridBroadphaseSharedCode.h
-		btGpu3DGridBroadphaseSharedDefs.h
-		btGpu3DGridBroadphaseSharedTypes.h
-		btGpuDefines.h
-		btGpuUtilsSharedCode.h
-		btGpuUtilsSharedDefs.h
-
-#MiniCL provides a small subset of OpenCL
-		MiniCLTaskScheduler.cpp
-		MiniCLTaskScheduler.h
-		MiniCLTask/MiniCLTask.cpp
-		MiniCLTask/MiniCLTask.h
-		../MiniCL/cl.h
-		../MiniCL/cl_gl.h
-		../MiniCL/cl_platform.h
+	#Some GPU related stuff, mainly CUDA and perhaps OpenCL
+	btGpu3DGridBroadphase.cpp
 )

+SET(Root_HDRS
+	PlatformDefinitions.h
+	PpuAddressSpace.h
+	SpuFakeDma.h
+	SpuDoubleBuffer.h
+	SpuLibspe2Support.h
+	btThreadSupportInterface.h
+	Win32ThreadSupport.h
+	PosixThreadSupport.h
+	SequentialThreadSupport.h
+	SpuSampleTaskProcess.h
+	SpuCollisionObjectWrapper.cpp 
+	SpuCollisionObjectWrapper.h 
+	SpuCollisionTaskProcess.h
+	SpuGatheringCollisionDispatcher.h
+	SpuContactManifoldCollisionAlgorithm.h
+	btParallelConstraintSolver.h
+
+	#SPURS_PEGatherScatterTask/SpuPEGatherScatterTask.h
+	#SpuPEGatherScatterTaskProcess.h
+
+	#Some GPU related stuff, mainly CUDA and perhaps OpenCL
+	btGpu3DGridBroadphase.h
+	btGpu3DGridBroadphaseSharedCode.h
+	btGpu3DGridBroadphaseSharedDefs.h
+	btGpu3DGridBroadphaseSharedTypes.h
+	btGpuDefines.h
+	btGpuUtilsSharedCode.h
+	btGpuUtilsSharedDefs.h
+)
+
+SET(SpuNarrowPhaseCollisionTask_HDRS
+	SpuNarrowPhaseCollisionTask/Box.h
+	SpuNarrowPhaseCollisionTask/boxBoxDistance.h
+	SpuNarrowPhaseCollisionTask/SpuContactResult.h
+	SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
+	SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
+	SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h
+	SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
+	SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
+)
+
+SET(BulletMultiThreaded_HDRS
+	${Root_HDRS}
+	${SpuNarrowPhaseCollisionTask_HDRS}
+)
+
+ADD_LIBRARY(BulletMultiThreaded ${BulletMultiThreaded_SRCS} ${BulletMultiThreaded_HDRS})
+SET_TARGET_PROPERTIES(BulletMultiThreaded PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletMultiThreaded PROPERTIES SOVERSION ${BULLET_VERSION})
+
+
+SUBDIRS(GpuSoftBodySolvers)
+
+
 IF (BUILD_SHARED_LIBS)
-	TARGET_LINK_LIBRARIES(BulletMultiThreaded BulletCollision)
+	IF (UNIX)
+		TARGET_LINK_LIBRARIES(BulletMultiThreaded BulletDynamics BulletCollision pthread)
+	ELSE()
+		TARGET_LINK_LIBRARIES(BulletMultiThreaded BulletDynamics BulletCollision)
+	ENDIF()
 ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		#INSTALL of other files requires CMake 2.6
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+#			IF(INSTALL_EXTRA_LIBS)
+				IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+					INSTALL(TARGETS BulletMultiThreaded DESTINATION .)
+				ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+					INSTALL(TARGETS BulletMultiThreaded DESTINATION lib${LIB_SUFFIX})
+					INSTALL(DIRECTORY
+${CMAKE_CURRENT_SOURCE_DIR} DESTINATION ${INCLUDE_INSTALL_DIR} FILES_MATCHING
+PATTERN "*.h"  PATTERN ".svn" EXCLUDE PATTERN "CMakeFiles" EXCLUDE)
+				ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+#			ENDIF (INSTALL_EXTRA_LIBS)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletMultiThreaded PROPERTIES FRAMEWORK true)
+		
+			SET_TARGET_PROPERTIES(BulletMultiThreaded PROPERTIES PUBLIC_HEADER "${Root_HDRS}")
+			# Have to list out sub-directories manually:
+			SET_PROPERTY(SOURCE ${SpuNarrowPhaseCollisionTask_HDRS} PROPERTY MACOSX_PACKAGE_LOCATION Headers/SpuNarrowPhaseCollisionTask)
+		
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/CMakeLists.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/CMakeLists.txt
@ -0,0 +1,13 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+
+SUBDIRS ( 
+	OpenCL
+)
+
+IF( USE_DX11 )
+	SUBDIRS( DX11 )
+ENDIF( USE_DX11 )
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/CMakeLists.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/CMakeLists.txt
@ -0,0 +1,83 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+SET(DXSDK_DIR $ENV{DXSDK_DIR})
+SET(DX11_INCLUDE_PATH  "${DIRECTX_SDK_BASE_DIR}/Include" CACHE DOCSTRING "Microsoft directX SDK include path")
+
+
+INCLUDE_DIRECTORIES(
+${DX11_INCLUDE_PATH} "../Shared/"
+${VECTOR_MATH_INCLUDE}
+)
+
+SET(BulletSoftBodyDX11Solvers_SRCS
+	btSoftBodySolver_DX11.cpp
+	btSoftBodySolver_DX11SIMDAware.cpp
+)
+
+SET(BulletSoftBodyDX11Solvers_HDRS
+	btSoftBodySolver_DX11.h
+	btSoftBodySolver_DX11SIMDAware.h
+	../Shared/btSoftBodySolverData.h
+	btSoftBodySolverVertexData_DX11.h
+	btSoftBodySolverTriangleData_DX11.h
+	btSoftBodySolverLinkData_DX11.h
+	btSoftBodySolverLinkData_DX11SIMDAware.h
+	btSoftBodySolverBuffer_DX11.h
+	btSoftBodySolverVertexBuffer_DX11.h
+
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyDX11Solvers_Shaders
+	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	ComputeBounds
+	SolvePositions
+	SolvePositionsSIMDBatched
+	SolveCollisionsAndUpdateVelocities
+	SolveCollisionsAndUpdateVelocitiesSIMDBatched
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyDX11Solvers_Shaders})
+    LIST(APPEND BulletSoftBodyDX11Solvers_HLSL "HLSL/${f}.hlsl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_DX11  ${BulletSoftBodyDX11Solvers_SRCS} ${BulletSoftBodyDX11Solvers_HDRS} ${BulletSoftBodyDX11Solvers_HLSL})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_DX11 PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_DX11 PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBodySolvers_DX11 BulletSoftBody BulletDynamics)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_DX11 DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			INSTALL(TARGETS BulletSoftBodySolvers_DX11 DESTINATION lib${LIB_SUFFIX})
+#headers are already installed by BulletMultiThreaded library
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_DX11 PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_DX11 PROPERTIES PUBLIC_HEADER "${BulletSoftBodyDX11Solvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ApplyForces.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ApplyForces.hlsl
@ -0,0 +1,95 @@
+MSTRINGIFY(
+
+cbuffer ApplyForcesCB : register( b0 )
+{
+	unsigned int numNodes;
+	float solverdt;
+	float epsilon;
+	int padding3;
+};
+
+
+StructuredBuffer<int> g_vertexClothIdentifier : register( t0 );
+StructuredBuffer<float4> g_vertexNormal : register( t1 );
+StructuredBuffer<float> g_vertexArea : register( t2 );
+StructuredBuffer<float> g_vertexInverseMass : register( t3 );
+// TODO: These could be combined into a lift/drag factor array along with medium density
+StructuredBuffer<float> g_clothLiftFactor : register( t4 );
+StructuredBuffer<float> g_clothDragFactor : register( t5 );
+StructuredBuffer<float4> g_clothWindVelocity : register( t6 );
+StructuredBuffer<float4> g_clothAcceleration : register( t7 );
+StructuredBuffer<float> g_clothMediumDensity : register( t8 );
+
+RWStructuredBuffer<float4> g_vertexForceAccumulator : register( u0 );
+RWStructuredBuffer<float4> g_vertexVelocity : register( u1 );
+
+float3 projectOnAxis( float3 v, float3 a )
+{
+	return (a*dot(v, a));
+}
+
+[numthreads(128, 1, 1)]
+void 
+ApplyForcesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	unsigned int nodeID = DTid.x;
+	if( nodeID < numNodes )
+	{		
+		int clothId = g_vertexClothIdentifier[nodeID];
+		float nodeIM = g_vertexInverseMass[nodeID];
+		
+		if( nodeIM > 0.0f )
+		{
+			float3 nodeV = g_vertexVelocity[nodeID].xyz;
+			float3 normal = g_vertexNormal[nodeID].xyz;
+			float area = g_vertexArea[nodeID];
+			float3 nodeF = g_vertexForceAccumulator[nodeID].xyz;
+			
+			// Read per-cloth values
+			float3 clothAcceleration = g_clothAcceleration[clothId].xyz;
+			float3 clothWindVelocity = g_clothWindVelocity[clothId].xyz;
+			float liftFactor = g_clothLiftFactor[clothId];
+			float dragFactor = g_clothDragFactor[clothId];
+			float mediumDensity = g_clothMediumDensity[clothId];
+		
+			// Apply the acceleration to the cloth rather than do this via a force
+			nodeV += (clothAcceleration*solverdt);
+
+			g_vertexVelocity[nodeID] = float4(nodeV, 0.f);
+
+			float3 relativeWindVelocity = nodeV - clothWindVelocity;
+			float relativeSpeedSquared = dot(relativeWindVelocity, relativeWindVelocity);
+			
+			if( relativeSpeedSquared > epsilon )
+			{
+				// Correct direction of normal relative to wind direction and get dot product
+				normal = normal * (dot(normal, relativeWindVelocity) < 0 ? -1.f : 1.f);
+				float dvNormal = dot(normal, relativeWindVelocity);
+				if( dvNormal > 0 )
+				{
+					float3 force = float3(0.f, 0.f, 0.f);
+					float c0 = area * dvNormal * relativeSpeedSquared / 2.f;
+					float c1 = c0 * mediumDensity;
+					force += normal * (-c1 * liftFactor);
+					force += normalize(relativeWindVelocity)*(-c1 * dragFactor);
+					
+					float dtim = solverdt * nodeIM;
+					float3 forceDTIM = force * dtim;
+					
+					float3 nodeFPlusForce = nodeF + force;
+					
+					// m_nodesf[i] -= ProjectOnAxis(m_nodesv[i], force.normalized())/dtim;	
+					float3 nodeFMinus = nodeF - (projectOnAxis(nodeV, normalize(force))/dtim);
+					
+					nodeF = nodeFPlusForce;
+					if( dot(forceDTIM, forceDTIM) > dot(nodeV, nodeV) )
+						nodeF = nodeFMinus;
+									
+					g_vertexForceAccumulator[nodeID] = float4(nodeF, 0.0f);	
+				}
+			}
+		}
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ComputeBounds.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ComputeBounds.hlsl
@ -0,0 +1,83 @@
+MSTRINGIFY(
+
+cbuffer ComputeBoundsCB : register( b0 )
+{
+	int numNodes;
+	int numSoftBodies;
+	int padding1;
+	int padding2;
+};
+
+// Node indices for each link
+StructuredBuffer<int> g_vertexClothIdentifier : register( t0 );
+StructuredBuffer<float4> g_vertexPositions : register( t1 );
+
+RWStructuredBuffer<uint4> g_clothMinBounds : register( u0 );
+RWStructuredBuffer<uint4> g_clothMaxBounds : register( u1 );
+
+groupshared uint4 clothMinBounds[256];
+groupshared uint4 clothMaxBounds[256];
+
+[numthreads(128, 1, 1)]
+void 
+ComputeBoundsKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	const unsigned int UINT_MAX = 0xffffffff;
+
+	// Init min and max bounds arrays
+	if( GTid.x < numSoftBodies )
+	{
+		clothMinBounds[GTid.x] = uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
+		clothMaxBounds[GTid.x] = uint4(0,0,0,0);
+	}
+
+	AllMemoryBarrierWithGroupSync();
+
+	int nodeID = DTid.x;
+	if( nodeID < numNodes )
+	{	
+		int clothIdentifier = g_vertexClothIdentifier[nodeID];
+		if( clothIdentifier >= 0 )
+		{
+			float3 position = g_vertexPositions[nodeID].xyz;
+
+			// Reinterpret position as uint
+			uint3 positionUInt = uint3(asuint(position.x), asuint(position.y), asuint(position.z));
+		
+			// Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints
+			//positionUInt.x ^= uint((-int(positionUInt.x >> 31) | 0x80000000));
+			//positionUInt.y ^= uint((-int(positionUInt.y >> 31) | 0x80000000));
+			//positionUInt.z ^= uint((-int(positionUInt.z >> 31) | 0x80000000));
+			positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000);
+			positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000);		
+			positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000);
+		
+			// Min/max with the LDS values
+			InterlockedMin(clothMinBounds[clothIdentifier].x, positionUInt.x);
+			InterlockedMin(clothMinBounds[clothIdentifier].y, positionUInt.y);
+			InterlockedMin(clothMinBounds[clothIdentifier].z, positionUInt.z);
+
+			InterlockedMax(clothMaxBounds[clothIdentifier].x, positionUInt.x);
+			InterlockedMax(clothMaxBounds[clothIdentifier].y, positionUInt.y);
+			InterlockedMax(clothMaxBounds[clothIdentifier].z, positionUInt.z);
+		}
+	}
+	
+	AllMemoryBarrierWithGroupSync();
+
+
+	// Use global atomics to update the global versions of the data
+	if( GTid.x < numSoftBodies )
+	{
+		InterlockedMin(g_clothMinBounds[GTid.x].x, clothMinBounds[GTid.x].x);
+		InterlockedMin(g_clothMinBounds[GTid.x].y, clothMinBounds[GTid.x].y);
+		InterlockedMin(g_clothMinBounds[GTid.x].z, clothMinBounds[GTid.x].z);
+
+		InterlockedMax(g_clothMaxBounds[GTid.x].x, clothMaxBounds[GTid.x].x);		
+		InterlockedMax(g_clothMaxBounds[GTid.x].y, clothMaxBounds[GTid.x].y);
+		InterlockedMax(g_clothMaxBounds[GTid.x].z, clothMaxBounds[GTid.x].z);
+	}
+}
+
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/Integrate.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/Integrate.hlsl
@ -0,0 +1,41 @@
+MSTRINGIFY(
+
+cbuffer IntegrateCB : register( b0 )
+{
+	int numNodes;
+	float solverdt;
+	int padding1;
+	int padding2;
+};
+
+// Node indices for each link
+StructuredBuffer<float> g_vertexInverseMasses : register( t0 );
+
+RWStructuredBuffer<float4> g_vertexPositions : register( u0 );
+RWStructuredBuffer<float4> g_vertexVelocity : register( u1 );
+RWStructuredBuffer<float4> g_vertexPreviousPositions : register( u2 );
+RWStructuredBuffer<float4> g_vertexForceAccumulator : register( u3 );
+
+[numthreads(128, 1, 1)]
+void 
+IntegrateKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int nodeID = DTid.x;
+	if( nodeID < numNodes )
+	{	
+		float3 position = g_vertexPositions[nodeID].xyz;
+		float3 velocity = g_vertexVelocity[nodeID].xyz;
+		float3 force = g_vertexForceAccumulator[nodeID].xyz;
+		float inverseMass = g_vertexInverseMasses[nodeID];
+		
+		g_vertexPreviousPositions[nodeID] = float4(position, 0.f);
+		velocity += force * inverseMass * solverdt;
+		position += velocity * solverdt;
+		
+		g_vertexForceAccumulator[nodeID] = float4(0.f, 0.f, 0.f, 0.0f);
+		g_vertexPositions[nodeID] = float4(position, 0.f);
+		g_vertexVelocity[nodeID] = float4(velocity, 0.f);	
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/OutputToVertexArray.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/OutputToVertexArray.hlsl
@ -0,0 +1,63 @@
+MSTRINGIFY(
+
+cbuffer OutputToVertexArrayCB : register( b0 )
+{
+	int startNode;
+	int numNodes;
+	int positionOffset;
+	int positionStride;
+	
+	int normalOffset;	
+	int normalStride;
+	int padding1;
+	int padding2;
+};
+
+
+StructuredBuffer<float4> g_vertexPositions : register( t0 );
+StructuredBuffer<float4> g_vertexNormals : register( t1 );
+
+RWBuffer<float> g_vertexBuffer : register( u0 );
+
+
+[numthreads(128, 1, 1)]
+void 
+OutputToVertexArrayWithNormalsKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int nodeID = DTid.x;
+	if( nodeID < numNodes )
+	{			
+		float4 position = g_vertexPositions[nodeID + startNode];
+		float4 normal = g_vertexNormals[nodeID + startNode];
+		
+		// Stride should account for the float->float4 conversion
+		int positionDestination = nodeID * positionStride + positionOffset;		
+		g_vertexBuffer[positionDestination] = position.x;
+		g_vertexBuffer[positionDestination+1] = position.y;
+		g_vertexBuffer[positionDestination+2] = position.z;
+		
+		int normalDestination = nodeID * normalStride + normalOffset;
+		g_vertexBuffer[normalDestination] = normal.x;
+		g_vertexBuffer[normalDestination+1] = normal.y;
+		g_vertexBuffer[normalDestination+2] = normal.z;		
+	}
+}
+
+[numthreads(128, 1, 1)]
+void 
+OutputToVertexArrayWithoutNormalsKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int nodeID = DTid.x;
+	if( nodeID < numNodes )
+	{			
+		float4 position = g_vertexPositions[nodeID + startNode];
+		float4 normal = g_vertexNormals[nodeID + startNode];
+		
+		// Stride should account for the float->float4 conversion
+		int positionDestination = nodeID * positionStride + positionOffset;		
+		g_vertexBuffer[positionDestination] = position.x;
+		g_vertexBuffer[positionDestination+1] = position.y;
+		g_vertexBuffer[positionDestination+2] = position.z;		
+	}
+}
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/PrepareLinks.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/PrepareLinks.hlsl
@ -0,0 +1,44 @@
+MSTRINGIFY(
+
+cbuffer PrepareLinksCB : register( b0 )
+{
+	int numLinks;
+	int padding0;
+	int padding1;
+	int padding2;
+};
+
+// Node indices for each link
+StructuredBuffer<int2> g_linksVertexIndices : register( t0 );
+StructuredBuffer<float> g_linksMassLSC : register( t1 );
+StructuredBuffer<float4> g_nodesPreviousPosition : register( t2 );
+
+RWStructuredBuffer<float> g_linksLengthRatio : register( u0 );
+RWStructuredBuffer<float4> g_linksCurrentLength : register( u1 );
+
+[numthreads(128, 1, 1)]
+void 
+PrepareLinksKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int linkID = DTid.x;
+	if( linkID < numLinks )
+	{	
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		
+		float4 nodePreviousPosition0 = g_nodesPreviousPosition[node0];
+		float4 nodePreviousPosition1 = g_nodesPreviousPosition[node1];
+
+		float massLSC = g_linksMassLSC[linkID];
+		
+		float4 linkCurrentLength = nodePreviousPosition1 - nodePreviousPosition0;
+		
+		float linkLengthRatio = dot(linkCurrentLength, linkCurrentLength)*massLSC;
+		linkLengthRatio = 1./linkLengthRatio;
+		
+		g_linksCurrentLength[linkID] = linkCurrentLength;
+		g_linksLengthRatio[linkID] = linkLengthRatio;		
+	}
+}
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositions.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositions.hlsl
@ -0,0 +1,55 @@
+MSTRINGIFY(
+
+cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
+{
+	int startLink;
+	int numLinks;
+	float kst;
+	float ti;
+};
+
+// Node indices for each link
+StructuredBuffer<int2> g_linksVertexIndices : register( t0 );
+
+StructuredBuffer<float> g_linksMassLSC : register( t1 );
+StructuredBuffer<float> g_linksRestLengthSquared : register( t2 );
+StructuredBuffer<float> g_verticesInverseMass : register( t3 );
+
+RWStructuredBuffer<float4> g_vertexPositions : register( u0 );
+
+[numthreads(128, 1, 1)]
+void 
+SolvePositionsFromLinksKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int linkID = DTid.x + startLink;
+	if( DTid.x < numLinks )
+	{	
+		float massLSC = g_linksMassLSC[linkID];
+		float restLengthSquared = g_linksRestLengthSquared[linkID];
+		
+		if( massLSC > 0.0f )
+		{		
+			int2 nodeIndices = g_linksVertexIndices[linkID];
+			int node0 = nodeIndices.x;
+			int node1 = nodeIndices.y;
+			
+			float3 position0 = g_vertexPositions[node0].xyz;
+			float3 position1 = g_vertexPositions[node1].xyz;
+
+			float inverseMass0 = g_verticesInverseMass[node0];
+			float inverseMass1 = g_verticesInverseMass[node1]; 
+
+			float3 del = position1 - position0;
+			float len = dot(del, del);
+			float k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
+			position0 = position0 - del*(k*inverseMass0);
+			position1 = position1 + del*(k*inverseMass1);
+
+			g_vertexPositions[node0] = float4(position0, 0.f);
+			g_vertexPositions[node1] = float4(position1, 0.f);
+
+		}
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl
@ -0,0 +1,147 @@
+MSTRINGIFY(
+
+
+
+cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
+{
+	int startWaveInBatch;
+	int numWaves;
+	float kst;		
+	float ti;
+};
+
+
+// Number of batches per wavefront stored one element per logical wavefront
+StructuredBuffer<int2> g_wavefrontBatchCountsVertexCounts : register( t0 );
+// Set of up to maxNumVertices vertex addresses per wavefront
+StructuredBuffer<int> g_vertexAddressesPerWavefront : register( t1 );
+
+StructuredBuffer<float> g_verticesInverseMass : register( t2 );
+
+// Per-link data layed out structured in terms of sub batches within wavefronts
+StructuredBuffer<int2> g_linksVertexIndices : register( t3 );
+StructuredBuffer<float> g_linksMassLSC : register( t4 );
+StructuredBuffer<float> g_linksRestLengthSquared : register( t5 );
+
+RWStructuredBuffer<float4> g_vertexPositions : register( u0 );
+
+// Data loaded on a per-wave basis
+groupshared int2 wavefrontBatchCountsVertexCounts[WAVEFRONT_BLOCK_MULTIPLIER];
+groupshared float4 vertexPositionSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
+groupshared float vertexInverseMassSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
+
+// Storing the vertex addresses actually slowed things down a little
+//groupshared int vertexAddressSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
+
+
+[numthreads(BLOCK_SIZE, 1, 1)]
+void 
+SolvePositionsFromLinksKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	const int laneInWavefront = (DTid.x & (WAVEFRONT_SIZE-1));
+	const int wavefront = startWaveInBatch + (DTid.x / WAVEFRONT_SIZE);
+	const int firstWavefrontInBlock = startWaveInBatch + Gid.x * WAVEFRONT_BLOCK_MULTIPLIER;
+	const int localWavefront = wavefront - firstWavefrontInBlock;
+
+	int batchesWithinWavefront = 0;
+	int verticesUsedByWave = 0;
+	int cond = wavefront < (startWaveInBatch + numWaves);
+
+	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
+	if( cond)
+	{
+
+		// Load the batch counts for the wavefronts
+
+		int2 batchesAndVerticesWithinWavefront = g_wavefrontBatchCountsVertexCounts[wavefront];
+
+		batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
+		verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
+
+		// Load the vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			//vertexAddressSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = vertexAddress;
+			vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress];
+			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
+		}
+		
+	}
+		// Ensure compiler does not re-order memory operations
+		//AllMemoryBarrier();
+	AllMemoryBarrierWithGroupSync ();
+		
+	if( cond)
+	{
+		// Loop through the batches performing the solve on each in LDS
+		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
+
+		//for( int batch = 0; batch < batchesWithinWavefront; ++batch )
+		
+		int batch = 0;
+		do
+		{
+			int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch;
+			int locationOfValue = baseDataLocation + laneInWavefront;
+			
+			
+			// These loads should all be perfectly linear across the WF
+			int2 localVertexIndices = g_linksVertexIndices[locationOfValue];
+			float massLSC = g_linksMassLSC[locationOfValue];
+			float restLengthSquared = g_linksRestLengthSquared[locationOfValue];
+			
+
+			// LDS vertex addresses based on logical wavefront number in block and loaded index
+			int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x;
+			int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y;
+			
+			float3 position0 = vertexPositionSharedData[vertexAddress0].xyz;
+			float3 position1 = vertexPositionSharedData[vertexAddress1].xyz;
+
+			float inverseMass0 = vertexInverseMassSharedData[vertexAddress0];
+			float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; 
+
+			float3 del = position1 - position0;
+			float len = dot(del, del);
+			
+			float k = 0;
+			if( massLSC > 0.0f )
+			{		
+				k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
+			}
+			
+			position0 = position0 - del*(k*inverseMass0);
+			position1 = position1 + del*(k*inverseMass1);
+			
+			// Ensure compiler does not re-order memory operations
+			AllMemoryBarrier();				
+
+			vertexPositionSharedData[vertexAddress0] = float4(position0, 0.f);
+			vertexPositionSharedData[vertexAddress1] = float4(position1, 0.f);
+			
+			// Ensure compiler does not re-order memory operations
+			AllMemoryBarrier();
+				
+			
+			++batch;
+		} while( batch < batchesWithinWavefront );
+		
+		// Update the global memory vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			g_vertexPositions[vertexAddress] = vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+		}
+	}
+		
+		
+}
+
+
+
+
+);
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/UpdateConstants.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/UpdateConstants.hlsl
@ -0,0 +1,48 @@
+MSTRINGIFY(
+
+cbuffer UpdateConstantsCB : register( b0 )
+{
+	int numLinks;
+	int padding0;
+	int padding1;
+	int padding2;
+};
+
+// Node indices for each link
+StructuredBuffer<int2> g_linksVertexIndices : register( t0 );
+StructuredBuffer<float4> g_vertexPositions : register( t1 );
+StructuredBuffer<float> g_vertexInverseMasses : register( t2 );
+StructuredBuffer<float> g_linksMaterialLSC : register( t3 );
+
+RWStructuredBuffer<float> g_linksMassLSC : register( u0 );
+RWStructuredBuffer<float> g_linksRestLengthSquared : register( u1 );
+RWStructuredBuffer<float> g_linksRestLengths : register( u2 );
+
+[numthreads(128, 1, 1)]
+void 
+UpdateConstantsKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int linkID = DTid.x;
+	if( linkID < numLinks )
+	{	
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		float linearStiffnessCoefficient = g_linksMaterialLSC[ linkID ];
+		
+		float3 position0 = g_vertexPositions[node0].xyz;
+		float3 position1 = g_vertexPositions[node1].xyz;
+		float inverseMass0 = g_vertexInverseMasses[node0];
+		float inverseMass1 = g_vertexInverseMasses[node1];
+
+		float3 difference = position0 - position1;
+		float length2 = dot(difference, difference);
+		float length = sqrt(length2);
+	
+		g_linksRestLengths[linkID] = length;
+		g_linksMassLSC[linkID] = (inverseMass0 + inverseMass1)/linearStiffnessCoefficient;
+		g_linksRestLengthSquared[linkID] = length*length;		
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/UpdateNodes.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/UpdateNodes.hlsl
@ -0,0 +1,49 @@
+MSTRINGIFY(
+
+cbuffer UpdateVelocitiesFromPositionsWithVelocitiesCB : register( b0 )
+{
+	int numNodes;
+	float isolverdt;
+	int padding1;
+	int padding2;
+};
+
+
+StructuredBuffer<float4> g_vertexPositions : register( t0 );
+StructuredBuffer<float4> g_vertexPreviousPositions : register( t1 );
+StructuredBuffer<int> g_vertexClothIndices : register( t2 );
+StructuredBuffer<float> g_clothVelocityCorrectionCoefficients : register( t3 );
+StructuredBuffer<float> g_clothDampingFactor : register( t4 );
+
+RWStructuredBuffer<float4> g_vertexVelocities : register( u0 );
+RWStructuredBuffer<float4> g_vertexForces : register( u1 );
+
+
+[numthreads(128, 1, 1)]
+void 
+updateVelocitiesFromPositionsWithVelocitiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int nodeID = DTid.x;
+	if( nodeID < numNodes )
+	{	
+		float3 position = g_vertexPositions[nodeID].xyz;
+		float3 previousPosition = g_vertexPreviousPositions[nodeID].xyz;
+		float3 velocity = g_vertexVelocities[nodeID].xyz;
+		int clothIndex = g_vertexClothIndices[nodeID];
+		float velocityCorrectionCoefficient = g_clothVelocityCorrectionCoefficients[clothIndex];
+		float dampingFactor = g_clothDampingFactor[clothIndex];
+		float velocityCoefficient = (1.f - dampingFactor);
+		
+		float3 difference = position - previousPosition;
+				
+		velocity += difference*velocityCorrectionCoefficient*isolverdt;
+		
+		// Damp the velocity
+		velocity *= velocityCoefficient;
+		
+		g_vertexVelocities[nodeID] = float4(velocity, 0.f);
+		g_vertexForces[nodeID] = float4(0.f, 0.f, 0.f, 0.f);								
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/UpdateNormals.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/UpdateNormals.hlsl
@ -0,0 +1,98 @@
+MSTRINGIFY(
+
+cbuffer UpdateSoftBodiesCB : register( b0 )
+{
+	unsigned int numNodes;
+	unsigned int startFace;
+	unsigned int numFaces;
+	float epsilon;
+};
+
+
+// Node indices for each link
+StructuredBuffer<int4> g_triangleVertexIndexSet : register( t0 );
+StructuredBuffer<float4> g_vertexPositions : register( t1 );
+StructuredBuffer<int> g_vertexTriangleCount : register( t2 );
+
+RWStructuredBuffer<float4> g_vertexNormals : register( u0 );
+RWStructuredBuffer<float> g_vertexArea : register( u1 );
+RWStructuredBuffer<float4> g_triangleNormals : register( u2 );
+RWStructuredBuffer<float> g_triangleArea : register( u3 );
+
+
+[numthreads(128, 1, 1)]
+void 
+ResetNormalsAndAreasKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	if( DTid.x < numNodes )
+	{
+		g_vertexNormals[DTid.x] = float4(0.0f, 0.0f, 0.0f, 0.0f);
+		g_vertexArea[DTid.x] = 0.0f;
+	}
+}
+
+
+[numthreads(128, 1, 1)]
+void 
+UpdateSoftBodiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int faceID = DTid.x + startFace;
+	if( DTid.x < numFaces )
+	{		
+		int4 triangleIndexSet = g_triangleVertexIndexSet[ faceID ];
+		int nodeIndex0 = triangleIndexSet.x;
+		int nodeIndex1 = triangleIndexSet.y;
+		int nodeIndex2 = triangleIndexSet.z;
+
+		float3 node0 = g_vertexPositions[nodeIndex0].xyz;
+		float3 node1 = g_vertexPositions[nodeIndex1].xyz;
+		float3 node2 = g_vertexPositions[nodeIndex2].xyz;
+		float3 nodeNormal0 = g_vertexNormals[nodeIndex0].xyz;
+		float3 nodeNormal1 = g_vertexNormals[nodeIndex1].xyz;
+		float3 nodeNormal2 = g_vertexNormals[nodeIndex2].xyz;
+		float vertexArea0 = g_vertexArea[nodeIndex0];
+		float vertexArea1 = g_vertexArea[nodeIndex1];
+		float vertexArea2 = g_vertexArea[nodeIndex2];
+		
+		float3 vector0 = node1 - node0;
+		float3 vector1 = node2 - node0;
+		
+		float3 faceNormal = cross(vector0.xyz, vector1.xyz);
+		float triangleArea = length(faceNormal);
+
+		nodeNormal0 = nodeNormal0 + faceNormal;
+		nodeNormal1 = nodeNormal1 + faceNormal;
+		nodeNormal2 = nodeNormal2 + faceNormal;
+		vertexArea0 = vertexArea0 + triangleArea;
+		vertexArea1 = vertexArea1 + triangleArea;
+		vertexArea2 = vertexArea2 + triangleArea;
+		
+		g_triangleNormals[faceID] = float4(normalize(faceNormal), 0.f);
+		g_vertexNormals[nodeIndex0] = float4(nodeNormal0, 0.f);
+		g_vertexNormals[nodeIndex1] = float4(nodeNormal1, 0.f);
+		g_vertexNormals[nodeIndex2] = float4(nodeNormal2, 0.f);
+		g_triangleArea[faceID] = triangleArea;
+		g_vertexArea[nodeIndex0] = vertexArea0;
+		g_vertexArea[nodeIndex1] = vertexArea1;
+		g_vertexArea[nodeIndex2] = vertexArea2;
+	}
+}
+
+[numthreads(128, 1, 1)]
+void 
+NormalizeNormalsAndAreasKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	if( DTid.x < numNodes )
+	{
+		float4 normal = g_vertexNormals[DTid.x];
+		float area = g_vertexArea[DTid.x];
+		int numTriangles = g_vertexTriangleCount[DTid.x];
+		
+		float vectorLength = length(normal);
+		
+		g_vertexNormals[DTid.x] = normalize(normal);
+		g_vertexArea[DTid.x] = area/float(numTriangles);
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/UpdatePositions.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/UpdatePositions.hlsl
@ -0,0 +1,44 @@
+MSTRINGIFY(
+
+cbuffer UpdateVelocitiesFromPositionsWithoutVelocitiesCB : register( b0 )
+{
+	int numNodes;
+	float isolverdt;
+	int padding1;
+	int padding2;
+};
+
+
+StructuredBuffer<float4> g_vertexPositions : register( t0 );
+StructuredBuffer<float4> g_vertexPreviousPositions : register( t1 );
+StructuredBuffer<int> g_vertexClothIndices : register( t2 );
+StructuredBuffer<float> g_clothDampingFactor : register( t3 );
+
+RWStructuredBuffer<float4> g_vertexVelocities : register( u0 );
+RWStructuredBuffer<float4> g_vertexForces : register( u1 );
+
+
+[numthreads(128, 1, 1)]
+void 
+updateVelocitiesFromPositionsWithoutVelocitiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int nodeID = DTid.x;
+	if( nodeID < numNodes )
+	{	
+		float3 position = g_vertexPositions[nodeID].xyz;
+		float3 previousPosition = g_vertexPreviousPositions[nodeID].xyz;
+		float3 velocity = g_vertexVelocities[nodeID].xyz;
+		int clothIndex = g_vertexClothIndices[nodeID];
+		float dampingFactor = g_clothDampingFactor[clothIndex];
+		float velocityCoefficient = (1.f - dampingFactor);
+		
+		float3 difference = position - previousPosition;
+				
+		velocity = difference*velocityCoefficient*isolverdt;		
+		
+		g_vertexVelocities[nodeID] = float4(velocity, 0.f);
+		g_vertexForces[nodeID] = float4(0.f, 0.f, 0.f, 0.f);								
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/UpdatePositionsFromVelocities.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/UpdatePositionsFromVelocities.hlsl
@ -0,0 +1,35 @@
+MSTRINGIFY(
+
+cbuffer UpdatePositionsFromVelocitiesCB : register( b0 )
+{
+	int numNodes;
+	float solverSDT;
+	int padding1;
+	int padding2;
+};
+
+
+StructuredBuffer<float4> g_vertexVelocities : register( t0 );
+
+RWStructuredBuffer<float4> g_vertexPreviousPositions : register( u0 );
+RWStructuredBuffer<float4> g_vertexCurrentPosition : register( u1 );
+
+
+[numthreads(128, 1, 1)]
+void 
+UpdatePositionsFromVelocitiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int vertexID = DTid.x;
+	if( vertexID < numNodes )
+	{	
+		float3 previousPosition = g_vertexPreviousPositions[vertexID].xyz;
+		float3 velocity = g_vertexVelocities[vertexID].xyz;
+		
+		float3 newPosition = previousPosition + velocity*solverSDT;
+		
+		g_vertexCurrentPosition[vertexID] = float4(newPosition, 0.f);
+		g_vertexPreviousPositions[vertexID] = float4(newPosition, 0.f);
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/VSolveLinks.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/VSolveLinks.hlsl
@ -0,0 +1,55 @@
+MSTRINGIFY(
+
+cbuffer VSolveLinksCB : register( b0 )
+{
+	int startLink;
+	int numLinks;
+	float kst;
+	int padding;
+};
+
+// Node indices for each link
+StructuredBuffer<int2> g_linksVertexIndices : register( t0 );
+
+StructuredBuffer<float> g_linksLengthRatio : register( t1 );
+StructuredBuffer<float4> g_linksCurrentLength : register( t2 );
+StructuredBuffer<float> g_vertexInverseMass : register( t3 );
+
+RWStructuredBuffer<float4> g_vertexVelocity : register( u0 );
+
+[numthreads(128, 1, 1)]
+void 
+VSolveLinksKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int linkID = DTid.x + startLink;
+	if( DTid.x < numLinks )
+	{		
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		
+		float linkLengthRatio = g_linksLengthRatio[linkID];
+		float3 linkCurrentLength = g_linksCurrentLength[linkID].xyz;
+		
+		float3 vertexVelocity0 = g_vertexVelocity[node0].xyz;
+		float3 vertexVelocity1 = g_vertexVelocity[node1].xyz;
+
+		float vertexInverseMass0 = g_vertexInverseMass[node0];
+		float vertexInverseMass1 = g_vertexInverseMass[node1]; 
+
+		float3 nodeDifference = vertexVelocity0 - vertexVelocity1;
+		float dotResult = dot(linkCurrentLength, nodeDifference);
+		float j = -dotResult*linkLengthRatio*kst;
+		
+		float3 velocityChange0 = linkCurrentLength*(j*vertexInverseMass0);
+		float3 velocityChange1 = linkCurrentLength*(j*vertexInverseMass1);
+		
+		vertexVelocity0 += velocityChange0;
+		vertexVelocity1 -= velocityChange1;
+
+		g_vertexVelocity[node0] = float4(vertexVelocity0, 0.f);
+		g_vertexVelocity[node1] = float4(vertexVelocity1, 0.f);
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocities.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocities.hlsl
@ -0,0 +1,170 @@
+MSTRINGIFY(
+
+cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
+{
+	unsigned int numNodes;
+	float isolverdt;
+	int padding0;
+	int padding1;
+};
+
+struct CollisionObjectIndices
+{
+	int firstObject;
+	int endObject;
+};
+
+struct CollisionShapeDescription
+{
+	float4x4 shapeTransform;
+	float4 linearVelocity;
+	float4 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+	
+
+	// Shape information
+	// Compressed from the union
+	float radius;
+	float halfHeight;
+		
+	float margin;
+	float friction;
+
+	int padding0;
+	int padding1;
+	
+};
+
+// From btBroadphaseProxy.h
+static const int CAPSULE_SHAPE_PROXYTYPE = 10;
+
+// Node indices for each link
+StructuredBuffer<int> g_vertexClothIdentifier : register( t0 );
+StructuredBuffer<float4> g_vertexPreviousPositions : register( t1 );
+StructuredBuffer<float> g_perClothFriction : register( t2 );
+StructuredBuffer<float> g_clothDampingFactor : register( t3 );
+StructuredBuffer<CollisionObjectIndices> g_perClothCollisionObjectIndices : register( t4 );
+StructuredBuffer<CollisionShapeDescription> g_collisionObjectDetails : register( t5 );
+
+RWStructuredBuffer<float4> g_vertexForces : register( u0 );
+RWStructuredBuffer<float4> g_vertexVelocities : register( u1 );
+RWStructuredBuffer<float4> g_vertexPositions : register( u2 );
+
+[numthreads(128, 1, 1)]
+void 
+SolveCollisionsAndUpdateVelocitiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int nodeID = DTid.x;
+	float3 forceOnVertex = float3(0.f, 0.f, 0.f);
+	if( DTid.x < numNodes )
+	{	
+		int clothIdentifier = g_vertexClothIdentifier[nodeID];
+		float4 position = float4(g_vertexPositions[nodeID].xyz, 1.f);
+		float4 previousPosition = float4(g_vertexPreviousPositions[nodeID].xyz, 1.f);
+		float3 velocity;
+		float clothFriction = g_perClothFriction[clothIdentifier];
+		float dampingFactor = g_clothDampingFactor[clothIdentifier];
+		float velocityCoefficient = (1.f - dampingFactor);		
+		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
+	
+		if( collisionObjectIndices.firstObject != collisionObjectIndices.endObject )
+		{
+			velocity = float3(15, 0, 0);
+
+			// We have some possible collisions to deal with
+			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
+			{
+				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
+				float colliderFriction = shapeDescription.friction;
+		
+				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
+				{
+					// Colliding with a capsule
+
+					float capsuleHalfHeight = shapeDescription.halfHeight;
+					float capsuleRadius = shapeDescription.radius;
+					float capsuleMargin = shapeDescription.margin;
+					float4x4 worldTransform = shapeDescription.shapeTransform;
+
+					float4 c1 = float4(0.f, -capsuleHalfHeight, 0.f, 1.f); 
+					float4 c2 = float4(0.f, +capsuleHalfHeight, 0.f, 1.f);
+					float4 worldC1 = mul(worldTransform, c1);
+					float4 worldC2 = mul(worldTransform, c2);
+					float3 segment = (worldC2 - worldC1).xyz;
+
+					// compute distance of tangent to vertex along line segment in capsule
+					float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
+
+					float4 closestPoint = (worldC1 + float4(segment * distanceAlongSegment, 0.f));
+					float distanceFromLine = length(position - closestPoint);
+					float distanceFromC1 = length(worldC1 - position);
+					float distanceFromC2 = length(worldC2 - position);
+					
+					// Final distance from collision, point to push from, direction to push in
+					// for impulse force
+					float dist;
+					float3 normalVector;
+					if( distanceAlongSegment < 0 )
+					{
+						dist = distanceFromC1;
+						normalVector = normalize(position - worldC1).xyz;
+					} else if( distanceAlongSegment > 1.f ) {
+						dist = distanceFromC2;
+						normalVector = normalize(position - worldC2).xyz;	
+					} else {
+						dist = distanceFromLine;
+						normalVector = normalize(position - closestPoint).xyz;
+					}
+						
+					float3 colliderLinearVelocity = shapeDescription.linearVelocity.xyz;
+					float3 colliderAngularVelocity = shapeDescription.angularVelocity.xyz;
+					float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - worldTransform._m03_m13_m23);
+
+					float minDistance = capsuleRadius + capsuleMargin;
+					
+					// In case of no collision, this is the value of velocity
+					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+					
+					
+					// Check for a collision
+					if( dist < minDistance )
+					{
+						// Project back to surface along normal
+						position = position + float4((minDistance - dist)*normalVector*0.9, 0.f);
+						velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+						float3 relativeVelocity = velocity - velocityOfSurfacePoint;
+
+						float3 p1 = normalize(cross(normalVector, segment));
+						float3 p2 = normalize(cross(p1, normalVector));
+						// Full friction is sum of velocities in each direction of plane
+						float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
+
+						// Real friction is peak friction corrected by friction coefficients
+						frictionVector = frictionVector * (colliderFriction*clothFriction);
+
+						float approachSpeed = dot(relativeVelocity, normalVector);
+
+						if( approachSpeed <= 0.0 )
+							forceOnVertex -= frictionVector;
+					}
+					
+				}
+			}
+		} else {
+			// Update velocity	
+			float3 difference = position.xyz - previousPosition.xyz;
+			velocity = difference*velocityCoefficient*isolverdt;			
+		}
+
+		g_vertexVelocities[nodeID] = float4(velocity, 0.f);	
+
+		// Update external force
+		g_vertexForces[nodeID] = float4(forceOnVertex, 0.f);
+
+		g_vertexPositions[nodeID] = float4(position.xyz, 0.f);
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocitiesSIMDBatched.hlsl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocitiesSIMDBatched.hlsl
@ -0,0 +1,191 @@
+MSTRINGIFY(
+
+cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
+{
+	unsigned int numNodes;
+	float isolverdt;
+	int padding0;
+	int padding1;
+};
+
+struct CollisionObjectIndices
+{
+	int firstObject;
+	int endObject;
+};
+
+struct CollisionShapeDescription
+{
+	float4x4 shapeTransform;
+	float4 linearVelocity;
+	float4 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+	
+
+	// Shape information
+	// Compressed from the union
+	float radius;
+	float halfHeight;
+		
+	float margin;
+	float friction;
+
+	int padding0;
+	int padding1;
+	
+};
+
+// From btBroadphaseProxy.h
+static const int CAPSULE_SHAPE_PROXYTYPE = 10;
+
+// Node indices for each link
+StructuredBuffer<int> g_vertexClothIdentifier : register( t0 );
+StructuredBuffer<float4> g_vertexPreviousPositions : register( t1 );
+StructuredBuffer<float> g_perClothFriction : register( t2 );
+StructuredBuffer<float> g_clothDampingFactor : register( t3 );
+StructuredBuffer<CollisionObjectIndices> g_perClothCollisionObjectIndices : register( t4 );
+StructuredBuffer<CollisionShapeDescription> g_collisionObjectDetails : register( t5 );
+
+RWStructuredBuffer<float4> g_vertexForces : register( u0 );
+RWStructuredBuffer<float4> g_vertexVelocities : register( u1 );
+RWStructuredBuffer<float4> g_vertexPositions : register( u2 );
+
+// A buffer of local collision shapes
+// TODO: Iterate to support more than 16
+groupshared CollisionShapeDescription localCollisionShapes[16];
+
+[numthreads(128, 1, 1)]
+void 
+SolveCollisionsAndUpdateVelocitiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int nodeID = DTid.x;
+	float3 forceOnVertex = float3(0.f, 0.f, 0.f);
+
+	int clothIdentifier = g_vertexClothIdentifier[nodeID];
+	float4 position = float4(g_vertexPositions[nodeID].xyz, 1.f);
+	float4 previousPosition = float4(g_vertexPreviousPositions[nodeID].xyz, 1.f);
+	float3 velocity;
+	float clothFriction = g_perClothFriction[clothIdentifier];
+	float dampingFactor = g_clothDampingFactor[clothIdentifier];
+	float velocityCoefficient = (1.f - dampingFactor);		
+	CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
+	
+	int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+	if( numObjects > 0 )
+	{
+		// We have some possible collisions to deal with
+		
+		// First load all of the collision objects into LDS
+		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+		if( GTid.x < numObjects )
+		{
+			localCollisionShapes[GTid.x] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + GTid.x ];
+		}
+	}
+
+	// Safe as the vertices are padded so that not more than one soft body is in a group
+	AllMemoryBarrierWithGroupSync();
+
+	// Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this
+	if( numObjects > 0 )
+	{
+		velocity = float3(0, 0, 0);
+		
+		
+		// We have some possible collisions to deal with
+		for( int collision = 0; collision < numObjects; ++collision )
+		{
+			CollisionShapeDescription shapeDescription = localCollisionShapes[collision];
+			float colliderFriction = shapeDescription.friction;
+		
+			if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
+			{
+				// Colliding with a capsule
+
+				float capsuleHalfHeight = localCollisionShapes[collision].halfHeight;
+				float capsuleRadius = localCollisionShapes[collision].radius;
+				float capsuleMargin = localCollisionShapes[collision].margin;
+
+				float4x4 worldTransform = localCollisionShapes[collision].shapeTransform;
+
+				float4 c1 = float4(0.f, -capsuleHalfHeight, 0.f, 1.f); 
+				float4 c2 = float4(0.f, +capsuleHalfHeight, 0.f, 1.f);
+				float4 worldC1 = mul(worldTransform, c1);
+				float4 worldC2 = mul(worldTransform, c2);
+				float3 segment = (worldC2 - worldC1).xyz;
+
+				// compute distance of tangent to vertex along line segment in capsule
+				float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
+
+				float4 closestPoint = (worldC1 + float4(segment * distanceAlongSegment, 0.f));
+				float distanceFromLine = length(position - closestPoint);
+				float distanceFromC1 = length(worldC1 - position);
+				float distanceFromC2 = length(worldC2 - position);
+					
+				// Final distance from collision, point to push from, direction to push in
+				// for impulse force
+				float dist;
+				float3 normalVector;
+				if( distanceAlongSegment < 0 )
+				{
+					dist = distanceFromC1;
+					normalVector = normalize(position - worldC1).xyz;
+				} else if( distanceAlongSegment > 1.f ) {
+					dist = distanceFromC2;
+					normalVector = normalize(position - worldC2).xyz;	
+				} else {
+					dist = distanceFromLine;
+					normalVector = normalize(position - closestPoint).xyz;
+				}
+						
+				float3 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity.xyz;
+				float3 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity.xyz;
+				float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - worldTransform._m03_m13_m23);
+
+				float minDistance = capsuleRadius + capsuleMargin;
+					
+				// In case of no collision, this is the value of velocity
+				velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+					
+					
+				// Check for a collision
+				if( dist < minDistance )
+				{
+					// Project back to surface along normal
+					position = position + float4((minDistance - dist)*normalVector*0.9, 0.f);
+					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+					float3 relativeVelocity = velocity - velocityOfSurfacePoint;
+
+					float3 p1 = normalize(cross(normalVector, segment));
+					float3 p2 = normalize(cross(p1, normalVector));
+					// Full friction is sum of velocities in each direction of plane
+					float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
+
+					// Real friction is peak friction corrected by friction coefficients
+					frictionVector = frictionVector * (colliderFriction*clothFriction);
+
+					float approachSpeed = dot(relativeVelocity, normalVector);
+
+					if( approachSpeed <= 0.0 )
+						forceOnVertex -= frictionVector;
+				}
+					
+			}
+		}
+	} else {
+		// Update velocity	
+		float3 difference = position.xyz - previousPosition.xyz;
+		velocity = difference*velocityCoefficient*isolverdt;			
+	}
+
+	g_vertexVelocities[nodeID] = float4(velocity, 0.f);	
+
+	// Update external force
+	g_vertexForces[nodeID] = float4(forceOnVertex, 0.f);
+
+	g_vertexPositions[nodeID] = float4(position.xyz, 0.f);
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverBuffer_DX11.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverBuffer_DX11.h
@ -0,0 +1,323 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef BT_SOFT_BODY_SOLVER_BUFFER_DX11_H
+#define BT_SOFT_BODY_SOLVER_BUFFER_DX11_H
+
+// DX11 support
+#include <windows.h>
+#include <crtdbg.h>
+#include <d3d11.h>
+#include <d3dx11.h>
+#include <d3dcompiler.h>
+
+#ifndef SAFE_RELEASE
+#define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
+#endif
+
+/**
+ * DX11 Buffer that tracks a host buffer on use to ensure size-correctness.
+ */
+template <typename ElementType> class btDX11Buffer
+{
+protected:
+	ID3D11Device*				m_d3dDevice;
+	ID3D11DeviceContext*		m_d3dDeviceContext;
+
+	ID3D11Buffer*               m_Buffer;
+	ID3D11ShaderResourceView*   m_SRV;
+	ID3D11UnorderedAccessView*  m_UAV;
+	btAlignedObjectArray< ElementType >*	m_CPUBuffer;
+
+	// TODO: Separate this from the main class
+	// as read back buffers can be shared between buffers
+	ID3D11Buffer*               m_readBackBuffer;
+
+	int m_gpuSize;
+	bool m_onGPU;
+
+	bool m_readOnlyOnGPU;
+	
+	bool createBuffer( ID3D11Buffer *preexistingBuffer = 0)
+	{
+		HRESULT hr = S_OK;
+
+		// Create all CS buffers
+		if( preexistingBuffer )
+		{
+			m_Buffer = preexistingBuffer;
+		} else {
+			D3D11_BUFFER_DESC buffer_desc;
+			ZeroMemory(&buffer_desc, sizeof(buffer_desc));		
+			buffer_desc.Usage = D3D11_USAGE_DEFAULT;
+			if( m_readOnlyOnGPU )
+				buffer_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
+			else
+				buffer_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
+			buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
+			
+			buffer_desc.ByteWidth = m_CPUBuffer->size() * sizeof(ElementType);
+			// At a minimum the buffer must exist
+			if( buffer_desc.ByteWidth == 0 )
+				buffer_desc.ByteWidth = sizeof(ElementType);
+			buffer_desc.StructureByteStride = sizeof(ElementType);
+			hr = m_d3dDevice->CreateBuffer(&buffer_desc, NULL, &m_Buffer);
+			if( FAILED( hr ) )
+		        return (hr==S_OK);
+		} 
+
+		if( m_readOnlyOnGPU )
+		{
+			D3D11_SHADER_RESOURCE_VIEW_DESC srvbuffer_desc;
+			ZeroMemory(&srvbuffer_desc, sizeof(srvbuffer_desc));
+			srvbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
+			srvbuffer_desc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
+
+			srvbuffer_desc.Buffer.ElementWidth = m_CPUBuffer->size();
+			if( srvbuffer_desc.Buffer.ElementWidth == 0 )
+				srvbuffer_desc.Buffer.ElementWidth = 1;
+			hr = m_d3dDevice->CreateShaderResourceView(m_Buffer, &srvbuffer_desc, &m_SRV);
+			if( FAILED( hr ) )
+				return (hr==S_OK);
+		} else {
+			// Create SRV
+			D3D11_SHADER_RESOURCE_VIEW_DESC srvbuffer_desc;
+			ZeroMemory(&srvbuffer_desc, sizeof(srvbuffer_desc));
+			srvbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
+			srvbuffer_desc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
+
+			srvbuffer_desc.Buffer.ElementWidth = m_CPUBuffer->size();
+			if( srvbuffer_desc.Buffer.ElementWidth == 0 )
+				srvbuffer_desc.Buffer.ElementWidth = 1;
+			hr = m_d3dDevice->CreateShaderResourceView(m_Buffer, &srvbuffer_desc, &m_SRV);
+			if( FAILED( hr ) )
+				return (hr==S_OK);
+
+			// Create UAV
+			D3D11_UNORDERED_ACCESS_VIEW_DESC uavbuffer_desc;
+			ZeroMemory(&uavbuffer_desc, sizeof(uavbuffer_desc));
+			uavbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
+			uavbuffer_desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
+
+			uavbuffer_desc.Buffer.NumElements = m_CPUBuffer->size();
+			if( uavbuffer_desc.Buffer.NumElements == 0 )
+				uavbuffer_desc.Buffer.NumElements = 1;
+			hr = m_d3dDevice->CreateUnorderedAccessView(m_Buffer, &uavbuffer_desc, &m_UAV);
+			if( FAILED( hr ) )
+				return (hr==S_OK);
+
+			// Create read back buffer
+			D3D11_BUFFER_DESC readback_buffer_desc;
+			ZeroMemory(&readback_buffer_desc, sizeof(readback_buffer_desc));
+
+			readback_buffer_desc.ByteWidth = m_CPUBuffer->size() * sizeof(ElementType);
+			readback_buffer_desc.Usage = D3D11_USAGE_STAGING;
+			readback_buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+			readback_buffer_desc.StructureByteStride = sizeof(ElementType);
+			hr = m_d3dDevice->CreateBuffer(&readback_buffer_desc, NULL, &m_readBackBuffer);
+			if( FAILED( hr ) )
+				return (hr==S_OK);
+		}
+
+		m_gpuSize = m_CPUBuffer->size();
+		return true;
+	}
+
+
+
+public:
+	btDX11Buffer( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext, btAlignedObjectArray< ElementType > *CPUBuffer, bool readOnly )
+	{
+		m_d3dDevice = d3dDevice;
+		m_d3dDeviceContext = d3dDeviceContext;
+		m_Buffer = 0;
+		m_SRV = 0;
+		m_UAV = 0;
+		m_readBackBuffer = 0;
+
+		m_CPUBuffer = CPUBuffer;
+
+		m_gpuSize = 0;
+		m_onGPU = false;
+
+		m_readOnlyOnGPU = readOnly;
+	}
+
+	virtual ~btDX11Buffer()
+	{
+		SAFE_RELEASE(m_Buffer);
+		SAFE_RELEASE(m_SRV);
+		SAFE_RELEASE(m_UAV);
+		SAFE_RELEASE(m_readBackBuffer);
+	}
+
+	ID3D11ShaderResourceView* &getSRV()
+	{
+		return m_SRV;
+	}
+
+	ID3D11UnorderedAccessView* &getUAV()
+	{
+		return m_UAV;
+	}
+
+	ID3D11Buffer* &getBuffer()
+	{
+		return m_Buffer;
+	}
+
+	/**
+	 * Move the data to the GPU if it is not there already.
+	 */
+	bool moveToGPU()
+	{
+		// Reallocate if GPU size is too small
+		if( (m_CPUBuffer->size() > m_gpuSize ) )
+			m_onGPU = false;
+		if( !m_onGPU && m_CPUBuffer->size() > 0 )
+		{
+			// If the buffer doesn't exist or the CPU-side buffer has changed size, create
+			// We should really delete the old one, too, but let's leave that for later
+			if( !m_Buffer || (m_CPUBuffer->size() != m_gpuSize) )
+			{
+				SAFE_RELEASE(m_Buffer);
+				SAFE_RELEASE(m_SRV);
+				SAFE_RELEASE(m_UAV);
+				SAFE_RELEASE(m_readBackBuffer);
+				if( !createBuffer() )
+				{
+					btAssert("Buffer creation failed.");
+					return false;
+				}
+			}
+
+			if( m_gpuSize > 0 )
+			{
+				D3D11_BOX destRegion;
+				destRegion.left = 0;
+				destRegion.front = 0;
+				destRegion.top = 0;
+				destRegion.bottom = 1;
+				destRegion.back = 1;
+				destRegion.right = (m_CPUBuffer->size())*sizeof(ElementType);
+				m_d3dDeviceContext->UpdateSubresource(m_Buffer, 0, &destRegion, &((*m_CPUBuffer)[0]), 0, 0);
+
+				m_onGPU = true;
+			}
+
+		}
+
+		return true;
+	}
+
+	/**
+	 * Move the data back from the GPU if it is on there and isn't read only.
+	 */
+	bool moveFromGPU()
+	{
+		if( m_CPUBuffer->size() > 0 )
+		{
+			if( m_onGPU && !m_readOnlyOnGPU )
+			{
+				// Copy back
+				D3D11_MAPPED_SUBRESOURCE MappedResource = {0}; 
+				//m_pd3dImmediateContext->CopyResource(m_phAngVelReadBackBuffer, m_phAngVel);
+
+				D3D11_BOX destRegion;	
+				destRegion.left = 0;
+				destRegion.front = 0;
+				destRegion.top = 0;
+				destRegion.bottom = 1;
+				destRegion.back = 1;
+
+				destRegion.right = (m_CPUBuffer->size())*sizeof(ElementType);
+				m_d3dDeviceContext->CopySubresourceRegion(
+					m_readBackBuffer,
+					0,
+					0,
+					0,
+					0 ,
+					m_Buffer,
+					0,
+					&destRegion
+					);
+
+				m_d3dDeviceContext->Map(m_readBackBuffer, 0, D3D11_MAP_READ, 0, &MappedResource);   
+				//memcpy(m_hAngVel, MappedResource.pData, (m_maxObjs * sizeof(float) ));
+				memcpy(&((*m_CPUBuffer)[0]), MappedResource.pData, ((m_CPUBuffer->size()) * sizeof(ElementType) ));		
+				m_d3dDeviceContext->Unmap(m_readBackBuffer, 0);
+
+				m_onGPU = false;
+			}
+		}
+
+		return true;
+	}
+
+
+	/**
+	 * Copy the data back from the GPU without changing its state to be CPU-side.
+	 * Useful if we just want to view it on the host for visualization.
+	 */
+	bool copyFromGPU()
+	{
+		if( m_CPUBuffer->size() > 0 )
+		{
+			if( m_onGPU && !m_readOnlyOnGPU )
+			{
+				// Copy back
+				D3D11_MAPPED_SUBRESOURCE MappedResource = {0}; 
+
+				D3D11_BOX destRegion;	
+				destRegion.left = 0;
+				destRegion.front = 0;
+				destRegion.top = 0;
+				destRegion.bottom = 1;
+				destRegion.back = 1;
+
+				destRegion.right = (m_CPUBuffer->size())*sizeof(ElementType);
+				m_d3dDeviceContext->CopySubresourceRegion(
+					m_readBackBuffer,
+					0,
+					0,
+					0,
+					0 ,
+					m_Buffer,
+					0,
+					&destRegion
+					);
+
+				m_d3dDeviceContext->Map(m_readBackBuffer, 0, D3D11_MAP_READ, 0, &MappedResource);   
+				//memcpy(m_hAngVel, MappedResource.pData, (m_maxObjs * sizeof(float) ));
+				memcpy(&((*m_CPUBuffer)[0]), MappedResource.pData, ((m_CPUBuffer->size()) * sizeof(ElementType) ));		
+				m_d3dDeviceContext->Unmap(m_readBackBuffer, 0);
+			}
+		}
+
+		return true;
+	}
+
+	/**
+	 * Call if data has changed on the CPU.
+	 * Can then trigger a move to the GPU as necessary.
+	 */
+	virtual void changedOnCPU()
+	{
+		m_onGPU = false;
+	}
+}; // class btDX11Buffer
+
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_BUFFER_DX11_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11.h
@ -0,0 +1,103 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_DX11.h"
+
+
+#ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_DX11_H
+#define BT_SOFT_BODY_SOLVER_LINK_DATA_DX11_H
+
+struct ID3D11Device;
+struct ID3D11DeviceContext;
+
+
+class btSoftBodyLinkDataDX11 : public btSoftBodyLinkData
+{
+public:
+	bool				m_onGPU;
+	ID3D11Device		*m_d3dDevice;
+	ID3D11DeviceContext *m_d3dDeviceContext;
+
+
+	btDX11Buffer<LinkNodePair>				m_dx11Links;
+	btDX11Buffer<float>											m_dx11LinkStrength;
+	btDX11Buffer<float>											m_dx11LinksMassLSC;
+	btDX11Buffer<float>											m_dx11LinksRestLengthSquared;
+	btDX11Buffer<Vectormath::Aos::Vector3>						m_dx11LinksCLength;
+	btDX11Buffer<float>											m_dx11LinksLengthRatio;
+	btDX11Buffer<float>											m_dx11LinksRestLength;
+	btDX11Buffer<float>											m_dx11LinksMaterialLinearStiffnessCoefficient;
+
+	struct BatchPair
+	{
+		int start;
+		int length;
+
+		BatchPair() :
+			start(0),
+			length(0)
+		{
+		}
+
+		BatchPair( int s, int l ) : 
+			start( s ),
+			length( l )
+		{
+		}
+	};
+
+	/**
+	 * Link addressing information for each cloth.
+	 * Allows link locations to be computed independently of data batching.
+	 */
+	btAlignedObjectArray< int >							m_linkAddresses;
+
+	/**
+	 * Start and length values for computation batches over link data.
+	 */
+	btAlignedObjectArray< BatchPair >		m_batchStartLengths;
+
+
+	//ID3D11Buffer*               readBackBuffer;
+	
+	btSoftBodyLinkDataDX11( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext );
+
+	virtual ~btSoftBodyLinkDataDX11();
+
+	/** Allocate enough space in all link-related arrays to fit numLinks links */
+	virtual void createLinks( int numLinks );
+	
+	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
+	virtual void setLinkAt( const LinkDescription &link, int linkIndex );
+
+	virtual bool onAccelerator();
+
+	virtual bool moveToAccelerator();
+
+	virtual bool moveFromAccelerator();
+
+	/**
+	 * Generate (and later update) the batching for the entire link set.
+	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
+	 * In theory we could delay it until just before we need the cloth.
+	 * It's a one-off overhead, though, so that is a later optimisation.
+	 */
+	void generateBatches();
+};
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_DX11_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h
@ -0,0 +1,173 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_DX11.h"
+
+#ifndef BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
+#define BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
+
+struct ID3D11Device;
+struct ID3D11DeviceContext;
+
+
+class btSoftBodyLinkDataDX11SIMDAware : public btSoftBodyLinkData
+{
+public:
+	bool				m_onGPU;
+	ID3D11Device		*m_d3dDevice;
+	ID3D11DeviceContext *m_d3dDeviceContext;
+
+	const int m_wavefrontSize;
+	const int m_linksPerWorkItem;
+	const int m_maxLinksPerWavefront;
+	int m_maxBatchesWithinWave;
+	int m_maxVerticesWithinWave;
+	int m_numWavefronts;
+
+	int m_maxVertex;
+
+	struct NumBatchesVerticesPair
+	{
+		int numBatches;
+		int numVertices;
+	};
+
+	// Array storing number of links in each wavefront
+	btAlignedObjectArray<int>									m_linksPerWavefront;
+	btAlignedObjectArray<NumBatchesVerticesPair>				m_numBatchesAndVerticesWithinWaves;
+	btDX11Buffer< NumBatchesVerticesPair >						m_dx11NumBatchesAndVerticesWithinWaves;
+
+	// All arrays here will contain batches of m_maxLinksPerWavefront links
+	// ordered by wavefront.
+	// with either global vertex pairs or local vertex pairs
+	btAlignedObjectArray< int >									m_wavefrontVerticesGlobalAddresses; // List of global vertices per wavefront
+	btDX11Buffer<int>											m_dx11WavefrontVerticesGlobalAddresses;
+	btAlignedObjectArray< LinkNodePair >						m_linkVerticesLocalAddresses; // Vertex pair for the link
+	btDX11Buffer<LinkNodePair>									m_dx11LinkVerticesLocalAddresses;
+	btDX11Buffer<float>											m_dx11LinkStrength;
+	btDX11Buffer<float>											m_dx11LinksMassLSC;
+	btDX11Buffer<float>											m_dx11LinksRestLengthSquared;
+	btDX11Buffer<float>											m_dx11LinksRestLength;
+	btDX11Buffer<float>											m_dx11LinksMaterialLinearStiffnessCoefficient;
+
+	struct BatchPair
+	{
+		int start;
+		int length;
+
+		BatchPair() :
+			start(0),
+			length(0)
+		{
+		}
+
+		BatchPair( int s, int l ) : 
+			start( s ),
+			length( l )
+		{
+		}
+	};
+
+	/**
+	 * Link addressing information for each cloth.
+	 * Allows link locations to be computed independently of data batching.
+	 */
+	btAlignedObjectArray< int >							m_linkAddresses;
+
+	/**
+	 * Start and length values for computation batches over link data.
+	 */
+	btAlignedObjectArray< BatchPair >		m_wavefrontBatchStartLengths;
+
+
+	//ID3D11Buffer*               readBackBuffer;
+	
+	btSoftBodyLinkDataDX11SIMDAware( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext );
+
+	virtual ~btSoftBodyLinkDataDX11SIMDAware();
+
+	/** Allocate enough space in all link-related arrays to fit numLinks links */
+	virtual void createLinks( int numLinks );
+	
+	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
+	virtual void setLinkAt( const LinkDescription &link, int linkIndex );
+
+	virtual bool onAccelerator();
+
+	virtual bool moveToAccelerator();
+
+	virtual bool moveFromAccelerator();
+
+	/**
+	 * Generate (and later update) the batching for the entire link set.
+	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
+	 * In theory we could delay it until just before we need the cloth.
+	 * It's a one-off overhead, though, so that is a later optimisation.
+	 */
+	void generateBatches();
+
+	int getMaxVerticesPerWavefront()
+	{
+		return m_maxVerticesWithinWave;
+	}
+
+	int getWavefrontSize()
+	{
+		return m_wavefrontSize;
+	}
+
+	int getLinksPerWorkItem()
+	{
+		return m_linksPerWorkItem;
+	}
+
+	int getMaxLinksPerWavefront()
+	{
+		return m_maxLinksPerWavefront;
+	}
+
+	int getMaxBatchesPerWavefront()
+	{
+		return m_maxBatchesWithinWave;
+	}
+
+	int getNumWavefronts()
+	{
+		return m_numWavefronts;
+	}
+
+	NumBatchesVerticesPair getNumBatchesAndVerticesWithinWavefront( int wavefront )
+	{
+		return m_numBatchesAndVerticesWithinWaves[wavefront];
+	}
+
+	int getVertexGlobalAddresses( int vertexIndex )
+	{
+		return m_wavefrontVerticesGlobalAddresses[vertexIndex];
+	}
+
+	/**
+	 * Get post-batching local addresses of the vertex pair for a link assuming all vertices used by a wavefront are loaded locally.
+	 */
+	LinkNodePair getVertexPairLocalAddresses( int linkIndex )
+	{
+		return m_linkVerticesLocalAddresses[linkIndex];
+	}
+
+};
+
+
+#endif // #ifndef BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverTriangleData_DX11.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverTriangleData_DX11.h
@ -0,0 +1,96 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_DX11.h"
+
+
+#ifndef BT_SOFT_BODY_SOLVER_TRIANGLE_DATA_DX11_H
+#define BT_SOFT_BODY_SOLVER_TRIANGLE_DATA_DX11_H
+
+struct ID3D11Device;
+struct ID3D11DeviceContext;
+
+class btSoftBodyTriangleDataDX11 : public btSoftBodyTriangleData
+{
+public:
+	bool				m_onGPU;
+	ID3D11Device		*m_d3dDevice;
+	ID3D11DeviceContext *m_d3dDeviceContext;
+
+	btDX11Buffer<btSoftBodyTriangleData::TriangleNodeSet>							m_dx11VertexIndices;
+	btDX11Buffer<float>									m_dx11Area;
+	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11Normal;
+
+	struct BatchPair
+	{
+		int start;
+		int length;
+
+		BatchPair() :
+			start(0),
+			length(0)
+		{
+		}
+
+		BatchPair( int s, int l ) : 
+			start( s ),
+			length( l )
+		{
+		}
+	};
+
+
+	/**
+	 * Link addressing information for each cloth.
+	 * Allows link locations to be computed independently of data batching.
+	 */
+	btAlignedObjectArray< int >							m_triangleAddresses;
+
+	/**
+	 * Start and length values for computation batches over link data.
+	 */
+	btAlignedObjectArray< BatchPair >		m_batchStartLengths;
+
+	//ID3D11Buffer*               readBackBuffer;
+
+public:
+	btSoftBodyTriangleDataDX11( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext );
+
+	virtual ~btSoftBodyTriangleDataDX11();
+
+
+	/** Allocate enough space in all link-related arrays to fit numLinks links */
+	virtual void createTriangles( int numTriangles );
+	
+	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
+	virtual void setTriangleAt( const btSoftBodyTriangleData::TriangleDescription &triangle, int triangleIndex );
+
+	virtual bool onAccelerator();
+	virtual bool moveToAccelerator();
+
+	virtual bool moveFromAccelerator();
+	/**
+	 * Generate (and later update) the batching for the entire triangle set.
+	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
+	 * In theory we could delay it until just before we need the cloth.
+	 * It's a one-off overhead, though, so that is a later optimisation.
+	 */
+	void generateBatches();
+};
+
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_TRIANGLE_DATA_DX11_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexBuffer_DX11.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexBuffer_DX11.h
@ -0,0 +1,107 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_DX11_H
+#define BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_DX11_H 
+
+
+#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
+
+#include <windows.h>
+#include <crtdbg.h>
+#include <d3d11.h>
+#include <d3dx11.h>
+#include <d3dcompiler.h>
+
+class btDX11VertexBufferDescriptor : public btVertexBufferDescriptor
+{
+protected:
+	/** Context of the DX11 device on which the vertex buffer is stored. */
+	ID3D11DeviceContext* m_context;
+	/** DX11 vertex buffer */
+	ID3D11Buffer* m_vertexBuffer;
+	/** UAV for DX11 buffer */
+	ID3D11UnorderedAccessView*  m_vertexBufferUAV;
+
+
+public:
+	/**
+	 * buffer is a pointer to the DX11 buffer to place the vertex data in.
+	 * UAV is a pointer to the UAV representation of the buffer laid out in floats.
+	 * vertexOffset is the offset in floats to the first vertex.
+	 * vertexStride is the stride in floats between vertices.
+	 */
+	btDX11VertexBufferDescriptor( ID3D11DeviceContext* context, ID3D11Buffer* buffer, ID3D11UnorderedAccessView *UAV, int vertexOffset, int vertexStride )
+	{
+		m_context = context;
+		m_vertexBuffer = buffer;
+		m_vertexBufferUAV = UAV;
+		m_vertexOffset = vertexOffset;
+		m_vertexStride = vertexStride;
+		m_hasVertexPositions = true;
+	}
+
+	/**
+	 * buffer is a pointer to the DX11 buffer to place the vertex data in.
+	 * UAV is a pointer to the UAV representation of the buffer laid out in floats.
+	 * vertexOffset is the offset in floats to the first vertex.
+	 * vertexStride is the stride in floats between vertices.
+	 * normalOffset is the offset in floats to the first normal.
+	 * normalStride is the stride in floats between normals.
+	 */
+	btDX11VertexBufferDescriptor( ID3D11DeviceContext* context, ID3D11Buffer* buffer, ID3D11UnorderedAccessView *UAV, int vertexOffset, int vertexStride, int normalOffset, int normalStride )
+	{
+		m_context = context;
+		m_vertexBuffer = buffer;
+		m_vertexBufferUAV = UAV;
+		m_vertexOffset = vertexOffset;
+		m_vertexStride = vertexStride;
+		m_hasVertexPositions = true;
+		
+		m_normalOffset = normalOffset;
+		m_normalStride = normalStride;
+		m_hasNormals = true;
+	}
+
+	virtual ~btDX11VertexBufferDescriptor()
+	{
+
+	}
+
+	/**
+	 * Return the type of the vertex buffer descriptor.
+	 */
+	virtual BufferTypes getBufferType() const
+	{
+		return DX11_BUFFER;
+	}
+
+	virtual ID3D11DeviceContext* getContext() const
+	{
+		return m_context;
+	}
+
+	virtual ID3D11Buffer* getbtDX11Buffer() const
+	{
+		return m_vertexBuffer;
+	}
+
+	virtual ID3D11UnorderedAccessView* getDX11UAV() const
+	{
+		return m_vertexBufferUAV;
+	}		
+};
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_DX11_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexData_DX11.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexData_DX11.h
@ -0,0 +1,63 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_DX11.h"
+
+
+#ifndef BT_SOFT_BHODY_SOLVER_VERTEX_DATA_DX11_H
+#define BT_SOFT_BHODY_SOLVER_VERTEX_DATA_DX11_H
+
+class btSoftBodyLinkData;
+class btSoftBodyLinkData::LinkDescription;
+
+struct ID3D11Device;
+struct ID3D11DeviceContext;
+
+class btSoftBodyVertexDataDX11 : public btSoftBodyVertexData
+{
+protected:
+	bool				m_onGPU;
+	ID3D11Device		*m_d3dDevice;
+	ID3D11DeviceContext *m_d3dDeviceContext;
+
+public:
+	btDX11Buffer<int>										m_dx11ClothIdentifier;
+	btDX11Buffer<Vectormath::Aos::Point3>					m_dx11VertexPosition;
+	btDX11Buffer<Vectormath::Aos::Point3>					m_dx11VertexPreviousPosition;
+	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11VertexVelocity;
+	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11VertexForceAccumulator;
+	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11VertexNormal;
+	btDX11Buffer<float>									m_dx11VertexInverseMass;
+	btDX11Buffer<float>									m_dx11VertexArea;
+	btDX11Buffer<int>										m_dx11VertexTriangleCount;
+
+
+	//ID3D11Buffer*               readBackBuffer;
+
+public:
+	btSoftBodyVertexDataDX11( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext );
+	virtual ~btSoftBodyVertexDataDX11();
+
+	virtual bool onAccelerator();
+	virtual bool moveToAccelerator();
+
+	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true);
+};
+
+
+#endif // #ifndef BT_SOFT_BHODY_SOLVER_VERTEX_DATA_DX11_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
@ -0,0 +1,691 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
+#define BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
+
+
+#include "vectormath/vmInclude.h"
+#include "BulletSoftBody/btSoftBodySolvers.h"
+#include "btSoftBodySolverVertexBuffer_DX11.h"
+#include "btSoftBodySolverLinkData_DX11.h"
+#include "btSoftBodySolverVertexData_DX11.h"
+#include "btSoftBodySolverTriangleData_DX11.h"
+
+
+
+class DXFunctions
+{
+public:
+	
+	typedef HRESULT (WINAPI * CompileFromMemoryFunc)(LPCSTR,SIZE_T,LPCSTR,const D3D10_SHADER_MACRO*,LPD3D10INCLUDE,LPCSTR,LPCSTR,UINT,UINT,ID3DX11ThreadPump*,ID3D10Blob**,ID3D10Blob**,HRESULT*);
+
+	ID3D11Device *		 m_dx11Device;
+	ID3D11DeviceContext* m_dx11Context;
+	CompileFromMemoryFunc m_dx11CompileFromMemory;
+
+	DXFunctions(ID3D11Device *dx11Device, ID3D11DeviceContext* dx11Context, CompileFromMemoryFunc dx11CompileFromMemory) :
+		m_dx11Device( dx11Device ),
+		m_dx11Context( dx11Context ),
+		m_dx11CompileFromMemory( dx11CompileFromMemory )
+	{
+
+	}
+
+	class KernelDesc
+	{
+	protected:
+		
+
+	public:
+		ID3D11ComputeShader* kernel;
+		ID3D11Buffer* constBuffer;
+
+		KernelDesc()
+		{
+			kernel = 0;
+			constBuffer = 0;
+		}
+
+		virtual ~KernelDesc()
+		{
+			// TODO: this should probably destroy its kernel but we need to be careful
+			// in case KernelDescs are copied
+		}
+	}; 
+
+	/**
+	 * Compile a compute shader kernel from a string and return the appropriate KernelDesc object.
+	 */
+	KernelDesc compileComputeShaderFromString( const char* shaderString, const char* shaderName, int constBufferSize, D3D10_SHADER_MACRO *compileMacros = 0 );
+
+};
+
+class btDX11SoftBodySolver : public btSoftBodySolver
+{
+protected:
+	/**
+	 * Entry in the collision shape array.
+	 * Specifies the shape type, the transform matrix and the necessary details of the collisionShape.
+	 */
+	struct CollisionShapeDescription
+	{
+		Vectormath::Aos::Transform3 shapeTransform;
+		Vectormath::Aos::Vector3 linearVelocity;
+		Vectormath::Aos::Vector3 angularVelocity;
+
+		int softBodyIdentifier;
+		int collisionShapeType;
+	
+		// Both needed for capsule
+		float radius;
+		float halfHeight;
+		
+		float margin;
+		float friction;
+
+		CollisionShapeDescription()
+		{
+			collisionShapeType = 0;
+			margin = 0;
+			friction = 0;
+		}
+	};
+
+	struct UIntVector3
+	{
+		UIntVector3()
+		{
+			x = 0;
+			y = 0;
+			z = 0;
+			_padding = 0;
+		}
+		
+		UIntVector3( unsigned int x_, unsigned int y_, unsigned int z_ )
+		{
+			x = x_;
+			y = y_;
+			z = z_;
+			_padding = 0;
+		}
+			
+		unsigned int x;
+		unsigned int y;
+		unsigned int z;
+		unsigned int _padding;
+	};
+
+
+
+public:
+	/**
+	 * SoftBody class to maintain information about a soft body instance
+	 * within a solver.
+	 * This data addresses the main solver arrays.
+	 */
+	class btAcceleratedSoftBodyInterface
+	{
+	protected:
+		/** Current number of vertices that are part of this cloth */
+		int m_numVertices;
+		/** Maximum number of vertices allocated to be part of this cloth */
+		int m_maxVertices;
+		/** Current number of triangles that are part of this cloth */
+		int m_numTriangles;
+		/** Maximum number of triangles allocated to be part of this cloth */
+		int m_maxTriangles;
+		/** Index of first vertex in the world allocated to this cloth */
+		int m_firstVertex;
+		/** Index of first triangle in the world allocated to this cloth */
+		int m_firstTriangle;
+		/** Index of first link in the world allocated to this cloth */
+		int m_firstLink;
+		/** Maximum number of links allocated to this cloth */
+		int m_maxLinks;
+		/** Current number of links allocated to this cloth */
+		int m_numLinks;
+
+		/** The actual soft body this data represents */
+		btSoftBody *m_softBody;
+
+
+	public:
+		btAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
+		  m_softBody( softBody )
+		{
+			m_numVertices = 0;
+			m_maxVertices = 0;
+			m_numTriangles = 0;
+			m_maxTriangles = 0;
+			m_firstVertex = 0;
+			m_firstTriangle = 0;
+			m_firstLink = 0;
+			m_maxLinks = 0;
+			m_numLinks = 0;
+		}
+		int getNumVertices() const
+		{
+			return m_numVertices;
+		}
+
+		int getNumTriangles() const
+		{
+			return m_numTriangles;
+		}
+
+		int getMaxVertices() const
+		{
+			return m_maxVertices;
+		}
+
+		int getMaxTriangles() const
+		{
+			return m_maxTriangles;
+		}
+
+		int getFirstVertex() const
+		{
+			return m_firstVertex;
+		}
+
+		int getFirstTriangle() const
+		{
+			return m_firstTriangle;
+		}
+
+
+		/**
+		 * Update the bounds in the btSoftBody object
+		 */
+		void updateBounds( const btVector3 &lowerBound, const btVector3 &upperBound );
+
+		
+		// TODO: All of these set functions will have to do checks and
+		// update the world because restructuring of the arrays will be necessary
+		// Reasonable use of "friend"?
+		void setNumVertices( int numVertices )
+		{
+			m_numVertices = numVertices;
+		}	
+	
+		void setNumTriangles( int numTriangles )
+		{
+			m_numTriangles = numTriangles;
+		}
+
+		void setMaxVertices( int maxVertices )
+		{
+			m_maxVertices = maxVertices;
+		}
+
+		void setMaxTriangles( int maxTriangles )
+		{
+			m_maxTriangles = maxTriangles;
+		}
+
+		void setFirstVertex( int firstVertex )
+		{
+			m_firstVertex = firstVertex;
+		}
+
+		void setFirstTriangle( int firstTriangle )
+		{
+			m_firstTriangle = firstTriangle;
+		}
+
+		void setMaxLinks( int maxLinks )
+		{
+			m_maxLinks = maxLinks;
+		}
+
+		void setNumLinks( int numLinks )
+		{
+			m_numLinks = numLinks;
+		}
+
+		void setFirstLink( int firstLink )
+		{
+			m_firstLink = firstLink;
+		}
+
+		int getMaxLinks()
+		{
+			return m_maxLinks;
+		}
+
+		int getNumLinks()
+		{
+			return m_numLinks;
+		}
+
+		int getFirstLink()
+		{
+			return m_firstLink;
+		}
+
+		btSoftBody* getSoftBody()
+		{
+			return m_softBody;
+		}
+
+	};
+
+	
+	struct CollisionObjectIndices
+	{
+		CollisionObjectIndices( int f, int e )
+		{
+			firstObject = f;
+			endObject = e;
+		}
+
+		int firstObject;
+		int endObject;
+	};
+
+
+
+
+
+	struct PrepareLinksCB
+	{		
+		int numLinks;
+		int padding0;
+		int padding1;
+		int padding2;
+	};
+
+	struct SolvePositionsFromLinksKernelCB
+	{		
+		int startLink;
+		int numLinks;
+		float kst;
+		float ti;
+	};
+
+	struct IntegrateCB
+	{
+		int numNodes;
+		float solverdt;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdatePositionsFromVelocitiesCB
+	{
+		int numNodes;
+		float solverSDT;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdateVelocitiesFromPositionsWithoutVelocitiesCB
+	{
+		int numNodes;
+		float isolverdt;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdateVelocitiesFromPositionsWithVelocitiesCB
+	{
+		int numNodes;
+		float isolverdt;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdateSoftBodiesCB
+	{
+		int numNodes;
+		int startFace;
+		int numFaces;
+		float epsilon;
+	};
+
+
+	struct ApplyForcesCB
+	{
+		unsigned int numNodes;
+		float solverdt;
+		float epsilon;
+		int padding3;
+	};
+
+	struct AddVelocityCB
+	{
+		int startNode;
+		int lastNode;
+		float velocityX;
+		float velocityY;
+		float velocityZ;
+		int padding1;
+		int padding2;
+		int padding3;
+	};
+
+	struct VSolveLinksCB
+	{
+		int startLink;
+		int numLinks;
+		float kst;
+		int padding;
+	};
+
+	struct ComputeBoundsCB
+	{
+		int numNodes;
+		int numSoftBodies;
+		int padding1;
+		int padding2;
+	};
+
+	struct SolveCollisionsAndUpdateVelocitiesCB
+	{
+		unsigned int numNodes;
+		float isolverdt;
+		int padding0;
+		int padding1;
+	};
+
+	
+
+
+protected:
+	ID3D11Device *		 m_dx11Device;
+	ID3D11DeviceContext* m_dx11Context;
+	
+	DXFunctions dxFunctions;
+public:
+	/** Link data for all cloths. Note that this will be sorted batch-wise for efficient computation and m_linkAddresses will maintain the addressing. */
+	btSoftBodyLinkDataDX11 m_linkData;
+	btSoftBodyVertexDataDX11 m_vertexData;
+	btSoftBodyTriangleDataDX11 m_triangleData;
+
+protected:
+
+	/** Variable to define whether we need to update solver constants on the next iteration */
+	bool m_updateSolverConstants;
+
+	bool m_shadersInitialized;
+
+	/** 
+	 * Cloths owned by this solver.
+	 * Only our cloths are in this array.
+	 */
+	btAlignedObjectArray< btAcceleratedSoftBodyInterface * > m_softBodySet;
+
+	/** Acceleration value to be applied to all non-static vertices in the solver. 
+	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
+	 */
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothAcceleration;
+	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11PerClothAcceleration;
+
+	/** Wind velocity to be applied normal to all non-static vertices in the solver. 
+	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
+	 */
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothWindVelocity;
+	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11PerClothWindVelocity;
+
+	/** Velocity damping factor */
+	btAlignedObjectArray< float >						m_perClothDampingFactor;
+	btDX11Buffer<float>									m_dx11PerClothDampingFactor;
+
+	/** Velocity correction coefficient */
+	btAlignedObjectArray< float >						m_perClothVelocityCorrectionCoefficient;
+	btDX11Buffer<float>									m_dx11PerClothVelocityCorrectionCoefficient;
+
+	/** Lift parameter for wind effect on cloth. */
+	btAlignedObjectArray< float >						m_perClothLiftFactor;
+	btDX11Buffer<float>									m_dx11PerClothLiftFactor;
+	
+	/** Drag parameter for wind effect on cloth. */
+	btAlignedObjectArray< float >						m_perClothDragFactor;
+	btDX11Buffer<float>									m_dx11PerClothDragFactor;
+
+	/** Density of the medium in which each cloth sits */
+	btAlignedObjectArray< float >						m_perClothMediumDensity;
+	btDX11Buffer<float>									m_dx11PerClothMediumDensity;
+
+	
+	/** 
+	 * Collision shape details: pair of index of first collision shape for the cloth and number of collision objects.
+	 */
+	btAlignedObjectArray< CollisionObjectIndices >		m_perClothCollisionObjects;
+	btDX11Buffer<CollisionObjectIndices>				m_dx11PerClothCollisionObjects;
+
+	/** 
+	 * Collision shapes being passed across to the cloths in this solver.
+	 */
+	btAlignedObjectArray< CollisionShapeDescription >	m_collisionObjectDetails;
+	btDX11Buffer< CollisionShapeDescription >			m_dx11CollisionObjectDetails;
+
+	/** 
+	 * Minimum bounds for each cloth.
+	 * Updated by GPU and returned for use by broad phase.
+	 * These are int vectors as a reminder that they store the int representation of a float, not a float.
+	 * Bit 31 is inverted - is floats are stored with int-sortable values.
+	 */
+	btAlignedObjectArray< UIntVector3 >	m_perClothMinBounds;
+	btDX11Buffer< UIntVector3 >			m_dx11PerClothMinBounds;
+
+	/** 
+	 * Maximum bounds for each cloth.
+	 * Updated by GPU and returned for use by broad phase.
+	 * These are int vectors as a reminder that they store the int representation of a float, not a float.
+	 * Bit 31 is inverted - is floats are stored with int-sortable values.
+	 */
+	btAlignedObjectArray< UIntVector3 >	m_perClothMaxBounds;
+	btDX11Buffer< UIntVector3 >			m_dx11PerClothMaxBounds;
+
+	
+	/** 
+	 * Friction coefficient for each cloth
+	 */
+	btAlignedObjectArray< float >	m_perClothFriction;
+	btDX11Buffer< float >			m_dx11PerClothFriction;
+
+	DXFunctions::KernelDesc		prepareLinksKernel;
+	DXFunctions::KernelDesc		solvePositionsFromLinksKernel;
+	DXFunctions::KernelDesc		vSolveLinksKernel;
+	DXFunctions::KernelDesc		integrateKernel;
+	DXFunctions::KernelDesc		addVelocityKernel;
+	DXFunctions::KernelDesc		updatePositionsFromVelocitiesKernel;
+	DXFunctions::KernelDesc		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
+	DXFunctions::KernelDesc		updateVelocitiesFromPositionsWithVelocitiesKernel;
+	DXFunctions::KernelDesc		solveCollisionsAndUpdateVelocitiesKernel;
+	DXFunctions::KernelDesc		resetNormalsAndAreasKernel;
+	DXFunctions::KernelDesc		normalizeNormalsAndAreasKernel;
+	DXFunctions::KernelDesc		computeBoundsKernel;
+	DXFunctions::KernelDesc		updateSoftBodiesKernel;
+
+	DXFunctions::KernelDesc		applyForcesKernel;
+
+	bool	m_enableUpdateBounds;
+
+	/**
+	 * Integrate motion on the solver.
+	 */
+	virtual void integrate( float solverdt );
+	float computeTriangleArea( 
+		const Vectormath::Aos::Point3 &vertex0,
+		const Vectormath::Aos::Point3 &vertex1,
+		const Vectormath::Aos::Point3 &vertex2 );
+
+
+	virtual bool buildShaders();
+
+	void resetNormalsAndAreas( int numVertices );
+
+	void normalizeNormalsAndAreas( int numVertices );
+
+	void executeUpdateSoftBodies( int firstTriangle, int numTriangles );
+
+	void prepareCollisionConstraints();
+
+	Vectormath::Aos::Vector3 ProjectOnAxis( const Vectormath::Aos::Vector3 &v, const Vectormath::Aos::Vector3 &a );
+
+	void ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce );
+
+	virtual void applyForces( float solverdt );
+	
+	virtual void updateConstants( float timeStep );
+	int findSoftBodyIndex( const btSoftBody* const softBody );
+
+	//////////////////////////////////////
+	// Kernel dispatches
+	virtual void prepareLinks();
+
+	void updatePositionsFromVelocities( float solverdt );
+	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
+	void solveLinksForVelocity( int startLink, int numLinks, float kst );
+	
+	void updateVelocitiesFromPositionsWithVelocities( float isolverdt );
+	void updateVelocitiesFromPositionsWithoutVelocities( float isolverdt );
+	void computeBounds( );
+	void solveCollisionsAndUpdateVelocities( float isolverdt );
+
+	// End kernel dispatches
+	/////////////////////////////////////
+
+	void updateBounds();
+
+	
+	void releaseKernels();
+
+public:
+	btDX11SoftBodySolver(ID3D11Device * dx11Device, ID3D11DeviceContext* dx11Context, DXFunctions::CompileFromMemoryFunc dx11CompileFromMemory = &D3DX11CompileFromMemory);
+
+	virtual ~btDX11SoftBodySolver();
+	
+	
+	virtual SolverTypes getSolverType() const
+	{
+		return DX_SOLVER;
+	}
+
+	void	setEnableUpdateBounds(bool enableBounds)
+	{
+		m_enableUpdateBounds = enableBounds;
+	}
+	bool getEnableUpdateBounds() const
+	{
+		return  m_enableUpdateBounds;
+	}
+
+
+
+	virtual btSoftBodyLinkData &getLinkData();
+
+	virtual btSoftBodyVertexData &getVertexData();
+
+	virtual btSoftBodyTriangleData &getTriangleData();
+
+
+
+	
+
+	btAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
+	const btAcceleratedSoftBodyInterface * const findSoftBodyInterface( const btSoftBody* const softBody ) const;
+
+	virtual bool checkInitialized();
+
+	virtual void updateSoftBodies( );
+
+	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
+
+	virtual void copyBackToSoftBodies(bool bMove = true);
+
+	virtual void solveConstraints( float solverdt );
+
+	virtual void predictMotion( float solverdt );
+
+	
+	virtual void processCollision( btSoftBody *, const btCollisionObjectWrapper* );
+
+	virtual void processCollision( btSoftBody*, btSoftBody* );
+
+};
+
+
+
+/** 
+ * Class to manage movement of data from a solver to a given target.
+ * This version is the DX to CPU version.
+ */
+class btSoftBodySolverOutputDXtoCPU : public btSoftBodySolverOutput
+{
+protected:
+
+public:
+	btSoftBodySolverOutputDXtoCPU()
+	{
+	}
+
+	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
+	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer );
+};
+
+/** 
+ * Class to manage movement of data from a solver to a given target.
+ * This version is the DX to DX version and subclasses DX to CPU so that it works for that too.
+ */
+class btSoftBodySolverOutputDXtoDX : public btSoftBodySolverOutputDXtoCPU
+{
+protected:
+	struct OutputToVertexArrayCB
+	{
+		int startNode;
+		int numNodes;
+		int positionOffset;
+		int positionStride;
+		
+		int normalOffset;	
+		int normalStride;
+		int padding1;
+		int padding2;
+	};
+	
+	DXFunctions dxFunctions;
+	DXFunctions::KernelDesc outputToVertexArrayWithNormalsKernel;
+	DXFunctions::KernelDesc outputToVertexArrayWithoutNormalsKernel;
+
+	
+	bool m_shadersInitialized;
+
+	bool checkInitialized();
+	bool buildShaders();
+	void releaseKernels();
+
+public:
+	btSoftBodySolverOutputDXtoDX(ID3D11Device *dx11Device, ID3D11DeviceContext* dx11Context, DXFunctions::CompileFromMemoryFunc dx11CompileFromMemory = &D3DX11CompileFromMemory) :
+	  dxFunctions( dx11Device, dx11Context, dx11CompileFromMemory )
+	{
+		m_shadersInitialized = false;
+	}
+
+	~btSoftBodySolverOutputDXtoDX()
+	{
+		releaseKernels();
+	}
+
+	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
+	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer );
+};
+
+#endif // #ifndef BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h
@ -0,0 +1,81 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "vectormath/vmInclude.h"
+#include "btSoftBodySolver_DX11.h"
+#include "btSoftBodySolverVertexBuffer_DX11.h"
+#include "btSoftBodySolverLinkData_DX11SIMDAware.h"
+#include "btSoftBodySolverVertexData_DX11.h"
+#include "btSoftBodySolverTriangleData_DX11.h"
+
+
+#ifndef BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
+#define BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
+
+class btDX11SIMDAwareSoftBodySolver : public btDX11SoftBodySolver
+{
+protected:
+	struct SolvePositionsFromLinksKernelCB
+	{		
+		int startWave;
+		int numWaves;
+		float kst;
+		float ti;
+	};
+
+
+	/** Link data for all cloths. Note that this will be sorted batch-wise for efficient computation and m_linkAddresses will maintain the addressing. */
+	btSoftBodyLinkDataDX11SIMDAware m_linkData;
+		
+	/** Variable to define whether we need to update solver constants on the next iteration */
+	bool m_updateSolverConstants;
+
+	
+	virtual bool buildShaders();
+
+	void updateConstants( float timeStep );
+
+
+	//////////////////////////////////////
+	// Kernel dispatches
+	
+
+	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
+
+	// End kernel dispatches
+	/////////////////////////////////////
+
+
+
+public:
+	btDX11SIMDAwareSoftBodySolver(ID3D11Device * dx11Device, ID3D11DeviceContext* dx11Context, DXFunctions::CompileFromMemoryFunc dx11CompileFromMemory = &D3DX11CompileFromMemory);
+
+	virtual ~btDX11SIMDAwareSoftBodySolver();
+
+	virtual btSoftBodyLinkData &getLinkData();
+
+	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
+
+	virtual void solveConstraints( float solverdt );
+	
+	virtual SolverTypes getSolverType() const
+	{
+		return DX_SIMD_SOLVER;
+	}
+	
+};
+
+#endif // #ifndef BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/premake4.lua
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/premake4.lua
@ -0,0 +1,23 @@
+	
+hasDX11 = findDirectX11()
+	
+if (hasDX11) then
+	
+	project "BulletSoftBodyDX11Solvers"
+		
+  initDirectX11()
+	
+	kind "StaticLib"
+	
+	targetdir "../../../../lib"
+	
+	includedirs {
+		".",
+		"../../.."
+	}
+	files {
+		"**.cpp",
+		"**.h"
+	}
+
+end
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
@ -0,0 +1,62 @@
+
+INCLUDE_DIRECTORIES(
+	${BULLET_PHYSICS_SOURCE_DIR}/src
+	${AMD_OPENCL_INCLUDES}
+)
+
+ADD_DEFINITIONS(-DUSE_AMD_OPENCL)
+ADD_DEFINITIONS(-DCL_PLATFORM_AMD)
+
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+	../btSoftBodySolver_OpenCLSIMDAware.cpp
+	../btSoftBodySolverOutputCLtoGL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../btSoftBodySolver_OpenCLSIMDAware.h
+	../../Shared/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCLSIMDAware.h
+	../btSoftBodySolverBuffer_OpenCL.h
+	../btSoftBodySolverVertexBuffer_OpenGL.h
+	../btSoftBodySolverOutputCLtoGL.h
+)
+
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_AMD
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBodySolvers_OpenCL_AMD BulletSoftBody)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_AMD DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_AMD DESTINATION lib${LIB_SUFFIX})
+#headers are already installed by BulletMultiThreaded library
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/premake4.lua
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/premake4.lua
@ -0,0 +1,27 @@
+	
+hasCL = findOpenCL_AMD()
+	
+if (hasCL) then
+	
+	project "BulletSoftBodySolvers_OpenCL_AMD"
+		
+ 	defines { "USE_AMD_OPENCL","CL_PLATFORM_AMD"}
+
+	initOpenCL_AMD()
+	
+	kind "StaticLib"
+	
+	targetdir "../../../../../lib"
+	
+	includedirs {
+		".",
+		"../../../..",
+		"../../../../../Glut"
+	}
+	files {
+		"../btSoftBodySolver_OpenCL.cpp",
+		"../btSoftBodySolver_OpenCLSIMDAware.cpp",
+		"../btSoftBodySolverOutputCLtoGL.cpp"
+	}
+
+end
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Apple/CMakeLists.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Apple/CMakeLists.txt
@ -0,0 +1,77 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+	../btSoftBodySolver_OpenCLSIMDAware.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../../Shared/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverBuffer_OpenCL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_Apple
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	IF (APPLE AND (BUILD_SHARED_LIBS OR FRAMEWORK) )
+		SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES LINK_FLAGS "-framework OpenCL")
+	ENDIF (APPLE AND (BUILD_SHARED_LIBS OR FRAMEWORK) )
+	TARGET_LINK_LIBRARIES(BulletSoftBodySolvers_OpenCL_Apple BulletSoftBody)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Apple  DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Apple  DESTINATION lib${LIB_SUFFIX})
+#headers are already installed by BulletMultiThreaded library
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/CMakeLists.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/CMakeLists.txt
@ -0,0 +1,17 @@
+	SUBDIRS( MiniCL  )
+
+IF(BUILD_INTEL_OPENCL_DEMOS)
+	SUBDIRS(Intel)
+ENDIF()
+
+IF(BUILD_AMD_OPENCL_DEMOS)
+	SUBDIRS(AMD)
+ENDIF()
+
+IF(BUILD_NVIDIA_OPENCL_DEMOS)
+	SUBDIRS(NVidia)
+ENDIF()
+
+IF(APPLE AND OPENCL_LIBRARY) 
+	SUBDIRS(Apple)
+ENDIF()
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Intel/CMakeLists.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Intel/CMakeLists.txt
@ -0,0 +1,82 @@
+
+INCLUDE_DIRECTORIES(
+	${BULLET_PHYSICS_SOURCE_DIR}/src
+	${INTEL_OPENCL_INCLUDES}
+)
+
+ADD_DEFINITIONS(-DUSE_INTEL_OPENCL)
+ADD_DEFINITIONS(-DCL_PLATFORM_INTEL)
+
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+	../btSoftBodySolver_OpenCLSIMDAware.cpp
+	../btSoftBodySolverOutputCLtoGL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../btSoftBodySolver_OpenCLSIMDAware.h
+	../../Shared/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCLSIMDAware.h
+	../btSoftBodySolverBuffer_OpenCL.h
+	../btSoftBodySolverVertexBuffer_OpenGL.h
+	../btSoftBodySolverOutputCLtoGL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_Intel
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Intel PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Intel PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBodySolvers_OpenCL_Intel BulletSoftBody)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Intel DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Intel DESTINATION lib${LIB_SUFFIX})
+#headers are already installed by BulletMultiThreaded library
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Intel PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Intel PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Intel/premake4.lua
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Intel/premake4.lua
@ -0,0 +1,27 @@
+	
+hasCL = findOpenCL_Intel()
+	
+if (hasCL) then
+	
+	project "BulletSoftBodySolvers_OpenCL_Intel"
+		
+ 	defines { "USE_INTEL_OPENCL","CL_PLATFORM_INTEL"}
+
+	initOpenCL_Intel()
+	
+	kind "StaticLib"
+	
+	targetdir "../../../../../lib"
+	
+	includedirs {
+		".",
+		"../../../..",
+		"../../../../../Glut"
+	}
+	files {
+		"../btSoftBodySolver_OpenCL.cpp",
+		"../btSoftBodySolver_OpenCLSIMDAware.cpp",
+		"../btSoftBodySolverOutputCLtoGL.cpp"
+	}
+
+end
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/CMakeLists.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/CMakeLists.txt
@ -0,0 +1,75 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+ADD_DEFINITIONS(-DUSE_MINICL)
+
+
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../../Shared/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverBuffer_OpenCL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_Mini
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBodySolvers_OpenCL_Mini MiniCL BulletMultiThreaded BulletSoftBody)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Mini DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Mini DESTINATION lib${LIB_SUFFIX})
+#headers are already installed by BulletMultiThreaded library
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
@ -0,0 +1,249 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <MiniCL/cl_MiniCL_Defs.h>
+
+#define MSTRINGIFY(A) A
+#include "../OpenCLC10/ApplyForces.cl"
+#include "../OpenCLC10/Integrate.cl"
+#include "../OpenCLC10/PrepareLinks.cl"
+#include "../OpenCLC10/SolvePositions.cl"
+#include "../OpenCLC10/UpdateNodes.cl"
+#include "../OpenCLC10/UpdateNormals.cl"
+#include "../OpenCLC10/UpdatePositions.cl"
+#include "../OpenCLC10/UpdatePositionsFromVelocities.cl"
+#include "../OpenCLC10/VSolveLinks.cl"
+#include "../OpenCLC10/UpdateFixedVertexPositions.cl"
+//#include "../OpenCLC10/SolveCollisionsAndUpdateVelocities.cl"
+
+
+MINICL_REGISTER(PrepareLinksKernel)
+MINICL_REGISTER(VSolveLinksKernel)
+MINICL_REGISTER(UpdatePositionsFromVelocitiesKernel)
+MINICL_REGISTER(SolvePositionsFromLinksKernel)
+MINICL_REGISTER(updateVelocitiesFromPositionsWithVelocitiesKernel)
+MINICL_REGISTER(updateVelocitiesFromPositionsWithoutVelocitiesKernel)
+MINICL_REGISTER(IntegrateKernel)
+MINICL_REGISTER(ApplyForcesKernel)
+MINICL_REGISTER(ResetNormalsAndAreasKernel)
+MINICL_REGISTER(NormalizeNormalsAndAreasKernel)
+MINICL_REGISTER(UpdateSoftBodiesKernel)
+MINICL_REGISTER(UpdateFixedVertexPositions)
+
+float mydot3a(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+
+typedef struct 
+{
+	int firstObject;
+	int endObject;
+} CollisionObjectIndices;
+
+typedef struct 
+{
+	float4 shapeTransform[4]; // column major 4x4 matrix
+	float4 linearVelocity;
+	float4 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+	
+
+	// Shape information
+	// Compressed from the union
+	float radius;
+	float halfHeight;
+	int upAxis;
+		
+	float margin;
+	float friction;
+
+	int padding0;
+	
+} CollisionShapeDescription;
+
+// From btBroadphaseProxy.h
+__constant int CAPSULE_SHAPE_PROXYTYPE = 10;
+
+// Multiply column-major matrix against vector
+float4 matrixVectorMul( float4 matrix[4], float4 vector )
+{
+	float4 returnVector;
+	float4 row0 = float4(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
+	float4 row1 = float4(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
+	float4 row2 = float4(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
+	float4 row3 = float4(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
+	returnVector.x = dot(row0, vector);
+	returnVector.y = dot(row1, vector);
+	returnVector.z = dot(row2, vector);
+	returnVector.w = dot(row3, vector);
+	return returnVector;
+}
+
+__kernel void 
+SolveCollisionsAndUpdateVelocitiesKernel( 
+	const int numNodes,
+	const float isolverdt,
+	__global int *g_vertexClothIdentifier,
+	__global float4 *g_vertexPreviousPositions,
+	__global float * g_perClothFriction,
+	__global float * g_clothDampingFactor,
+	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
+	__global CollisionShapeDescription * g_collisionObjectDetails,
+	__global float4 * g_vertexForces,
+	__global float4 *g_vertexVelocities,
+	__global float4 *g_vertexPositions GUID_ARG)
+{
+	int nodeID = get_global_id(0);
+	float4 forceOnVertex = (float4)(0.f, 0.f, 0.f, 0.f);
+	
+	if( get_global_id(0) < numNodes )
+	{	
+		int clothIdentifier = g_vertexClothIdentifier[nodeID];
+		
+		// Abort if this is not a valid cloth
+		if( clothIdentifier < 0 )
+			return;
+
+
+		float4 position (g_vertexPositions[nodeID].xyz, 1.f);
+		float4 previousPosition (g_vertexPreviousPositions[nodeID].xyz, 1.f);
+			
+		float clothFriction = g_perClothFriction[clothIdentifier];
+		float dampingFactor = g_clothDampingFactor[clothIdentifier];
+		float velocityCoefficient = (1.f - dampingFactor);		
+		float4 difference = position - previousPosition;
+		float4 velocity = difference*velocityCoefficient*isolverdt;
+		
+		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
+	
+		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+		
+		if( numObjects > 0 )
+		{
+			// We have some possible collisions to deal with
+			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
+			{
+				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
+				float colliderFriction = shapeDescription.friction;
+
+				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
+				{
+					// Colliding with a capsule
+
+					float capsuleHalfHeight = shapeDescription.halfHeight;
+					float capsuleRadius = shapeDescription.radius;
+					float capsuleMargin = shapeDescription.margin;
+					int capsuleupAxis = shapeDescription.upAxis;
+
+					// Four columns of worldTransform matrix
+					float4 worldTransform[4];
+					worldTransform[0] = shapeDescription.shapeTransform[0];
+					worldTransform[1] = shapeDescription.shapeTransform[1];
+					worldTransform[2] = shapeDescription.shapeTransform[2];
+					worldTransform[3] = shapeDescription.shapeTransform[3];
+
+					// Correctly define capsule centerline vector 
+					float4 c1 (0.f, 0.f, 0.f, 1.f); 
+					float4 c2 (0.f, 0.f, 0.f, 1.f);
+					c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
+					c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
+					c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
+					c2.x = -c1.x;
+					c2.y = -c1.y;
+					c2.z = -c1.z;
+
+
+					float4 worldC1 = matrixVectorMul(worldTransform, c1);
+					float4 worldC2 = matrixVectorMul(worldTransform, c2);
+					float4 segment = (worldC2 - worldC1);
+
+					// compute distance of tangent to vertex along line segment in capsule
+					float distanceAlongSegment = -( mydot3a( (worldC1 - position), segment ) / mydot3a(segment, segment) );
+
+					float4 closestPoint = (worldC1 + (segment * distanceAlongSegment));
+					float distanceFromLine = length(position - closestPoint);
+					float distanceFromC1 = length(worldC1 - position);
+					float distanceFromC2 = length(worldC2 - position);
+					
+					// Final distance from collision, point to push from, direction to push in
+					// for impulse force
+					float dist;
+					float4 normalVector;
+					if( distanceAlongSegment < 0 )
+					{
+						dist = distanceFromC1;
+						normalVector = float4(normalize(position - worldC1).xyz, 0.f);
+					} else if( distanceAlongSegment > 1.f ) {
+						dist = distanceFromC2;
+						normalVector = float4(normalize(position - worldC2).xyz, 0.f);	
+					} else {
+						dist = distanceFromLine;
+						normalVector = float4(normalize(position - closestPoint).xyz, 0.f);
+					}
+						
+					float4 colliderLinearVelocity = shapeDescription.linearVelocity;
+					float4 colliderAngularVelocity = shapeDescription.angularVelocity;
+					float4 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position - float4(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w, 0.f));
+
+					float minDistance = capsuleRadius + capsuleMargin;
+					
+					// In case of no collision, this is the value of velocity
+					velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
+					
+					
+					// Check for a collision
+					if( dist < minDistance )
+					{
+						// Project back to surface along normal
+						position = position + float4(normalVector*(minDistance - dist)*0.9f);
+						velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
+						float4 relativeVelocity = velocity - velocityOfSurfacePoint;
+
+						float4 p1 = normalize(cross(normalVector, segment));
+						float4 p2 = normalize(cross(p1, normalVector));
+						// Full friction is sum of velocities in each direction of plane
+						float4 frictionVector = p1*mydot3a(relativeVelocity, p1) + p2*mydot3a(relativeVelocity, p2);
+
+						// Real friction is peak friction corrected by friction coefficients
+						frictionVector = frictionVector * (colliderFriction*clothFriction);
+
+						float approachSpeed = dot(relativeVelocity, normalVector);
+
+						if( approachSpeed <= 0.0f )
+							forceOnVertex -= frictionVector;
+					}
+				}
+			}
+		}
+
+		g_vertexVelocities[nodeID] = float4(velocity.xyz, 0.f);	
+
+		// Update external force
+		g_vertexForces[nodeID] = float4(forceOnVertex.xyz, 0.f);
+
+		g_vertexPositions[nodeID] = float4(position.xyz, 0.f);
+	}
+}
+
+
+MINICL_REGISTER(SolveCollisionsAndUpdateVelocitiesKernel);
+
+
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
@ -0,0 +1,81 @@
+
+ADD_DEFINITIONS(-DUSE_NVIDIA_OPENCL)
+ADD_DEFINITIONS(-DCL_PLATFORM_NVIDIA)
+
+INCLUDE_DIRECTORIES(
+	${BULLET_PHYSICS_SOURCE_DIR}/src
+	${NVIDIA_OPENCL_INCLUDES}
+)
+
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+	../btSoftBodySolver_OpenCLSIMDAware.cpp
+	../btSoftBodySolverOutputCLtoGL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../../Shared/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCLSIMDAware.h
+	../btSoftBodySolverBuffer_OpenCL.h
+	../btSoftBodySolverVertexBuffer_OpenGL.h
+	../btSoftBodySolverOutputCLtoGL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_NVidia
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBodySolvers_OpenCL_NVidia BulletSoftBody BulletDynamics)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_NVidia DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_NVidia DESTINATION lib${LIB_SUFFIX})
+#headers are already installed by BulletMultiThreaded library
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/premake4.lua
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/premake4.lua
@ -0,0 +1,27 @@
+	
+hasCL = findOpenCL_NVIDIA()
+	
+if (hasCL) then
+	
+	project "BulletSoftBodySolvers_OpenCL_NVIDIA"
+		
+ 	defines { "USE_NVIDIA_OPENCL","CL_PLATFORM_NVIDIA"}
+
+	initOpenCL_NVIDIA()
+	
+	kind "StaticLib"
+	
+	targetdir "../../../../../lib"
+	
+	includedirs {
+		".",
+		"../../../..",
+		"../../../../../Glut"
+	}
+	files {
+		"../btSoftBodySolver_OpenCL.cpp",
+		"../btSoftBodySolver_OpenCLSIMDAware.cpp",
+		"../btSoftBodySolverOutputCLtoGL.cpp"
+	}
+
+end
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
@ -0,0 +1,102 @@
+MSTRINGIFY(
+
+
+float adot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+float alength3(float4 a)
+{
+	a.w = 0;
+	return length(a);
+}
+
+float4 anormalize3(float4 a)
+{
+	a.w = 0;
+	return normalize(a);
+}
+
+float4 projectOnAxis( float4 v, float4 a )
+{
+	return (a*adot3(v, a));
+}
+
+__kernel void 
+ApplyForcesKernel(
+	const uint numNodes,
+	const float solverdt,
+	const float epsilon,
+	__global int * g_vertexClothIdentifier,
+	__global float4 * g_vertexNormal,
+	__global float * g_vertexArea,
+	__global float * g_vertexInverseMass,
+	__global float * g_clothLiftFactor,
+	__global float * g_clothDragFactor,
+	__global float4 * g_clothWindVelocity,
+	__global float4 * g_clothAcceleration,
+	__global float * g_clothMediumDensity,
+	__global float4 * g_vertexForceAccumulator,
+	__global float4 * g_vertexVelocity GUID_ARG)
+{
+	unsigned int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{		
+		int clothId  = g_vertexClothIdentifier[nodeID];
+		float nodeIM = g_vertexInverseMass[nodeID];
+		
+		if( nodeIM > 0.0f )
+		{
+			float4 nodeV  = g_vertexVelocity[nodeID];
+			float4 normal = g_vertexNormal[nodeID];
+			float area    = g_vertexArea[nodeID];
+			float4 nodeF  = g_vertexForceAccumulator[nodeID];
+			
+			// Read per-cloth values
+			float4 clothAcceleration = g_clothAcceleration[clothId];
+			float4 clothWindVelocity = g_clothWindVelocity[clothId];
+			float liftFactor = g_clothLiftFactor[clothId];
+			float dragFactor = g_clothDragFactor[clothId];
+			float mediumDensity = g_clothMediumDensity[clothId];
+		
+			// Apply the acceleration to the cloth rather than do this via a force
+			nodeV += (clothAcceleration*solverdt);
+
+			g_vertexVelocity[nodeID] = nodeV;
+
+			// Aerodynamics
+			float4 rel_v = nodeV - clothWindVelocity;
+			float rel_v_len = alength3(rel_v);
+			float rel_v2 = dot(rel_v, rel_v);
+			
+			if( rel_v2 > epsilon )
+			{
+				float4 rel_v_nrm = anormalize3(rel_v);
+				float4 nrm = normal;
+									
+				nrm = nrm * (dot(nrm, rel_v) < 0 ? -1.f : 1.f);
+
+				float4 fDrag = (float4)(0.f, 0.f, 0.f, 0.f);
+				float4 fLift = (float4)(0.f, 0.f, 0.f, 0.f);
+
+				float n_dot_v = dot(nrm, rel_v_nrm);
+
+				// drag force
+				if ( dragFactor > 0.f )
+					fDrag = 0.5f * dragFactor * mediumDensity * rel_v2 * area * n_dot_v * (-1.0f) * rel_v_nrm;
+
+				// lift force
+				// Check angle of attack
+				// cos(10º) = 0.98480
+				if ( 0 < n_dot_v && n_dot_v < 0.98480f)
+					fLift = 0.5f * liftFactor * mediumDensity * rel_v_len * area * sqrt(1.0f-n_dot_v*n_dot_v) * (cross(cross(nrm, rel_v_nrm), rel_v_nrm));
+				
+				nodeF += fDrag + fLift;
+					g_vertexForceAccumulator[nodeID] = nodeF;	
+			}
+		}
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ComputeBounds.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ComputeBounds.cl
@ -0,0 +1,82 @@
+MSTRINGIFY(
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n
+
+__kernel void
+ComputeBoundsKernel( 
+	const int numNodes,
+	const int numSoftBodies,
+	__global int * g_vertexClothIdentifier,
+	__global float4 * g_vertexPositions,
+	/* Unfortunately, to get the atomics below to work these arrays cannot be */
+	/* uint4, though that is the layout of the data */
+	/* Therefore this is little-endian-only code */
+	volatile __global uint * g_clothMinBounds,
+	volatile __global uint * g_clothMaxBounds,
+	volatile __local uint * clothMinBounds,
+	volatile __local uint * clothMaxBounds)
+{
+	// Init min and max bounds arrays
+	if( get_local_id(0) < numSoftBodies )
+	{
+		
+		clothMinBounds[get_local_id(0)*4] = UINT_MAX;
+		clothMinBounds[get_local_id(0)*4+1] = UINT_MAX;
+		clothMinBounds[get_local_id(0)*4+2] = UINT_MAX;
+		clothMinBounds[get_local_id(0)*4+3] = UINT_MAX;
+		clothMaxBounds[get_local_id(0)*4] = 0;
+		clothMaxBounds[get_local_id(0)*4+1] = 0;
+		clothMaxBounds[get_local_id(0)*4+2] = 0;
+		clothMaxBounds[get_local_id(0)*4+3] = 0;
+
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		int clothIdentifier = g_vertexClothIdentifier[nodeID];
+		if( clothIdentifier >= 0 )
+		{
+
+			float4 position = (float4)(g_vertexPositions[nodeID].xyz, 0.f);
+
+			/* Reinterpret position as uint */
+			uint4 positionUInt = (uint4)(as_uint(position.x), as_uint(position.y), as_uint(position.z), 0);
+		
+			/* Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints */
+			positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000);
+			positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000);		
+			positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000);
+		
+			// Min/max with the LDS values
+			atom_min(&(clothMinBounds[clothIdentifier*4]), positionUInt.x);
+			atom_min(&(clothMinBounds[clothIdentifier*4+1]), positionUInt.y);
+			atom_min(&(clothMinBounds[clothIdentifier*4+2]), positionUInt.z);
+
+			atom_max(&(clothMaxBounds[clothIdentifier*4]), positionUInt.x);
+			atom_max(&(clothMaxBounds[clothIdentifier*4+1]), positionUInt.y);
+			atom_max(&(clothMaxBounds[clothIdentifier*4+2]), positionUInt.z);
+		}
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+
+	/* Use global atomics to update the global versions of the data */
+	if( get_local_id(0) < numSoftBodies )
+	{
+		/*atom_min(&(g_clothMinBounds[get_local_id(0)].x), clothMinBounds[get_local_id(0)].x);*/
+		atom_min(&(g_clothMinBounds[get_local_id(0)*4]), clothMinBounds[get_local_id(0)*4]);
+		atom_min(&(g_clothMinBounds[get_local_id(0)*4+1]), clothMinBounds[get_local_id(0)*4+1]);
+		atom_min(&(g_clothMinBounds[get_local_id(0)*4+2]), clothMinBounds[get_local_id(0)*4+2]);
+
+		atom_max(&(g_clothMaxBounds[get_local_id(0)*4]), clothMaxBounds[get_local_id(0)*4]);		
+		atom_max(&(g_clothMaxBounds[get_local_id(0)*4+1]), clothMaxBounds[get_local_id(0)*4+1]);
+		atom_max(&(g_clothMaxBounds[get_local_id(0)*4+2]), clothMaxBounds[get_local_id(0)*4+2]);
+	}
+}
+
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/Integrate.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/Integrate.cl
@ -0,0 +1,35 @@
+MSTRINGIFY(
+
+// Node indices for each link
+
+
+
+__kernel void
+IntegrateKernel( 
+	const int numNodes,
+	const float solverdt,
+	__global float * g_vertexInverseMasses,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexVelocity,
+	__global float4 * g_vertexPreviousPositions,
+	__global float4 * g_vertexForceAccumulator GUID_ARG)
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		float4 position   = g_vertexPositions[nodeID];
+		float4 velocity   = g_vertexVelocity[nodeID];
+		float4 force      = g_vertexForceAccumulator[nodeID];
+		float inverseMass = g_vertexInverseMasses[nodeID];
+		
+		g_vertexPreviousPositions[nodeID] = position;
+		velocity += force * inverseMass * solverdt;
+		position += velocity * solverdt;
+		
+		g_vertexForceAccumulator[nodeID] = (float4)(0.f, 0.f, 0.f, 0.0f);
+		g_vertexPositions[nodeID]        = position;
+		g_vertexVelocity[nodeID]         = velocity;	
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/OutputToVertexArray.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/OutputToVertexArray.cl
@ -0,0 +1,46 @@
+MSTRINGIFY(
+
+__kernel void 
+OutputToVertexArrayWithNormalsKernel( 
+	const int startNode, const int numNodes, __global float *g_vertexBuffer,
+	const int positionOffset, const int positionStride, const __global float4* g_vertexPositions, 
+	const int normalOffset, const int normalStride, const __global float4* g_vertexNormals  )
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{			
+		float4 position = g_vertexPositions[nodeID + startNode];
+		float4 normal = g_vertexNormals[nodeID + startNode];
+		
+		// Stride should account for the float->float4 conversion
+		int positionDestination = nodeID * positionStride + positionOffset;		
+		g_vertexBuffer[positionDestination] = position.x;
+		g_vertexBuffer[positionDestination+1] = position.y;
+		g_vertexBuffer[positionDestination+2] = position.z;
+		
+		int normalDestination = nodeID * normalStride + normalOffset;
+		g_vertexBuffer[normalDestination] = normal.x;
+		g_vertexBuffer[normalDestination+1] = normal.y;
+		g_vertexBuffer[normalDestination+2] = normal.z;		
+	}
+}
+
+__kernel void 
+OutputToVertexArrayWithoutNormalsKernel(
+	const int startNode, const int numNodes, __global float *g_vertexBuffer,
+	const int positionOffset, const int positionStride, const __global float4* g_vertexPositions )
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{			
+		float4 position = g_vertexPositions[nodeID + startNode];
+		
+		// Stride should account for the float->float4 conversion
+		int positionDestination = nodeID * positionStride + positionOffset;		
+		g_vertexBuffer[positionDestination] = position.x;
+		g_vertexBuffer[positionDestination+1] = position.y;
+		g_vertexBuffer[positionDestination+2] = position.z;		
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/PrepareLinks.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/PrepareLinks.cl
@ -0,0 +1,38 @@
+MSTRINGIFY(
+
+
+
+__kernel void 
+PrepareLinksKernel( 
+	const int numLinks,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksMassLSC,
+	__global float4 * g_nodesPreviousPosition,
+	__global float * g_linksLengthRatio,
+	__global float4 * g_linksCurrentLength GUID_ARG)
+{
+	int linkID = get_global_id(0);
+	if( linkID < numLinks )
+	{	
+		
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		
+		float4 nodePreviousPosition0 = g_nodesPreviousPosition[node0];
+		float4 nodePreviousPosition1 = g_nodesPreviousPosition[node1];
+
+		float massLSC = g_linksMassLSC[linkID];
+		
+		float4 linkCurrentLength = nodePreviousPosition1 - nodePreviousPosition0;
+		linkCurrentLength.w = 0.f;
+		
+		float linkLengthRatio = dot(linkCurrentLength, linkCurrentLength)*massLSC;
+		linkLengthRatio = 1.0f/linkLengthRatio;
+		
+		g_linksCurrentLength[linkID] = linkCurrentLength;
+		g_linksLengthRatio[linkID]   = linkLengthRatio;		
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolveCollisionsAndUpdateVelocities.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolveCollisionsAndUpdateVelocities.cl
@ -0,0 +1,204 @@
+MSTRINGIFY(
+
+
+
+float mydot3a(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+
+typedef struct 
+{
+	int firstObject;
+	int endObject;
+} CollisionObjectIndices;
+
+typedef struct 
+{
+	float4 shapeTransform[4]; // column major 4x4 matrix
+	float4 linearVelocity;
+	float4 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+	
+
+	// Shape information
+	// Compressed from the union
+	float radius;
+	float halfHeight;
+	int upAxis;
+		
+	float margin;
+	float friction;
+
+	int padding0;
+	
+} CollisionShapeDescription;
+
+// From btBroadphaseProxy.h
+__constant int CAPSULE_SHAPE_PROXYTYPE = 10;
+
+// Multiply column-major matrix against vector
+float4 matrixVectorMul( float4 matrix[4], float4 vector )
+{
+	float4 returnVector;
+	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
+	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
+	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
+	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
+	returnVector.x = dot(row0, vector);
+	returnVector.y = dot(row1, vector);
+	returnVector.z = dot(row2, vector);
+	returnVector.w = dot(row3, vector);
+	return returnVector;
+}
+
+__kernel void 
+SolveCollisionsAndUpdateVelocitiesKernel( 
+	const int numNodes,
+	const float isolverdt,
+	__global int *g_vertexClothIdentifier,
+	__global float4 *g_vertexPreviousPositions,
+	__global float * g_perClothFriction,
+	__global float * g_clothDampingFactor,
+	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
+	__global CollisionShapeDescription * g_collisionObjectDetails,
+	__global float4 * g_vertexForces,
+	__global float4 *g_vertexVelocities,
+	__global float4 *g_vertexPositions GUID_ARG)
+{
+	int nodeID = get_global_id(0);
+	float4 forceOnVertex = (float4)(0.f, 0.f, 0.f, 0.f);
+	
+	if( get_global_id(0) < numNodes )
+	{	
+		int clothIdentifier = g_vertexClothIdentifier[nodeID];
+		
+		// Abort if this is not a valid cloth
+		if( clothIdentifier < 0 )
+			return;
+
+
+		float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
+		float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
+			
+		float clothFriction = g_perClothFriction[clothIdentifier];
+		float dampingFactor = g_clothDampingFactor[clothIdentifier];
+		float velocityCoefficient = (1.f - dampingFactor);		
+		float4 difference = position - previousPosition;
+		float4 velocity = difference*velocityCoefficient*isolverdt;
+		
+		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
+	
+		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+		
+		if( numObjects > 0 )
+		{
+			// We have some possible collisions to deal with
+			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
+			{
+				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
+				float colliderFriction = shapeDescription.friction;
+
+				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
+				{
+					// Colliding with a capsule
+
+					float capsuleHalfHeight = shapeDescription.halfHeight;
+					float capsuleRadius = shapeDescription.radius;
+					float capsuleMargin = shapeDescription.margin;
+					int capsuleupAxis = shapeDescription.upAxis;
+
+					// Four columns of worldTransform matrix
+					float4 worldTransform[4];
+					worldTransform[0] = shapeDescription.shapeTransform[0];
+					worldTransform[1] = shapeDescription.shapeTransform[1];
+					worldTransform[2] = shapeDescription.shapeTransform[2];
+					worldTransform[3] = shapeDescription.shapeTransform[3];
+
+					// Correctly define capsule centerline vector 
+					float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
+					float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
+					c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
+					c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
+					c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
+					c2.x = -c1.x;
+					c2.y = -c1.y;
+					c2.z = -c1.z;
+
+
+					float4 worldC1 = matrixVectorMul(worldTransform, c1);
+					float4 worldC2 = matrixVectorMul(worldTransform, c2);
+					float4 segment = (worldC2 - worldC1);
+
+					// compute distance of tangent to vertex along line segment in capsule
+					float distanceAlongSegment = -( mydot3a( (worldC1 - position), segment ) / mydot3a(segment, segment) );
+
+					float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment));
+					float distanceFromLine = length(position - closestPoint);
+					float distanceFromC1 = length(worldC1 - position);
+					float distanceFromC2 = length(worldC2 - position);
+					
+					// Final distance from collision, point to push from, direction to push in
+					// for impulse force
+					float dist;
+					float4 normalVector;
+					if( distanceAlongSegment < 0 )
+					{
+						dist = distanceFromC1;
+						normalVector = (float4)(normalize(position - worldC1).xyz, 0.f);
+					} else if( distanceAlongSegment > 1.f ) {
+						dist = distanceFromC2;
+						normalVector = (float4)(normalize(position - worldC2).xyz, 0.f);	
+					} else {
+						dist = distanceFromLine;
+						normalVector = (float4)(normalize(position - closestPoint).xyz, 0.f);
+					}
+						
+					float4 colliderLinearVelocity = shapeDescription.linearVelocity;
+					float4 colliderAngularVelocity = shapeDescription.angularVelocity;
+					float4 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position - (float4)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w, 0.f));
+
+					float minDistance = capsuleRadius + capsuleMargin;
+					
+					// In case of no collision, this is the value of velocity
+					velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
+					
+					
+					// Check for a collision
+					if( dist < minDistance )
+					{
+						// Project back to surface along normal
+						position = position + (float4)((minDistance - dist)*normalVector*0.9f);
+						velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
+						float4 relativeVelocity = velocity - velocityOfSurfacePoint;
+
+						float4 p1 = normalize(cross(normalVector, segment));
+						float4 p2 = normalize(cross(p1, normalVector));
+						// Full friction is sum of velocities in each direction of plane
+						float4 frictionVector = p1*mydot3a(relativeVelocity, p1) + p2*mydot3a(relativeVelocity, p2);
+
+						// Real friction is peak friction corrected by friction coefficients
+						frictionVector = frictionVector * (colliderFriction*clothFriction);
+
+						float approachSpeed = dot(relativeVelocity, normalVector);
+
+						if( approachSpeed <= 0.0f )
+							forceOnVertex -= frictionVector;
+					}
+				}
+			}
+		}
+
+		g_vertexVelocities[nodeID] = (float4)(velocity.xyz, 0.f);	
+
+		// Update external force
+		g_vertexForces[nodeID] = (float4)(forceOnVertex.xyz, 0.f);
+
+		g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl
@ -0,0 +1,242 @@
+MSTRINGIFY(
+
+//#pragma OPENCL EXTENSION cl_amd_printf:enable\n
+
+float mydot3a(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+float mylength3(float4 a)
+{
+	a.w = 0;
+	return length(a);
+}
+
+float4 mynormalize3(float4 a)
+{
+	a.w = 0;
+	return normalize(a);
+}
+
+typedef struct 
+{
+	int firstObject;
+	int endObject;
+} CollisionObjectIndices;
+
+typedef struct 
+{
+	float4 shapeTransform[4]; // column major 4x4 matrix
+	float4 linearVelocity;
+	float4 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+	
+
+	// Shape information
+	// Compressed from the union
+	float radius;
+	float halfHeight;
+	int upAxis;
+		
+	float margin;
+	float friction;
+
+	int padding0;
+	
+} CollisionShapeDescription;
+
+// From btBroadphaseProxy.h
+__constant int CAPSULE_SHAPE_PROXYTYPE = 10;
+
+// Multiply column-major matrix against vector
+float4 matrixVectorMul( float4 matrix[4], float4 vector )
+{
+	float4 returnVector;
+	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
+	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
+	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
+	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
+	returnVector.x = dot(row0, vector);
+	returnVector.y = dot(row1, vector);
+	returnVector.z = dot(row2, vector);
+	returnVector.w = dot(row3, vector);
+	return returnVector;
+}
+
+__kernel void 
+SolveCollisionsAndUpdateVelocitiesKernel( 
+	const int numNodes,
+	const float isolverdt,
+	__global int *g_vertexClothIdentifier,
+	__global float4 *g_vertexPreviousPositions,
+	__global float * g_perClothFriction,
+	__global float * g_clothDampingFactor,
+	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
+	__global CollisionShapeDescription * g_collisionObjectDetails,
+	__global float4 * g_vertexForces,
+	__global float4 *g_vertexVelocities,
+	__global float4 *g_vertexPositions,
+	__local CollisionShapeDescription *localCollisionShapes,
+	__global float * g_vertexInverseMasses)
+{
+	int nodeID = get_global_id(0);
+	float4 forceOnVertex = (float4)(0.f, 0.f, 0.f, 0.f);
+
+	int clothIdentifier = g_vertexClothIdentifier[nodeID];
+
+	// Abort if this is not a valid cloth
+	if( clothIdentifier < 0 )
+		return;
+	
+
+	float4 position = (float4)(g_vertexPositions[nodeID].xyz, 0.f);
+	float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 0.f);
+			
+	float clothFriction = g_perClothFriction[clothIdentifier];
+	float dampingFactor = g_clothDampingFactor[clothIdentifier];
+	float velocityCoefficient = (1.f - dampingFactor);		
+	float4 difference = position - previousPosition;
+	float4 velocity = difference*velocityCoefficient*isolverdt;			
+	float inverseMass = g_vertexInverseMasses[nodeID];
+		
+	CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
+	
+	int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+		
+	if( numObjects > 0 )
+	{
+		// We have some possible collisions to deal with
+		
+		// First load all of the collision objects into LDS
+		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+		if( get_local_id(0) < numObjects )
+		{
+			localCollisionShapes[get_local_id(0)] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + get_local_id(0) ];
+		}
+	}
+
+	// Safe as the vertices are padded so that not more than one soft body is in a group
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	// Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this
+	if( numObjects > 0 )
+	{
+		
+		
+		// We have some possible collisions to deal with
+		for( int collision = 0; collision < numObjects; ++collision )
+		{
+			CollisionShapeDescription shapeDescription = localCollisionShapes[collision];
+			float colliderFriction = localCollisionShapes[collision].friction;
+		
+			if( localCollisionShapes[collision].collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
+			{
+				// Colliding with a capsule
+
+				float capsuleHalfHeight = localCollisionShapes[collision].halfHeight;
+				float capsuleRadius = localCollisionShapes[collision].radius;
+				float capsuleMargin = localCollisionShapes[collision].margin;
+				int capsuleupAxis = localCollisionShapes[collision].upAxis;
+
+				if ( capsuleHalfHeight <= 0 )
+						capsuleHalfHeight = 0.0001f;
+				float4 worldTransform[4];
+				worldTransform[0] = localCollisionShapes[collision].shapeTransform[0];
+				worldTransform[1] = localCollisionShapes[collision].shapeTransform[1];
+				worldTransform[2] = localCollisionShapes[collision].shapeTransform[2];
+				worldTransform[3] = localCollisionShapes[collision].shapeTransform[3];
+
+				// Correctly define capsule centerline vector 
+				float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
+				float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
+				c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
+				c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
+				c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
+				c2.x = -c1.x;
+				c2.y = -c1.y;
+				c2.z = -c1.z;
+
+				float4 worldC1 = matrixVectorMul(worldTransform, c1);
+				float4 worldC2 = matrixVectorMul(worldTransform, c2);
+				float4 segment = (float4)((worldC2 - worldC1).xyz, 0.f);
+
+				float4 segmentNormalized = mynormalize3(segment);
+				float distanceAlongSegment =mydot3a( (position - worldC1), segmentNormalized );
+
+				float4 closestPointOnSegment = (worldC1 + (float4)(segmentNormalized * distanceAlongSegment));
+				float distanceFromLine = mylength3(position - closestPointOnSegment);
+				float distanceFromC1 = mylength3(worldC1 - position);
+				float distanceFromC2 = mylength3(worldC2 - position);
+	
+				// Final distance from collision, point to push from, direction to push in
+				// for impulse force
+				float dist;
+				float4 normalVector;
+
+				if( distanceAlongSegment < 0 )
+				{
+					dist = distanceFromC1;
+					normalVector = (float4)(normalize(position - worldC1).xyz, 0.f);		
+				} else if( distanceAlongSegment > length(segment) ) {
+					dist = distanceFromC2;
+					normalVector = (float4)(normalize(position - worldC2).xyz, 0.f);	
+				} else {
+					dist = distanceFromLine;
+					normalVector = (float4)(normalize(position - closestPointOnSegment).xyz, 0.f);
+				}
+						
+				float minDistance = capsuleRadius + capsuleMargin;
+				float4 closestPointOnSurface = (float4)((position + (minDistance - dist) * normalVector).xyz, 0.f);
+										
+				float4 colliderLinearVelocity = shapeDescription.linearVelocity;
+				float4 colliderAngularVelocity = shapeDescription.angularVelocity;
+				float4 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, closestPointOnSurface - (float4)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w, 0.f));
+					
+					
+				// Check for a collision
+				if( dist < minDistance )
+				{
+					// Project back to surface along normal
+					position = closestPointOnSurface;
+					velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
+					float4 relativeVelocity = velocity - velocityOfSurfacePoint;
+
+					float4 p1 = mynormalize3(cross(normalVector, segment));
+					float4 p2 = mynormalize3(cross(p1, normalVector));
+					
+					float4 tangentialVel = p1*mydot3a(relativeVelocity, p1) + p2*mydot3a(relativeVelocity, p2);
+					float frictionCoef = (colliderFriction * clothFriction);
+					if (frictionCoef>1.f)
+						frictionCoef = 1.f;
+						
+					//only apply friction if objects are not moving apart
+					float projVel = mydot3a(relativeVelocity,normalVector);
+					if ( projVel >= -0.001f)
+					{
+						if ( inverseMass > 0 )
+						{
+							//float4 myforceOnVertex = -tangentialVel * frictionCoef *  isolverdt * (1.0f / inverseMass);
+							position += (-tangentialVel * frictionCoef) / (isolverdt);
+						}
+					}						
+					
+					// In case of no collision, this is the value of velocity
+					velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
+
+				}
+			}
+		}
+	}
+	
+	g_vertexVelocities[nodeID] = (float4)(velocity.xyz, 0.f);	
+
+	// Update external force
+	g_vertexForces[nodeID] = (float4)(forceOnVertex.xyz, 0.f);
+
+	g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositions.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositions.cl
@ -0,0 +1,57 @@
+
+
+
+MSTRINGIFY(
+
+
+float mydot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+
+__kernel void 
+SolvePositionsFromLinksKernel( 
+	const int startLink,
+	const int numLinks,
+	const float kst,
+	const float ti,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksMassLSC,
+	__global float * g_linksRestLengthSquared,
+	__global float * g_verticesInverseMass,
+	__global float4 * g_vertexPositions GUID_ARG)
+	
+{
+	int linkID = get_global_id(0) + startLink;
+	if( get_global_id(0) < numLinks )
+	{	
+		float massLSC = g_linksMassLSC[linkID];
+		float restLengthSquared = g_linksRestLengthSquared[linkID];
+		
+		if( massLSC > 0.0f )
+		{		
+			int2 nodeIndices = g_linksVertexIndices[linkID];
+			int node0 = nodeIndices.x;
+			int node1 = nodeIndices.y;
+			
+			float4 position0 = g_vertexPositions[node0];
+			float4 position1 = g_vertexPositions[node1];
+
+			float inverseMass0 = g_verticesInverseMass[node0];
+			float inverseMass1 = g_verticesInverseMass[node1]; 
+
+			float4 del = position1 - position0;
+			float len  = mydot3(del, del);
+			float k    = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
+			position0 = position0 - del*(k*inverseMass0);
+			position1 = position1 + del*(k*inverseMass1);
+
+			g_vertexPositions[node0] = position0;
+			g_vertexPositions[node1] = position1;
+
+		}
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositionsSIMDBatched.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositionsSIMDBatched.cl
@ -0,0 +1,130 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+MSTRINGIFY(
+
+float mydot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+__kernel __attribute__((reqd_work_group_size(WAVEFRONT_BLOCK_MULTIPLIER*WAVEFRONT_SIZE, 1, 1)))
+void 
+SolvePositionsFromLinksKernel( 
+	const int startWaveInBatch,
+	const int numWaves,
+	const float kst,
+	const float ti,
+	__global int2 *g_wavefrontBatchCountsVertexCounts,
+	__global int *g_vertexAddressesPerWavefront,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksMassLSC,
+	__global float * g_linksRestLengthSquared,
+	__global float * g_verticesInverseMass,
+	__global float4 * g_vertexPositions,
+	__local int2 *wavefrontBatchCountsVertexCounts,
+	__local float4 *vertexPositionSharedData,
+	__local float *vertexInverseMassSharedData)
+{
+	const int laneInWavefront = (get_global_id(0) & (WAVEFRONT_SIZE-1));
+	const int wavefront = startWaveInBatch + (get_global_id(0) / WAVEFRONT_SIZE);
+	const int firstWavefrontInBlock = startWaveInBatch + get_group_id(0) * WAVEFRONT_BLOCK_MULTIPLIER;
+	const int localWavefront = wavefront - firstWavefrontInBlock;
+
+	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
+	if( wavefront < (startWaveInBatch + numWaves) )
+	{	
+		// Load the batch counts for the wavefronts
+		
+		int2 batchesAndVerticesWithinWavefront = g_wavefrontBatchCountsVertexCounts[wavefront];
+		int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
+		int verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
+
+		// Load the vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress];
+			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop through the batches performing the solve on each in LDS
+		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
+
+		//for( int batch = 0; batch < batchesWithinWavefront; ++batch )
+		
+		int batch = 0;
+		do
+		{
+			int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch;
+			int locationOfValue = baseDataLocation + laneInWavefront;
+			
+			
+			// These loads should all be perfectly linear across the WF
+			int2 localVertexIndices = g_linksVertexIndices[locationOfValue];
+			float massLSC = g_linksMassLSC[locationOfValue];
+			float restLengthSquared = g_linksRestLengthSquared[locationOfValue];
+			
+			// LDS vertex addresses based on logical wavefront number in block and loaded index
+			int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x;
+			int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y;
+			
+			float4 position0 = vertexPositionSharedData[vertexAddress0];
+			float4 position1 = vertexPositionSharedData[vertexAddress1];
+
+			float inverseMass0 = vertexInverseMassSharedData[vertexAddress0];
+			float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; 
+
+			float4 del = position1 - position0;
+			float len = mydot3(del, del);
+			
+			float k = 0;
+			if( massLSC > 0.0f )
+			{		
+				k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
+			}
+			
+			position0 = position0 - del*(k*inverseMass0);
+			position1 = position1 + del*(k*inverseMass1);
+			
+			// Ensure compiler does not re-order memory operations
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			vertexPositionSharedData[vertexAddress0] = position0;
+			vertexPositionSharedData[vertexAddress1] = position1;
+			
+			// Ensure compiler does not re-order memory operations
+			barrier(CLK_LOCAL_MEM_FENCE);
+				
+			
+			++batch;
+		} while( batch < batchesWithinWavefront );
+
+		// Update the global memory vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			g_vertexPositions[vertexAddress] = (float4)(vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex].xyz, 0.f);
+		}		
+		
+	}
+
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateConstants.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateConstants.cl
@ -0,0 +1,44 @@
+MSTRINGIFY(
+
+/*#define float3 float4
+
+float dot3(float3 a, float3 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}*/
+
+__kernel void 
+UpdateConstantsKernel( 
+	const int numLinks,
+	__global int2 * g_linksVertexIndices,
+	__global float4 * g_vertexPositions,
+	__global float * g_vertexInverseMasses,
+	__global float * g_linksMaterialLSC,
+	__global float * g_linksMassLSC,
+	__global float * g_linksRestLengthSquared,
+	__global float * g_linksRestLengths)
+{
+	int linkID = get_global_id(0);
+	if( linkID < numLinks )
+	{	
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		float linearStiffnessCoefficient = g_linksMaterialLSC[ linkID ];
+		
+		float3 position0   = g_vertexPositions[node0].xyz;
+		float3 position1   = g_vertexPositions[node1].xyz;
+		float inverseMass0 = g_vertexInverseMasses[node0];
+		float inverseMass1 = g_vertexInverseMasses[node1];
+
+		float3 difference = position0 - position1;
+		float length2 = dot(difference, difference);
+		float length = sqrt(length2);
+	
+		g_linksRestLengths[linkID] = length;
+		g_linksMassLSC[linkID] = (inverseMass0 + inverseMass1)/linearStiffnessCoefficient;
+		g_linksRestLengthSquared[linkID] = length*length;		
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateFixedVertexPositions.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateFixedVertexPositions.cl
@ -0,0 +1,25 @@
+MSTRINGIFY(
+	
+__kernel void 
+UpdateFixedVertexPositions(
+	const uint numNodes,
+	__global int * g_anchorIndex,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_anchorPositions GUID_ARG)
+{
+	unsigned int nodeID = get_global_id(0);
+
+	if( nodeID < numNodes )
+	{		
+		int anchorIndex  = g_anchorIndex[nodeID];
+		float4 position = g_vertexPositions[nodeID];
+
+		if ( anchorIndex >= 0 )
+		{
+			float4 anchorPosition = g_anchorPositions[anchorIndex];
+			g_vertexPositions[nodeID] = anchorPosition;
+		}
+	}		
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNodes.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNodes.cl
@ -0,0 +1,39 @@
+MSTRINGIFY(
+
+
+__kernel void 
+updateVelocitiesFromPositionsWithVelocitiesKernel( 
+	int numNodes,
+	float isolverdt,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexPreviousPositions,
+	__global int * g_vertexClothIndices,
+	__global float *g_clothVelocityCorrectionCoefficients,
+	__global float * g_clothDampingFactor,
+	__global float4 * g_vertexVelocities,
+	__global float4 * g_vertexForces GUID_ARG)
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		float4 position = g_vertexPositions[nodeID];
+		float4 previousPosition = g_vertexPreviousPositions[nodeID];
+		float4 velocity = g_vertexVelocities[nodeID];
+		int clothIndex = g_vertexClothIndices[nodeID];
+		float velocityCorrectionCoefficient = g_clothVelocityCorrectionCoefficients[clothIndex];
+		float dampingFactor = g_clothDampingFactor[clothIndex];
+		float velocityCoefficient = (1.f - dampingFactor);
+		
+		float4 difference = position - previousPosition;
+				
+		velocity += difference*velocityCorrectionCoefficient*isolverdt;
+		
+		// Damp the velocity
+		velocity *= velocityCoefficient;
+		
+		g_vertexVelocities[nodeID] = velocity;
+		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNormals.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNormals.cl
@ -0,0 +1,102 @@
+MSTRINGIFY(
+
+float length3(float4 a)
+{
+	a.w = 0;
+	return length(a);
+}
+
+float4 normalize3(float4 a)
+{
+	a.w = 0;
+	return normalize(a);
+}
+
+__kernel void 
+ResetNormalsAndAreasKernel(
+	const unsigned int numNodes,
+	__global float4 * g_vertexNormals,
+	__global float * g_vertexArea GUID_ARG)
+{
+	if( get_global_id(0) < numNodes )
+	{
+		g_vertexNormals[get_global_id(0)] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+		g_vertexArea[get_global_id(0)]    = 0.0f;
+	}
+}
+
+
+__kernel void 
+UpdateSoftBodiesKernel(
+	const unsigned int startFace,
+	const unsigned int numFaces,
+	__global int4 * g_triangleVertexIndexSet,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexNormals,
+	__global float * g_vertexArea,
+	__global float4 * g_triangleNormals,
+	__global float * g_triangleArea GUID_ARG)
+{
+	int faceID = get_global_id(0) + startFace;
+	if( get_global_id(0) < numFaces )
+	{		
+		int4 triangleIndexSet = g_triangleVertexIndexSet[ faceID ];
+		int nodeIndex0 = triangleIndexSet.x;
+		int nodeIndex1 = triangleIndexSet.y;
+		int nodeIndex2 = triangleIndexSet.z;
+
+		float4 node0 = g_vertexPositions[nodeIndex0];
+		float4 node1 = g_vertexPositions[nodeIndex1];
+		float4 node2 = g_vertexPositions[nodeIndex2];
+		float4 nodeNormal0 = g_vertexNormals[nodeIndex0];
+		float4 nodeNormal1 = g_vertexNormals[nodeIndex1];
+		float4 nodeNormal2 = g_vertexNormals[nodeIndex2];
+		float vertexArea0 = g_vertexArea[nodeIndex0];
+		float vertexArea1 = g_vertexArea[nodeIndex1];
+		float vertexArea2 = g_vertexArea[nodeIndex2];
+		
+		float4 vector0 = node1 - node0;
+		float4 vector1 = node2 - node0;
+		
+		float4 faceNormal = cross(vector0, vector1);
+		float triangleArea = length(faceNormal);
+
+		nodeNormal0 = nodeNormal0 + faceNormal;
+		nodeNormal1 = nodeNormal1 + faceNormal;
+		nodeNormal2 = nodeNormal2 + faceNormal;
+		vertexArea0 = vertexArea0 + triangleArea;
+		vertexArea1 = vertexArea1 + triangleArea;
+		vertexArea2 = vertexArea2 + triangleArea;
+		
+		g_triangleNormals[faceID] = normalize3(faceNormal);
+		g_vertexNormals[nodeIndex0] = nodeNormal0;
+		g_vertexNormals[nodeIndex1] = nodeNormal1;
+		g_vertexNormals[nodeIndex2] = nodeNormal2;
+		g_triangleArea[faceID] = triangleArea;
+		g_vertexArea[nodeIndex0] = vertexArea0;
+		g_vertexArea[nodeIndex1] = vertexArea1;
+		g_vertexArea[nodeIndex2] = vertexArea2;
+	}
+}
+
+__kernel void 
+NormalizeNormalsAndAreasKernel( 
+	const unsigned int numNodes,
+	__global int * g_vertexTriangleCount,
+	__global float4 * g_vertexNormals,
+	__global float * g_vertexArea GUID_ARG)
+{
+	if( get_global_id(0) < numNodes )
+	{
+		float4 normal = g_vertexNormals[get_global_id(0)];
+		float area = g_vertexArea[get_global_id(0)];
+		int numTriangles = g_vertexTriangleCount[get_global_id(0)];
+		
+		float vectorLength = length3(normal);
+		
+		g_vertexNormals[get_global_id(0)] = normalize3(normal);
+		g_vertexArea[get_global_id(0)] = area/(float)(numTriangles);
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositions.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositions.cl
@ -0,0 +1,34 @@
+MSTRINGIFY(
+
+__kernel void 
+updateVelocitiesFromPositionsWithoutVelocitiesKernel( 
+	const int numNodes,
+	const float isolverdt,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexPreviousPositions,
+	__global int * g_vertexClothIndices,
+	__global float * g_clothDampingFactor,
+	__global float4 * g_vertexVelocities,
+	__global float4 * g_vertexForces GUID_ARG)
+
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		float4 position = g_vertexPositions[nodeID];
+		float4 previousPosition = g_vertexPreviousPositions[nodeID];
+		float4 velocity = g_vertexVelocities[nodeID];
+		int clothIndex = g_vertexClothIndices[nodeID];
+		float dampingFactor = g_clothDampingFactor[clothIndex];
+		float velocityCoefficient = (1.f - dampingFactor);
+		
+		float4 difference = position - previousPosition;
+				
+		velocity = difference*velocityCoefficient*isolverdt;		
+		
+		g_vertexVelocities[nodeID] = velocity;
+		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositionsFromVelocities.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositionsFromVelocities.cl
@ -0,0 +1,28 @@
+
+MSTRINGIFY(
+
+
+
+
+__kernel void 
+UpdatePositionsFromVelocitiesKernel( 
+	const int numNodes,
+	const float solverSDT,
+	__global float4 * g_vertexVelocities,
+	__global float4 * g_vertexPreviousPositions,
+	__global float4 * g_vertexCurrentPosition GUID_ARG)
+{
+	int vertexID = get_global_id(0);
+	if( vertexID < numNodes )
+	{	
+		float4 previousPosition = g_vertexPreviousPositions[vertexID];
+		float4 velocity         = g_vertexVelocities[vertexID];
+		
+		float4 newPosition      = previousPosition + velocity*solverSDT;
+		
+		g_vertexCurrentPosition[vertexID]   = newPosition;
+		g_vertexPreviousPositions[vertexID] = newPosition;
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/VSolveLinks.cl
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/VSolveLinks.cl
@ -0,0 +1,45 @@
+MSTRINGIFY(
+
+__kernel void 
+VSolveLinksKernel( 
+	int startLink,
+	int numLinks,
+	float kst,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksLengthRatio,
+	__global float4 * g_linksCurrentLength,
+	__global float * g_vertexInverseMass,
+	__global float4 * g_vertexVelocity GUID_ARG)
+{
+	int linkID = get_global_id(0) + startLink;
+	if( get_global_id(0) < numLinks )
+	{		
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		
+		float linkLengthRatio = g_linksLengthRatio[linkID];
+		float3 linkCurrentLength = g_linksCurrentLength[linkID].xyz;
+		
+		float3 vertexVelocity0 = g_vertexVelocity[node0].xyz;
+		float3 vertexVelocity1 = g_vertexVelocity[node1].xyz;
+
+		float vertexInverseMass0 = g_vertexInverseMass[node0];
+		float vertexInverseMass1 = g_vertexInverseMass[node1]; 
+
+		float3 nodeDifference = vertexVelocity0 - vertexVelocity1;
+		float dotResult = dot(linkCurrentLength, nodeDifference);
+		float j = -dotResult*linkLengthRatio*kst;
+		
+		float3 velocityChange0 = linkCurrentLength*(j*vertexInverseMass0);
+		float3 velocityChange1 = linkCurrentLength*(j*vertexInverseMass1);
+		
+		vertexVelocity0 += velocityChange0;
+		vertexVelocity1 -= velocityChange1;
+
+		g_vertexVelocity[node0] = (float4)(vertexVelocity0, 0.f);
+		g_vertexVelocity[node1] = (float4)(vertexVelocity1, 0.f);
+	}
+}
+
+);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
@ -0,0 +1,209 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVER_BUFFER_OPENCL_H
+#define BT_SOFT_BODY_SOLVER_BUFFER_OPENCL_H
+
+// OpenCL support
+
+#ifdef USE_MINICL
+	#include "MiniCL/cl.h"
+#else //USE_MINICL
+	#ifdef __APPLE__
+		#include <OpenCL/OpenCL.h>
+	#else
+		#include <CL/cl.h>
+	#endif //__APPLE__
+#endif//USE_MINICL
+
+#ifndef SAFE_RELEASE
+#define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
+#endif
+
+template <typename ElementType> class btOpenCLBuffer
+{
+public:
+
+	cl_command_queue	m_cqCommandQue;
+	cl_context			m_clContext;
+	cl_mem				m_buffer;
+
+
+
+	btAlignedObjectArray< ElementType > * m_CPUBuffer;
+	
+	int  m_gpuSize;
+	bool m_onGPU;
+	bool m_readOnlyOnGPU;
+	bool m_allocated;
+
+
+	bool createBuffer( cl_mem* preexistingBuffer = 0)
+	{
+
+		cl_int err;
+		 
+
+		if( preexistingBuffer )
+		{
+			m_buffer = *preexistingBuffer;
+		} 
+		else {
+
+			cl_mem_flags flags= m_readOnlyOnGPU ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
+
+			size_t size = m_CPUBuffer->size() * sizeof(ElementType);
+			// At a minimum the buffer must exist
+			if( size == 0 )
+				size = sizeof(ElementType);
+			m_buffer = clCreateBuffer(m_clContext, flags, size, 0, &err);
+			if( err != CL_SUCCESS )
+			{
+				btAssert( "Buffer::Buffer(m_buffer)");
+			}
+		}
+
+		m_gpuSize = m_CPUBuffer->size();
+
+		return true;
+	}
+
+public:
+	btOpenCLBuffer( cl_command_queue	commandQue,cl_context ctx, btAlignedObjectArray< ElementType >* CPUBuffer, bool readOnly)
+		:m_cqCommandQue(commandQue),
+		m_clContext(ctx),
+		m_buffer(0),
+		m_CPUBuffer(CPUBuffer),
+		m_gpuSize(0),
+		m_onGPU(false),
+		m_readOnlyOnGPU(readOnly),
+		m_allocated(false)
+	{
+	}
+
+	~btOpenCLBuffer()
+	{
+		clReleaseMemObject(m_buffer);
+	}
+
+
+	bool moveToGPU()
+	{
+
+
+		cl_int err;
+
+		if( (m_CPUBuffer->size() != m_gpuSize) )
+		{
+			m_onGPU = false;
+		}
+
+		if( !m_allocated && m_CPUBuffer->size() == 0  )
+		{
+			// If it isn't on the GPU and yet there is no data on the CPU side this may cause a problem with some kernels.
+			// We should create *something* on the device side
+			if (!createBuffer()) {
+				return false;
+			}
+			m_allocated = true;
+		}
+
+		if( !m_onGPU && m_CPUBuffer->size() > 0 )
+		{
+			if (!m_allocated || (m_CPUBuffer->size() != m_gpuSize)) {
+				if (!createBuffer()) {
+					return false;
+				}
+				m_allocated = true;
+			}
+			
+			size_t size = m_CPUBuffer->size() * sizeof(ElementType);
+			err = clEnqueueWriteBuffer(m_cqCommandQue,m_buffer,
+				CL_FALSE,
+				0,
+				size, 
+				&((*m_CPUBuffer)[0]),0,0,0);
+			if( err != CL_SUCCESS )
+			{
+				btAssert( "CommandQueue::enqueueWriteBuffer(m_buffer)" );
+			}
+
+			m_onGPU = true;
+		}
+
+		return true;
+
+	}
+
+	bool moveFromGPU()
+	{
+
+		cl_int err;
+
+		if (m_CPUBuffer->size() > 0) {
+			if (m_onGPU && !m_readOnlyOnGPU) {
+				size_t size = m_CPUBuffer->size() * sizeof(ElementType);
+				err = clEnqueueReadBuffer(m_cqCommandQue,
+					m_buffer,
+					CL_TRUE,
+					0,
+					size,
+					&((*m_CPUBuffer)[0]),0,0,0);
+
+				if( err != CL_SUCCESS )
+				{
+					btAssert( "CommandQueue::enqueueReadBuffer(m_buffer)" );
+				}
+
+				m_onGPU = false;
+			}
+		}
+
+		return true;
+	}
+
+	bool copyFromGPU()
+	{
+
+		cl_int err;
+		size_t size = m_CPUBuffer->size() * sizeof(ElementType);
+
+		if (m_CPUBuffer->size() > 0) {
+			if (m_onGPU && !m_readOnlyOnGPU) {
+				err = clEnqueueReadBuffer(m_cqCommandQue,
+					m_buffer,
+					CL_TRUE,
+					0,size, 
+					&((*m_CPUBuffer)[0]),0,0,0);
+
+				if( err != CL_SUCCESS )
+				{
+					btAssert( "CommandQueue::enqueueReadBuffer(m_buffer)");
+				}
+
+			}
+		}
+
+		return true;
+	}
+
+	virtual void changedOnCPU()
+	{
+		m_onGPU = false;
+	}
+}; // class btOpenCLBuffer
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_BUFFER_OPENCL_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h
@ -0,0 +1,99 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
+
+
+#ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_H
+#define BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_H
+
+
+class btSoftBodyLinkDataOpenCL : public btSoftBodyLinkData
+{
+public:
+	bool				m_onGPU;
+
+	cl_command_queue	m_cqCommandQue;
+
+
+	btOpenCLBuffer<LinkNodePair> m_clLinks;
+	btOpenCLBuffer<float>							      m_clLinkStrength;
+	btOpenCLBuffer<float>								  m_clLinksMassLSC;
+	btOpenCLBuffer<float>								  m_clLinksRestLengthSquared;
+	btOpenCLBuffer<Vectormath::Aos::Vector3>			  m_clLinksCLength;
+	btOpenCLBuffer<float>								  m_clLinksLengthRatio;
+	btOpenCLBuffer<float>								  m_clLinksRestLength;
+	btOpenCLBuffer<float>								  m_clLinksMaterialLinearStiffnessCoefficient;
+
+	struct BatchPair
+	{
+		int start;
+		int length;
+
+		BatchPair() :
+			start(0),
+			length(0)
+		{
+		}
+
+		BatchPair( int s, int l ) : 
+			start( s ),
+			length( l )
+		{
+		}
+	};
+
+	/**
+	 * Link addressing information for each cloth.
+	 * Allows link locations to be computed independently of data batching.
+	 */
+	btAlignedObjectArray< int >							m_linkAddresses;
+
+	/**
+	 * Start and length values for computation batches over link data.
+	 */
+	btAlignedObjectArray< BatchPair >		m_batchStartLengths;
+
+	btSoftBodyLinkDataOpenCL(cl_command_queue queue, cl_context ctx);
+
+	virtual ~btSoftBodyLinkDataOpenCL();
+
+	/** Allocate enough space in all link-related arrays to fit numLinks links */
+	virtual void createLinks( int numLinks );
+	
+	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
+	virtual void setLinkAt( 
+		const LinkDescription &link, 
+		int linkIndex );
+
+	virtual bool onAccelerator();
+
+	virtual bool moveToAccelerator();
+
+	virtual bool moveFromAccelerator();
+
+	/**
+	 * Generate (and later update) the batching for the entire link set.
+	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
+	 * In theory we could delay it until just before we need the cloth.
+	 * It's a one-off overhead, though, so that is a later optimisation.
+	 */
+	void generateBatches();
+};
+
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h
@ -0,0 +1,169 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
+
+
+#ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H
+#define BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H
+
+
+class btSoftBodyLinkDataOpenCLSIMDAware : public btSoftBodyLinkData
+{
+public:
+	bool				m_onGPU;
+
+	cl_command_queue	m_cqCommandQue;
+
+	const int m_wavefrontSize;
+	const int m_linksPerWorkItem;
+	const int m_maxLinksPerWavefront;
+	int m_maxBatchesWithinWave;
+	int m_maxVerticesWithinWave;
+	int m_numWavefronts;
+
+	int m_maxVertex;
+
+	struct NumBatchesVerticesPair
+	{
+		int numBatches;
+		int numVertices;
+	};
+
+	btAlignedObjectArray<int>							  m_linksPerWavefront;
+	btAlignedObjectArray<NumBatchesVerticesPair>		  m_numBatchesAndVerticesWithinWaves;
+	btOpenCLBuffer< NumBatchesVerticesPair >			  m_clNumBatchesAndVerticesWithinWaves;
+
+	// All arrays here will contain batches of m_maxLinksPerWavefront links
+	// ordered by wavefront.
+	// with either global vertex pairs or local vertex pairs
+	btAlignedObjectArray< int >							  m_wavefrontVerticesGlobalAddresses; // List of global vertices per wavefront
+	btOpenCLBuffer<int>									  m_clWavefrontVerticesGlobalAddresses;
+	btAlignedObjectArray< LinkNodePair >				  m_linkVerticesLocalAddresses; // Vertex pair for the link
+	btOpenCLBuffer<LinkNodePair>						  m_clLinkVerticesLocalAddresses;
+	btOpenCLBuffer<float>							      m_clLinkStrength;
+	btOpenCLBuffer<float>								  m_clLinksMassLSC;
+	btOpenCLBuffer<float>								  m_clLinksRestLengthSquared;
+	btOpenCLBuffer<float>								  m_clLinksRestLength;
+	btOpenCLBuffer<float>								  m_clLinksMaterialLinearStiffnessCoefficient;
+
+	struct BatchPair
+	{
+		int start;
+		int length;
+
+		BatchPair() :
+			start(0),
+			length(0)
+		{
+		}
+
+		BatchPair( int s, int l ) : 
+			start( s ),
+			length( l )
+		{
+		}
+	};
+
+	/**
+	 * Link addressing information for each cloth.
+	 * Allows link locations to be computed independently of data batching.
+	 */
+	btAlignedObjectArray< int >							m_linkAddresses;
+	
+	/**
+	 * Start and length values for computation batches over link data.
+	 */
+	btAlignedObjectArray< BatchPair >		m_wavefrontBatchStartLengths;
+
+	btSoftBodyLinkDataOpenCLSIMDAware(cl_command_queue queue, cl_context ctx);
+
+	virtual ~btSoftBodyLinkDataOpenCLSIMDAware();
+
+	/** Allocate enough space in all link-related arrays to fit numLinks links */
+	virtual void createLinks( int numLinks );
+	
+	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
+	virtual void setLinkAt( 
+		const LinkDescription &link, 
+		int linkIndex );
+
+	virtual bool onAccelerator();
+
+	virtual bool moveToAccelerator();
+
+	virtual bool moveFromAccelerator();
+
+	/**
+	 * Generate (and later update) the batching for the entire link set.
+	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
+	 * In theory we could delay it until just before we need the cloth.
+	 * It's a one-off overhead, though, so that is a later optimisation.
+	 */
+	void generateBatches();
+
+	int getMaxVerticesPerWavefront()
+	{
+		return m_maxVerticesWithinWave;
+	}
+
+	int getWavefrontSize()
+	{
+		return m_wavefrontSize;
+	}
+
+	int getLinksPerWorkItem()
+	{
+		return m_linksPerWorkItem;
+	}
+
+	int getMaxLinksPerWavefront()
+	{
+		return m_maxLinksPerWavefront;
+	}
+
+	int getMaxBatchesPerWavefront()
+	{
+		return m_maxBatchesWithinWave;
+	}
+
+	int getNumWavefronts()
+	{
+		return m_numWavefronts;
+	}
+
+	NumBatchesVerticesPair getNumBatchesAndVerticesWithinWavefront( int wavefront )
+	{
+		return m_numBatchesAndVerticesWithinWaves[wavefront];
+	}
+
+	int getVertexGlobalAddresses( int vertexIndex )
+	{
+		return m_wavefrontVerticesGlobalAddresses[vertexIndex];
+	}
+
+	/**
+	 * Get post-batching local addresses of the vertex pair for a link assuming all vertices used by a wavefront are loaded locally.
+	 */
+	LinkNodePair getVertexPairLocalAddresses( int linkIndex )
+	{
+		return m_linkVerticesLocalAddresses[linkIndex];
+	}
+};
+
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
@ -0,0 +1,126 @@
+#include "btSoftBodySolverOutputCLtoGL.h"
+#include <stdio.h> //@todo: remove the debugging printf at some stage
+#include "btSoftBodySolver_OpenCL.h"
+#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
+#include "btSoftBodySolverVertexBuffer_OpenGL.h"
+#include "BulletSoftBody/btSoftBody.h"
+
+////OpenCL 1.0 kernels don't use float3
+#define MSTRINGIFY(A) #A
+static char* OutputToVertexArrayCLString =
+#include "OpenCLC10/OutputToVertexArray.cl"
+
+	
+#define RELEASE_CL_KERNEL(kernelName) {if( kernelName ){ clReleaseKernel( kernelName ); kernelName = 0; }}
+
+static const size_t workGroupSize = 128;
+
+void btSoftBodySolverOutputCLtoGL::copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer )
+{
+
+	btSoftBodySolver *solver = softBody->getSoftBodySolver();
+	btAssert( solver->getSolverType() == btSoftBodySolver::CL_SOLVER || solver->getSolverType() == btSoftBodySolver::CL_SIMD_SOLVER );
+	btOpenCLSoftBodySolver *dxSolver = static_cast< btOpenCLSoftBodySolver * >( solver );
+	checkInitialized();
+	btOpenCLAcceleratedSoftBodyInterface* currentCloth = dxSolver->findSoftBodyInterface( softBody );
+	btSoftBodyVertexDataOpenCL &vertexData( dxSolver->m_vertexData );	
+
+	const int firstVertex = currentCloth->getFirstVertex();
+	const int lastVertex = firstVertex + currentCloth->getNumVertices();
+
+	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::OPENGL_BUFFER ) {		
+
+		const btOpenGLInteropVertexBufferDescriptor *openGLVertexBuffer = static_cast< btOpenGLInteropVertexBufferDescriptor* >(vertexBuffer);						
+		cl_int ciErrNum = CL_SUCCESS;    
+
+		cl_mem clBuffer = openGLVertexBuffer->getBuffer();		
+		cl_kernel outputKernel = outputToVertexArrayWithNormalsKernel;
+		if( !vertexBuffer->hasNormals() )
+			outputKernel = outputToVertexArrayWithoutNormalsKernel;
+
+		ciErrNum = clEnqueueAcquireGLObjects(m_cqCommandQue, 1, &clBuffer, 0, 0, NULL);
+		if( ciErrNum != CL_SUCCESS )
+		{
+			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
+		}
+
+		int numVertices = currentCloth->getNumVertices();
+
+		ciErrNum = clSetKernelArg(outputKernel, 0, sizeof(int), &firstVertex );
+		ciErrNum = clSetKernelArg(outputKernel, 1, sizeof(int), &numVertices );
+		ciErrNum = clSetKernelArg(outputKernel, 2, sizeof(cl_mem), (void*)&clBuffer );
+		if( vertexBuffer->hasVertexPositions() )
+		{
+			int vertexOffset = vertexBuffer->getVertexOffset();
+			int vertexStride = vertexBuffer->getVertexStride();
+			ciErrNum = clSetKernelArg(outputKernel, 3, sizeof(int), &vertexOffset );
+			ciErrNum = clSetKernelArg(outputKernel, 4, sizeof(int), &vertexStride );
+			ciErrNum = clSetKernelArg(outputKernel, 5, sizeof(cl_mem), (void*)&vertexData.m_clVertexPosition.m_buffer );
+
+		}
+		if( vertexBuffer->hasNormals() )
+		{
+			int normalOffset = vertexBuffer->getNormalOffset();
+			int normalStride = vertexBuffer->getNormalStride();
+			ciErrNum = clSetKernelArg(outputKernel, 6, sizeof(int), &normalOffset );
+			ciErrNum = clSetKernelArg(outputKernel, 7, sizeof(int), &normalStride );
+			ciErrNum = clSetKernelArg(outputKernel, 8, sizeof(cl_mem), (void*)&vertexData.m_clVertexNormal.m_buffer );
+
+		}
+		size_t	numWorkItems = workGroupSize*((vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, outputKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
+		if( ciErrNum != CL_SUCCESS ) 
+		{
+			btAssert( 0 &&  "enqueueNDRangeKernel(copySoftBodyToVertexBuffer)");
+		}
+
+		ciErrNum = clEnqueueReleaseGLObjects(m_cqCommandQue, 1, &clBuffer, 0, 0, 0);
+		if( ciErrNum != CL_SUCCESS )
+		{
+			btAssert( 0 &&  "clEnqueueReleaseGLObjects(copySoftBodyToVertexBuffer)");
+		}
+	} else {
+		btAssert( "Undefined output for this solver output" == false );
+	}
+
+	// clFinish in here may not be the best thing. It's possible that we should have a waitForFrameComplete function.
+	clFinish(m_cqCommandQue);
+
+} // btSoftBodySolverOutputCLtoGL::outputToVertexBuffers
+
+bool btSoftBodySolverOutputCLtoGL::buildShaders()
+{
+	// Ensure current kernels are released first
+	releaseKernels();
+
+	bool returnVal = true;
+
+	if( m_shadersInitialized )
+		return true;
+	
+	outputToVertexArrayWithNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithNormalsKernel" ,"","OpenCLC10/OutputToVertexArray.cl");
+	outputToVertexArrayWithoutNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithoutNormalsKernel" ,"","OpenCLC10/OutputToVertexArray.cl");
+
+
+	if( returnVal )
+		m_shadersInitialized = true;
+
+	return returnVal;
+} // btSoftBodySolverOutputCLtoGL::buildShaders
+
+void btSoftBodySolverOutputCLtoGL::releaseKernels()
+{
+	RELEASE_CL_KERNEL( outputToVertexArrayWithNormalsKernel );
+	RELEASE_CL_KERNEL( outputToVertexArrayWithoutNormalsKernel );
+
+	m_shadersInitialized = false;
+} // btSoftBodySolverOutputCLtoGL::releaseKernels
+
+bool btSoftBodySolverOutputCLtoGL::checkInitialized()
+{
+	if( !m_shadersInitialized )
+		if( buildShaders() )
+			m_shadersInitialized = true;
+
+	return m_shadersInitialized;
+}
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h
@ -0,0 +1,62 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H
+#define BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H
+
+#include "btSoftBodySolver_OpenCL.h"
+
+/** 
+ * Class to manage movement of data from a solver to a given target.
+ * This version is the CL to GL interop version.
+ */
+class btSoftBodySolverOutputCLtoGL : public btSoftBodySolverOutput
+{
+protected:
+	cl_command_queue	m_cqCommandQue;
+	cl_context			m_cxMainContext;
+	CLFunctions			clFunctions;
+	
+	cl_kernel		outputToVertexArrayWithNormalsKernel;
+	cl_kernel		outputToVertexArrayWithoutNormalsKernel;
+
+	bool m_shadersInitialized;
+	
+	virtual bool checkInitialized();	
+	virtual bool buildShaders();
+	void releaseKernels();
+public:
+	btSoftBodySolverOutputCLtoGL(cl_command_queue cqCommandQue, cl_context cxMainContext) :
+		m_cqCommandQue( cqCommandQue ),
+		m_cxMainContext( cxMainContext ),
+		clFunctions(cqCommandQue, cxMainContext),
+		outputToVertexArrayWithNormalsKernel( 0 ),
+		outputToVertexArrayWithoutNormalsKernel( 0 ),
+		m_shadersInitialized( false )
+	{
+	}
+
+	virtual ~btSoftBodySolverOutputCLtoGL()
+	{
+		releaseKernels();
+	}
+
+	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
+	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer );
+};
+
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h
@ -0,0 +1,84 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
+
+
+#ifndef BT_SOFT_BODY_SOLVER_TRIANGLE_DATA_OPENCL_H
+#define BT_SOFT_BODY_SOLVER_TRIANGLE_DATA_OPENCL_H
+
+
+class btSoftBodyTriangleDataOpenCL : public btSoftBodyTriangleData
+{
+public:
+	bool				m_onGPU;
+	cl_command_queue    m_queue;
+
+	btOpenCLBuffer<btSoftBodyTriangleData::TriangleNodeSet>					m_clVertexIndices;
+	btOpenCLBuffer<float>								m_clArea;
+	btOpenCLBuffer<Vectormath::Aos::Vector3>			m_clNormal;
+
+	/**
+	 * Link addressing information for each cloth.
+	 * Allows link locations to be computed independently of data batching.
+	 */
+	btAlignedObjectArray< int >							m_triangleAddresses;
+
+	/**
+	 * Start and length values for computation batches over link data.
+	 */
+	struct btSomePair
+	{
+		btSomePair() {}
+		btSomePair(int f,int s)
+			:first(f),second(s)
+		{
+		}
+		int first;
+		int second;
+	};
+	btAlignedObjectArray< btSomePair >		m_batchStartLengths;
+
+public:
+	btSoftBodyTriangleDataOpenCL( cl_command_queue queue, cl_context ctx );
+
+	virtual ~btSoftBodyTriangleDataOpenCL();
+
+	/** Allocate enough space in all link-related arrays to fit numLinks links */
+	virtual void createTriangles( int numTriangles );
+	
+	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
+	virtual void setTriangleAt( const btSoftBodyTriangleData::TriangleDescription &triangle, int triangleIndex );
+
+	virtual bool onAccelerator();
+
+	virtual bool moveToAccelerator();
+
+	virtual bool moveFromAccelerator();
+
+	/**
+	 * Generate (and later update) the batching for the entire triangle set.
+	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
+	 * In theory we could delay it until just before we need the cloth.
+	 * It's a one-off overhead, though, so that is a later optimisation.
+	 */
+	void generateBatches();
+}; // class btSoftBodyTriangleDataOpenCL
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_TRIANGLE_DATA_OPENCL_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h
@ -0,0 +1,166 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H
+#define BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H 
+
+
+#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
+#ifdef USE_MINICL
+	#include "MiniCL/cl.h"
+#else //USE_MINICL
+	#ifdef __APPLE__
+		#include <OpenCL/OpenCL.h>
+	#else
+		#include <CL/cl.h>
+		#include <CL/cl_gl.h>
+	#endif //__APPLE__
+#endif//USE_MINICL
+
+
+#ifdef _WIN32//for glut.h
+#include <windows.h>
+#endif
+
+//think different
+#if defined(__APPLE__) && !defined (VMDMESA)
+#include <OpenGL/OpenGL.h>
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#include <GLUT/glut.h>
+#else
+
+
+#ifdef _WINDOWS
+#include <windows.h>
+#include <GL/gl.h>
+#include <GL/glu.h>
+#else
+#include <GL/glut.h>
+#endif //_WINDOWS
+#endif //APPLE
+
+
+
+class btOpenGLInteropVertexBufferDescriptor : public btVertexBufferDescriptor
+{
+protected:
+	/** OpenCL context */
+	cl_context			m_context;
+
+	/** OpenCL command queue */
+	cl_command_queue	m_commandQueue;
+	
+	/** OpenCL interop buffer */
+	cl_mem m_buffer;
+
+	/** VBO in GL that is the basis of the interop buffer */
+	GLuint m_openGLVBO;
+
+
+public:
+	/**
+	 * context is the OpenCL context this interop buffer will work in.
+	 * queue is the command queue that kernels and data movement will be enqueued into.
+	 * openGLVBO is the OpenGL vertex buffer data will be copied into.
+	 * vertexOffset is the offset in floats to the first vertex.
+	 * vertexStride is the stride in floats between vertices.
+	 */
+	btOpenGLInteropVertexBufferDescriptor( cl_command_queue cqCommandQue, cl_context context, GLuint openGLVBO, int vertexOffset, int vertexStride )
+	{
+#ifndef USE_MINICL
+		cl_int ciErrNum = CL_SUCCESS;
+		m_context = context;
+		m_commandQueue = cqCommandQue;
+		
+		m_vertexOffset = vertexOffset;
+		m_vertexStride = vertexStride;
+
+		m_openGLVBO = openGLVBO;
+		
+		m_buffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, openGLVBO, &ciErrNum);
+		if( ciErrNum != CL_SUCCESS )
+		{
+			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
+		}
+
+		m_hasVertexPositions = true;
+#else
+		btAssert(0);//MiniCL shouldn't get here
+#endif
+	}
+
+	/**
+	 * context is the OpenCL context this interop buffer will work in.
+	 * queue is the command queue that kernels and data movement will be enqueued into.
+	 * openGLVBO is the OpenGL vertex buffer data will be copied into.
+	 * vertexOffset is the offset in floats to the first vertex.
+	 * vertexStride is the stride in floats between vertices.
+	 * normalOffset is the offset in floats to the first normal.
+	 * normalStride is the stride in floats between normals.
+	 */
+	btOpenGLInteropVertexBufferDescriptor( cl_command_queue cqCommandQue, cl_context context, GLuint openGLVBO, int vertexOffset, int vertexStride, int normalOffset, int normalStride )
+	{
+#ifndef USE_MINICL
+		cl_int ciErrNum = CL_SUCCESS;
+		m_context = context;
+		m_commandQueue = cqCommandQue;
+		
+		m_openGLVBO = openGLVBO;
+		
+		m_buffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, openGLVBO, &ciErrNum);
+		if( ciErrNum != CL_SUCCESS )
+		{
+			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
+		}
+
+		m_vertexOffset = vertexOffset;
+		m_vertexStride = vertexStride;
+		m_hasVertexPositions = true;
+
+		m_normalOffset = normalOffset;
+		m_normalStride = normalStride;
+		m_hasNormals = true;
+#else
+		btAssert(0);
+#endif //USE_MINICL
+		
+	}
+
+	virtual ~btOpenGLInteropVertexBufferDescriptor()
+	{
+		clReleaseMemObject( m_buffer );
+	}
+
+	/**
+	 * Return the type of the vertex buffer descriptor.
+	 */
+	virtual BufferTypes getBufferType() const
+	{
+		return OPENGL_BUFFER;
+	}
+
+	virtual cl_context getContext() const
+	{
+		return m_context;
+	}
+
+	virtual cl_mem getBuffer() const
+	{
+		return m_buffer;
+	}	
+};
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
@ -0,0 +1,52 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
+
+#ifndef BT_SOFT_BODY_SOLVER_VERTEX_DATA_OPENCL_H
+#define BT_SOFT_BODY_SOLVER_VERTEX_DATA_OPENCL_H
+
+
+class btSoftBodyVertexDataOpenCL : public btSoftBodyVertexData
+{
+protected:
+	bool		m_onGPU;
+	cl_command_queue	m_queue;
+
+public:
+	btOpenCLBuffer<int>									m_clClothIdentifier;
+	btOpenCLBuffer<Vectormath::Aos::Point3>				m_clVertexPosition;
+	btOpenCLBuffer<Vectormath::Aos::Point3>				m_clVertexPreviousPosition;
+	btOpenCLBuffer<Vectormath::Aos::Vector3>				m_clVertexVelocity;
+	btOpenCLBuffer<Vectormath::Aos::Vector3>				m_clVertexForceAccumulator;
+	btOpenCLBuffer<Vectormath::Aos::Vector3>				m_clVertexNormal;
+	btOpenCLBuffer<float>									m_clVertexInverseMass;
+	btOpenCLBuffer<float>									m_clVertexArea;
+	btOpenCLBuffer<int>									m_clVertexTriangleCount;
+public:
+	btSoftBodyVertexDataOpenCL( cl_command_queue queue,  cl_context ctx);
+
+	virtual ~btSoftBodyVertexDataOpenCL();
+
+	virtual bool onAccelerator();
+
+	virtual bool moveToAccelerator();
+
+	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true);
+};
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_VERTEX_DATA_OPENCL_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
@ -0,0 +1,527 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVER_OPENCL_H
+#define BT_SOFT_BODY_SOLVER_OPENCL_H
+
+#include "stddef.h" //for size_t
+#include "vectormath/vmInclude.h"
+
+#include "BulletSoftBody/btSoftBodySolvers.h"
+#include "BulletSoftBody/btSoftBody.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
+#include "btSoftBodySolverLinkData_OpenCL.h"
+#include "btSoftBodySolverVertexData_OpenCL.h"
+#include "btSoftBodySolverTriangleData_OpenCL.h"
+
+class CLFunctions
+{
+protected:
+	cl_command_queue	m_cqCommandQue;
+	cl_context			m_cxMainContext;
+
+	int	m_kernelCompilationFailures;
+
+
+public:
+	CLFunctions(cl_command_queue cqCommandQue, cl_context cxMainContext) :
+		m_cqCommandQue( cqCommandQue ),
+		m_cxMainContext( cxMainContext ),
+		m_kernelCompilationFailures(0)
+	{
+	}
+
+	int getKernelCompilationFailures() const
+	{
+		return m_kernelCompilationFailures;
+	}
+
+	/**
+	 * Compile a compute shader kernel from a string and return the appropriate cl_kernel object.
+	 */	
+	virtual cl_kernel compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros, const char* srcFileNameForCaching);
+
+	void	clearKernelCompilationFailures()
+	{
+		m_kernelCompilationFailures=0;
+	}
+};
+
+/**
+ * Entry in the collision shape array.
+ * Specifies the shape type, the transform matrix and the necessary details of the collisionShape.
+ */
+struct CollisionShapeDescription
+{
+	Vectormath::Aos::Transform3 shapeTransform;
+	Vectormath::Aos::Vector3 linearVelocity;
+	Vectormath::Aos::Vector3 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+
+	// Both needed for capsule
+	float radius;
+	float halfHeight;
+	int upAxis;
+	
+	float margin;
+	float friction;
+
+	CollisionShapeDescription()
+	{
+		collisionShapeType = 0;
+		margin = 0;
+		friction = 0;
+	}
+};
+
+/**
+	 * SoftBody class to maintain information about a soft body instance
+	 * within a solver.
+	 * This data addresses the main solver arrays.
+	 */
+class btOpenCLAcceleratedSoftBodyInterface
+{
+protected:
+	/** Current number of vertices that are part of this cloth */
+	int m_numVertices;
+	/** Maximum number of vertices allocated to be part of this cloth */
+	int m_maxVertices;
+	/** Current number of triangles that are part of this cloth */
+	int m_numTriangles;
+	/** Maximum number of triangles allocated to be part of this cloth */
+	int m_maxTriangles;
+	/** Index of first vertex in the world allocated to this cloth */
+	int m_firstVertex;
+	/** Index of first triangle in the world allocated to this cloth */
+	int m_firstTriangle;
+	/** Index of first link in the world allocated to this cloth */
+	int m_firstLink;
+	/** Maximum number of links allocated to this cloth */
+	int m_maxLinks;
+	/** Current number of links allocated to this cloth */
+	int m_numLinks;
+
+	/** The actual soft body this data represents */
+	btSoftBody *m_softBody;
+
+
+public:
+	btOpenCLAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
+	  m_softBody( softBody )
+	{
+		m_numVertices = 0;
+		m_maxVertices = 0;
+		m_numTriangles = 0;
+		m_maxTriangles = 0;
+		m_firstVertex = 0;
+		m_firstTriangle = 0;
+		m_firstLink = 0;
+		m_maxLinks = 0;
+		m_numLinks = 0;
+	}
+	int getNumVertices()
+	{
+		return m_numVertices;
+	}
+
+	int getNumTriangles()
+	{
+		return m_numTriangles;
+	}
+
+	int getMaxVertices()
+	{
+		return m_maxVertices;
+	}
+
+	int getMaxTriangles()
+	{
+		return m_maxTriangles;
+	}
+
+	int getFirstVertex()
+	{
+		return m_firstVertex;
+	}
+
+	int getFirstTriangle()
+	{
+		return m_firstTriangle;
+	}
+	
+	/**
+	 * Update the bounds in the btSoftBody object
+	 */
+	void updateBounds( const btVector3 &lowerBound, const btVector3 &upperBound );
+
+	// TODO: All of these set functions will have to do checks and
+	// update the world because restructuring of the arrays will be necessary
+	// Reasonable use of "friend"?
+	void setNumVertices( int numVertices )
+	{
+		m_numVertices = numVertices;
+	}	
+
+	void setNumTriangles( int numTriangles )
+	{
+		m_numTriangles = numTriangles;
+	}
+
+	void setMaxVertices( int maxVertices )
+	{
+		m_maxVertices = maxVertices;
+	}
+
+	void setMaxTriangles( int maxTriangles )
+	{
+		m_maxTriangles = maxTriangles;
+	}
+
+	void setFirstVertex( int firstVertex )
+	{
+		m_firstVertex = firstVertex;
+	}
+
+	void setFirstTriangle( int firstTriangle )
+	{
+		m_firstTriangle = firstTriangle;
+	}
+
+	void setMaxLinks( int maxLinks )
+	{
+		m_maxLinks = maxLinks;
+	}
+
+	void setNumLinks( int numLinks )
+	{
+		m_numLinks = numLinks;
+	}
+
+	void setFirstLink( int firstLink )
+	{
+		m_firstLink = firstLink;
+	}
+
+	int getMaxLinks()
+	{
+		return m_maxLinks;
+	}
+
+	int getNumLinks()
+	{
+		return m_numLinks;
+	}
+
+	int getFirstLink()
+	{
+		return m_firstLink;
+	}
+
+	btSoftBody* getSoftBody()
+	{
+		return m_softBody;
+	}
+
+};
+
+
+
+class btOpenCLSoftBodySolver : public btSoftBodySolver
+{
+public:
+	
+
+	struct UIntVector3
+	{
+		UIntVector3()
+		{
+			x = 0;
+			y = 0;
+			z = 0;
+			_padding = 0;
+		}
+		
+		UIntVector3( unsigned int x_, unsigned int y_, unsigned int z_ )
+		{
+			x = x_;
+			y = y_;
+			z = z_;
+			_padding = 0;
+		}
+			
+		unsigned int x;
+		unsigned int y;
+		unsigned int z;
+		unsigned int _padding;
+	};
+
+	struct CollisionObjectIndices
+	{
+		CollisionObjectIndices( int f, int e )
+		{
+			firstObject = f;
+			endObject = e;
+		}
+
+		int firstObject;
+		int endObject;
+	};
+
+	btSoftBodyLinkDataOpenCL m_linkData;
+	btSoftBodyVertexDataOpenCL m_vertexData;
+	btSoftBodyTriangleDataOpenCL m_triangleData;
+
+protected:
+
+	CLFunctions m_defaultCLFunctions;
+	CLFunctions* m_currentCLFunctions;
+
+	/** Variable to define whether we need to update solver constants on the next iteration */
+	bool m_updateSolverConstants;
+
+	bool m_shadersInitialized;
+
+	/** 
+	 * Cloths owned by this solver.
+	 * Only our cloths are in this array.
+	 */
+	btAlignedObjectArray< btOpenCLAcceleratedSoftBodyInterface * > m_softBodySet;
+
+	/** Acceleration value to be applied to all non-static vertices in the solver. 
+	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
+	 */
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothAcceleration;
+	btOpenCLBuffer<Vectormath::Aos::Vector3>			m_clPerClothAcceleration;
+
+	/** Wind velocity to be applied normal to all non-static vertices in the solver. 
+	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
+	 */
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothWindVelocity;
+	btOpenCLBuffer<Vectormath::Aos::Vector3>			m_clPerClothWindVelocity;
+
+	/** Velocity damping factor */
+	btAlignedObjectArray< float >						m_perClothDampingFactor;
+	btOpenCLBuffer<float>								m_clPerClothDampingFactor;
+
+	/** Velocity correction coefficient */
+	btAlignedObjectArray< float >						m_perClothVelocityCorrectionCoefficient;
+	btOpenCLBuffer<float>								m_clPerClothVelocityCorrectionCoefficient;
+
+	/** Lift parameter for wind effect on cloth. */
+	btAlignedObjectArray< float >						m_perClothLiftFactor;
+	btOpenCLBuffer<float>								m_clPerClothLiftFactor;
+	
+	/** Drag parameter for wind effect on cloth. */
+	btAlignedObjectArray< float >						m_perClothDragFactor;
+	btOpenCLBuffer<float>								m_clPerClothDragFactor;
+
+	/** Density of the medium in which each cloth sits */
+	btAlignedObjectArray< float >						m_perClothMediumDensity;
+	btOpenCLBuffer<float>								m_clPerClothMediumDensity;
+
+	/** 
+	 * Collision shape details: pair of index of first collision shape for the cloth and number of collision objects.
+	 */
+	btAlignedObjectArray< CollisionObjectIndices >		m_perClothCollisionObjects;
+	btOpenCLBuffer<CollisionObjectIndices>				m_clPerClothCollisionObjects;
+
+	/** 
+	 * Collision shapes being passed across to the cloths in this solver.
+	 */
+	btAlignedObjectArray< CollisionShapeDescription >	m_collisionObjectDetails;
+	btOpenCLBuffer< CollisionShapeDescription >			m_clCollisionObjectDetails;
+
+
+	
+	/** 
+	 * Friction coefficient for each cloth
+	 */
+	btAlignedObjectArray< float >	m_perClothFriction;
+	btOpenCLBuffer< float >			m_clPerClothFriction;
+
+	// anchor node info
+	struct AnchorNodeInfoCL
+	{
+		int clVertexIndex;
+		btSoftBody::Node* pNode;
+	};
+
+	btAlignedObjectArray<AnchorNodeInfoCL> m_anchorNodeInfoArray;
+	btAlignedObjectArray<Vectormath::Aos::Point3> m_anchorPosition;
+	btOpenCLBuffer<Vectormath::Aos::Point3>		  m_clAnchorPosition;
+	btAlignedObjectArray<int> m_anchorIndex;
+	btOpenCLBuffer<int>		  m_clAnchorIndex;
+
+	bool m_bUpdateAnchoredNodePos;
+
+	cl_kernel		m_prepareLinksKernel;
+	cl_kernel		m_solvePositionsFromLinksKernel;
+	cl_kernel		m_updateConstantsKernel;
+	cl_kernel		m_integrateKernel;
+	cl_kernel		m_addVelocityKernel;
+	cl_kernel		m_updatePositionsFromVelocitiesKernel;
+	cl_kernel		m_updateVelocitiesFromPositionsWithoutVelocitiesKernel;
+	cl_kernel		m_updateVelocitiesFromPositionsWithVelocitiesKernel;
+	cl_kernel		m_vSolveLinksKernel;
+	cl_kernel		m_solveCollisionsAndUpdateVelocitiesKernel;
+	cl_kernel		m_resetNormalsAndAreasKernel;
+	cl_kernel		m_normalizeNormalsAndAreasKernel;
+	cl_kernel		m_updateSoftBodiesKernel;
+
+	cl_kernel		m_outputToVertexArrayKernel;
+	cl_kernel		m_applyForcesKernel;
+	cl_kernel       m_updateFixedVertexPositionsKernel;	
+
+	cl_command_queue	m_cqCommandQue;
+	cl_context			m_cxMainContext;
+	
+	size_t				m_defaultWorkGroupSize;
+
+
+	virtual bool buildShaders();
+
+	void resetNormalsAndAreas( int numVertices );
+
+	void normalizeNormalsAndAreas( int numVertices );
+
+	void executeUpdateSoftBodies( int firstTriangle, int numTriangles );
+
+	void prepareCollisionConstraints();
+	
+	Vectormath::Aos::Vector3 ProjectOnAxis( const Vectormath::Aos::Vector3 &v, const Vectormath::Aos::Vector3 &a );
+
+	void ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce );
+	
+
+	int findSoftBodyIndex( const btSoftBody* const softBody );
+
+	virtual void applyForces( float solverdt );
+
+	void updateFixedVertexPositions();
+
+	/**
+	 * Integrate motion on the solver.
+	 */
+	virtual void integrate( float solverdt );
+
+	virtual void updateConstants( float timeStep );
+
+	float computeTriangleArea( 
+		const Vectormath::Aos::Point3 &vertex0,
+		const Vectormath::Aos::Point3 &vertex1,
+		const Vectormath::Aos::Point3 &vertex2 );
+
+
+	//////////////////////////////////////
+	// Kernel dispatches
+	void prepareLinks();
+
+	void solveLinksForVelocity( int startLink, int numLinks, float kst );
+
+	void updatePositionsFromVelocities( float solverdt );
+
+	virtual void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
+	
+	void updateVelocitiesFromPositionsWithVelocities( float isolverdt );
+
+	void updateVelocitiesFromPositionsWithoutVelocities( float isolverdt );
+	virtual void solveCollisionsAndUpdateVelocities( float isolverdt );
+
+	// End kernel dispatches
+	/////////////////////////////////////
+	
+	void updateBounds();
+
+	void releaseKernels();
+
+public:
+	btOpenCLSoftBodySolver(cl_command_queue queue,cl_context	ctx, bool bUpdateAchchoredNodePos = false);
+
+	virtual ~btOpenCLSoftBodySolver();
+
+
+	
+	btOpenCLAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
+
+	virtual btSoftBodyLinkData &getLinkData();
+
+	virtual btSoftBodyVertexData &getVertexData();
+
+	virtual btSoftBodyTriangleData &getTriangleData();
+
+	virtual SolverTypes getSolverType() const
+	{
+		return CL_SOLVER;
+	}
+
+
+	virtual bool checkInitialized();
+
+	virtual void updateSoftBodies( );
+
+	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
+
+	virtual void copyBackToSoftBodies(bool bMove = true);
+
+	virtual void solveConstraints( float solverdt );
+
+	virtual void predictMotion( float solverdt );
+
+	virtual void processCollision( btSoftBody *, const btCollisionObjectWrapper* );
+
+	virtual void processCollision( btSoftBody*, btSoftBody* );
+
+	virtual void	setDefaultWorkgroupSize(size_t workGroupSize)
+	{
+		m_defaultWorkGroupSize = workGroupSize;
+	}
+	virtual size_t	getDefaultWorkGroupSize() const
+	{
+		return m_defaultWorkGroupSize;
+	}
+
+	void	setCLFunctions(CLFunctions* funcs)
+	{
+		if (funcs)
+			m_currentCLFunctions = funcs;
+		else
+			m_currentCLFunctions  = &m_defaultCLFunctions;
+	}
+
+}; // btOpenCLSoftBodySolver
+
+
+/** 
+ * Class to manage movement of data from a solver to a given target.
+ * This version is the CL to CPU version.
+ */
+class btSoftBodySolverOutputCLtoCPU : public btSoftBodySolverOutput
+{
+protected:
+
+public:
+	btSoftBodySolverOutputCLtoCPU()
+	{
+	}
+
+	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
+	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer );
+};
+
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
@ -0,0 +1,81 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H
+#define BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H
+
+#include "stddef.h" //for size_t
+#include "vectormath/vmInclude.h"
+
+#include "btSoftBodySolver_OpenCL.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
+#include "btSoftBodySolverLinkData_OpenCLSIMDAware.h"
+#include "btSoftBodySolverVertexData_OpenCL.h"
+#include "btSoftBodySolverTriangleData_OpenCL.h"
+
+
+
+
+
+class btOpenCLSoftBodySolverSIMDAware : public btOpenCLSoftBodySolver
+{
+protected:
+	
+
+	btSoftBodyLinkDataOpenCLSIMDAware m_linkData;
+
+
+
+
+	virtual bool buildShaders();
+
+
+	void updateConstants( float timeStep );
+
+	float computeTriangleArea( 
+		const Vectormath::Aos::Point3 &vertex0,
+		const Vectormath::Aos::Point3 &vertex1,
+		const Vectormath::Aos::Point3 &vertex2 );
+
+
+	//////////////////////////////////////
+	// Kernel dispatches
+	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
+	
+	void solveCollisionsAndUpdateVelocities( float isolverdt );
+	// End kernel dispatches
+	/////////////////////////////////////
+
+public:
+	btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue,cl_context	ctx, bool bUpdateAchchoredNodePos = false);
+
+	virtual ~btOpenCLSoftBodySolverSIMDAware();
+
+	virtual SolverTypes getSolverType() const
+	{
+		return CL_SIMD_SOLVER;
+	}
+
+
+	virtual btSoftBodyLinkData &getLinkData();
+
+
+	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
+
+	virtual void solveConstraints( float solverdt );
+
+}; // btOpenCLSoftBodySolverSIMDAware
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h
@ -0,0 +1,748 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVER_DATA_H
+#define BT_SOFT_BODY_SOLVER_DATA_H
+
+#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
+#include "vectormath/vmInclude.h"
+
+
+class btSoftBodyLinkData
+{
+public:
+	/**
+	 * Class representing a link as a set of three indices into the vertex array.
+	 */
+	class LinkNodePair
+	{
+	public:
+		int vertex0;
+		int vertex1;
+
+		LinkNodePair()
+		{
+			vertex0 = 0;
+			vertex1 = 0;
+		}
+
+		LinkNodePair( int v0, int v1 )
+		{
+			vertex0 = v0;
+			vertex1 = v1;
+		}
+	};
+
+	/**
+	 * Class describing a link for input into the system.
+	 */
+	class LinkDescription
+	{
+	protected:
+		int m_vertex0;
+		int m_vertex1;
+		float m_linkLinearStiffness;
+		float m_linkStrength;
+
+	public:
+
+		LinkDescription()
+		{
+			m_vertex0 = 0;
+			m_vertex1 = 0;
+			m_linkLinearStiffness = 1.0;
+			m_linkStrength = 1.0;
+		}
+
+		LinkDescription( int newVertex0, int newVertex1, float linkLinearStiffness )
+		{
+			m_vertex0 = newVertex0;
+			m_vertex1 = newVertex1;
+			m_linkLinearStiffness = linkLinearStiffness;
+			m_linkStrength = 1.0;
+		}
+
+		LinkNodePair getVertexPair() const
+		{
+			LinkNodePair nodes;
+			nodes.vertex0 = m_vertex0;
+			nodes.vertex1 = m_vertex1;
+			return nodes;
+		}
+
+		void setVertex0( int vertex )
+		{
+			m_vertex0 = vertex;
+		}
+
+		void setVertex1( int vertex )
+		{
+			m_vertex1 = vertex;
+		}
+
+		void setLinkLinearStiffness( float linearStiffness )
+		{
+			m_linkLinearStiffness = linearStiffness;
+		}
+
+		void setLinkStrength( float strength )
+		{
+			m_linkStrength = strength;
+		}
+
+		int getVertex0() const
+		{
+			return m_vertex0;
+		}
+
+		int getVertex1() const
+		{
+			return m_vertex1;
+		}
+
+		float getLinkStrength() const
+		{
+			return m_linkStrength;
+		}
+
+		float getLinkLinearStiffness() const
+		{
+			return m_linkLinearStiffness;
+		}
+	};
+
+
+protected:
+	// NOTE:
+	// Vertex reference data is stored relative to global array, not relative to individual cloth.
+	// Values must be correct if being passed into single-cloth VBOs or when migrating from one solver
+	// to another.
+
+	btAlignedObjectArray< LinkNodePair > m_links; // Vertex pair for the link
+	btAlignedObjectArray< float >								m_linkStrength; // Strength of each link
+	// (inverseMassA + inverseMassB)/ linear stiffness coefficient
+	btAlignedObjectArray< float >								m_linksMassLSC; 
+	btAlignedObjectArray< float >								m_linksRestLengthSquared; 
+	// Current vector length of link
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >			m_linksCLength;
+	// 1/(current length * current length * massLSC)
+	btAlignedObjectArray< float >								m_linksLengthRatio; 
+	btAlignedObjectArray< float >								m_linksRestLength;
+	btAlignedObjectArray< float >								m_linksMaterialLinearStiffnessCoefficient;
+
+public:
+	btSoftBodyLinkData()
+	{
+	}
+
+	virtual ~btSoftBodyLinkData()
+	{
+	}
+
+	virtual void clear()
+	{
+		m_links.resize(0);
+		m_linkStrength.resize(0);
+		m_linksMassLSC.resize(0);
+		m_linksRestLengthSquared.resize(0);
+		m_linksLengthRatio.resize(0);
+		m_linksRestLength.resize(0);
+		m_linksMaterialLinearStiffnessCoefficient.resize(0);
+	}
+
+	int getNumLinks()
+	{
+		return m_links.size();
+	}
+
+	/** Allocate enough space in all link-related arrays to fit numLinks links */
+	virtual void createLinks( int numLinks )
+	{
+		int previousSize = m_links.size();
+		int newSize = previousSize + numLinks;
+
+		// Resize all the arrays that store link data
+		m_links.resize( newSize );
+		m_linkStrength.resize( newSize );
+		m_linksMassLSC.resize( newSize );
+		m_linksRestLengthSquared.resize( newSize );
+		m_linksCLength.resize( newSize );
+		m_linksLengthRatio.resize( newSize );
+		m_linksRestLength.resize( newSize );
+		m_linksMaterialLinearStiffnessCoefficient.resize( newSize );
+	}
+	
+	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
+	virtual void setLinkAt( const LinkDescription &link, int linkIndex )
+	{
+		m_links[linkIndex] = link.getVertexPair();
+		m_linkStrength[linkIndex] = link.getLinkStrength();
+		m_linksMassLSC[linkIndex] = 0.f;
+		m_linksRestLengthSquared[linkIndex] = 0.f;
+		m_linksCLength[linkIndex] = Vectormath::Aos::Vector3(0.f, 0.f, 0.f);
+		m_linksLengthRatio[linkIndex] = 0.f;
+		m_linksRestLength[linkIndex] = 0.f;
+		m_linksMaterialLinearStiffnessCoefficient[linkIndex] = link.getLinkLinearStiffness();
+	}
+
+
+	/**
+	 * Return true if data is on the accelerator.
+	 * The CPU version of this class will return true here because
+	 * the CPU is the same as the accelerator.
+	 */
+	virtual bool onAccelerator()
+	{
+		return true;
+	}
+	
+	/**
+	 * Move data from host memory to the accelerator.
+	 * The CPU version will always return that it has moved it.
+	 */
+	virtual bool moveToAccelerator()
+	{
+		return true;
+	}
+
+	/**
+	 * Move data from host memory from the accelerator.
+	 * The CPU version will always return that it has moved it.
+	 */
+	virtual bool moveFromAccelerator()
+	{
+		return true;
+	}
+
+
+
+	/**
+	 * Return reference to the vertex index pair for link linkIndex as stored on the host.
+	 */
+	LinkNodePair &getVertexPair( int linkIndex )
+	{
+		return m_links[linkIndex];
+	}
+
+	/** 
+	 * Return reference to strength of link linkIndex as stored on the host.
+	 */
+	float &getStrength( int linkIndex )
+	{
+		return m_linkStrength[linkIndex];
+	}
+
+	/**
+	 * Return a reference to the strength of the link corrected for link sorting.
+	 * This is important if we are using data on an accelerator which has the data sorted in some fashion.
+	 */
+	virtual float &getStrengthCorrected( int linkIndex )
+	{
+		return getStrength( linkIndex );
+	}
+
+	/**
+	 * Return reference to the rest length of link linkIndex as stored on the host.
+	 */
+	float &getRestLength( int linkIndex )
+	{
+		return m_linksRestLength[linkIndex];
+	}
+
+	/**
+	 * Return reference to linear stiffness coefficient for link linkIndex as stored on the host.
+	 */
+	float &getLinearStiffnessCoefficient( int linkIndex )
+	{
+		return m_linksMaterialLinearStiffnessCoefficient[linkIndex];
+	}
+
+	/**
+	 * Return reference to the MassLSC value for link linkIndex as stored on the host.
+	 */
+	float &getMassLSC( int linkIndex )
+	{
+		return m_linksMassLSC[linkIndex];
+	}
+
+	/**
+	 * Return reference to rest length squared for link linkIndex as stored on the host.
+	 */
+	float &getRestLengthSquared( int linkIndex )
+	{
+		return m_linksRestLengthSquared[linkIndex];
+	}
+
+	/**
+	 * Return reference to current length of link linkIndex as stored on the host.
+	 */
+	Vectormath::Aos::Vector3 &getCurrentLength( int linkIndex )
+	{
+		return m_linksCLength[linkIndex];
+	}
+
+	 /**
+	  * Return the link length ratio from for link linkIndex as stored on the host.
+	  */
+	 float &getLinkLengthRatio( int linkIndex )
+	 {
+		 return m_linksLengthRatio[linkIndex];
+	 }
+};
+
+
+
+/**
+ * Wrapper for vertex data information.
+ * By wrapping it like this we stand a good chance of being able to optimise for storage format easily.
+ * It should also help us make sure all the data structures remain consistent.
+ */
+class btSoftBodyVertexData
+{
+public:
+	/**
+	 * Class describing a vertex for input into the system.
+	 */
+	class VertexDescription
+	{
+	private:
+		Vectormath::Aos::Point3 m_position;
+		/** Inverse mass. If this is 0f then the mass was 0 because that simplifies calculations. */
+		float m_inverseMass;
+
+	public:
+		VertexDescription()
+		{	
+			m_position = Vectormath::Aos::Point3( 0.f, 0.f, 0.f );
+			m_inverseMass = 0.f;
+		}
+
+		VertexDescription( const Vectormath::Aos::Point3 &position, float mass )
+		{
+			m_position = position;
+			if( mass > 0.f )
+				m_inverseMass = 1.0f/mass;
+			else
+				m_inverseMass = 0.f;
+		}
+
+		void setPosition( const Vectormath::Aos::Point3 &position )
+		{
+			m_position = position;
+		}
+
+		void setInverseMass( float inverseMass )
+		{
+			m_inverseMass = inverseMass;
+		}
+
+		void setMass( float mass )
+		{
+			if( mass > 0.f )
+				m_inverseMass = 1.0f/mass;
+			else
+				m_inverseMass = 0.f;
+		}
+
+		Vectormath::Aos::Point3 getPosition() const
+		{
+			return m_position;
+		}
+
+		float getInverseMass() const
+		{
+			return m_inverseMass;
+		}
+
+		float getMass() const
+		{
+			if( m_inverseMass == 0.f )
+				return 0.f;
+			else
+				return 1.0f/m_inverseMass;
+		}
+	};
+protected:
+
+	// identifier for the individual cloth
+	// For the CPU we don't really need this as we can grab the cloths and iterate over only their vertices
+	// For a parallel accelerator knowing on a per-vertex basis which cloth we're part of will help for obtaining
+	// per-cloth data
+	// For sorting etc it might also be helpful to be able to use in-array data such as this.
+	btAlignedObjectArray< int >							m_clothIdentifier;
+	btAlignedObjectArray< Vectormath::Aos::Point3 >		m_vertexPosition;			// vertex positions
+	btAlignedObjectArray< Vectormath::Aos::Point3 >		m_vertexPreviousPosition;	// vertex positions
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_vertexVelocity;			// Velocity
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_vertexForceAccumulator;	// Force accumulator
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_vertexNormal;				// Normals
+	btAlignedObjectArray< float >						m_vertexInverseMass;		// Inverse mass
+	btAlignedObjectArray< float >						m_vertexArea;				// Area controlled by the vertex
+	btAlignedObjectArray< int >							m_vertexTriangleCount;		// Number of triangles touching this vertex
+
+public:
+	btSoftBodyVertexData()
+	{
+	}
+
+	virtual ~btSoftBodyVertexData()
+	{
+	}
+
+	virtual void clear()
+	{
+		m_clothIdentifier.resize(0);
+		m_vertexPosition.resize(0);
+		m_vertexPreviousPosition.resize(0);
+		m_vertexVelocity.resize(0);
+		m_vertexForceAccumulator.resize(0);
+		m_vertexNormal.resize(0);
+		m_vertexInverseMass.resize(0);
+		m_vertexArea.resize(0);
+		m_vertexTriangleCount.resize(0);
+	}
+
+	int getNumVertices()
+	{
+		return m_vertexPosition.size();
+	}
+
+	int getClothIdentifier( int vertexIndex )
+	{
+		return m_clothIdentifier[vertexIndex];
+	}
+
+	void setVertexAt( const VertexDescription &vertex, int vertexIndex )
+	{
+		m_vertexPosition[vertexIndex] = vertex.getPosition();
+		m_vertexPreviousPosition[vertexIndex] = vertex.getPosition();
+		m_vertexVelocity[vertexIndex] = Vectormath::Aos::Vector3(0.f, 0.f, 0.f);
+		m_vertexForceAccumulator[vertexIndex] = Vectormath::Aos::Vector3(0.f, 0.f, 0.f);
+		m_vertexNormal[vertexIndex] = Vectormath::Aos::Vector3(0.f, 0.f, 0.f);
+		m_vertexInverseMass[vertexIndex] = vertex.getInverseMass();
+		m_vertexArea[vertexIndex] = 0.f;
+		m_vertexTriangleCount[vertexIndex] = 0;
+	}
+
+	/** 
+	 * Create numVertices new vertices for cloth clothIdentifier 
+	 * maxVertices allows a buffer zone of extra vertices for alignment or tearing reasons.
+	 */
+	void createVertices( int numVertices, int clothIdentifier, int maxVertices = 0 )
+	{
+		int previousSize = m_vertexPosition.size();
+		if( maxVertices == 0 )
+			maxVertices = numVertices;
+		int newSize = previousSize + maxVertices;
+
+		// Resize all the arrays that store vertex data
+		m_clothIdentifier.resize( newSize );
+		m_vertexPosition.resize( newSize );
+		m_vertexPreviousPosition.resize( newSize );
+		m_vertexVelocity.resize( newSize );
+		m_vertexForceAccumulator.resize( newSize );
+		m_vertexNormal.resize( newSize );
+		m_vertexInverseMass.resize( newSize );
+		m_vertexArea.resize( newSize );
+		m_vertexTriangleCount.resize( newSize );
+
+		for( int vertexIndex = previousSize; vertexIndex < newSize; ++vertexIndex )
+			m_clothIdentifier[vertexIndex] = clothIdentifier;
+		for( int vertexIndex = (previousSize + numVertices); vertexIndex < newSize; ++vertexIndex )
+			m_clothIdentifier[vertexIndex] = -1;
+	}
+
+	// Get and set methods in header so they can be inlined
+
+	/**
+	 * Return a reference to the position of vertex vertexIndex as stored on the host.
+	 */
+	Vectormath::Aos::Point3 &getPosition( int vertexIndex )
+	{
+		return m_vertexPosition[vertexIndex];
+	}
+
+	Vectormath::Aos::Point3 getPosition( int vertexIndex ) const
+	{
+		return m_vertexPosition[vertexIndex];
+	}
+
+	/**
+	 * Return a reference to the previous position of vertex vertexIndex as stored on the host.
+	 */
+	Vectormath::Aos::Point3 &getPreviousPosition( int vertexIndex )
+	{
+		return m_vertexPreviousPosition[vertexIndex];
+	}
+
+	/**
+	 * Return a reference to the velocity of vertex vertexIndex as stored on the host.
+	 */
+	Vectormath::Aos::Vector3 &getVelocity( int vertexIndex )
+	{
+		return m_vertexVelocity[vertexIndex];
+	}
+
+	/**
+	 * Return a reference to the force accumulator of vertex vertexIndex as stored on the host.
+	 */
+	Vectormath::Aos::Vector3 &getForceAccumulator( int vertexIndex )
+	{
+		return m_vertexForceAccumulator[vertexIndex];
+	}
+
+	/**
+	 * Return a reference to the normal of vertex vertexIndex as stored on the host.
+	 */
+	Vectormath::Aos::Vector3 &getNormal( int vertexIndex )
+	{
+		return m_vertexNormal[vertexIndex];
+	}
+
+	Vectormath::Aos::Vector3 getNormal( int vertexIndex ) const
+	{
+		return m_vertexNormal[vertexIndex];
+	}
+
+	/**
+	 * Return a reference to the inverse mass of vertex vertexIndex as stored on the host.
+	 */
+	float &getInverseMass( int vertexIndex )
+	{
+		return m_vertexInverseMass[vertexIndex];
+	}
+
+	/**
+	 * Get access to the area controlled by this vertex.
+	 */
+	float &getArea( int vertexIndex )
+	{
+		return m_vertexArea[vertexIndex];
+	}
+
+	/**
+	 * Get access to the array of how many triangles touch each vertex.
+	 */
+	int &getTriangleCount( int vertexIndex )
+	{
+		return m_vertexTriangleCount[vertexIndex];
+	}
+
+
+
+	/**
+	 * Return true if data is on the accelerator.
+	 * The CPU version of this class will return true here because
+	 * the CPU is the same as the accelerator.
+	 */
+	virtual bool onAccelerator()
+	{
+		return true;
+	}
+	
+	/**
+	 * Move data from host memory to the accelerator.
+	 * The CPU version will always return that it has moved it.
+	 */
+	virtual bool moveToAccelerator()
+	{
+		return true;
+	}
+
+	/**
+	 * Move data to host memory from the accelerator if bCopy is false.
+	 * If bCopy is true, copy data to host memory from the accelerator so that data 
+	 * won't be moved to accelerator when moveToAccelerator() is called next time. 
+	 * If bCopyMinimum is true, only vertex position and normal are copied.
+	 * bCopyMinimum will be meaningful only if bCopy is true.
+	 * The CPU version will always return that it has moved it.
+	 */
+	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true)
+	{
+		return true;
+	}
+
+	btAlignedObjectArray< Vectormath::Aos::Point3 >	&getVertexPositions()
+	{
+		return m_vertexPosition;
+	}
+};
+
+
+class btSoftBodyTriangleData
+{
+public:
+	/**
+	 * Class representing a triangle as a set of three indices into the
+	 * vertex array.
+	 */
+	class TriangleNodeSet
+	{
+	public:
+		int vertex0;
+		int vertex1;
+		int vertex2;
+		int _padding;
+
+		TriangleNodeSet( )
+		{
+			vertex0 = 0;
+			vertex1 = 0;
+			vertex2 = 0;
+			_padding = -1;
+		}
+
+		TriangleNodeSet( int newVertex0, int newVertex1, int newVertex2 )
+		{
+			vertex0 = newVertex0;
+			vertex1 = newVertex1;
+			vertex2 = newVertex2;
+		}
+	};
+
+	class TriangleDescription
+	{
+	protected:
+		int m_vertex0;
+		int m_vertex1;
+		int m_vertex2;
+
+	public:
+		TriangleDescription()
+		{
+			m_vertex0 = 0;
+			m_vertex1 = 0;
+			m_vertex2 = 0;
+		}
+
+		TriangleDescription( int newVertex0, int newVertex1, int newVertex2 )
+		{
+			m_vertex0 = newVertex0;
+			m_vertex1 = newVertex1;
+			m_vertex2 = newVertex2;
+		}
+
+		TriangleNodeSet getVertexSet() const
+		{
+			btSoftBodyTriangleData::TriangleNodeSet nodes;
+			nodes.vertex0 = m_vertex0;
+			nodes.vertex1 = m_vertex1;
+			nodes.vertex2 = m_vertex2;
+			return nodes;
+		}
+	};
+
+protected:
+	// NOTE:
+	// Vertex reference data is stored relative to global array, not relative to individual cloth.
+	// Values must be correct if being passed into single-cloth VBOs or when migrating from one solver
+	// to another.
+	btAlignedObjectArray< TriangleNodeSet > m_vertexIndices;
+	btAlignedObjectArray< float > m_area;
+	btAlignedObjectArray< Vectormath::Aos::Vector3 > m_normal;
+
+public:
+	btSoftBodyTriangleData()
+	{
+	}
+
+	virtual ~btSoftBodyTriangleData()
+	{
+
+	}
+
+	virtual void clear()
+	{
+		m_vertexIndices.resize(0);
+		m_area.resize(0);
+		m_normal.resize(0);
+	}
+
+	int getNumTriangles()
+	{
+		return m_vertexIndices.size();
+	}
+
+	virtual void setTriangleAt( const TriangleDescription &triangle, int triangleIndex )
+	{
+		m_vertexIndices[triangleIndex] = triangle.getVertexSet();
+	}
+
+	virtual void createTriangles( int numTriangles )		
+	{
+		int previousSize = m_vertexIndices.size();
+		int newSize = previousSize + numTriangles;
+
+		// Resize all the arrays that store triangle data
+		m_vertexIndices.resize( newSize );
+		m_area.resize( newSize );
+		m_normal.resize( newSize );
+	}
+
+	/**
+	 * Return the vertex index set for triangle triangleIndex as stored on the host.
+	 */
+	const TriangleNodeSet &getVertexSet( int triangleIndex )
+	{
+		return m_vertexIndices[triangleIndex];
+	}
+
+	/**
+	 * Get access to the triangle area.
+	 */
+	float &getTriangleArea( int triangleIndex )
+	{
+		return m_area[triangleIndex];
+	}
+
+	/**
+	 * Get access to the normal vector for this triangle.
+	 */
+	Vectormath::Aos::Vector3 &getNormal( int triangleIndex )
+	{
+		return m_normal[triangleIndex];
+	}
+
+	/**
+	 * Return true if data is on the accelerator.
+	 * The CPU version of this class will return true here because
+	 * the CPU is the same as the accelerator.
+	 */
+	virtual bool onAccelerator()
+	{
+		return true;
+	}
+	
+	/**
+	 * Move data from host memory to the accelerator.
+	 * The CPU version will always return that it has moved it.
+	 */
+	virtual bool moveToAccelerator()
+	{
+		return true;
+	}
+
+	/**
+	 * Move data from host memory from the accelerator.
+	 * The CPU version will always return that it has moved it.
+	 */
+	virtual bool moveFromAccelerator()
+	{
+		return true;
+	}
+};
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_DATA_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/HeapManager.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/HeapManager.h
@ -0,0 +1,117 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef BT_HEAP_MANAGER_H__
+#define BT_HEAP_MANAGER_H__
+
+#ifdef __SPU__
+	#define HEAP_STACK_SIZE 32
+#else
+	#define HEAP_STACK_SIZE 64
+#endif
+
+#define MIN_ALLOC_SIZE 16
+
+
+class HeapManager
+{
+private:
+	ATTRIBUTE_ALIGNED16(unsigned char *mHeap);
+	ATTRIBUTE_ALIGNED16(unsigned int mHeapBytes);
+	ATTRIBUTE_ALIGNED16(unsigned char *mPoolStack[HEAP_STACK_SIZE]);
+	ATTRIBUTE_ALIGNED16(unsigned int mCurStack);
+	
+public:
+	enum {ALIGN16,ALIGN128};
+
+	HeapManager(unsigned char *buf,int bytes)
+	{
+		mHeap = buf;
+		mHeapBytes = bytes;
+		clear();
+	}
+	
+	~HeapManager()
+	{
+	}
+	
+	int getAllocated()
+	{
+		return (int)(mPoolStack[mCurStack]-mHeap);
+	}
+	
+	int getRest()
+	{
+		return mHeapBytes-getAllocated();
+	}
+
+	void *allocate(size_t bytes,int alignment = ALIGN16)
+	{
+		if(bytes <= 0) bytes = MIN_ALLOC_SIZE;
+		btAssert(mCurStack < (HEAP_STACK_SIZE-1));
+
+		
+#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64__)
+		unsigned long long p = (unsigned long long )mPoolStack[mCurStack];
+		if(alignment == ALIGN128) {
+			p = ((p+127) & 0xffffffffffffff80);
+			bytes = (bytes+127) & 0xffffffffffffff80;
+		}
+		else {
+			bytes = (bytes+15) & 0xfffffffffffffff0;
+		}
+
+		btAssert(bytes <=(mHeapBytes-(p-(unsigned long long )mHeap)) );
+		
+#else
+		unsigned long p = (unsigned long )mPoolStack[mCurStack];
+		if(alignment == ALIGN128) {
+			p = ((p+127) & 0xffffff80);
+			bytes = (bytes+127) & 0xffffff80;
+		}
+		else {
+			bytes = (bytes+15) & 0xfffffff0;
+		}
+		btAssert(bytes <=(mHeapBytes-(p-(unsigned long)mHeap)) );
+#endif
+		unsigned char * bla = (unsigned char *)(p + bytes);
+		mPoolStack[++mCurStack] = bla;
+		return (void*)p;
+	}
+
+	void deallocate(void *p)
+	{
+		(void) p;
+		mCurStack--;
+	}
+	
+	void clear()
+	{
+		mPoolStack[0] = mHeap;
+		mCurStack = 0;
+	}
+
+//	void printStack()
+//	{
+//		for(unsigned int i=0;i<=mCurStack;i++) {
+//			PRINTF("memStack %2d 0x%x\n",i,(uint32_t)mPoolStack[i]);
+//		}
+//	}
+
+};
+
+#endif //BT_HEAP_MANAGER_H__
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/Jamfile
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/Jamfile
@ -1,14 +0,0 @@
-SubDir TOP src BulletMultiThreaded ;
-
-#IncludeDir src/BulletMultiThreaded ;
-
-Library bulletmultithreaded : [ Wildcard . : *.h *.cpp ] [ Wildcard MiniCLTask : *.h *.cpp ] [ Wildcard SpuNarrowPhaseCollisionTask : *.h *.cpp  ]  : noinstall ;
-CFlags bulletmultithreaded : [ FIncludes $(TOP)/src/BulletMultiThreaded ] [ FIncludes $(TOP)/src/BulletMultiThreaded/vectormath/scalar/cpp ] ;
-LibDepends bulletmultithreaded :  ;
-
-   MsvcIncDirs bulletmultithreaded : 
-	"../../src/BulletMultiThreaded"  
-	"../../src/BulletMultiThreaded/vectormath/scalar/cpp"
-	;
-
-InstallHeader [ Wildcard *.h ] : bulletmultithreaded ;
--- a/Engine/lib/bullet/src/BulletMultiThreaded/Makefile.original
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/Makefile.original
@ -1,187 +0,0 @@
-__ARCH_BITS__ := 32
-
-# define macros
-NARROWPHASEDIR=./SpuNarrowPhaseCollisionTask
-SPU_TASKFILE=$(NARROWPHASEDIR)/SpuGatheringCollisionTask
-
-IBM_CELLSDK_VERSION := $(shell if [ -d /opt/cell ]; then echo "3.0"; fi)
-
-ifeq ("$(IBM_CELLSDK_VERSION)","3.0")
-        CELL_TOP ?= /opt/cell/sdk
-	CELL_SYSROOT := /opt/cell/sysroot
-else
-        CELL_TOP ?= /opt/ibm/cell-sdk/prototype
-	CELL_SYSROOT := $(CELL_TOP)/sysroot
-endif
-
-
-USE_CCACHE=ccache
-RM=rm -f 
-OUTDIR=./out
-DEBUGFLAG=-DNDEBUG
-LIBOUTDIR=../../lib/ibmsdk
-COLLISIONDIR=../../src/BulletCollision
-MATHDIR=../../src/LinearMath
-ARCHITECTUREFLAG=-m$(__ARCH_BITS__)
-ifeq "$(__ARCH_BITS__)" "64"
-  SPU_DEFFLAGS= -DUSE_LIBSPE2 -D__SPU__ -DUSE_ADDR64
-else
-  SPU_DEFFLAGS= -DUSE_LIBSPE2 -D__SPU__
-endif
-
-SPU_DEFFLAGS+=-DUSE_PE_BOX_BOX
-
-SPU_GCC=$(USE_CCACHE) /usr/bin/spu-gcc
-SPU_INCLUDEDIR= -Ivectormath/scalar/cpp -I. -I$(CELL_SYSROOT)/usr/spu/include -I../../src -I$(NARROWPHASEDIR)
-#SPU_CFLAGS= $(DEBUGFLAG) -W -Wall -Winline -Os -c -include spu_intrinsics.h -include stdbool.h
-SPU_CFLAGS= $(DEBUGFLAG) -W -Wall -Winline -O3 -mbranch-hints -fomit-frame-pointer -ftree-vectorize -finline-functions -ftree-vect-loop-version -ftree-loop-optimize -ffast-math -fno-rtti -fno-exceptions -c -include spu_intrinsics.h -include stdbool.h
-
-SPU_LFLAGS= -Wl,-N
-SPU_LIBRARIES=-lstdc++
-SPU_EMBED=/usr/bin/ppu-embedspu
-SPU_AR=/usr/bin/ar
-SYMBOLNAME=spu_program
-
-ifeq "$(__ARCH_BITS__)" "64"
-  PPU_DEFFLAGS= -DUSE_LIBSPE2 -DUSE_ADDR64
-  PPU_GCC=$(USE_CCACHE) /usr/bin/ppu-gcc
-else
-  PPU_DEFFLAGS= -DUSE_LIBSPE2
-  PPU_GCC=$(USE_CCACHE) /usr/bin/ppu32-gcc
-endif
-
-PPU_CFLAGS= $(ARCHITECTUREFLAG) $(DEBUGFLAG) -W -Wall -Winline -O3 -c -mabi=altivec -maltivec -include altivec.h -include stdbool.h
-PPU_INCLUDEDIR= -I. -I$(CELL_SYSROOT)/usr/include -I../../src -I$(NARROWPHASEDIR)
-PPU_LFLAGS= $(ARCHITECTUREFLAG) -Wl,-m,elf$(__ARCH_BITS__)ppc
-PPU_LIBRARIES= -lstdc++ -lsupc++ -lgcc -lgcov -lspe2 -lpthread -L../../lib/ibmsdk -lbulletcollision -lbulletdynamics -lbulletmath -L$(CELL_SYSROOT)/usr/lib$(__ARCH_BITS__) -R$(CELL_SYSROOT)/usr/lib
-PPU_AR=/usr/bin/ar
-
-MakeOut :
-#	rm -f -R $(OUTDIR) ; mkdir $(OUTDIR)
-	@echo "usage: make spu, make ppu, make all, or make clean"
-# SPU
-SpuTaskFile : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/SpuTaskFile.o $(SPU_TASKFILE).cpp
-
-boxBoxDistance : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
-
-SpuFakeDma : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
-
-SpuContactManifoldCollisionAlgorithm_spu : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o SpuContactManifoldCollisionAlgorithm.cpp
-
-SpuCollisionShapes : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
-
-SpuContactResult : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
-
-#SpuGatheringCollisionTask : MakeOut
-#	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
-
-SpuGjkPairDetector: MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
-
-SpuMinkowskiPenetrationDepthSolver : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
-
-SpuVoronoiSimplexSolver : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
-
-#SpuLibspe2Support_spu : MakeOut
-#	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o SpuLibspe2Support.cpp
-
-## SPU-Bullet
-btPersistentManifold : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/NarrowPhaseCollision/$@.cpp
-
-btOptimizedBvh : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
-
-btCollisionObject : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionDispatch/$@.cpp
-
-btTriangleCallback : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
-
-btTriangleIndexVertexArray : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
-
-btStridingMeshInterface : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
-
-btAlignedAllocator : MakeOut
-	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(MATHDIR)/$@.cpp
-
-
-# PPU
-SpuGatheringCollisionDispatcher : MakeOut
-	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
-
-SequentialThreadSupport: MakeOut
-	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
-
-SpuLibspe2Support: MakeOut
-	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
-
-btThreadSupportInterface: MakeOut
-	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
-
-SpuCollisionTaskProcess : MakeOut
-	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
-
-SpuContactManifoldCollisionAlgorithm : MakeOut
-	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
-	
-SpuSampleTaskProcess : MakeOut
-	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
-	
-
-
-spu : boxBoxDistance SpuFakeDma SpuContactManifoldCollisionAlgorithm_spu SpuContactResult SpuTaskFile \
-      SpuGjkPairDetector SpuMinkowskiPenetrationDepthSolver SpuVoronoiSimplexSolver SpuCollisionShapes \
-      btPersistentManifold btOptimizedBvh btCollisionObject btTriangleCallback btTriangleIndexVertexArray \
-      btStridingMeshInterface btAlignedAllocator
-	$(SPU_GCC) -o $(OUTDIR)/spuCollision.elf \
-                            $(OUTDIR)/SpuTaskFile.o \
-                            $(OUTDIR)/SpuFakeDma.o \
-			    $(OUTDIR)/boxBoxDistance.o \
-                            $(OUTDIR)/SpuContactManifoldCollisionAlgorithm_spu.o \
-                            $(OUTDIR)/SpuContactResult.o \
-                            $(OUTDIR)/SpuCollisionShapes.o \
-                            $(OUTDIR)/SpuGjkPairDetector.o \
-                            $(OUTDIR)/SpuMinkowskiPenetrationDepthSolver.o \
-                            $(OUTDIR)/SpuVoronoiSimplexSolver.o \
-                            $(OUTDIR)/btPersistentManifold.o \
-                            $(OUTDIR)/btTriangleCallback.o \
-                            $(OUTDIR)/btTriangleIndexVertexArray.o \
-                            $(OUTDIR)/btStridingMeshInterface.o \
-                            $(OUTDIR)/btAlignedAllocator.o \
-                            $(SPU_LFLAGS) $(SPU_LIBRARIES)
-
-spu-embed : spu
-	$(SPU_EMBED) $(ARCHITECTUREFLAG) $(SYMBOLNAME) $(OUTDIR)/spuCollision.elf $(OUTDIR)/$@.o
-	$(SPU_AR) -qcs $(LIBOUTDIR)/libspu.a $(OUTDIR)/$@.o
-
-
-
-ppu : SpuGatheringCollisionDispatcher SpuCollisionTaskProcess btThreadSupportInterface \
-      SpuLibspe2Support SpuContactManifoldCollisionAlgorithm SpuSampleTaskProcess
-	$(PPU_AR) -qcs $(LIBOUTDIR)/bulletmultithreaded.a \
-                                                          $(OUTDIR)/SpuCollisionTaskProcess.o \
-                                                          $(OUTDIR)/SpuSampleTaskProcess.o \
-                                                          $(OUTDIR)/SpuGatheringCollisionDispatcher.o \
-                                                          $(OUTDIR)/SpuLibspe2Support.o \
-                                                          $(OUTDIR)/btThreadSupportInterface.o \
-							  $(OUTDIR)/SpuContactManifoldCollisionAlgorithm.o
-
-all : spu-embed ppu 
-
-clean:
-	$(RM) $(OUTDIR)/* ; $(RM) $(LIBOUTDIR)/libspu.a ; $(RM) $(LIBOUTDIR)/bulletmultithreaded.a
-
-
-
-
--- a/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTask/MiniCLTask.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTask/MiniCLTask.cpp
@ -1,116 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-
-*/
-
-
-#include "MiniCLTask.h"
-#include "../PlatformDefinitions.h"
-#include "../SpuFakeDma.h"
-#include "LinearMath/btMinMax.h"
-#include "BulletMultiThreaded/MiniCLTask/MiniCLTask.h"
-
-#ifdef __SPU__
-#include <spu_printf.h>
-#else
-#include <stdio.h>
-#define spu_printf printf
-#endif
-
-#define __kernel
-#define __global
-#define get_global_id(a) guid
-
-struct MiniCLTask_LocalStoreMemory
-{
-	
-};
-
-
-///////////////////////////////////////////////////
-// OpenCL Kernel Function for element by element vector addition
-__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int guid)
-{
-    // get oct-float index into global data array
-    int iGID = get_global_id(0);
-
-    // read inputs into registers
-    float8 f8InA = a[iGID];
-    float8 f8InB = b[iGID];
-    float8 f8Out = (float8)0.0f;
-    
-    // add the vector elements
-    f8Out.s0 = f8InA.s0 + f8InB.s0;
-    f8Out.s1 = f8InA.s1 + f8InB.s1;
-    f8Out.s2 = f8InA.s2 + f8InB.s2;
-    f8Out.s3 = f8InA.s3 + f8InB.s3;
-    f8Out.s4 = f8InA.s4 + f8InB.s4;
-    f8Out.s5 = f8InA.s5 + f8InB.s5;
-    f8Out.s6 = f8InA.s6 + f8InB.s6;
-    f8Out.s7 = f8InA.s7 + f8InB.s7;
-
-    // write back out to GMEM
-    c[get_global_id(0)] = f8Out;
-}
-///////////////////////////////////////////////////
-
-
-//-- MAIN METHOD
-void processMiniCLTask(void* userPtr, void* lsMemory)
-{
-	//	BT_PROFILE("processSampleTask");
-
-	MiniCLTask_LocalStoreMemory* localMemory = (MiniCLTask_LocalStoreMemory*)lsMemory;
-
-	MiniCLTaskDesc* taskDescPtr = (MiniCLTaskDesc*)userPtr;
-	MiniCLTaskDesc& taskDesc = *taskDescPtr;
-
-	printf("Compute Unit[%d] executed kernel %d work items [%d..%d)\n",taskDesc.m_taskId,taskDesc.m_kernelProgramId,taskDesc.m_firstWorkUnit,taskDesc.m_lastWorkUnit);
-	
-	
-	switch (taskDesc.m_kernelProgramId)
-	{
-	case CMD_MINICL_ADDVECTOR:
-		{
-			for (unsigned int i=taskDesc.m_firstWorkUnit;i<taskDesc.m_lastWorkUnit;i++)
-			{
-				VectorAdd(*(const float8**)&taskDesc.m_argData[0][0],*(const float8**)&taskDesc.m_argData[1][0],*(float8**)&taskDesc.m_argData[2][0],i);
-			}
-			break;
-		}
-
-	default:
-		{
-			printf("error in processMiniCLTask: unknown command id: %d\n",taskDesc.m_kernelProgramId);
-
-		}
-	};
-
-}
-
-
-#if defined(__CELLOS_LV2__) || defined (LIBSPE2)
-
-ATTRIBUTE_ALIGNED16(MiniCLTask_LocalStoreMemory	gLocalStoreMemory);
-
-void* createMiniCLLocalStoreMemory()
-{
-	return &gLocalStoreMemory;
-}
-#else
-void* createMiniCLLocalStoreMemory()
-{
-	return new MiniCLTask_LocalStoreMemory;
-};
-
-#endif
--- a/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTask/MiniCLTask.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTask/MiniCLTask.h
@ -1,81 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-
-*/
-
-#ifndef MINICL__TASK_H
-#define MINICL__TASK_H
-
-#include "../PlatformDefinitions.h"
-#include "LinearMath/btScalar.h"
-
-#include "LinearMath/btAlignedAllocator.h"
-
-
-enum
-{
-	CMD_MINICL_1= 1,
-	CMD_MINICL_ADDVECTOR
-};
-
-
-
-struct float8
-{
-	float s0;
-	float s1;
-	float s2;
-	float s3;
-	float s4;
-	float s5;
-	float s6;
-	float s7;
-
-	float8(float scalar)
-	{
-		s0=s1=s2=s3=s4=s5=s6=s7=scalar;
-	}
-};
-
-#define MINICL_MAX_ARGLENGTH 128
-#define MINI_CL_MAX_ARG 8
-
-ATTRIBUTE_ALIGNED16(struct) MiniCLTaskDesc
-{
-	BT_DECLARE_ALIGNED_ALLOCATOR();
-
-	MiniCLTaskDesc()
-	{
-		for (int i=0;i<MINI_CL_MAX_ARG;i++)
-		{
-			m_argSizes[i]=0;
-		}
-	}
-
-	uint32_t	m_taskId;
-
-	uint32_t	m_kernelProgramId;
-	uint32_t	m_firstWorkUnit;
-	uint32_t	m_lastWorkUnit;
-
-	char		m_argData[MINI_CL_MAX_ARG][MINICL_MAX_ARGLENGTH];
-	int			m_argSizes[MINI_CL_MAX_ARG];
-};
-
-
-void	processMiniCLTask(void* userPtr, void* lsMemory);
-void*	createMiniCLLocalStoreMemory();
-
-
-#endif //MINICL__TASK_H
-
--- a/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTaskScheduler.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTaskScheduler.cpp
@ -1,227 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-//#define __CELLOS_LV2__ 1
-
-#define USE_SAMPLE_PROCESS 1
-#ifdef USE_SAMPLE_PROCESS
-
-
-#include "MiniCLTaskScheduler.h"
-#include <stdio.h>
-
-#ifdef __SPU__
-
-
-
-void	SampleThreadFunc(void* userPtr,void* lsMemory)
-{
-	//do nothing
-	printf("hello world\n");
-}
-
-
-void*	SamplelsMemoryFunc()
-{
-	//don't create local store memory, just return 0
-	return 0;
-}
-
-
-#else
-
-
-#include "btThreadSupportInterface.h"
-
-//#	include "SPUAssert.h"
-#include <string.h>
-
-
-
-extern "C" {
-	extern char SPU_SAMPLE_ELF_SYMBOL[];
-}
-
-
-
-
-
-MiniCLTaskScheduler::MiniCLTaskScheduler(btThreadSupportInterface*	threadInterface,  int maxNumOutstandingTasks)
-:m_threadInterface(threadInterface),
-m_maxNumOutstandingTasks(maxNumOutstandingTasks)
-{
-
-	m_taskBusy.resize(m_maxNumOutstandingTasks);
-	m_spuSampleTaskDesc.resize(m_maxNumOutstandingTasks);
-
-	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
-	{
-		m_taskBusy[i] = false;
-	}
-	m_numBusyTasks = 0;
-	m_currentTask = 0;
-
-	m_initialized = false;
-
-	m_threadInterface->startSPU();
-
-
-}
-
-MiniCLTaskScheduler::~MiniCLTaskScheduler()
-{
-	m_threadInterface->stopSPU();
-	
-}
-
-
-
-void	MiniCLTaskScheduler::initialize()
-{
-#ifdef DEBUG_SPU_TASK_SCHEDULING
-	printf("MiniCLTaskScheduler::initialize()\n");
-#endif //DEBUG_SPU_TASK_SCHEDULING
-	
-	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
-	{
-		m_taskBusy[i] = false;
-	}
-	m_numBusyTasks = 0;
-	m_currentTask = 0;
-	m_initialized = true;
-
-}
-
-
-void MiniCLTaskScheduler::issueTask(int firstWorkUnit, int lastWorkUnit,int kernelProgramId,char* argData,int* argSizes)
-{
-
-#ifdef DEBUG_SPU_TASK_SCHEDULING
-	printf("MiniCLTaskScheduler::issueTask (m_currentTask= %d\)n", m_currentTask);
-#endif //DEBUG_SPU_TASK_SCHEDULING
-
-	m_taskBusy[m_currentTask] = true;
-	m_numBusyTasks++;
-
-	MiniCLTaskDesc& taskDesc = m_spuSampleTaskDesc[m_currentTask];
-	{
-		// send task description in event message
-		taskDesc.m_firstWorkUnit = firstWorkUnit;
-		taskDesc.m_lastWorkUnit = lastWorkUnit;
-		taskDesc.m_kernelProgramId = kernelProgramId;
-		//some bookkeeping to recognize finished tasks
-		taskDesc.m_taskId = m_currentTask;
-		
-		for (int i=0;i<MINI_CL_MAX_ARG;i++)
-		{
-			taskDesc.m_argSizes[i] = argSizes[i];
-			if (taskDesc.m_argSizes[i])
-			{
-				memcpy(&taskDesc.m_argData[i],&argData[MINICL_MAX_ARGLENGTH*i],taskDesc.m_argSizes[i]);
-			}
-		}
-	}
-
-
-	m_threadInterface->sendRequest(1, (ppu_address_t) &taskDesc, m_currentTask);
-
-	// if all tasks busy, wait for spu event to clear the task.
-	
-	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
-	{
-		unsigned int taskId;
-		unsigned int outputSize;
-
-		for (int i=0;i<m_maxNumOutstandingTasks;i++)
-	  {
-		  if (m_taskBusy[i])
-		  {
-			  taskId = i;
-			  break;
-		  }
-	  }
-		m_threadInterface->waitForResponse(&taskId, &outputSize);
-
-		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
-
-		postProcess(taskId, outputSize);
-
-		m_taskBusy[taskId] = false;
-
-		m_numBusyTasks--;
-	}
-
-	// find new task buffer
-	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
-	{
-		if (!m_taskBusy[i])
-		{
-			m_currentTask = i;
-			break;
-		}
-	}
-}
-
-
-///Optional PPU-size post processing for each task
-void MiniCLTaskScheduler::postProcess(int taskId, int outputSize)
-{
-
-}
-
-
-void MiniCLTaskScheduler::flush()
-{
-#ifdef DEBUG_SPU_TASK_SCHEDULING
-	printf("\nSpuCollisionTaskProcess::flush()\n");
-#endif //DEBUG_SPU_TASK_SCHEDULING
-	
-
-	// all tasks are issued, wait for all tasks to be complete
-	while(m_numBusyTasks > 0)
-	{
-// Consolidating SPU code
-	  unsigned int taskId;
-	  unsigned int outputSize;
-	  
-	  for (int i=0;i<m_maxNumOutstandingTasks;i++)
-	  {
-		  if (m_taskBusy[i])
-		  {
-			  taskId = i;
-			  break;
-		  }
-	  }
-	  {
-			
-		  m_threadInterface->waitForResponse(&taskId, &outputSize);
-	  }
-
-		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
-
-		postProcess(taskId, outputSize);
-
-		m_taskBusy[taskId] = false;
-
-		m_numBusyTasks--;
-	}
-
-
-}
-
-#endif
-
-
-#endif //USE_SAMPLE_PROCESS
--- a/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTaskScheduler.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTaskScheduler.h
@ -1,181 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-
-#ifndef MINICL_TASK_SCHEDULER_H
-#define MINICL_TASK_SCHEDULER_H
-
-#include <assert.h>
-
-
-#include "PlatformDefinitions.h"
-
-#include <stdlib.h>
-
-#include "LinearMath/btAlignedObjectArray.h"
-
-
-#include "MiniCLTask/MiniCLTask.h"
-
-
-//just add your commands here, try to keep them globally unique for debugging purposes
-#define CMD_SAMPLE_TASK_COMMAND 10
-
-
-
-/// MiniCLTaskScheduler handles SPU processing of collision pairs.
-/// When PPU issues a task, it will look for completed task buffers
-/// PPU will do postprocessing, dependent on workunit output (not likely)
-class MiniCLTaskScheduler
-{
-	// track task buffers that are being used, and total busy tasks
-	btAlignedObjectArray<bool>	m_taskBusy;
-	btAlignedObjectArray<MiniCLTaskDesc>	m_spuSampleTaskDesc;
-	
-	int   m_numBusyTasks;
-
-	// the current task and the current entry to insert a new work unit
-	int   m_currentTask;
-
-	bool m_initialized;
-
-	void postProcess(int taskId, int outputSize);
-	
-	class	btThreadSupportInterface*	m_threadInterface;
-
-	int	m_maxNumOutstandingTasks;
-
-
-
-public:
-	MiniCLTaskScheduler(btThreadSupportInterface*	threadInterface, int maxNumOutstandingTasks);
-	
-	~MiniCLTaskScheduler();
-	
-	///call initialize in the beginning of the frame, before addCollisionPairToTask
-	void initialize();
-
-	void issueTask(int firstWorkUnit, int lastWorkUnit,int kernelProgramId,char* argData,int* argSizes);
-
-	///call flush to submit potential outstanding work to SPUs and wait for all involved SPUs to be finished
-	void flush();
-
-	class	btThreadSupportInterface*	getThreadSupportInterface()
-	{
-		return m_threadInterface;
-	}
-
-	int	findProgramCommandIdByName(const char* programName) const
-	{
-		return CMD_MINICL_ADDVECTOR;//hardcoded temp value, todo: implement multi-program support
-	}
-
-	int getMaxNumOutstandingTasks() const
-	{
-		return m_maxNumOutstandingTasks;
-	}
-};
-
-
-struct	MiniCLKernel
-{
-	MiniCLTaskScheduler* m_scheduler;
-	
-	int	m_kernelProgramCommandId;
-
-	char	m_argData[MINI_CL_MAX_ARG][MINICL_MAX_ARGLENGTH];
-	int				m_argSizes[MINI_CL_MAX_ARG];
-};
-
-
-#if defined(USE_LIBSPE2) && defined(__SPU__)
-////////////////////MAIN/////////////////////////////
-#include "../SpuLibspe2Support.h"
-#include <spu_intrinsics.h>
-#include <spu_mfcio.h>
-#include <SpuFakeDma.h>
-
-void * SamplelsMemoryFunc();
-void SampleThreadFunc(void* userPtr,void* lsMemory);
-
-//#define DEBUG_LIBSPE2_MAINLOOP
-
-int main(unsigned long long speid, addr64 argp, addr64 envp)
-{
-	printf("SPU is up \n");
-	
-	ATTRIBUTE_ALIGNED128(btSpuStatus status);
-	ATTRIBUTE_ALIGNED16( SpuSampleTaskDesc taskDesc ) ;
-	unsigned int received_message = Spu_Mailbox_Event_Nothing;
-        bool shutdown = false;
-
-	cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-	cellDmaWaitTagStatusAll(DMA_MASK(3));
-
-	status.m_status = Spu_Status_Free;
-	status.m_lsMemory.p = SamplelsMemoryFunc();
-
-	cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-	cellDmaWaitTagStatusAll(DMA_MASK(3));
-	
-	
-	while (!shutdown)
-	{
-		received_message = spu_read_in_mbox();
-		
-
-		
-		switch(received_message)
-		{
-		case Spu_Mailbox_Event_Shutdown:
-			shutdown = true;
-			break; 
-		case Spu_Mailbox_Event_Task:
-			// refresh the status
-#ifdef DEBUG_LIBSPE2_MAINLOOP
-			printf("SPU recieved Task \n");
-#endif //DEBUG_LIBSPE2_MAINLOOP
-			cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-			cellDmaWaitTagStatusAll(DMA_MASK(3));
-		
-			btAssert(status.m_status==Spu_Status_Occupied);
-			
-			cellDmaGet(&taskDesc, status.m_taskDesc.p, sizeof(SpuSampleTaskDesc), DMA_TAG(3), 0, 0);
-			cellDmaWaitTagStatusAll(DMA_MASK(3));
-			
-			SampleThreadFunc((void*)&taskDesc, reinterpret_cast<void*> (taskDesc.m_mainMemoryPtr) );
-			break;
-		case Spu_Mailbox_Event_Nothing:
-		default:
-			break;
-		}
-
-		// set to status free and wait for next task
-		status.m_status = Spu_Status_Free;
-		cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
-		cellDmaWaitTagStatusAll(DMA_MASK(3));		
-				
-		
-  	}
-  	return 0;
-}
-//////////////////////////////////////////////////////
-#endif
-
-
-
-#endif // MINICL_TASK_SCHEDULER_H
-
--- a/Engine/lib/bullet/src/BulletMultiThreaded/PlatformDefinitions.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/PlatformDefinitions.h
@ -1,9 +1,26 @@
-#ifndef TYPE_DEFINITIONS_H
-#define TYPE_DEFINITIONS_H
+#ifndef BT_TYPE_DEFINITIONS_H
+#define BT_TYPE_DEFINITIONS_H

 ///This file provides some platform/compiler checks for common definitions
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btMinMax.h"

-#ifdef WIN32
+#ifdef PFX_USE_FREE_VECTORMATH
+#include "physics_effects/base_level/base/pfx_vectormath_include.win32.h"
+typedef Vectormath::Aos::Vector3    vmVector3;
+typedef Vectormath::Aos::Quat       vmQuat;
+typedef Vectormath::Aos::Matrix3    vmMatrix3;
+typedef Vectormath::Aos::Transform3 vmTransform3;
+typedef Vectormath::Aos::Point3     vmPoint3;
+#else
+#include "vectormath/vmInclude.h"
+#endif//PFX_USE_FREE_VECTORMATH
+
+
+
+
+
+#ifdef _WIN32

 typedef union
 {
@ -19,7 +36,11 @@ typedef union

 		typedef unsigned char     uint8_t;
 #ifndef __PHYSICS_COMMON_H__
+#ifndef PFX_USE_FREE_VECTORMATH
+#ifndef __BT_SKIP_UINT64_H
 		typedef unsigned long int uint64_t;
+#endif //__BT_SKIP_UINT64_H
+#endif //PFX_USE_FREE_VECTORMATH
 		typedef unsigned int      uint32_t;
 #endif //__PHYSICS_COMMON_H__
 		typedef unsigned short    uint16_t;
@ -52,31 +73,27 @@ typedef union
 #include <stdio.h>		
 #define spu_printf printf	
 #define DWORD unsigned int
-		
 			typedef union
 			{
 			  unsigned long long ull;
 			  unsigned int ui[2];
 			  void *p;
 			} addr64;
-		
-		
-#else
-
-#include <stdio.h>		
-#define spu_printf printf	
-
 #endif // USE_LIBSPE2
-	
+
 #endif	//__CELLOS_LV2__
 	
 #endif

+#ifdef __SPU__
+#include <stdio.h>		
+#define printf spu_printf
+#endif

 /* Included here because we need uint*_t typedefs */
 #include "PpuAddressSpace.h"

-#endif //TYPE_DEFINITIONS_H
+#endif //BT_TYPE_DEFINITIONS_H



--- a/Engine/lib/bullet/src/BulletMultiThreaded/PosixThreadSupport.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/PosixThreadSupport.cpp
@ -48,7 +48,7 @@ PosixThreadSupport::~PosixThreadSupport()
 #endif

 // this semaphore will signal, if and how many threads are finished with their work
-static sem_t* mainSemaphore;
+static sem_t* mainSemaphore=0;

 static sem_t* createSem(const char* baseName)
 {
@ -58,9 +58,10 @@ static sem_t* createSem(const char* baseName)
        char name[32];
        snprintf(name, 32, "/%s-%d-%4.4d", baseName, getpid(), semCount++); 
        sem_t* tempSem = sem_open(name, O_CREAT, 0600, 0);
+
        if (tempSem != reinterpret_cast<sem_t *>(SEM_FAILED))
        {
-        	//printf("Created \"%s\" Semaphore %x\n", name, tempSem);
+//        printf("Created \"%s\" Semaphore %p\n", name, tempSem);
        }
        else
 	{
@ -172,7 +173,7 @@ void PosixThreadSupport::waitForResponse(unsigned int *puiArgument0, unsigned in
 	// get at least one thread which has finished
        size_t last = -1;
        
-        for(size_t t=0; t < m_activeSpuStatus.size(); ++t) {
+        for(size_t t=0; t < size_t(m_activeSpuStatus.size()); ++t) {
            if(2 == m_activeSpuStatus[t].m_status) {
                last = t;
                break;
@ -199,7 +200,8 @@ void PosixThreadSupport::startThreads(ThreadConstructionInfo& threadConstruction
 	m_activeSpuStatus.resize(threadConstructionInfo.m_numThreads);
        
 	mainSemaphore = createSem("main");                
-        
+	//checkPThreadFunction(sem_wait(mainSemaphore));
+   
 	for (int i=0;i < threadConstructionInfo.m_numThreads;i++)
 	{
 		printf("starting thread %d\n",i);
@ -233,17 +235,175 @@ void PosixThreadSupport::startSPU()
 ///tell the task scheduler we are done with the SPU tasks
 void PosixThreadSupport::stopSPU()
 {
-	for(size_t t=0; t < m_activeSpuStatus.size(); ++t) {
+	for(size_t t=0; t < size_t(m_activeSpuStatus.size()); ++t) 
+	{
            btSpuStatus&	spuStatus = m_activeSpuStatus[t];
-            printf("%s: Thread %i used: %ld\n", __FUNCTION__, t, spuStatus.threadUsed);
-        
-            destroySem(spuStatus.startSemaphore);
-            checkPThreadFunction(pthread_cancel(spuStatus.thread));
-        }
-        destroySem(mainSemaphore);
+            printf("%s: Thread %i used: %ld\n", __FUNCTION__, int(t), spuStatus.threadUsed);

+	spuStatus.m_userPtr = 0;       
+ 	checkPThreadFunction(sem_post(spuStatus.startSemaphore));
+	checkPThreadFunction(sem_wait(mainSemaphore));
+
+	printf("destroy semaphore\n"); 
+            destroySem(spuStatus.startSemaphore);
+            printf("semaphore destroyed\n");
+		checkPThreadFunction(pthread_join(spuStatus.thread,0));
+
+        }
+	printf("destroy main semaphore\n");
+        destroySem(mainSemaphore);
+	printf("main semaphore destroyed\n");
 	m_activeSpuStatus.clear();
 }

+class PosixCriticalSection : public btCriticalSection 
+{
+	pthread_mutex_t m_mutex;
+	
+public:
+	PosixCriticalSection() 
+	{
+		pthread_mutex_init(&m_mutex, NULL);
+	}
+	virtual ~PosixCriticalSection() 
+	{
+		pthread_mutex_destroy(&m_mutex);
+	}
+	
+	ATTRIBUTE_ALIGNED16(unsigned int mCommonBuff[32]);
+	
+	virtual unsigned int getSharedParam(int i)
+	{
+		return mCommonBuff[i];
+	}
+	virtual void setSharedParam(int i,unsigned int p)
+	{
+		mCommonBuff[i] = p;
+	}
+	
+	virtual void lock()
+	{
+		pthread_mutex_lock(&m_mutex);
+	}
+	virtual void unlock()
+	{
+		pthread_mutex_unlock(&m_mutex);
+	}
+};
+
+
+#if defined(_POSIX_BARRIERS) && (_POSIX_BARRIERS - 20012L) >= 0
+/* OK to use barriers on this platform */
+class PosixBarrier : public btBarrier 
+{
+	pthread_barrier_t m_barr;
+	int m_numThreads;
+public:
+	PosixBarrier()
+	:m_numThreads(0)	{	}
+	virtual ~PosixBarrier()	{
+		pthread_barrier_destroy(&m_barr);
+	}
+	
+	virtual void sync()
+	{
+		int rc = pthread_barrier_wait(&m_barr);
+		if(rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD)
+		{
+			printf("Could not wait on barrier\n");
+			exit(-1);
+		}
+	}
+	virtual void setMaxCount(int numThreads)
+	{
+		int result = pthread_barrier_init(&m_barr, NULL, numThreads);
+		m_numThreads = numThreads;
+		btAssert(result==0);
+	}
+	virtual int  getMaxCount()
+	{
+		return m_numThreads;
+	}
+};
+#else
+/* Not OK to use barriers on this platform - insert alternate code here */
+class PosixBarrier : public btBarrier 
+{
+	pthread_mutex_t m_mutex;
+	pthread_cond_t m_cond;
+	
+	int m_numThreads;
+	int	m_called;
+	
+public:
+	PosixBarrier()
+	:m_numThreads(0)
+	{
+	}
+	virtual ~PosixBarrier() 
+	{
+		if (m_numThreads>0)
+		{
+			pthread_mutex_destroy(&m_mutex);
+			pthread_cond_destroy(&m_cond);
+		}
+	}
+	
+	virtual void sync()
+	{		
+		pthread_mutex_lock(&m_mutex);
+		m_called++;
+		if (m_called == m_numThreads) {
+			m_called = 0;
+			pthread_cond_broadcast(&m_cond);
+		} else {
+			pthread_cond_wait(&m_cond,&m_mutex);
+		}
+		pthread_mutex_unlock(&m_mutex);
+		
+	}
+	virtual void setMaxCount(int numThreads)
+	{
+		if (m_numThreads>0)
+		{
+			pthread_mutex_destroy(&m_mutex);
+			pthread_cond_destroy(&m_cond);
+		}
+		m_called = 0;
+		pthread_mutex_init(&m_mutex,NULL);
+		pthread_cond_init(&m_cond,NULL);
+		m_numThreads = numThreads;
+	}
+	virtual int  getMaxCount()
+	{
+		return m_numThreads;
+	}
+};
+
+#endif//_POSIX_BARRIERS
+
+
+
+btBarrier* PosixThreadSupport::createBarrier()
+{
+	PosixBarrier* barrier = new PosixBarrier();
+	barrier->setMaxCount(getNumTasks());
+	return barrier;
+}
+
+btCriticalSection* PosixThreadSupport::createCriticalSection()
+{
+	return new PosixCriticalSection();
+}
+
+void	PosixThreadSupport::deleteBarrier(btBarrier* barrier)
+{
+	delete barrier;
+}
+
+void PosixThreadSupport::deleteCriticalSection(btCriticalSection* cs)
+{
+	delete cs;
+}
 #endif // USE_PTHREADS

--- a/Engine/lib/bullet/src/BulletMultiThreaded/PosixThreadSupport.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/PosixThreadSupport.h
@ -13,16 +13,22 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

+#ifndef BT_POSIX_THREAD_SUPPORT_H
+#define BT_POSIX_THREAD_SUPPORT_H
+

 #include "LinearMath/btScalar.h"
 #include "PlatformDefinitions.h"

-#ifdef USE_PTHREADS  //platform specific defines are defined in PlatformDefinitions.h
+#ifdef USE_PTHREADS //platform specifc defines are defined in PlatformDefinitions.h
+
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600 //for definition of pthread_barrier_t, see http://pages.cs.wisc.edu/~travitch/pthreads_primer.html
+#endif //_XOPEN_SOURCE
 #include <pthread.h>
 #include <semaphore.h>

-#ifndef POSIX_THREAD_SUPPORT_H
-#define POSIX_THREAD_SUPPORT_H
+

 #include "LinearMath/btAlignedObjectArray.h"

@ -68,7 +74,7 @@ public:

 	struct	ThreadConstructionInfo
 	{
-		ThreadConstructionInfo(char* uniqueName,
+		ThreadConstructionInfo(const char* uniqueName,
 									PosixThreadFunc userThreadFunc,
 									PosixlsMemorySetupFunc	lsMemoryFunc,
 									int numThreads=1,
@ -83,7 +89,7 @@ public:

 		}

-		char*					m_uniqueName;
+		const char*					m_uniqueName;
 		PosixThreadFunc			m_userThreadFunc;
 		PosixlsMemorySetupFunc	m_lsMemoryFunc;
 		int						m_numThreads;
@ -117,8 +123,25 @@ public:
 	{
 		return m_activeSpuStatus.size();
 	}
+
+	virtual btBarrier* createBarrier();
+
+	virtual btCriticalSection* createCriticalSection();
+
+	virtual void deleteBarrier(btBarrier* barrier);
+
+	virtual void deleteCriticalSection(btCriticalSection* criticalSection);
+	
+	
+	virtual void*	getThreadLocalMemory(int taskId)
+	{
+		return m_activeSpuStatus[taskId].m_lsMemory;
+	}
+
 };

-#endif // POSIX_THREAD_SUPPORT_H
-
 #endif // USE_PTHREADS
+
+#endif // BT_POSIX_THREAD_SUPPORT_H
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/PpuAddressSpace.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/PpuAddressSpace.h
@ -1,18 +1,37 @@
-#ifndef __PPU_ADDRESS_SPACE_H
-#define __PPU_ADDRESS_SPACE_H
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2010 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/


-#ifdef WIN32
+#ifndef BT_PPU_ADDRESS_SPACE_H
+#define BT_PPU_ADDRESS_SPACE_H
+
+
+#ifdef _WIN32
 //stop those casting warnings until we have a better solution for ppu_address_t / void* / uint64 conversions
 #pragma warning (disable: 4311)
 #pragma warning (disable: 4312)
-#endif //WIN32
+#endif //_WIN32

-#ifdef USE_ADDR64
-typedef uint64_t ppu_address_t;
+
+#if defined(_WIN64)
+	typedef unsigned __int64 ppu_address_t;
+#elif defined(__LP64__) || defined(__x86_64__)
+	typedef uint64_t ppu_address_t;
 #else
-typedef uint32_t ppu_address_t;
-#endif
+	typedef uint32_t ppu_address_t;
+#endif //defined(_WIN64)

-#endif
+#endif //BT_PPU_ADDRESS_SPACE_H

--- a/Engine/lib/bullet/src/BulletMultiThreaded/SequentialThreadSupport.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SequentialThreadSupport.cpp
@ -91,3 +91,91 @@ void SequentialThreadSupport::setNumTasks(int numTasks)
 {
 	printf("SequentialThreadSupport::setNumTasks(%d) is not implemented and has no effect\n",numTasks);
 }
+
+
+
+
+class btDummyBarrier : public btBarrier
+{
+private:
+		
+public:
+	btDummyBarrier()
+	{
+	}
+	
+	virtual ~btDummyBarrier()
+	{
+	}
+	
+	void sync()
+	{
+	}
+	
+	virtual void setMaxCount(int n) {}
+	virtual int  getMaxCount() {return 1;}
+};
+
+class btDummyCriticalSection : public btCriticalSection
+{
+	
+public:
+	btDummyCriticalSection()
+	{
+	}
+	
+	virtual ~btDummyCriticalSection()
+	{
+	}
+	
+	unsigned int getSharedParam(int i)
+	{
+		btAssert(i>=0&&i<31);
+		return mCommonBuff[i+1];
+	}
+	
+	void setSharedParam(int i,unsigned int p)
+	{
+		btAssert(i>=0&&i<31);
+		mCommonBuff[i+1] = p;
+	}
+	
+	void lock()
+	{
+		mCommonBuff[0] = 1;
+	}
+	
+	void unlock()
+	{
+		mCommonBuff[0] = 0;
+	}
+};
+
+
+
+
+btBarrier*	SequentialThreadSupport::createBarrier()
+{
+	return new btDummyBarrier();
+}
+
+btCriticalSection* SequentialThreadSupport::createCriticalSection()
+{
+	return new btDummyCriticalSection();
+	
+}
+
+void SequentialThreadSupport::deleteBarrier(btBarrier* barrier)
+{
+    delete barrier;
+}
+
+void SequentialThreadSupport::deleteCriticalSection(btCriticalSection* criticalSection)
+{
+    delete criticalSection;
+}
+
+
+
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SequentialThreadSupport.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SequentialThreadSupport.h
@ -17,8 +17,8 @@ subject to the following restrictions:
 #include "PlatformDefinitions.h"


-#ifndef SEQUENTIAL_THREAD_SUPPORT_H
-#define SEQUENTIAL_THREAD_SUPPORT_H
+#ifndef BT_SEQUENTIAL_THREAD_SUPPORT_H
+#define BT_SEQUENTIAL_THREAD_SUPPORT_H

 #include "LinearMath/btAlignedObjectArray.h"

@ -51,7 +51,7 @@ private:
 public:
 	struct	SequentialThreadConstructionInfo
 	{
-		SequentialThreadConstructionInfo (char* uniqueName,
+		SequentialThreadConstructionInfo (const char* uniqueName,
 									SequentialThreadFunc userThreadFunc,
 									SequentiallsMemorySetupFunc	lsMemoryFunc
 									)
@ -62,7 +62,7 @@ public:

 		}

-		char*						m_uniqueName;
+		const char*						m_uniqueName;
 		SequentialThreadFunc		m_userThreadFunc;
 		SequentiallsMemorySetupFunc	m_lsMemoryFunc;
 	};
@ -85,8 +85,16 @@ public:
 	{
 		return 1;
 	}
+	virtual btBarrier*	createBarrier();
+
+	virtual btCriticalSection* createCriticalSection();
+	
+    virtual void deleteBarrier(btBarrier* barrier);
+    
+    virtual void deleteCriticalSection(btCriticalSection* criticalSection);
+

 };

-#endif //SEQUENTIAL_THREAD_SUPPORT_H
+#endif //BT_SEQUENTIAL_THREAD_SUPPORT_H

--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionObjectWrapper.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionObjectWrapper.h
@ -13,8 +13,8 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

-#ifndef SPU_COLLISION_OBJECT_WRAPPER_H
-#define SPU_COLLISION_OBJECT_WRAPPER_H
+#ifndef BT_SPU_COLLISION_OBJECT_WRAPPER_H
+#define BT_SPU_COLLISION_OBJECT_WRAPPER_H

 #include "PlatformDefinitions.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
@ -37,4 +37,4 @@ public:
 };


-#endif //SPU_COLLISION_OBJECT_WRAPPER_H
+#endif //BT_SPU_COLLISION_OBJECT_WRAPPER_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionTaskProcess.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionTaskProcess.cpp
@ -26,7 +26,7 @@ subject to the following restrictions:

 void	SpuCollisionTaskProcess::setNumTasks(int maxNumTasks)
 {
-	if (m_maxNumOutstandingTasks != maxNumTasks)
+	if (int(m_maxNumOutstandingTasks) != maxNumTasks)
 	{
 		m_maxNumOutstandingTasks = maxNumTasks;
 		m_taskBusy.resize(m_maxNumOutstandingTasks);
@ -44,7 +44,6 @@ void	SpuCollisionTaskProcess::setNumTasks(int maxNumTasks)
 		}
 		
 		m_workUnitTaskBuffers = (unsigned char *)btAlignedAlloc(MIDPHASE_WORKUNIT_TASK_SIZE*m_maxNumOutstandingTasks, 128);
-					m_workUnitTaskBuffers = (unsigned char *)btAlignedAlloc(MIDPHASE_WORKUNIT_TASK_SIZE*6, 128);
 	}
 	
 }
@ -69,7 +68,7 @@ m_maxNumOutstandingTasks(0)
 	m_threadInterface->startSPU();

 	//printf("sizeof vec_float4: %d\n", sizeof(vec_float4));
-	printf("sizeof SpuGatherAndProcessWorkUnitInput: %d\n", sizeof(SpuGatherAndProcessWorkUnitInput));
+	printf("sizeof SpuGatherAndProcessWorkUnitInput: %d\n", int(sizeof(SpuGatherAndProcessWorkUnitInput)));

 }

--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionTaskProcess.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionTaskProcess.h
@ -13,12 +13,12 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

-#ifndef SPU_COLLISION_TASK_PROCESS_H
-#define SPU_COLLISION_TASK_PROCESS_H
+#ifndef BT_SPU_COLLISION_TASK_PROCESS_H
+#define BT_SPU_COLLISION_TASK_PROCESS_H

 #include <assert.h>

-#include <LinearMath/btScalar.h>
+#include "LinearMath/btScalar.h"

 #include "PlatformDefinitions.h"
 #include "LinearMath/btAlignedObjectArray.h"
@ -35,7 +35,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btCollisionShape.h"
 #include "BulletCollision/CollisionShapes/btConvexShape.h"

-#include <LinearMath/btAlignedAllocator.h>
+#include "LinearMath/btAlignedAllocator.h"

 #include <stdio.h>

@ -159,5 +159,5 @@ public:
 #define MIDPHASE_NUM_WORKUNITS_PER_TASK (MIDPHASE_NUM_WORKUNITS_PER_PAGE*MIDPHASE_NUM_WORKUNIT_PAGES)


-#endif // SPU_COLLISION_TASK_PROCESS_H
+#endif // BT_SPU_COLLISION_TASK_PROCESS_H

--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp
@ -22,7 +22,7 @@ subject to the following restrictions:



-void SpuContactManifoldCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+void SpuContactManifoldCollisionAlgorithm::processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
 {
 	btAssert(0);
 }
@ -34,7 +34,7 @@ btScalar SpuContactManifoldCollisionAlgorithm::calculateTimeOfImpact(btCollision
 }

 #ifndef __SPU__
-SpuContactManifoldCollisionAlgorithm::SpuContactManifoldCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1)
+SpuContactManifoldCollisionAlgorithm::SpuContactManifoldCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObject* body0,const btCollisionObject* body1)
 :btCollisionAlgorithm(ci)
 #ifdef USE_SEPDISTANCE_UTIL
 ,m_sepDistance(body0->getCollisionShape()->getAngularMotionDisc(),body1->getCollisionShape()->getAngularMotionDisc())
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h
@ -13,14 +13,15 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

-#ifndef SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H
-#define SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H
+#ifndef BT_SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H
+#define BT_SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H

 #include "BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h"
 #include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
 #include "BulletCollision/CollisionDispatch/btCollisionCreateFunc.h"
 #include "BulletCollision/BroadphaseCollision/btDispatcher.h"
 #include "LinearMath/btTransformUtil.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"

 class btPersistentManifold;

@ -37,20 +38,20 @@ ATTRIBUTE_ALIGNED16(class) SpuContactManifoldCollisionAlgorithm : public btColli
 	float	m_collisionMargin0;
 	float	m_collisionMargin1;

-	btCollisionObject*	m_collisionObject0;
-	btCollisionObject*	m_collisionObject1;
+	const btCollisionObject*	m_collisionObject0;
+	const btCollisionObject*	m_collisionObject1;
 	
 	

 	
 public:
 	
-	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+	virtual void processCollision (const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);

 	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);

 	
-	SpuContactManifoldCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+	SpuContactManifoldCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,const btCollisionObject* body0,const btCollisionObject* body1);
 #ifdef USE_SEPDISTANCE_UTIL
 	btConvexSeparatingDistanceUtil	m_sepDistance;
 #endif //USE_SEPDISTANCE_UTIL
@ -68,12 +69,12 @@ public:
 		return m_manifoldPtr;
 	}

-	btCollisionObject*	getCollisionObject0()
+	const btCollisionObject*	getCollisionObject0()
 	{
 		return m_collisionObject0;
 	}
 	
-	btCollisionObject*	getCollisionObject1()
+	const btCollisionObject*	getCollisionObject1()
 	{
 		return m_collisionObject1;
 	}
@ -108,13 +109,13 @@ public:

 	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
 	{
-		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
-		{
+        virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, const btCollisionObjectWrapper* body0Wrap,const btCollisionObjectWrapper* body1Wrap)
+ 		{
 			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(SpuContactManifoldCollisionAlgorithm));
-			return new(mem) SpuContactManifoldCollisionAlgorithm(ci,body0,body1);
+			return new(mem) SpuContactManifoldCollisionAlgorithm(ci,body0Wrap->getCollisionObject(),body1Wrap->getCollisionObject());
 		}
 	};

 };

-#endif //SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H
+#endif //BT_SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuDoubleBuffer.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuDoubleBuffer.h
@ -1,8 +1,24 @@
-#ifndef DOUBLE_BUFFER_H
-#define DOUBLE_BUFFER_H
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef BT_DOUBLE_BUFFER_H
+#define BT_DOUBLE_BUFFER_H

 #include "SpuFakeDma.h"
-#include <LinearMath/btScalar.h>
+#include "LinearMath/btScalar.h"


 ///DoubleBuffer
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuFakeDma.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuFakeDma.cpp
@ -30,7 +30,7 @@ void*	cellDmaLargeGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag
 	cellDmaLargeGet(ls,ea,size,tag,tid,rid);
 	return ls;
 #else
-	return (void*)(uint32_t)ea;
+	return (void*)(ppu_address_t)ea;
 #endif
 }

@ -40,7 +40,7 @@ void*	cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag
 	mfc_get(ls,ea,size,tag,0,0);
 	return ls;
 #else
-	return (void*)(uint32_t)ea;
+	return (void*)(ppu_address_t)ea;
 #endif
 }

@ -53,7 +53,7 @@ void*	cellDmaGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uin
 	cellDmaGet(ls,ea,size,tag,tid,rid);
 	return ls;
 #else
-	return (void*)(uint32_t)ea;
+	return (void*)(ppu_address_t)ea;
 #endif
 }

@ -174,6 +174,9 @@ int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid,
 {
 	char* mainMem = (char*)ea;
 	char* localStore = (char*)ls;
+
+//	printf("mainMem=%x, localStore=%x",mainMem,localStore);
+
 #ifdef USE_MEMCPY
 	memcpy(localStore,mainMem,size);
 #else
@ -182,6 +185,7 @@ int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid,
 		localStore[i] = mainMem[i];
 	}	
 #endif //#ifdef USE_MEMCPY
+//	printf(" finished\n");
 	return 0;
 }

--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuFakeDma.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuFakeDma.h
@ -13,8 +13,8 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

-#ifndef FAKE_DMA_H
-#define FAKE_DMA_H
+#ifndef BT_FAKE_DMA_H
+#define BT_FAKE_DMA_H


 #include "PlatformDefinitions.h"
@ -132,4 +132,4 @@ void*	cellDmaGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uin
 void*	cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);


-#endif //FAKE_DMA_H
+#endif //BT_FAKE_DMA_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp
@ -23,6 +23,8 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletCollision/CollisionShapes/btCollisionShape.h"
 #include "LinearMath/btQuickprof.h"
+#include "BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"



@ -48,6 +50,7 @@ bool	SpuGatheringCollisionDispatcher::supportsDispatchPairOnSpu(int proxyType0,i
 //		(proxyType0 == CONE_SHAPE_PROXYTYPE) ||
 		(proxyType0 == TRIANGLE_MESH_SHAPE_PROXYTYPE) ||
 		(proxyType0 == CONVEX_HULL_SHAPE_PROXYTYPE)||
+		(proxyType0 == STATIC_PLANE_PROXYTYPE)||
 		(proxyType0 == COMPOUND_SHAPE_PROXYTYPE)
 		);

@ -60,9 +63,11 @@ bool	SpuGatheringCollisionDispatcher::supportsDispatchPairOnSpu(int proxyType0,i
 //		(proxyType1 == CONE_SHAPE_PROXYTYPE) ||
 		(proxyType1 == TRIANGLE_MESH_SHAPE_PROXYTYPE) ||
 		(proxyType1 == CONVEX_HULL_SHAPE_PROXYTYPE) ||
+		(proxyType1 == STATIC_PLANE_PROXYTYPE) ||
 		(proxyType1 == COMPOUND_SHAPE_PROXYTYPE)
 		);

+	
 	return supported0 && supported1;
 }

@ -124,8 +129,33 @@ public:
 				{
 					int	proxyType0 = colObj0->getCollisionShape()->getShapeType();
 					int	proxyType1 = colObj1->getCollisionShape()->getShapeType();
-					if (m_dispatcher->supportsDispatchPairOnSpu(proxyType0,proxyType1))
+					bool supportsSpuDispatch = m_dispatcher->supportsDispatchPairOnSpu(proxyType0,proxyType1) 
+						&& ((colObj0->getCollisionFlags() & btCollisionObject::CF_DISABLE_SPU_COLLISION_PROCESSING) == 0)
+						&& ((colObj1->getCollisionFlags() & btCollisionObject::CF_DISABLE_SPU_COLLISION_PROCESSING) == 0);
+
+					if (proxyType0 == COMPOUND_SHAPE_PROXYTYPE)
 					{
+						btCompoundShape* compound = (btCompoundShape*)colObj0->getCollisionShape();
+						if (compound->getNumChildShapes()>MAX_SPU_COMPOUND_SUBSHAPES)
+						{
+							//printf("PPU fallback, compound->getNumChildShapes(%d)>%d\n",compound->getNumChildShapes(),MAX_SPU_COMPOUND_SUBSHAPES);
+							supportsSpuDispatch = false;
+						}
+					}
+
+					if (proxyType1 == COMPOUND_SHAPE_PROXYTYPE)
+					{
+						btCompoundShape* compound = (btCompoundShape*)colObj1->getCollisionShape();
+						if (compound->getNumChildShapes()>MAX_SPU_COMPOUND_SUBSHAPES)
+						{
+							//printf("PPU fallback, compound->getNumChildShapes(%d)>%d\n",compound->getNumChildShapes(),MAX_SPU_COMPOUND_SUBSHAPES);
+							supportsSpuDispatch = false;
+						}
+					}
+
+					if (supportsSpuDispatch)
+					{
+
 						int so = sizeof(SpuContactManifoldCollisionAlgorithm);
 #ifdef ALLOCATE_SEPARATELY
 						void* mem = btAlignedAlloc(so,16);//m_dispatcher->allocateCollisionAlgorithm(so);
@ -136,7 +166,10 @@ public:
 						collisionPair.m_internalTmpValue =  2;
 					} else
 					{
-						collisionPair.m_algorithm = m_dispatcher->findAlgorithm(colObj0,colObj1);
+						btCollisionObjectWrapper ob0(0,colObj0->getCollisionShape(),colObj0,colObj0->getWorldTransform());
+						btCollisionObjectWrapper ob1(0,colObj1->getCollisionShape(),colObj1,colObj1->getWorldTransform());
+
+						collisionPair.m_algorithm = m_dispatcher->findAlgorithm(&ob0,&ob1);
 						collisionPair.m_internalTmpValue = 3;
 					}
 				} 
@ -175,48 +208,60 @@ void	SpuGatheringCollisionDispatcher::dispatchAllCollisionPairs(btOverlappingPai

 		//send one big batch
 		int numTotalPairs = pairCache->getNumOverlappingPairs();
-		btBroadphasePair* pairPtr = pairCache->getOverlappingPairArrayPtr();
-		int i;
+		if (numTotalPairs)
 		{
-			BT_PROFILE("addWorkToTask");
-			for (i=0;i<numTotalPairs;)
+			btBroadphasePair* pairPtr = pairCache->getOverlappingPairArrayPtr();
+			int i;
 			{
-				//Performance Hint: tweak this number during benchmarking
-				static const int pairRange = SPU_BATCHSIZE_BROADPHASE_PAIRS;
-				int endIndex = (i+pairRange) < numTotalPairs ? i+pairRange : numTotalPairs;
-				m_spuCollisionTaskProcess->addWorkToTask(pairPtr,i,endIndex);
-				i = endIndex;
-			}
-		}
-
-		{
-			BT_PROFILE("PPU fallback");
-			//handle PPU fallback pairs
-			for (i=0;i<numTotalPairs;i++)
-			{
-				btBroadphasePair& collisionPair = pairPtr[i];
-				if (collisionPair.m_internalTmpValue == 3)
+				int pairRange =	SPU_BATCHSIZE_BROADPHASE_PAIRS;
+				if (numTotalPairs < (m_spuCollisionTaskProcess->getNumTasks()*SPU_BATCHSIZE_BROADPHASE_PAIRS))
 				{
-					if (collisionPair.m_algorithm)
+					pairRange = (numTotalPairs/m_spuCollisionTaskProcess->getNumTasks())+1;
+				}
+	
+				BT_PROFILE("addWorkToTask");
+				for (i=0;i<numTotalPairs;)
+				{
+					//Performance Hint: tweak this number during benchmarking
+					
+					int endIndex = (i+pairRange) < numTotalPairs ? i+pairRange : numTotalPairs;
+					m_spuCollisionTaskProcess->addWorkToTask(pairPtr,i,endIndex);
+					i = endIndex;
+				}
+			}
+			{
+				BT_PROFILE("PPU fallback");
+				//handle PPU fallback pairs
+				for (i=0;i<numTotalPairs;i++)
+				{
+					btBroadphasePair& collisionPair = pairPtr[i];
+					if (collisionPair.m_internalTmpValue == 3)
 					{
-						btCollisionObject* colObj0 = (btCollisionObject*)collisionPair.m_pProxy0->m_clientObject;
-						btCollisionObject* colObj1 = (btCollisionObject*)collisionPair.m_pProxy1->m_clientObject;
-
-						if (dispatcher->needsCollision(colObj0,colObj1))
+						if (collisionPair.m_algorithm)
 						{
-							btManifoldResult contactPointResult(colObj0,colObj1);
-							
-							if (dispatchInfo.m_dispatchFunc == 		btDispatcherInfo::DISPATCH_DISCRETE)
+							btCollisionObject* colObj0 = (btCollisionObject*)collisionPair.m_pProxy0->m_clientObject;
+							btCollisionObject* colObj1 = (btCollisionObject*)collisionPair.m_pProxy1->m_clientObject;
+	
+							if (dispatcher->needsCollision(colObj0,colObj1))
 							{
-								//discrete collision detection query
-								collisionPair.m_algorithm->processCollision(colObj0,colObj1,dispatchInfo,&contactPointResult);
-							} else
-							{
-								//continuous collision detection query, time of impact (toi)
-								btScalar toi = collisionPair.m_algorithm->calculateTimeOfImpact(colObj0,colObj1,dispatchInfo,&contactPointResult);
-								if (dispatchInfo.m_timeOfImpact > toi)
-									dispatchInfo.m_timeOfImpact = toi;
+							//discrete collision detection query
+								btCollisionObjectWrapper ob0(0,colObj0->getCollisionShape(),colObj0,colObj0->getWorldTransform());
+								btCollisionObjectWrapper ob1(0,colObj1->getCollisionShape(),colObj1,colObj1->getWorldTransform());

+								btManifoldResult contactPointResult(&ob0,&ob1);
+								
+								if (dispatchInfo.m_dispatchFunc == 		btDispatcherInfo::DISPATCH_DISCRETE)
+								{
+									
+									collisionPair.m_algorithm->processCollision(&ob0,&ob1,dispatchInfo,&contactPointResult);
+								} else
+								{
+									//continuous collision detection query, time of impact (toi)
+									btScalar toi = collisionPair.m_algorithm->calculateTimeOfImpact(colObj0,colObj1,dispatchInfo,&contactPointResult);
+									if (dispatchInfo.m_timeOfImpact > toi)
+										dispatchInfo.m_timeOfImpact = toi;
+	
+								}
 							}
 						}
 					}
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.h
@ -12,8 +12,8 @@ subject to the following restrictions:
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
-#ifndef SPU_GATHERING_COLLISION__DISPATCHER_H
-#define SPU_GATHERING_COLLISION__DISPATCHER_H
+#ifndef BT_SPU_GATHERING_COLLISION__DISPATCHER_H
+#define BT_SPU_GATHERING_COLLISION__DISPATCHER_H

 #include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"

@ -23,9 +23,10 @@ subject to the following restrictions:
 ///Too big value might render some SPUs are idle, while a few other SPUs are doing all work.
 //#define SPU_BATCHSIZE_BROADPHASE_PAIRS 8
 //#define SPU_BATCHSIZE_BROADPHASE_PAIRS 16
-#define SPU_BATCHSIZE_BROADPHASE_PAIRS 64
-//#define SPU_BATCHSIZE_BROADPHASE_PAIRS 128
+//#define SPU_BATCHSIZE_BROADPHASE_PAIRS 64
+#define SPU_BATCHSIZE_BROADPHASE_PAIRS 128
 //#define SPU_BATCHSIZE_BROADPHASE_PAIRS 256
+//#define SPU_BATCHSIZE_BROADPHASE_PAIRS 512
 //#define SPU_BATCHSIZE_BROADPHASE_PAIRS 1024


@ -66,4 +67,6 @@ public:



-#endif //SPU_GATHERING_COLLISION__DISPATCHER_H
+#endif //BT_SPU_GATHERING_COLLISION__DISPATCHER_H
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuLibspe2Support.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuLibspe2Support.h
@ -14,8 +14,8 @@ subject to the following restrictions:
 */


-#ifndef SPU_LIBSPE2_SUPPORT_H
-#define SPU_LIBSPE2_SUPPORT_H
+#ifndef BT_SPU_LIBSPE2_SUPPORT_H
+#define BT_SPU_LIBSPE2_SUPPORT_H

 #include <LinearMath/btScalar.h> //for uint32_t etc.

@ -173,7 +173,7 @@ private:

 #endif //USE_LIBSPE2

-#endif //SPU_LIBSPE2_SUPPORT_H
+#endif //BT_SPU_LIBSPE2_SUPPORT_H



--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
@ -24,11 +24,11 @@ subject to the following restrictions:

 #include <math.h>

-//#include "BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h"
-#include <vectormath_aos.h>
+
+#include "../PlatformDefinitions.h"
+


-using namespace Vectormath::Aos;

 enum FeatureType { F, E, V };

@ -39,21 +39,21 @@ enum FeatureType { F, E, V };
 class Box
 {
 public:
-	Vector3 half;
+	vmVector3 mHalf;

 	inline Box()
 	{}
-	inline Box(PE_REF(Vector3) half_);
+	inline Box(PE_REF(vmVector3) half_);
 	inline Box(float hx, float hy, float hz);

-	inline void Set(PE_REF(Vector3) half_);
+	inline void Set(PE_REF(vmVector3) half_);
 	inline void Set(float hx, float hy, float hz);

-	inline Vector3 GetAABB(const Matrix3& rotation) const;
+	inline vmVector3 GetAABB(const vmMatrix3& rotation) const;
 };

 inline
-Box::Box(PE_REF(Vector3) half_)
+Box::Box(PE_REF(vmVector3) half_)
 {
 	Set(half_);
 }
@ -66,23 +66,23 @@ Box::Box(float hx, float hy, float hz)

 inline
 void
-Box::Set(PE_REF(Vector3) half_)
+Box::Set(PE_REF(vmVector3) half_)
 {
-	half = half_;
+	mHalf = half_;
 }

 inline
 void
 Box::Set(float hx, float hy, float hz)
 {
-	half = Vector3(hx, hy, hz);
+	mHalf = vmVector3(hx, hy, hz);
 }

 inline
-Vector3
-Box::GetAABB(const Matrix3& rotation) const
+vmVector3
+Box::GetAABB(const vmMatrix3& rotation) const
 {
-	return absPerElem(rotation) * half;
+	return absPerElem(rotation) * mHalf;
 }

 //-------------------------------------------------------------------------------------------------
@ -95,7 +95,7 @@ class BoxPoint
 public:
 	BoxPoint() : localPoint(0.0f) {}

-	Point3      localPoint;
+	vmPoint3      localPoint;
 	FeatureType featureType;
 	int         featureIdx;

--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
@ -44,7 +44,7 @@ void computeAabb (btVector3& aabbMin, btVector3& aabbMax, btConvexInternalShape*
 		const btTransform& t = xform;
 		btMatrix3x3 abs_b = t.getBasis().absolute();  
 		btVector3 center = t.getOrigin();
-		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
+        btVector3 extent = halfExtents.dot3( abs_b[0], abs_b[1], abs_b[2] );
 		
 		aabbMin = center - extent;
 		aabbMax = center + extent;
@ -67,7 +67,7 @@ void computeAabb (btVector3& aabbMin, btVector3& aabbMax, btConvexInternalShape*
 		const btTransform& t = xform;
 		btMatrix3x3 abs_b = t.getBasis().absolute();  
 		btVector3 center = t.getOrigin();
-		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
+        btVector3 extent = halfExtents.dot3( abs_b[0], abs_b[1], abs_b[2] );
 		
 		aabbMin = center - extent;
 		aabbMax = center + extent;
@ -198,6 +198,12 @@ int		getShapeTypeSize(int shapeType)
 			btAssert(shapeSize < MAX_SHAPE_SIZE);
 			return shapeSize;
 		}
+	case STATIC_PLANE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btStaticPlaneShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}

 	default:
 		btAssert(0);
@ -225,6 +231,7 @@ void dmaCollisionShape (void* collisionShapeLocation, ppu_address_t collisionSha
 {
 	register int dmaSize = getShapeTypeSize(shapeType);
 	cellDmaGet(collisionShapeLocation, collisionShapePtr  , dmaSize, DMA_TAG(dmaTag), 0, 0);
+	//cellDmaGetReadOnly(collisionShapeLocation, collisionShapePtr  , dmaSize, DMA_TAG(dmaTag), 0, 0);
 	//cellDmaWaitTagStatusAll(DMA_MASK(dmaTag));
 }

--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
@ -20,6 +20,7 @@ subject to the following restrictions:
 #include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
 #include "BulletCollision/CollisionShapes/btConvexInternalShape.h"
 #include "BulletCollision/CollisionShapes/btCylinderShape.h"
+#include "BulletCollision/CollisionShapes/btStaticPlaneShape.h"

 #include "BulletCollision/CollisionShapes/btOptimizedBvh.h"
 #include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
@ -32,7 +33,9 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btConvexHullShape.h"
 #include "BulletCollision/CollisionShapes/btCompoundShape.h"

-#define MAX_NUM_SPU_CONVEX_POINTS 128
+#define MAX_NUM_SPU_CONVEX_POINTS 128 //@fallback to PPU if a btConvexHullShape has more than MAX_NUM_SPU_CONVEX_POINTS points
+#define MAX_SPU_COMPOUND_SUBSHAPES 16 //@fallback on PPU if compound has more than MAX_SPU_COMPOUND_SUBSHAPES child shapes
+#define MAX_SHAPE_SIZE 256 //@todo: assert on this

 ATTRIBUTE_ALIGNED16(struct)	SpuConvexPolyhedronVertexData
 {
@ -43,7 +46,7 @@ ATTRIBUTE_ALIGNED16(struct)	SpuConvexPolyhedronVertexData
 	ATTRIBUTE_ALIGNED16(btVector3 g_convexPointBuffer[MAX_NUM_SPU_CONVEX_POINTS]);
 };

-#define MAX_SHAPE_SIZE 256
+

 ATTRIBUTE_ALIGNED16(struct) CollisionShape_LocalStoreMemory
 {
@ -53,7 +56,7 @@ ATTRIBUTE_ALIGNED16(struct) CollisionShape_LocalStoreMemory
 ATTRIBUTE_ALIGNED16(struct) CompoundShape_LocalStoreMemory
 {
 	// Compound data
-#define MAX_SPU_COMPOUND_SUBSHAPES 16
+
 	ATTRIBUTE_ALIGNED16(btCompoundShapeChild gSubshapes[MAX_SPU_COMPOUND_SUBSHAPES]);
 	ATTRIBUTE_ALIGNED16(char gSubshapeShape[MAX_SPU_COMPOUND_SUBSHAPES][MAX_SHAPE_SIZE]);
 };
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
@ -17,6 +17,12 @@ subject to the following restrictions:

 //#define DEBUG_SPU_COLLISION_DETECTION 1

+#ifdef DEBUG_SPU_COLLISION_DETECTION
+#ifndef __SPU__
+#include <stdio.h>
+#define spu_printf printf
+#endif
+#endif //DEBUG_SPU_COLLISION_DETECTION

 SpuContactResult::SpuContactResult()
 {
@ -99,50 +105,50 @@ bool ManifoldResultAddContactPoint(const btVector3& normalOnBInWorld,
 	if (depth > manifoldPtr->getContactBreakingThreshold())
 		return false;

-	//provide inverses or just calculate?
-	btTransform transAInv = transA.inverse();//m_body0->m_cachedInvertedWorldTransform;
-	btTransform transBInv= transB.inverse();//m_body1->m_cachedInvertedWorldTransform;
+	//if (depth > manifoldPtr->getContactProcessingThreshold())
+	//	return false;
+
+

 	btVector3 pointA;
 	btVector3 localA;
 	btVector3 localB;
 	btVector3 normal;

+
 	if (isSwapped)
 	{
 		normal = normalOnBInWorld * -1;
 		pointA = pointInWorld + normal * depth;
-		localA = transAInv(pointA );
-		localB = transBInv(pointInWorld);
-		/*localA = transBInv(pointA );
-		localB = transAInv(pointInWorld);*/
+		localA = transA.invXform(pointA );
+		localB = transB.invXform(pointInWorld);
 	}
 	else
 	{
 		normal = normalOnBInWorld;
 		pointA = pointInWorld + normal * depth;
-		localA = transAInv(pointA );
-		localB = transBInv(pointInWorld);
+		localA = transA.invXform(pointA );
+		localB = transB.invXform(pointInWorld);
 	}

 	btManifoldPoint newPt(localA,localB,normal,depth);
+	newPt.m_positionWorldOnA = pointA;
+	newPt.m_positionWorldOnB = pointInWorld;
+
+	newPt.m_combinedFriction = combinedFriction;
+	newPt.m_combinedRestitution = combinedRestitution;
+

 	int insertIndex = manifoldPtr->getCacheEntry(newPt);
 	if (insertIndex >= 0)
 	{
-//		manifoldPtr->replaceContactPoint(newPt,insertIndex);
-//		return true;
-
-#ifdef DEBUG_SPU_COLLISION_DETECTION
-		spu_printf("SPU: same contact detected, nothing done\n");
-#endif //DEBUG_SPU_COLLISION_DETECTION
-		// This is not needed, just use the old info! saves a DMA transfer as well
+		// we need to replace the current contact point, otherwise small errors will accumulate (spheres start rolling etc)
+		manifoldPtr->replaceContactPoint(newPt,insertIndex);
+		return true;
+		
 	} else
 	{

-		newPt.m_combinedFriction = combinedFriction;
-		newPt.m_combinedRestitution = combinedRestitution;
-
 		/*
 		///@todo: SPU callbacks, either immediate (local on the SPU), or deferred
 		//User can override friction and/or restitution
@ -155,6 +161,7 @@ bool ManifoldResultAddContactPoint(const btVector3& normalOnBInWorld,
 			(*gContactAddedCallback)(newPt,m_body0,m_partId0,m_index0,m_body1,m_partId1,m_index1);
 		}
 		*/
+
 		manifoldPtr->addManifoldPoint(newPt);
 		return true;

@ -181,7 +188,12 @@ void SpuContactResult::writeDoubleBufferedManifold(btPersistentManifold* lsManif

 void SpuContactResult::addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,btScalar depth)
 {
-	//spu_printf("*** SpuContactResult::addContactPoint: depth = %f\n",depth);
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+	spu_printf("*** SpuContactResult::addContactPoint: depth = %f\n",depth);
+	spu_printf("*** normal = %f,%f,%f\n",normalOnBInWorld.getX(),normalOnBInWorld.getY(),normalOnBInWorld.getZ());
+	spu_printf("*** position = %f,%f,%f\n",pointInWorld.getX(),pointInWorld.getY(),pointInWorld.getZ());
+#endif //DEBUG_SPU_COLLISION_DETECTION
+	

 #ifdef DEBUG_SPU_COLLISION_DETECTION
 //   int sman = sizeof(rage::phManifold);
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h
@ -17,7 +17,7 @@ subject to the following restrictions:
 #define SPU_CONTACT_RESULT2_H


-#ifndef WIN32
+#ifndef _WIN32
 #include <stdint.h>
 #endif

--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
@ -24,7 +24,7 @@ class btStackAlloc;
 class btIDebugDraw;
 #include "BulletCollision/NarrowphaseCollision/btConvexPenetrationDepthSolver.h"

-#include <LinearMath/btTransform.h>
+#include "LinearMath/btTransform.h"


 ///ConvexPenetrationDepthSolver provides an interface for penetration depth calculation.
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
@ -52,7 +52,7 @@ subject to the following restrictions:
 #ifdef __SPU__
 ///Software caching from the IBM Cell SDK, it reduces 25% SPU time for our test cases
 #ifndef USE_LIBSPE2
-#define USE_SOFTWARE_CACHE 1
+//#define USE_SOFTWARE_CACHE 1
 #endif
 #endif //__SPU__

@ -190,10 +190,27 @@ void* createCollisionLocalStoreMemory()
 {
 	return &gLocalStoreMemory;
 }
+void deleteCollisionLocalStoreMemory()
+{
+}
 #else
+
+btAlignedObjectArray<CollisionTask_LocalStoreMemory*> sLocalStorePointers;
+
 void* createCollisionLocalStoreMemory()
 {
-        return new CollisionTask_LocalStoreMemory;
+    CollisionTask_LocalStoreMemory* localStore = new CollisionTask_LocalStoreMemory;
+    sLocalStorePointers.push_back(localStore);
+    return localStore;
+}
+
+void deleteCollisionLocalStoreMemory()
+{
+    for (int i=0;i<sLocalStorePointers.size();i++)
+    {
+        delete sLocalStorePointers[i];
+    }
+    sLocalStorePointers.clear();
 }

 #endif
@ -372,6 +389,151 @@ public:
 };


+
+void btConvexPlaneCollideSingleContact (SpuCollisionPairInput* wuInput,CollisionTask_LocalStoreMemory* lsMemPtr,SpuContactResult&  spuContacts)
+{
+	
+	btConvexShape* convexShape = (btConvexShape*) wuInput->m_spuCollisionShapes[0];
+	btStaticPlaneShape* planeShape = (btStaticPlaneShape*) wuInput->m_spuCollisionShapes[1];
+
+    bool hasCollision = false;
+	const btVector3& planeNormal = planeShape->getPlaneNormal();
+	const btScalar& planeConstant = planeShape->getPlaneConstant();
+	
+	
+	btTransform convexWorldTransform = wuInput->m_worldTransform0;
+	btTransform convexInPlaneTrans;
+	convexInPlaneTrans= wuInput->m_worldTransform1.inverse() * convexWorldTransform;
+	btTransform planeInConvex;
+	planeInConvex= convexWorldTransform.inverse() * wuInput->m_worldTransform1;
+	
+	//btVector3 vtx = convexShape->localGetSupportVertexWithoutMarginNonVirtual(planeInConvex.getBasis()*-planeNormal);
+	btVector3 vtx = convexShape->localGetSupportVertexNonVirtual(planeInConvex.getBasis()*-planeNormal);
+
+	btVector3 vtxInPlane = convexInPlaneTrans(vtx);
+	btScalar distance = (planeNormal.dot(vtxInPlane) - planeConstant);
+
+	btVector3 vtxInPlaneProjected = vtxInPlane - distance*planeNormal;
+	btVector3 vtxInPlaneWorld = wuInput->m_worldTransform1 * vtxInPlaneProjected;
+
+	hasCollision = distance < lsMemPtr->getContactManifoldPtr()->getContactBreakingThreshold();
+	//resultOut->setPersistentManifold(m_manifoldPtr);
+	if (hasCollision)
+	{
+		/// report a contact. internally this will be kept persistent, and contact reduction is done
+		btVector3 normalOnSurfaceB =wuInput->m_worldTransform1.getBasis() * planeNormal;
+		btVector3 pOnB = vtxInPlaneWorld;
+		spuContacts.addContactPoint(normalOnSurfaceB,pOnB,distance);
+	}
+}
+
+void	ProcessConvexPlaneSpuCollision(SpuCollisionPairInput* wuInput, CollisionTask_LocalStoreMemory* lsMemPtr, SpuContactResult& spuContacts)
+{
+
+		register	int dmaSize = 0;
+		register ppu_address_t	dmaPpuAddress2;
+		btPersistentManifold* manifold = (btPersistentManifold*)wuInput->m_persistentManifoldPtr;
+
+		///DMA in the vertices for convex shapes
+		ATTRIBUTE_ALIGNED16(char convexHullShape0[sizeof(btConvexHullShape)]);
+		ATTRIBUTE_ALIGNED16(char convexHullShape1[sizeof(btConvexHullShape)]);
+
+		if ( btLikely( wuInput->m_shapeType0== CONVEX_HULL_SHAPE_PROXYTYPE ) )
+		{
+			//	spu_printf("SPU: DMA btConvexHullShape\n");
+			
+			dmaSize = sizeof(btConvexHullShape);
+			dmaPpuAddress2 = wuInput->m_collisionShapes[0];
+
+			cellDmaGet(&convexHullShape0, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
+			//cellDmaWaitTagStatusAll(DMA_MASK(1));
+		}
+
+		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
+		{
+			//	spu_printf("SPU: DMA btConvexHullShape\n");
+			dmaSize = sizeof(btConvexHullShape);
+			dmaPpuAddress2 = wuInput->m_collisionShapes[1];
+			cellDmaGet(&convexHullShape1, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
+			//cellDmaWaitTagStatusAll(DMA_MASK(1));
+		}
+		
+		if ( btLikely( wuInput->m_shapeType0 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
+		{		
+			cellDmaWaitTagStatusAll(DMA_MASK(1));
+			dmaConvexVertexData (&lsMemPtr->convexVertexData[0], (btConvexHullShape*)&convexHullShape0);
+			lsMemPtr->convexVertexData[0].gSpuConvexShapePtr = wuInput->m_spuCollisionShapes[0];
+		}
+
+			
+		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
+		{
+			cellDmaWaitTagStatusAll(DMA_MASK(1));
+			dmaConvexVertexData (&lsMemPtr->convexVertexData[1], (btConvexHullShape*)&convexHullShape1);
+			lsMemPtr->convexVertexData[1].gSpuConvexShapePtr = wuInput->m_spuCollisionShapes[1];
+		}
+
+		
+		btConvexPointCloudShape cpc0,cpc1;
+
+		if ( btLikely( wuInput->m_shapeType0 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
+		{
+			cellDmaWaitTagStatusAll(DMA_MASK(2));
+			lsMemPtr->convexVertexData[0].gConvexPoints = &lsMemPtr->convexVertexData[0].g_convexPointBuffer[0];
+			btConvexHullShape* ch = (btConvexHullShape*)wuInput->m_spuCollisionShapes[0];
+			const btVector3& localScaling = ch->getLocalScalingNV();
+			cpc0.setPoints(lsMemPtr->convexVertexData[0].gConvexPoints,lsMemPtr->convexVertexData[0].gNumConvexPoints,false,localScaling);
+			wuInput->m_spuCollisionShapes[0] = &cpc0;
+		}
+
+		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
+		{
+			cellDmaWaitTagStatusAll(DMA_MASK(2));		
+			lsMemPtr->convexVertexData[1].gConvexPoints = &lsMemPtr->convexVertexData[1].g_convexPointBuffer[0];
+			btConvexHullShape* ch = (btConvexHullShape*)wuInput->m_spuCollisionShapes[1];
+			const btVector3& localScaling = ch->getLocalScalingNV();
+			cpc1.setPoints(lsMemPtr->convexVertexData[1].gConvexPoints,lsMemPtr->convexVertexData[1].gNumConvexPoints,false,localScaling);
+			wuInput->m_spuCollisionShapes[1] = &cpc1;
+
+		}
+
+
+//		const btConvexShape* shape0Ptr = (const btConvexShape*)wuInput->m_spuCollisionShapes[0];
+//		const btConvexShape* shape1Ptr = (const btConvexShape*)wuInput->m_spuCollisionShapes[1];
+//		int shapeType0 = wuInput->m_shapeType0;
+//		int shapeType1 = wuInput->m_shapeType1;
+		float marginA = wuInput->m_collisionMargin0;
+		float marginB = wuInput->m_collisionMargin1;
+
+		SpuClosestPointInput	cpInput;
+		cpInput.m_convexVertexData[0] = &lsMemPtr->convexVertexData[0];
+		cpInput.m_convexVertexData[1] = &lsMemPtr->convexVertexData[1];
+		cpInput.m_transformA = wuInput->m_worldTransform0;
+		cpInput.m_transformB = wuInput->m_worldTransform1;
+		float sumMargin = (marginA+marginB+lsMemPtr->getContactManifoldPtr()->getContactBreakingThreshold());
+		cpInput.m_maximumDistanceSquared = sumMargin * sumMargin;
+
+		ppu_address_t manifoldAddress = (ppu_address_t)manifold;
+
+		btPersistentManifold* spuManifold=lsMemPtr->getContactManifoldPtr();
+		//spuContacts.setContactInfo(spuManifold,manifoldAddress,wuInput->m_worldTransform0,wuInput->m_worldTransform1,wuInput->m_isSwapped);
+		spuContacts.setContactInfo(spuManifold,manifoldAddress,lsMemPtr->getColObj0()->getWorldTransform(),
+			lsMemPtr->getColObj1()->getWorldTransform(),
+			lsMemPtr->getColObj0()->getRestitution(),lsMemPtr->getColObj1()->getRestitution(),
+			lsMemPtr->getColObj0()->getFriction(),lsMemPtr->getColObj1()->getFriction(),
+			wuInput->m_isSwapped);
+
+
+		btConvexPlaneCollideSingleContact(wuInput,lsMemPtr,spuContacts);
+
+
+		
+	
+}
+
+
+
+
 ////////////////////////
 /// Convex versus Concave triangle mesh collision detection (handles concave triangle mesh versus sphere, box, cylinder, triangle, cone, convex polyhedron etc)
 ///////////////////
@ -476,8 +638,9 @@ void	ProcessConvexConcaveSpuCollision(SpuCollisionPairInput* wuInput, CollisionT
 }


-int stats[11]={0,0,0,0,0,0,0,0,0,0,0};
-int degenerateStats[11]={0,0,0,0,0,0,0,0,0,0,0};
+#define MAX_DEGENERATE_STATS 15
+int stats[MAX_DEGENERATE_STATS]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+int degenerateStats[MAX_DEGENERATE_STATS]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};


 ////////////////////////
@ -613,8 +776,10 @@ void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTa
 		{
 			btGjkPairDetector gjk(shape0Ptr,shape1Ptr,shapeType0,shapeType1,marginA,marginB,&simplexSolver,penetrationSolver);//&vsSolver,penetrationSolver);
 			gjk.getClosestPoints(cpInput,spuContacts,0);//,debugDraw);
-			
+
+			btAssert(gjk.m_lastUsedMethod <MAX_DEGENERATE_STATS);
 			stats[gjk.m_lastUsedMethod]++;
+			btAssert(gjk.m_degenerateSimplex <MAX_DEGENERATE_STATS);
 			degenerateStats[gjk.m_degenerateSimplex]++;

 #ifdef USE_SEPDISTANCE_UTIL			
@ -719,16 +884,21 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 		cellDmaWaitTagStatusAll(DMA_MASK(1));

 		int childShapeCount0 = spuCompoundShape0->getNumChildShapes();
+		btAssert(childShapeCount0< MAX_SPU_COMPOUND_SUBSHAPES);
 		int childShapeCount1 = spuCompoundShape1->getNumChildShapes();
+		btAssert(childShapeCount1< MAX_SPU_COMPOUND_SUBSHAPES);

 		// Start the N^2
 		for (int i = 0; i < childShapeCount0; ++i)
 		{
 			btCompoundShapeChild& childShape0 = lsMem.compoundShapeData[0].gSubshapes[i];
+			btAssert(!btBroadphaseProxy::isCompound(childShape0.m_childShapeType));

 			for (int j = 0; j < childShapeCount1; ++j)
 			{
 				btCompoundShapeChild& childShape1 = lsMem.compoundShapeData[1].gSubshapes[j];
+				btAssert(!btBroadphaseProxy::isCompound(childShape1.m_childShapeType));
+

 				/* Create a new collision pair input struct using the two child shapes */
 				SpuCollisionPairInput cinput (collisionPairInput);
@ -741,9 +911,10 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 				cinput.m_shapeType1 = childShape1.m_childShapeType;
 				cinput.m_collisionMargin1 = childShape1.m_childMargin;
 				/* Recursively call handleCollisionPair () with new collision pair input */
+				
 				handleCollisionPair(cinput, lsMem, spuContacts,			
 					(ppu_address_t)childShape0.m_childShape, lsMem.compoundShapeData[0].gSubshapeShape[i], 
-					(ppu_address_t)childShape1.m_childShape, lsMem.compoundShapeData[1].gSubshapeShape[j], false); // bug fix: changed index to j.
+					(ppu_address_t)childShape1.m_childShape, lsMem.compoundShapeData[1].gSubshapeShape[j], false);
 			}
 		}
 	}
@ -761,11 +932,12 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 		cellDmaWaitTagStatusAll(DMA_MASK(1));

 		int childShapeCount = spuCompoundShape->getNumChildShapes();
+		btAssert(childShapeCount< MAX_SPU_COMPOUND_SUBSHAPES);

 		for (int i = 0; i < childShapeCount; ++i)
 		{
 			btCompoundShapeChild& childShape = lsMem.compoundShapeData[0].gSubshapes[i];
-
+			btAssert(!btBroadphaseProxy::isCompound(childShape.m_childShapeType));
 			// Dma the child shape
 			dmaCollisionShape (&lsMem.compoundShapeData[0].gSubshapeShape[i], (ppu_address_t)childShape.m_childShape, 1, childShape.m_childShapeType);
 			cellDmaWaitTagStatusAll(DMA_MASK(1));
@ -793,10 +965,13 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 		cellDmaWaitTagStatusAll(DMA_MASK(1));
 		
 		int childShapeCount = spuCompoundShape->getNumChildShapes();
+		btAssert(childShapeCount< MAX_SPU_COMPOUND_SUBSHAPES);
+

 		for (int i = 0; i < childShapeCount; ++i)
 		{
 			btCompoundShapeChild& childShape = lsMem.compoundShapeData[0].gSubshapes[i];
+			btAssert(!btBroadphaseProxy::isCompound(childShape.m_childShapeType));
 			// Dma the child shape
 			dmaCollisionShape (&lsMem.compoundShapeData[0].gSubshapeShape[i], (ppu_address_t)childShape.m_childShape, 1, childShape.m_childShapeType);
 			cellDmaWaitTagStatusAll(DMA_MASK(1));
@ -845,17 +1020,33 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 				cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
 			}
 			
-			btConvexInternalShape* spuConvexShape0 = (btConvexInternalShape*)collisionShape0Loc;
-			btBvhTriangleMeshShape* trimeshShape = (btBvhTriangleMeshShape*)collisionShape1Loc;
+			if (collisionPairInput.m_shapeType1 == STATIC_PLANE_PROXYTYPE)
+			{
+				btConvexInternalShape* spuConvexShape0 = (btConvexInternalShape*)collisionShape0Loc;
+				btStaticPlaneShape* planeShape= (btStaticPlaneShape*)collisionShape1Loc;

-			btVector3 dim0 = spuConvexShape0->getImplicitShapeDimensions();
-			collisionPairInput.m_primitiveDimensions0 = dim0;
-			collisionPairInput.m_collisionShapes[0] = collisionShape0Ptr;
-			collisionPairInput.m_collisionShapes[1] = collisionShape1Ptr;
-			collisionPairInput.m_spuCollisionShapes[0] = spuConvexShape0;
-			collisionPairInput.m_spuCollisionShapes[1] = trimeshShape;
+				btVector3 dim0 = spuConvexShape0->getImplicitShapeDimensions();
+				collisionPairInput.m_primitiveDimensions0 = dim0;
+				collisionPairInput.m_collisionShapes[0] = collisionShape0Ptr;
+				collisionPairInput.m_collisionShapes[1] = collisionShape1Ptr;
+				collisionPairInput.m_spuCollisionShapes[0] = spuConvexShape0;
+				collisionPairInput.m_spuCollisionShapes[1] = planeShape;

-			ProcessConvexConcaveSpuCollision(&collisionPairInput,&lsMem,spuContacts);
+				ProcessConvexPlaneSpuCollision(&collisionPairInput,&lsMem,spuContacts);
+			} else
+			{
+				btConvexInternalShape* spuConvexShape0 = (btConvexInternalShape*)collisionShape0Loc;
+				btBvhTriangleMeshShape* trimeshShape = (btBvhTriangleMeshShape*)collisionShape1Loc;
+
+				btVector3 dim0 = spuConvexShape0->getImplicitShapeDimensions();
+				collisionPairInput.m_primitiveDimensions0 = dim0;
+				collisionPairInput.m_collisionShapes[0] = collisionShape0Ptr;
+				collisionPairInput.m_collisionShapes[1] = collisionShape1Ptr;
+				collisionPairInput.m_spuCollisionShapes[0] = spuConvexShape0;
+				collisionPairInput.m_spuCollisionShapes[1] = trimeshShape;
+
+				ProcessConvexConcaveSpuCollision(&collisionPairInput,&lsMem,spuContacts);
+			}
 		}

 	}
@ -1033,7 +1224,7 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 											collisionPairInput.m_isSwapped);

 						
-									float distance=0.f;
+									//float distance=0.f;
 									btVector3 normalInB;


@ -1054,38 +1245,64 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 												btScalar margin1 = lsMem.getlocalCollisionAlgorithm()->getCollisionMargin1();
 												btVector3 shapeDim0 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions0()+btVector3(margin0,margin0,margin0);
 												btVector3 shapeDim1 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions1()+btVector3(margin1,margin1,margin1);
+/*
+												//Box boxA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ());
+												vmVector3 vmPos0 = getVmVector3(collisionPairInput.m_worldTransform0.getOrigin());
+												vmVector3 vmPos1 = getVmVector3(collisionPairInput.m_worldTransform1.getOrigin());
+												vmMatrix3 vmMatrix0 = getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis());
+												vmMatrix3 vmMatrix1 = getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis());

-												Box boxA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ());
-												Vector3 vmPos0 = getVmVector3(collisionPairInput.m_worldTransform0.getOrigin());
-												Vector3 vmPos1 = getVmVector3(collisionPairInput.m_worldTransform1.getOrigin());
-												Matrix3 vmMatrix0 = getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis());
-												Matrix3 vmMatrix1 = getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis());
-
-												Transform3 transformA(vmMatrix0,vmPos0);
+												vmTransform3 transformA(vmMatrix0,vmPos0);
 												Box boxB(shapeDim1.getX(),shapeDim1.getY(),shapeDim1.getZ());
-												Transform3 transformB(vmMatrix1,vmPos1);
+												vmTransform3 transformB(vmMatrix1,vmPos1);
 												BoxPoint resultClosestBoxPointA;
 												BoxPoint resultClosestBoxPointB;
-												Vector3 resultNormal;
+												vmVector3 resultNormal;
+												*/
+
 #ifdef USE_SEPDISTANCE_UTIL
 												float distanceThreshold = FLT_MAX
 #else
-												float distanceThreshold = 0.f;
+												//float distanceThreshold = 0.f;
 #endif


-												distance = boxBoxDistance(resultNormal,resultClosestBoxPointA,resultClosestBoxPointB,  boxA, transformA, boxB,transformB,distanceThreshold);
+												vmVector3 n;
+												Box boxA;
+												vmVector3 hA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ());
+												vmVector3 hB(shapeDim1.getX(),shapeDim1.getY(),shapeDim1.getZ());
+												boxA.mHalf= hA;
+												vmTransform3 trA;
+												trA.setTranslation(getVmVector3(collisionPairInput.m_worldTransform0.getOrigin()));
+												trA.setUpper3x3(getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis()));
+												Box boxB;
+												boxB.mHalf = hB;
+												vmTransform3 trB;
+												trB.setTranslation(getVmVector3(collisionPairInput.m_worldTransform1.getOrigin()));
+												trB.setUpper3x3(getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis()));
 												
-												normalInB = -getBtVector3(resultNormal);
+												float distanceThreshold = spuManifold->getContactBreakingThreshold();//0.001f;

-												if(distance < spuManifold->getContactBreakingThreshold())
+
+												BoxPoint ptA,ptB;
+												float dist = boxBoxDistance(n, ptA, ptB,
+														   boxA, trA, boxB,	   trB,
+															distanceThreshold );
+
+
+//												float distance = boxBoxDistance(resultNormal,resultClosestBoxPointA,resultClosestBoxPointB,  boxA, transformA, boxB,transformB,distanceThreshold);
+												
+												normalInB = -getBtVector3(n);//resultNormal);
+
+												//if(dist < distanceThreshold)//spuManifold->getContactBreakingThreshold())
+												if(dist < spuManifold->getContactBreakingThreshold())
 												{
-													btVector3 pointOnB = collisionPairInput.m_worldTransform1(getBtVector3(resultClosestBoxPointB.localPoint));
+													btVector3 pointOnB = collisionPairInput.m_worldTransform1(getBtVector3(ptB.localPoint));

 													spuContacts.addContactPoint(
 														normalInB,
 														pointOnB,
-														distance);
+														dist);
 												}
 											} 
 #else									
@ -1163,7 +1380,9 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 #endif //USE_SEPDISTANCE_UTIL
 											)
 										{
-											handleCollisionPair(collisionPairInput, lsMem, spuContacts,				(ppu_address_t)lsMem.getColObj0()->getCollisionShape(), &lsMem.gCollisionShapes[0].collisionShape,	(ppu_address_t)lsMem.getColObj1()->getCollisionShape(), &lsMem.gCollisionShapes[1].collisionShape);
+											handleCollisionPair(collisionPairInput, lsMem, spuContacts,
+												(ppu_address_t)lsMem.getColObj0()->getCollisionShape(), &lsMem.gCollisionShapes[0].collisionShape,
+												(ppu_address_t)lsMem.getColObj1()->getCollisionShape(), &lsMem.gCollisionShapes[1].collisionShape);
 										} else
 										{
 												//spu_printf("boxbox dist = %f\n",distance);
@ -1209,3 +1428,5 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)

 	return;
 }
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
@ -47,7 +47,7 @@ __attribute__ ((aligned (128)))
 void	processCollisionTask(void* userPtr, void* lsMemory);

 void*	createCollisionLocalStoreMemory();
-
+void deleteCollisionLocalStoreMemory();

 #if defined(USE_LIBSPE2) && defined(__SPU__)
 #include "../SpuLibspe2Support.h"
--- a/Show more
+++ b/Show more