Engine directory for ticket #1

2026-04-25 14:25:38 +00:00 · 2012-09-19 11:15:01 -04:00 · 2012-09-19 11:15:01 -04:00 · 7dbfe6994d
commit 7dbfe6994d
parent 352279af7a
3795 changed files with 1363358 additions and 0 deletions
--- a/Engine/lib/bullet/src/BulletMultiThreaded/CMakeLists.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/CMakeLists.txt
@ -0,0 +1,71 @@
+INCLUDE_DIRECTORIES(
+	${BULLET_PHYSICS_SOURCE_DIR}/src
+	${BULLET_PHYSICS_SOURCE_DIR}/src/BulletMultiThreaded/vectormath/scalar/cpp
+)
+
+ADD_LIBRARY(BulletMultiThreaded
+		PlatformDefinitions.h
+		SpuFakeDma.cpp
+		SpuFakeDma.h
+		SpuSync.h
+		SpuDoubleBuffer.h
+		SpuLibspe2Support.cpp
+		SpuLibspe2Support.h
+		btThreadSupportInterface.cpp
+		btThreadSupportInterface.h
+		
+		Win32ThreadSupport.cpp
+		Win32ThreadSupport.h
+		PosixThreadSupport.cpp
+		PosixThreadSupport.h
+		SequentialThreadSupport.cpp
+		SequentialThreadSupport.h
+		SpuSampleTaskProcess.h
+		SpuSampleTaskProcess.cpp
+
+		SpuCollisionObjectWrapper.cpp 
+		SpuCollisionObjectWrapper.h 
+		SpuCollisionTaskProcess.h
+		SpuCollisionTaskProcess.cpp
+		SpuGatheringCollisionDispatcher.h
+		SpuGatheringCollisionDispatcher.cpp
+		SpuContactManifoldCollisionAlgorithm.cpp
+		SpuContactManifoldCollisionAlgorithm.h
+		SpuNarrowPhaseCollisionTask/Box.h
+		SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
+		SpuNarrowPhaseCollisionTask/boxBoxDistance.h
+		SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
+		SpuNarrowPhaseCollisionTask/SpuContactResult.h
+		SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
+		SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
+		SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
+		SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h
+		SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+		SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
+		SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
+		SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
+
+
+#Some GPU related stuff, mainly CUDA and perhaps OpenCL
+		btGpu3DGridBroadphase.cpp
+		btGpu3DGridBroadphase.h
+		btGpu3DGridBroadphaseSharedCode.h
+		btGpu3DGridBroadphaseSharedDefs.h
+		btGpu3DGridBroadphaseSharedTypes.h
+		btGpuDefines.h
+		btGpuUtilsSharedCode.h
+		btGpuUtilsSharedDefs.h
+
+#MiniCL provides a small subset of OpenCL
+		MiniCLTaskScheduler.cpp
+		MiniCLTaskScheduler.h
+		MiniCLTask/MiniCLTask.cpp
+		MiniCLTask/MiniCLTask.h
+		../MiniCL/cl.h
+		../MiniCL/cl_gl.h
+		../MiniCL/cl_platform.h
+)
+
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletMultiThreaded BulletCollision)
+ENDIF (BUILD_SHARED_LIBS)
--- a/Engine/lib/bullet/src/BulletMultiThreaded/Jamfile
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/Jamfile
@ -0,0 +1,14 @@
+SubDir TOP src BulletMultiThreaded ;
+
+#IncludeDir src/BulletMultiThreaded ;
+
+Library bulletmultithreaded : [ Wildcard . : *.h *.cpp ] [ Wildcard MiniCLTask : *.h *.cpp ] [ Wildcard SpuNarrowPhaseCollisionTask : *.h *.cpp  ]  : noinstall ;
+CFlags bulletmultithreaded : [ FIncludes $(TOP)/src/BulletMultiThreaded ] [ FIncludes $(TOP)/src/BulletMultiThreaded/vectormath/scalar/cpp ] ;
+LibDepends bulletmultithreaded :  ;
+
+   MsvcIncDirs bulletmultithreaded : 
+	"../../src/BulletMultiThreaded"  
+	"../../src/BulletMultiThreaded/vectormath/scalar/cpp"
+	;
+
+InstallHeader [ Wildcard *.h ] : bulletmultithreaded ;
--- a/Engine/lib/bullet/src/BulletMultiThreaded/Makefile.original
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/Makefile.original
@ -0,0 +1,187 @@
+__ARCH_BITS__ := 32
+
+# define macros
+NARROWPHASEDIR=./SpuNarrowPhaseCollisionTask
+SPU_TASKFILE=$(NARROWPHASEDIR)/SpuGatheringCollisionTask
+
+IBM_CELLSDK_VERSION := $(shell if [ -d /opt/cell ]; then echo "3.0"; fi)
+
+ifeq ("$(IBM_CELLSDK_VERSION)","3.0")
+        CELL_TOP ?= /opt/cell/sdk
+	CELL_SYSROOT := /opt/cell/sysroot
+else
+        CELL_TOP ?= /opt/ibm/cell-sdk/prototype
+	CELL_SYSROOT := $(CELL_TOP)/sysroot
+endif
+
+
+USE_CCACHE=ccache
+RM=rm -f 
+OUTDIR=./out
+DEBUGFLAG=-DNDEBUG
+LIBOUTDIR=../../lib/ibmsdk
+COLLISIONDIR=../../src/BulletCollision
+MATHDIR=../../src/LinearMath
+ARCHITECTUREFLAG=-m$(__ARCH_BITS__)
+ifeq "$(__ARCH_BITS__)" "64"
+  SPU_DEFFLAGS= -DUSE_LIBSPE2 -D__SPU__ -DUSE_ADDR64
+else
+  SPU_DEFFLAGS= -DUSE_LIBSPE2 -D__SPU__
+endif
+
+SPU_DEFFLAGS+=-DUSE_PE_BOX_BOX
+
+SPU_GCC=$(USE_CCACHE) /usr/bin/spu-gcc
+SPU_INCLUDEDIR= -Ivectormath/scalar/cpp -I. -I$(CELL_SYSROOT)/usr/spu/include -I../../src -I$(NARROWPHASEDIR)
+#SPU_CFLAGS= $(DEBUGFLAG) -W -Wall -Winline -Os -c -include spu_intrinsics.h -include stdbool.h
+SPU_CFLAGS= $(DEBUGFLAG) -W -Wall -Winline -O3 -mbranch-hints -fomit-frame-pointer -ftree-vectorize -finline-functions -ftree-vect-loop-version -ftree-loop-optimize -ffast-math -fno-rtti -fno-exceptions -c -include spu_intrinsics.h -include stdbool.h
+
+SPU_LFLAGS= -Wl,-N
+SPU_LIBRARIES=-lstdc++
+SPU_EMBED=/usr/bin/ppu-embedspu
+SPU_AR=/usr/bin/ar
+SYMBOLNAME=spu_program
+
+ifeq "$(__ARCH_BITS__)" "64"
+  PPU_DEFFLAGS= -DUSE_LIBSPE2 -DUSE_ADDR64
+  PPU_GCC=$(USE_CCACHE) /usr/bin/ppu-gcc
+else
+  PPU_DEFFLAGS= -DUSE_LIBSPE2
+  PPU_GCC=$(USE_CCACHE) /usr/bin/ppu32-gcc
+endif
+
+PPU_CFLAGS= $(ARCHITECTUREFLAG) $(DEBUGFLAG) -W -Wall -Winline -O3 -c -mabi=altivec -maltivec -include altivec.h -include stdbool.h
+PPU_INCLUDEDIR= -I. -I$(CELL_SYSROOT)/usr/include -I../../src -I$(NARROWPHASEDIR)
+PPU_LFLAGS= $(ARCHITECTUREFLAG) -Wl,-m,elf$(__ARCH_BITS__)ppc
+PPU_LIBRARIES= -lstdc++ -lsupc++ -lgcc -lgcov -lspe2 -lpthread -L../../lib/ibmsdk -lbulletcollision -lbulletdynamics -lbulletmath -L$(CELL_SYSROOT)/usr/lib$(__ARCH_BITS__) -R$(CELL_SYSROOT)/usr/lib
+PPU_AR=/usr/bin/ar
+
+MakeOut :
+#	rm -f -R $(OUTDIR) ; mkdir $(OUTDIR)
+	@echo "usage: make spu, make ppu, make all, or make clean"
+# SPU
+SpuTaskFile : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/SpuTaskFile.o $(SPU_TASKFILE).cpp
+
+boxBoxDistance : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+SpuFakeDma : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+
+SpuContactManifoldCollisionAlgorithm_spu : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o SpuContactManifoldCollisionAlgorithm.cpp
+
+SpuCollisionShapes : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+SpuContactResult : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+#SpuGatheringCollisionTask : MakeOut
+#	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+SpuGjkPairDetector: MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+SpuMinkowskiPenetrationDepthSolver : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+SpuVoronoiSimplexSolver : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(NARROWPHASEDIR)/$@.cpp
+
+#SpuLibspe2Support_spu : MakeOut
+#	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o SpuLibspe2Support.cpp
+
+## SPU-Bullet
+btPersistentManifold : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/NarrowPhaseCollision/$@.cpp
+
+btOptimizedBvh : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
+
+btCollisionObject : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionDispatch/$@.cpp
+
+btTriangleCallback : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
+
+btTriangleIndexVertexArray : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
+
+btStridingMeshInterface : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(COLLISIONDIR)/CollisionShapes/$@.cpp
+
+btAlignedAllocator : MakeOut
+	$(SPU_GCC) $(SPU_DEFFLAGS) $(SPU_CFLAGS) $(SPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $(MATHDIR)/$@.cpp
+
+
+# PPU
+SpuGatheringCollisionDispatcher : MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+
+SequentialThreadSupport: MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+
+SpuLibspe2Support: MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+
+btThreadSupportInterface: MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+
+SpuCollisionTaskProcess : MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+
+SpuContactManifoldCollisionAlgorithm : MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+	
+SpuSampleTaskProcess : MakeOut
+	$(PPU_GCC) $(PPU_DEFFLAGS) $(PPU_CFLAGS) $(PPU_INCLUDEDIR) -o $(OUTDIR)/$@.o $@.cpp
+	
+
+
+spu : boxBoxDistance SpuFakeDma SpuContactManifoldCollisionAlgorithm_spu SpuContactResult SpuTaskFile \
+      SpuGjkPairDetector SpuMinkowskiPenetrationDepthSolver SpuVoronoiSimplexSolver SpuCollisionShapes \
+      btPersistentManifold btOptimizedBvh btCollisionObject btTriangleCallback btTriangleIndexVertexArray \
+      btStridingMeshInterface btAlignedAllocator
+	$(SPU_GCC) -o $(OUTDIR)/spuCollision.elf \
+                            $(OUTDIR)/SpuTaskFile.o \
+                            $(OUTDIR)/SpuFakeDma.o \
+			    $(OUTDIR)/boxBoxDistance.o \
+                            $(OUTDIR)/SpuContactManifoldCollisionAlgorithm_spu.o \
+                            $(OUTDIR)/SpuContactResult.o \
+                            $(OUTDIR)/SpuCollisionShapes.o \
+                            $(OUTDIR)/SpuGjkPairDetector.o \
+                            $(OUTDIR)/SpuMinkowskiPenetrationDepthSolver.o \
+                            $(OUTDIR)/SpuVoronoiSimplexSolver.o \
+                            $(OUTDIR)/btPersistentManifold.o \
+                            $(OUTDIR)/btTriangleCallback.o \
+                            $(OUTDIR)/btTriangleIndexVertexArray.o \
+                            $(OUTDIR)/btStridingMeshInterface.o \
+                            $(OUTDIR)/btAlignedAllocator.o \
+                            $(SPU_LFLAGS) $(SPU_LIBRARIES)
+
+spu-embed : spu
+	$(SPU_EMBED) $(ARCHITECTUREFLAG) $(SYMBOLNAME) $(OUTDIR)/spuCollision.elf $(OUTDIR)/$@.o
+	$(SPU_AR) -qcs $(LIBOUTDIR)/libspu.a $(OUTDIR)/$@.o
+
+
+
+ppu : SpuGatheringCollisionDispatcher SpuCollisionTaskProcess btThreadSupportInterface \
+      SpuLibspe2Support SpuContactManifoldCollisionAlgorithm SpuSampleTaskProcess
+	$(PPU_AR) -qcs $(LIBOUTDIR)/bulletmultithreaded.a \
+                                                          $(OUTDIR)/SpuCollisionTaskProcess.o \
+                                                          $(OUTDIR)/SpuSampleTaskProcess.o \
+                                                          $(OUTDIR)/SpuGatheringCollisionDispatcher.o \
+                                                          $(OUTDIR)/SpuLibspe2Support.o \
+                                                          $(OUTDIR)/btThreadSupportInterface.o \
+							  $(OUTDIR)/SpuContactManifoldCollisionAlgorithm.o
+
+all : spu-embed ppu 
+
+clean:
+	$(RM) $(OUTDIR)/* ; $(RM) $(LIBOUTDIR)/libspu.a ; $(RM) $(LIBOUTDIR)/bulletmultithreaded.a
+
+
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTask/MiniCLTask.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTask/MiniCLTask.cpp
@ -0,0 +1,116 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+
+#include "MiniCLTask.h"
+#include "../PlatformDefinitions.h"
+#include "../SpuFakeDma.h"
+#include "LinearMath/btMinMax.h"
+#include "BulletMultiThreaded/MiniCLTask/MiniCLTask.h"
+
+#ifdef __SPU__
+#include <spu_printf.h>
+#else
+#include <stdio.h>
+#define spu_printf printf
+#endif
+
+#define __kernel
+#define __global
+#define get_global_id(a) guid
+
+struct MiniCLTask_LocalStoreMemory
+{
+	
+};
+
+
+///////////////////////////////////////////////////
+// OpenCL Kernel Function for element by element vector addition
+__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int guid)
+{
+    // get oct-float index into global data array
+    int iGID = get_global_id(0);
+
+    // read inputs into registers
+    float8 f8InA = a[iGID];
+    float8 f8InB = b[iGID];
+    float8 f8Out = (float8)0.0f;
+    
+    // add the vector elements
+    f8Out.s0 = f8InA.s0 + f8InB.s0;
+    f8Out.s1 = f8InA.s1 + f8InB.s1;
+    f8Out.s2 = f8InA.s2 + f8InB.s2;
+    f8Out.s3 = f8InA.s3 + f8InB.s3;
+    f8Out.s4 = f8InA.s4 + f8InB.s4;
+    f8Out.s5 = f8InA.s5 + f8InB.s5;
+    f8Out.s6 = f8InA.s6 + f8InB.s6;
+    f8Out.s7 = f8InA.s7 + f8InB.s7;
+
+    // write back out to GMEM
+    c[get_global_id(0)] = f8Out;
+}
+///////////////////////////////////////////////////
+
+
+//-- MAIN METHOD
+void processMiniCLTask(void* userPtr, void* lsMemory)
+{
+	//	BT_PROFILE("processSampleTask");
+
+	MiniCLTask_LocalStoreMemory* localMemory = (MiniCLTask_LocalStoreMemory*)lsMemory;
+
+	MiniCLTaskDesc* taskDescPtr = (MiniCLTaskDesc*)userPtr;
+	MiniCLTaskDesc& taskDesc = *taskDescPtr;
+
+	printf("Compute Unit[%d] executed kernel %d work items [%d..%d)\n",taskDesc.m_taskId,taskDesc.m_kernelProgramId,taskDesc.m_firstWorkUnit,taskDesc.m_lastWorkUnit);
+	
+	
+	switch (taskDesc.m_kernelProgramId)
+	{
+	case CMD_MINICL_ADDVECTOR:
+		{
+			for (unsigned int i=taskDesc.m_firstWorkUnit;i<taskDesc.m_lastWorkUnit;i++)
+			{
+				VectorAdd(*(const float8**)&taskDesc.m_argData[0][0],*(const float8**)&taskDesc.m_argData[1][0],*(float8**)&taskDesc.m_argData[2][0],i);
+			}
+			break;
+		}
+
+	default:
+		{
+			printf("error in processMiniCLTask: unknown command id: %d\n",taskDesc.m_kernelProgramId);
+
+		}
+	};
+
+}
+
+
+#if defined(__CELLOS_LV2__) || defined (LIBSPE2)
+
+ATTRIBUTE_ALIGNED16(MiniCLTask_LocalStoreMemory	gLocalStoreMemory);
+
+void* createMiniCLLocalStoreMemory()
+{
+	return &gLocalStoreMemory;
+}
+#else
+void* createMiniCLLocalStoreMemory()
+{
+	return new MiniCLTask_LocalStoreMemory;
+};
+
+#endif
--- a/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTask/MiniCLTask.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTask/MiniCLTask.h
@ -0,0 +1,81 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef MINICL__TASK_H
+#define MINICL__TASK_H
+
+#include "../PlatformDefinitions.h"
+#include "LinearMath/btScalar.h"
+
+#include "LinearMath/btAlignedAllocator.h"
+
+
+enum
+{
+	CMD_MINICL_1= 1,
+	CMD_MINICL_ADDVECTOR
+};
+
+
+
+struct float8
+{
+	float s0;
+	float s1;
+	float s2;
+	float s3;
+	float s4;
+	float s5;
+	float s6;
+	float s7;
+
+	float8(float scalar)
+	{
+		s0=s1=s2=s3=s4=s5=s6=s7=scalar;
+	}
+};
+
+#define MINICL_MAX_ARGLENGTH 128
+#define MINI_CL_MAX_ARG 8
+
+ATTRIBUTE_ALIGNED16(struct) MiniCLTaskDesc
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	MiniCLTaskDesc()
+	{
+		for (int i=0;i<MINI_CL_MAX_ARG;i++)
+		{
+			m_argSizes[i]=0;
+		}
+	}
+
+	uint32_t	m_taskId;
+
+	uint32_t	m_kernelProgramId;
+	uint32_t	m_firstWorkUnit;
+	uint32_t	m_lastWorkUnit;
+
+	char		m_argData[MINI_CL_MAX_ARG][MINICL_MAX_ARGLENGTH];
+	int			m_argSizes[MINI_CL_MAX_ARG];
+};
+
+
+void	processMiniCLTask(void* userPtr, void* lsMemory);
+void*	createMiniCLLocalStoreMemory();
+
+
+#endif //MINICL__TASK_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTaskScheduler.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTaskScheduler.cpp
@ -0,0 +1,227 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//#define __CELLOS_LV2__ 1
+
+#define USE_SAMPLE_PROCESS 1
+#ifdef USE_SAMPLE_PROCESS
+
+
+#include "MiniCLTaskScheduler.h"
+#include <stdio.h>
+
+#ifdef __SPU__
+
+
+
+void	SampleThreadFunc(void* userPtr,void* lsMemory)
+{
+	//do nothing
+	printf("hello world\n");
+}
+
+
+void*	SamplelsMemoryFunc()
+{
+	//don't create local store memory, just return 0
+	return 0;
+}
+
+
+#else
+
+
+#include "btThreadSupportInterface.h"
+
+//#	include "SPUAssert.h"
+#include <string.h>
+
+
+
+extern "C" {
+	extern char SPU_SAMPLE_ELF_SYMBOL[];
+}
+
+
+
+
+
+MiniCLTaskScheduler::MiniCLTaskScheduler(btThreadSupportInterface*	threadInterface,  int maxNumOutstandingTasks)
+:m_threadInterface(threadInterface),
+m_maxNumOutstandingTasks(maxNumOutstandingTasks)
+{
+
+	m_taskBusy.resize(m_maxNumOutstandingTasks);
+	m_spuSampleTaskDesc.resize(m_maxNumOutstandingTasks);
+
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+
+	m_initialized = false;
+
+	m_threadInterface->startSPU();
+
+
+}
+
+MiniCLTaskScheduler::~MiniCLTaskScheduler()
+{
+	m_threadInterface->stopSPU();
+	
+}
+
+
+
+void	MiniCLTaskScheduler::initialize()
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("MiniCLTaskScheduler::initialize()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+	m_initialized = true;
+
+}
+
+
+void MiniCLTaskScheduler::issueTask(int firstWorkUnit, int lastWorkUnit,int kernelProgramId,char* argData,int* argSizes)
+{
+
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("MiniCLTaskScheduler::issueTask (m_currentTask= %d\)n", m_currentTask);
+#endif //DEBUG_SPU_TASK_SCHEDULING
+
+	m_taskBusy[m_currentTask] = true;
+	m_numBusyTasks++;
+
+	MiniCLTaskDesc& taskDesc = m_spuSampleTaskDesc[m_currentTask];
+	{
+		// send task description in event message
+		taskDesc.m_firstWorkUnit = firstWorkUnit;
+		taskDesc.m_lastWorkUnit = lastWorkUnit;
+		taskDesc.m_kernelProgramId = kernelProgramId;
+		//some bookkeeping to recognize finished tasks
+		taskDesc.m_taskId = m_currentTask;
+		
+		for (int i=0;i<MINI_CL_MAX_ARG;i++)
+		{
+			taskDesc.m_argSizes[i] = argSizes[i];
+			if (taskDesc.m_argSizes[i])
+			{
+				memcpy(&taskDesc.m_argData[i],&argData[MINICL_MAX_ARGLENGTH*i],taskDesc.m_argSizes[i]);
+			}
+		}
+	}
+
+
+	m_threadInterface->sendRequest(1, (ppu_address_t) &taskDesc, m_currentTask);
+
+	// if all tasks busy, wait for spu event to clear the task.
+	
+	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
+	{
+		unsigned int taskId;
+		unsigned int outputSize;
+
+		for (int i=0;i<m_maxNumOutstandingTasks;i++)
+	  {
+		  if (m_taskBusy[i])
+		  {
+			  taskId = i;
+			  break;
+		  }
+	  }
+		m_threadInterface->waitForResponse(&taskId, &outputSize);
+
+		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
+
+		postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+
+	// find new task buffer
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		if (!m_taskBusy[i])
+		{
+			m_currentTask = i;
+			break;
+		}
+	}
+}
+
+
+///Optional PPU-size post processing for each task
+void MiniCLTaskScheduler::postProcess(int taskId, int outputSize)
+{
+
+}
+
+
+void MiniCLTaskScheduler::flush()
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("\nSpuCollisionTaskProcess::flush()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+
+	// all tasks are issued, wait for all tasks to be complete
+	while(m_numBusyTasks > 0)
+	{
+// Consolidating SPU code
+	  unsigned int taskId;
+	  unsigned int outputSize;
+	  
+	  for (int i=0;i<m_maxNumOutstandingTasks;i++)
+	  {
+		  if (m_taskBusy[i])
+		  {
+			  taskId = i;
+			  break;
+		  }
+	  }
+	  {
+			
+		  m_threadInterface->waitForResponse(&taskId, &outputSize);
+	  }
+
+		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
+
+		postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+
+
+}
+
+#endif
+
+
+#endif //USE_SAMPLE_PROCESS
--- a/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTaskScheduler.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/MiniCLTaskScheduler.h
@ -0,0 +1,181 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#ifndef MINICL_TASK_SCHEDULER_H
+#define MINICL_TASK_SCHEDULER_H
+
+#include <assert.h>
+
+
+#include "PlatformDefinitions.h"
+
+#include <stdlib.h>
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+
+#include "MiniCLTask/MiniCLTask.h"
+
+
+//just add your commands here, try to keep them globally unique for debugging purposes
+#define CMD_SAMPLE_TASK_COMMAND 10
+
+
+
+/// MiniCLTaskScheduler handles SPU processing of collision pairs.
+/// When PPU issues a task, it will look for completed task buffers
+/// PPU will do postprocessing, dependent on workunit output (not likely)
+class MiniCLTaskScheduler
+{
+	// track task buffers that are being used, and total busy tasks
+	btAlignedObjectArray<bool>	m_taskBusy;
+	btAlignedObjectArray<MiniCLTaskDesc>	m_spuSampleTaskDesc;
+	
+	int   m_numBusyTasks;
+
+	// the current task and the current entry to insert a new work unit
+	int   m_currentTask;
+
+	bool m_initialized;
+
+	void postProcess(int taskId, int outputSize);
+	
+	class	btThreadSupportInterface*	m_threadInterface;
+
+	int	m_maxNumOutstandingTasks;
+
+
+
+public:
+	MiniCLTaskScheduler(btThreadSupportInterface*	threadInterface, int maxNumOutstandingTasks);
+	
+	~MiniCLTaskScheduler();
+	
+	///call initialize in the beginning of the frame, before addCollisionPairToTask
+	void initialize();
+
+	void issueTask(int firstWorkUnit, int lastWorkUnit,int kernelProgramId,char* argData,int* argSizes);
+
+	///call flush to submit potential outstanding work to SPUs and wait for all involved SPUs to be finished
+	void flush();
+
+	class	btThreadSupportInterface*	getThreadSupportInterface()
+	{
+		return m_threadInterface;
+	}
+
+	int	findProgramCommandIdByName(const char* programName) const
+	{
+		return CMD_MINICL_ADDVECTOR;//hardcoded temp value, todo: implement multi-program support
+	}
+
+	int getMaxNumOutstandingTasks() const
+	{
+		return m_maxNumOutstandingTasks;
+	}
+};
+
+
+struct	MiniCLKernel
+{
+	MiniCLTaskScheduler* m_scheduler;
+	
+	int	m_kernelProgramCommandId;
+
+	char	m_argData[MINI_CL_MAX_ARG][MINICL_MAX_ARGLENGTH];
+	int				m_argSizes[MINI_CL_MAX_ARG];
+};
+
+
+#if defined(USE_LIBSPE2) && defined(__SPU__)
+////////////////////MAIN/////////////////////////////
+#include "../SpuLibspe2Support.h"
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <SpuFakeDma.h>
+
+void * SamplelsMemoryFunc();
+void SampleThreadFunc(void* userPtr,void* lsMemory);
+
+//#define DEBUG_LIBSPE2_MAINLOOP
+
+int main(unsigned long long speid, addr64 argp, addr64 envp)
+{
+	printf("SPU is up \n");
+	
+	ATTRIBUTE_ALIGNED128(btSpuStatus status);
+	ATTRIBUTE_ALIGNED16( SpuSampleTaskDesc taskDesc ) ;
+	unsigned int received_message = Spu_Mailbox_Event_Nothing;
+        bool shutdown = false;
+
+	cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(3));
+
+	status.m_status = Spu_Status_Free;
+	status.m_lsMemory.p = SamplelsMemoryFunc();
+
+	cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(3));
+	
+	
+	while (!shutdown)
+	{
+		received_message = spu_read_in_mbox();
+		
+
+		
+		switch(received_message)
+		{
+		case Spu_Mailbox_Event_Shutdown:
+			shutdown = true;
+			break; 
+		case Spu_Mailbox_Event_Task:
+			// refresh the status
+#ifdef DEBUG_LIBSPE2_MAINLOOP
+			printf("SPU recieved Task \n");
+#endif //DEBUG_LIBSPE2_MAINLOOP
+			cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+			cellDmaWaitTagStatusAll(DMA_MASK(3));
+		
+			btAssert(status.m_status==Spu_Status_Occupied);
+			
+			cellDmaGet(&taskDesc, status.m_taskDesc.p, sizeof(SpuSampleTaskDesc), DMA_TAG(3), 0, 0);
+			cellDmaWaitTagStatusAll(DMA_MASK(3));
+			
+			SampleThreadFunc((void*)&taskDesc, reinterpret_cast<void*> (taskDesc.m_mainMemoryPtr) );
+			break;
+		case Spu_Mailbox_Event_Nothing:
+		default:
+			break;
+		}
+
+		// set to status free and wait for next task
+		status.m_status = Spu_Status_Free;
+		cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+		cellDmaWaitTagStatusAll(DMA_MASK(3));		
+				
+		
+  	}
+  	return 0;
+}
+//////////////////////////////////////////////////////
+#endif
+
+
+
+#endif // MINICL_TASK_SCHEDULER_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/PlatformDefinitions.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/PlatformDefinitions.h
@ -0,0 +1,82 @@
+#ifndef TYPE_DEFINITIONS_H
+#define TYPE_DEFINITIONS_H
+
+///This file provides some platform/compiler checks for common definitions
+
+#ifdef WIN32
+
+typedef union
+{
+  unsigned int u;
+  void *p;
+} addr64;
+
+#define USE_WIN32_THREADING 1
+
+		#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300)
+		#else
+		#endif //__MINGW32__
+
+		typedef unsigned char     uint8_t;
+#ifndef __PHYSICS_COMMON_H__
+		typedef unsigned long int uint64_t;
+		typedef unsigned int      uint32_t;
+#endif //__PHYSICS_COMMON_H__
+		typedef unsigned short    uint16_t;
+
+		#include <malloc.h>
+		#define memalign(alignment, size) malloc(size);
+			
+#include <string.h> //memcpy
+
+		
+
+		#include <stdio.h>		
+		#define spu_printf printf
+		
+#else
+		#include <stdint.h>
+		#include <stdlib.h>
+		#include <string.h> //for memcpy
+
+#if defined	(__CELLOS_LV2__)
+	// Playstation 3 Cell SDK
+#include <spu_printf.h>
+		
+#else
+	// posix system
+
+#define USE_PTHREADS    (1)
+
+#ifdef USE_LIBSPE2
+#include <stdio.h>		
+#define spu_printf printf	
+#define DWORD unsigned int
+		
+			typedef union
+			{
+			  unsigned long long ull;
+			  unsigned int ui[2];
+			  void *p;
+			} addr64;
+		
+		
+#else
+
+#include <stdio.h>		
+#define spu_printf printf	
+
+#endif // USE_LIBSPE2
+	
+#endif	//__CELLOS_LV2__
+	
+#endif
+
+
+/* Included here because we need uint*_t typedefs */
+#include "PpuAddressSpace.h"
+
+#endif //TYPE_DEFINITIONS_H
+
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/PosixThreadSupport.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/PosixThreadSupport.cpp
@ -0,0 +1,249 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <stdio.h>
+#include "PosixThreadSupport.h"
+#ifdef USE_PTHREADS
+#include <errno.h>
+#include <unistd.h>
+
+#include "SpuCollisionTaskProcess.h"
+#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
+
+#define checkPThreadFunction(returnValue) \
+    if(0 != returnValue) { \
+        printf("PThread problem at line %i in file %s: %i %d\n", __LINE__, __FILE__, returnValue, errno); \
+    }
+
+// The number of threads should be equal to the number of available cores
+// Todo: each worker should be linked to a single core, using SetThreadIdealProcessor.
+
+// PosixThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+// Setup and initialize SPU/CELL/Libspe2
+PosixThreadSupport::PosixThreadSupport(ThreadConstructionInfo& threadConstructionInfo)
+{
+	startThreads(threadConstructionInfo);
+}
+
+// cleanup/shutdown Libspe2
+PosixThreadSupport::~PosixThreadSupport()
+{
+	stopSPU();
+}
+
+#if (defined (__APPLE__))
+#define NAMED_SEMAPHORES
+#endif
+
+// this semaphore will signal, if and how many threads are finished with their work
+static sem_t* mainSemaphore;
+
+static sem_t* createSem(const char* baseName)
+{
+	static int semCount = 0;
+#ifdef NAMED_SEMAPHORES
+        /// Named semaphore begin
+        char name[32];
+        snprintf(name, 32, "/%s-%d-%4.4d", baseName, getpid(), semCount++); 
+        sem_t* tempSem = sem_open(name, O_CREAT, 0600, 0);
+        if (tempSem != reinterpret_cast<sem_t *>(SEM_FAILED))
+        {
+        	//printf("Created \"%s\" Semaphore %x\n", name, tempSem);
+        }
+        else
+	{
+		//printf("Error creating Semaphore %d\n", errno);
+		exit(-1);
+	}
+        /// Named semaphore end
+#else
+	sem_t* tempSem = new sem_t;
+	checkPThreadFunction(sem_init(tempSem, 0, 0));
+#endif
+	return tempSem;
+}
+
+static void destroySem(sem_t* semaphore)
+{
+#ifdef NAMED_SEMAPHORES
+	checkPThreadFunction(sem_close(semaphore));
+#else
+	checkPThreadFunction(sem_destroy(semaphore));
+	delete semaphore;
+#endif	
+}
+
+static void *threadFunction(void *argument) 
+{
+
+	PosixThreadSupport::btSpuStatus* status = (PosixThreadSupport::btSpuStatus*)argument;
+
+	
+	while (1)
+	{
+            checkPThreadFunction(sem_wait(status->startSemaphore));
+		
+		void* userPtr = status->m_userPtr;
+
+		if (userPtr)
+		{
+			btAssert(status->m_status);
+			status->m_userThreadFunc(userPtr,status->m_lsMemory);
+			status->m_status = 2;
+			checkPThreadFunction(sem_post(mainSemaphore));
+	                status->threadUsed++;
+		} else {
+			//exit Thread
+			status->m_status = 3;
+			checkPThreadFunction(sem_post(mainSemaphore));
+			printf("Thread with taskId %i exiting\n",status->m_taskId);
+			break;
+		}
+		
+	}
+
+	printf("Thread TERMINATED\n");
+	return 0;
+
+}
+
+///send messages to SPUs
+void PosixThreadSupport::sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t taskId)
+{
+	///	gMidphaseSPU.sendRequest(CMD_GATHER_AND_PROCESS_PAIRLIST, (uint32_t) &taskDesc);
+	
+	///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished
+	
+
+
+	switch (uiCommand)
+	{
+	case 	CMD_GATHER_AND_PROCESS_PAIRLIST:
+		{
+			btSpuStatus&	spuStatus = m_activeSpuStatus[taskId];
+			btAssert(taskId >= 0);
+			btAssert(taskId < m_activeSpuStatus.size());
+
+			spuStatus.m_commandId = uiCommand;
+			spuStatus.m_status = 1;
+			spuStatus.m_userPtr = (void*)uiArgument0;
+
+			// fire event to start new task
+			checkPThreadFunction(sem_post(spuStatus.startSemaphore));
+			break;
+		}
+	default:
+		{
+			///not implemented
+			btAssert(0);
+		}
+
+	};
+
+
+}
+
+
+///check for messages from SPUs
+void PosixThreadSupport::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
+{
+	///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
+	
+	///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
+
+
+	btAssert(m_activeSpuStatus.size());
+
+        // wait for any of the threads to finish
+	checkPThreadFunction(sem_wait(mainSemaphore));
+        
+	// get at least one thread which has finished
+        size_t last = -1;
+        
+        for(size_t t=0; t < m_activeSpuStatus.size(); ++t) {
+            if(2 == m_activeSpuStatus[t].m_status) {
+                last = t;
+                break;
+            }
+        }
+
+	btSpuStatus& spuStatus = m_activeSpuStatus[last];
+
+	btAssert(spuStatus.m_status > 1);
+	spuStatus.m_status = 0;
+
+	// need to find an active spu
+	btAssert(last >= 0);
+
+	*puiArgument0 = spuStatus.m_taskId;
+	*puiArgument1 = spuStatus.m_status;
+}
+
+
+
+void PosixThreadSupport::startThreads(ThreadConstructionInfo& threadConstructionInfo)
+{
+        printf("%s creating %i threads.\n", __FUNCTION__, threadConstructionInfo.m_numThreads);
+	m_activeSpuStatus.resize(threadConstructionInfo.m_numThreads);
+        
+	mainSemaphore = createSem("main");                
+        
+	for (int i=0;i < threadConstructionInfo.m_numThreads;i++)
+	{
+		printf("starting thread %d\n",i);
+
+		btSpuStatus&	spuStatus = m_activeSpuStatus[i];
+
+		spuStatus.startSemaphore = createSem("threadLocal");                
+                
+                checkPThreadFunction(pthread_create(&spuStatus.thread, NULL, &threadFunction, (void*)&spuStatus));
+
+		spuStatus.m_userPtr=0;
+
+		spuStatus.m_taskId = i;
+		spuStatus.m_commandId = 0;
+		spuStatus.m_status = 0;
+		spuStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
+		spuStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
+        spuStatus.threadUsed = 0;
+
+		printf("started thread %d \n",i);
+		
+	}
+
+}
+
+void PosixThreadSupport::startSPU()
+{
+}
+
+
+///tell the task scheduler we are done with the SPU tasks
+void PosixThreadSupport::stopSPU()
+{
+	for(size_t t=0; t < m_activeSpuStatus.size(); ++t) {
+            btSpuStatus&	spuStatus = m_activeSpuStatus[t];
+            printf("%s: Thread %i used: %ld\n", __FUNCTION__, t, spuStatus.threadUsed);
+        
+            destroySem(spuStatus.startSemaphore);
+            checkPThreadFunction(pthread_cancel(spuStatus.thread));
+        }
+        destroySem(mainSemaphore);
+
+	m_activeSpuStatus.clear();
+}
+
+#endif // USE_PTHREADS
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/PosixThreadSupport.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/PosixThreadSupport.h
@ -0,0 +1,124 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "LinearMath/btScalar.h"
+#include "PlatformDefinitions.h"
+
+#ifdef USE_PTHREADS  //platform specific defines are defined in PlatformDefinitions.h
+#include <pthread.h>
+#include <semaphore.h>
+
+#ifndef POSIX_THREAD_SUPPORT_H
+#define POSIX_THREAD_SUPPORT_H
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+#include "btThreadSupportInterface.h"
+
+
+typedef void (*PosixThreadFunc)(void* userPtr,void* lsMemory);
+typedef void* (*PosixlsMemorySetupFunc)();
+
+// PosixThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+class PosixThreadSupport : public btThreadSupportInterface 
+{
+public:
+    typedef enum sStatus {
+        STATUS_BUSY,
+        STATUS_READY,
+        STATUS_FINISHED
+    } Status;
+
+	// placeholder, until libspe2 support is there
+	struct	btSpuStatus
+	{
+		uint32_t	m_taskId;
+		uint32_t	m_commandId;
+		uint32_t	m_status;
+
+		PosixThreadFunc	m_userThreadFunc;
+		void*	m_userPtr; //for taskDesc etc
+		void*	m_lsMemory; //initialized using PosixLocalStoreMemorySetupFunc
+
+                pthread_t thread;
+                sem_t* startSemaphore;
+
+        unsigned long threadUsed;
+	};
+private:
+
+	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
+public:
+	///Setup and initialize SPU/CELL/Libspe2
+
+	
+
+	struct	ThreadConstructionInfo
+	{
+		ThreadConstructionInfo(char* uniqueName,
+									PosixThreadFunc userThreadFunc,
+									PosixlsMemorySetupFunc	lsMemoryFunc,
+									int numThreads=1,
+									int threadStackSize=65535
+									)
+									:m_uniqueName(uniqueName),
+									m_userThreadFunc(userThreadFunc),
+									m_lsMemoryFunc(lsMemoryFunc),
+									m_numThreads(numThreads),
+									m_threadStackSize(threadStackSize)
+		{
+
+		}
+
+		char*					m_uniqueName;
+		PosixThreadFunc			m_userThreadFunc;
+		PosixlsMemorySetupFunc	m_lsMemoryFunc;
+		int						m_numThreads;
+		int						m_threadStackSize;
+
+	};
+
+	PosixThreadSupport(ThreadConstructionInfo& threadConstructionInfo);
+
+///cleanup/shutdown Libspe2
+	virtual	~PosixThreadSupport();
+
+	void	startThreads(ThreadConstructionInfo&	threadInfo);
+
+
+///send messages to SPUs
+	virtual	void sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t uiArgument1);
+
+///check for messages from SPUs
+	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
+
+///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	virtual	void startSPU();
+
+///tell the task scheduler we are done with the SPU tasks
+	virtual	void stopSPU();
+
+	virtual void setNumTasks(int numTasks) {}
+
+	virtual int getNumTasks() const
+	{
+		return m_activeSpuStatus.size();
+	}
+};
+
+#endif // POSIX_THREAD_SUPPORT_H
+
+#endif // USE_PTHREADS
--- a/Engine/lib/bullet/src/BulletMultiThreaded/PpuAddressSpace.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/PpuAddressSpace.h
@ -0,0 +1,18 @@
+#ifndef __PPU_ADDRESS_SPACE_H
+#define __PPU_ADDRESS_SPACE_H
+
+
+#ifdef WIN32
+//stop those casting warnings until we have a better solution for ppu_address_t / void* / uint64 conversions
+#pragma warning (disable: 4311)
+#pragma warning (disable: 4312)
+#endif //WIN32
+
+#ifdef USE_ADDR64
+typedef uint64_t ppu_address_t;
+#else
+typedef uint32_t ppu_address_t;
+#endif
+
+#endif
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SequentialThreadSupport.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SequentialThreadSupport.cpp
@ -0,0 +1,93 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SequentialThreadSupport.h"
+
+
+#include "SpuCollisionTaskProcess.h"
+#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
+
+SequentialThreadSupport::SequentialThreadSupport(SequentialThreadConstructionInfo& threadConstructionInfo)
+{
+	startThreads(threadConstructionInfo);
+}
+
+///cleanup/shutdown Libspe2
+SequentialThreadSupport::~SequentialThreadSupport()
+{
+	stopSPU();
+}
+
+#include <stdio.h>
+
+///send messages to SPUs
+void SequentialThreadSupport::sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t taskId)
+{
+	switch (uiCommand)
+	{
+	case 	CMD_GATHER_AND_PROCESS_PAIRLIST:
+		{
+			btSpuStatus&	spuStatus = m_activeSpuStatus[0];
+			spuStatus.m_userPtr=(void*)uiArgument0;
+			spuStatus.m_userThreadFunc(spuStatus.m_userPtr,spuStatus.m_lsMemory);
+		}
+	break;
+	default:
+		{
+			///not implemented
+			btAssert(0 && "Not implemented");
+		}
+
+	};
+
+
+}
+
+///check for messages from SPUs
+void SequentialThreadSupport::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
+{
+	btAssert(m_activeSpuStatus.size());
+	btSpuStatus& spuStatus = m_activeSpuStatus[0];
+	*puiArgument0 = spuStatus.m_taskId;
+	*puiArgument1 = spuStatus.m_status;
+}
+
+void SequentialThreadSupport::startThreads(SequentialThreadConstructionInfo& threadConstructionInfo)
+{
+	m_activeSpuStatus.resize(1);
+	printf("STS: Not starting any threads\n");
+	btSpuStatus& spuStatus = m_activeSpuStatus[0];
+	spuStatus.m_userPtr = 0;
+	spuStatus.m_taskId = 0;
+	spuStatus.m_commandId = 0;
+	spuStatus.m_status = 0;
+	spuStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
+	spuStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
+	printf("STS: Created local store at %p for task %s\n", spuStatus.m_lsMemory, threadConstructionInfo.m_uniqueName);
+}
+
+void SequentialThreadSupport::startSPU()
+{
+}
+
+void SequentialThreadSupport::stopSPU()
+{
+	m_activeSpuStatus.clear();
+}
+
+void SequentialThreadSupport::setNumTasks(int numTasks)
+{
+	printf("SequentialThreadSupport::setNumTasks(%d) is not implemented and has no effect\n",numTasks);
+}
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SequentialThreadSupport.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SequentialThreadSupport.h
@ -0,0 +1,92 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "LinearMath/btScalar.h"
+#include "PlatformDefinitions.h"
+
+
+#ifndef SEQUENTIAL_THREAD_SUPPORT_H
+#define SEQUENTIAL_THREAD_SUPPORT_H
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+#include "btThreadSupportInterface.h"
+
+typedef void (*SequentialThreadFunc)(void* userPtr,void* lsMemory);
+typedef void* (*SequentiallsMemorySetupFunc)();
+
+
+
+///The SequentialThreadSupport is a portable non-parallel implementation of the btThreadSupportInterface
+///This is useful for debugging and porting SPU Tasks to other platforms.
+class SequentialThreadSupport : public btThreadSupportInterface 
+{
+public:
+	struct	btSpuStatus
+	{
+		uint32_t	m_taskId;
+		uint32_t	m_commandId;
+		uint32_t	m_status;
+
+		SequentialThreadFunc	m_userThreadFunc;
+
+		void*	m_userPtr; //for taskDesc etc
+		void*	m_lsMemory; //initialized using SequentiallsMemorySetupFunc
+	};
+private:
+	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
+	btAlignedObjectArray<void*>			m_completeHandles;	
+public:
+	struct	SequentialThreadConstructionInfo
+	{
+		SequentialThreadConstructionInfo (char* uniqueName,
+									SequentialThreadFunc userThreadFunc,
+									SequentiallsMemorySetupFunc	lsMemoryFunc
+									)
+									:m_uniqueName(uniqueName),
+									m_userThreadFunc(userThreadFunc),
+									m_lsMemoryFunc(lsMemoryFunc)
+		{
+
+		}
+
+		char*						m_uniqueName;
+		SequentialThreadFunc		m_userThreadFunc;
+		SequentiallsMemorySetupFunc	m_lsMemoryFunc;
+	};
+
+	SequentialThreadSupport(SequentialThreadConstructionInfo& threadConstructionInfo);
+	virtual	~SequentialThreadSupport();
+	void	startThreads(SequentialThreadConstructionInfo&	threadInfo);
+///send messages to SPUs
+	virtual	void sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t uiArgument1);
+///check for messages from SPUs
+	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
+///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	virtual	void startSPU();
+///tell the task scheduler we are done with the SPU tasks
+	virtual	void stopSPU();
+
+	virtual void setNumTasks(int numTasks);
+
+	virtual int getNumTasks() const
+	{
+		return 1;
+	}
+
+};
+
+#endif //SEQUENTIAL_THREAD_SUPPORT_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionObjectWrapper.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionObjectWrapper.cpp
@ -0,0 +1,48 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuCollisionObjectWrapper.h"
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+
+SpuCollisionObjectWrapper::SpuCollisionObjectWrapper ()
+{
+}
+
+#ifndef __SPU__
+SpuCollisionObjectWrapper::SpuCollisionObjectWrapper (const btCollisionObject* collisionObject)
+{
+	m_shapeType = collisionObject->getCollisionShape()->getShapeType ();
+	m_collisionObjectPtr = (ppu_address_t)collisionObject;
+	m_margin = collisionObject->getCollisionShape()->getMargin ();
+}
+#endif
+
+int
+SpuCollisionObjectWrapper::getShapeType () const
+{
+	return m_shapeType;
+}
+
+float
+SpuCollisionObjectWrapper::getCollisionMargin () const
+{
+	return m_margin;
+}
+
+ppu_address_t
+SpuCollisionObjectWrapper::getCollisionObjectPtr () const
+{
+	return m_collisionObjectPtr;
+}
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionObjectWrapper.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionObjectWrapper.h
@ -0,0 +1,40 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_COLLISION_OBJECT_WRAPPER_H
+#define SPU_COLLISION_OBJECT_WRAPPER_H
+
+#include "PlatformDefinitions.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+
+ATTRIBUTE_ALIGNED16(class) SpuCollisionObjectWrapper
+{
+protected:
+	int m_shapeType;
+	float m_margin;
+	ppu_address_t m_collisionObjectPtr;
+
+public:
+	SpuCollisionObjectWrapper ();
+
+	SpuCollisionObjectWrapper (const btCollisionObject* collisionObject);
+
+	int           getShapeType () const;
+	float         getCollisionMargin () const;
+	ppu_address_t getCollisionObjectPtr () const;
+};
+
+
+#endif //SPU_COLLISION_OBJECT_WRAPPER_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionTaskProcess.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionTaskProcess.cpp
@ -0,0 +1,318 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+//#define DEBUG_SPU_TASK_SCHEDULING 1
+
+
+//class OptimizedBvhNode;
+
+#include "SpuCollisionTaskProcess.h"
+
+
+
+
+void	SpuCollisionTaskProcess::setNumTasks(int maxNumTasks)
+{
+	if (m_maxNumOutstandingTasks != maxNumTasks)
+	{
+		m_maxNumOutstandingTasks = maxNumTasks;
+		m_taskBusy.resize(m_maxNumOutstandingTasks);
+		m_spuGatherTaskDesc.resize(m_maxNumOutstandingTasks);
+
+		for (int i = 0; i < m_taskBusy.size(); i++)
+		{
+			m_taskBusy[i] = false;
+		}
+
+		///re-allocate task memory buffers
+		if (m_workUnitTaskBuffers != 0)
+		{
+			btAlignedFree(m_workUnitTaskBuffers);
+		}
+		
+		m_workUnitTaskBuffers = (unsigned char *)btAlignedAlloc(MIDPHASE_WORKUNIT_TASK_SIZE*m_maxNumOutstandingTasks, 128);
+					m_workUnitTaskBuffers = (unsigned char *)btAlignedAlloc(MIDPHASE_WORKUNIT_TASK_SIZE*6, 128);
+	}
+	
+}
+
+
+
+SpuCollisionTaskProcess::SpuCollisionTaskProcess(class	btThreadSupportInterface*	threadInterface, unsigned int	maxNumOutstandingTasks)
+:m_threadInterface(threadInterface),
+m_maxNumOutstandingTasks(0)
+{
+	m_workUnitTaskBuffers = (unsigned char *)0;
+	setNumTasks(maxNumOutstandingTasks);
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+	m_currentPage = 0;
+	m_currentPageEntry = 0;
+
+#ifdef DEBUG_SpuCollisionTaskProcess
+	m_initialized = false;
+#endif
+
+	m_threadInterface->startSPU();
+
+	//printf("sizeof vec_float4: %d\n", sizeof(vec_float4));
+	printf("sizeof SpuGatherAndProcessWorkUnitInput: %d\n", sizeof(SpuGatherAndProcessWorkUnitInput));
+
+}
+
+SpuCollisionTaskProcess::~SpuCollisionTaskProcess()
+{
+	
+	if (m_workUnitTaskBuffers != 0)
+	{
+		btAlignedFree(m_workUnitTaskBuffers);
+		m_workUnitTaskBuffers = 0;
+	}
+	
+
+
+	m_threadInterface->stopSPU();
+	
+}
+
+
+
+void SpuCollisionTaskProcess::initialize2(bool useEpa)
+{
+
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("SpuCollisionTaskProcess::initialize()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+	for (int i = 0; i < int (m_maxNumOutstandingTasks); i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+	m_currentPage = 0;
+	m_currentPageEntry = 0;
+	m_useEpa = useEpa;
+
+#ifdef DEBUG_SpuCollisionTaskProcess
+	m_initialized = true;
+	btAssert(MIDPHASE_NUM_WORKUNITS_PER_TASK*sizeof(SpuGatherAndProcessWorkUnitInput) <= MIDPHASE_WORKUNIT_TASK_SIZE);
+#endif
+}
+
+
+void SpuCollisionTaskProcess::issueTask2()
+{
+
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("SpuCollisionTaskProcess::issueTask (m_currentTask= %d\n)", m_currentTask);
+#endif //DEBUG_SPU_TASK_SCHEDULING
+
+	m_taskBusy[m_currentTask] = true;
+	m_numBusyTasks++;
+
+
+	SpuGatherAndProcessPairsTaskDesc& taskDesc = m_spuGatherTaskDesc[m_currentTask];
+	taskDesc.m_useEpa = m_useEpa;
+
+	{
+		// send task description in event message
+		// no error checking here...
+		// but, currently, event queue can be no larger than NUM_WORKUNIT_TASKS.
+	
+		taskDesc.m_inPairPtr = reinterpret_cast<uint64_t>(MIDPHASE_TASK_PTR(m_currentTask));
+	
+		taskDesc.taskId = m_currentTask;
+		taskDesc.numPages = m_currentPage+1;
+		taskDesc.numOnLastPage = m_currentPageEntry;
+	}
+
+
+
+	m_threadInterface->sendRequest(CMD_GATHER_AND_PROCESS_PAIRLIST, (ppu_address_t) &taskDesc,m_currentTask);
+
+	// if all tasks busy, wait for spu event to clear the task.
+	
+
+	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
+	{
+		unsigned int taskId;
+		unsigned int outputSize;
+
+		
+		for (int i=0;i<int (m_maxNumOutstandingTasks);i++)
+		  {
+			  if (m_taskBusy[i])
+			  {
+				  taskId = i;
+				  break;
+			  }
+		  }
+
+	  btAssert(taskId>=0);
+
+	  
+		m_threadInterface->waitForResponse(&taskId, &outputSize);
+
+//		printf("issueTask taskId %d completed, numBusy=%d\n",taskId,m_numBusyTasks);
+
+		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
+
+		//postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+	
+}
+
+void SpuCollisionTaskProcess::addWorkToTask(void* pairArrayPtr,int startIndex,int endIndex)
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("#");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+#ifdef DEBUG_SpuCollisionTaskProcess
+	btAssert(m_initialized);
+	btAssert(m_workUnitTaskBuffers);
+
+#endif
+
+	bool batch = true;
+
+	if (batch)
+	{
+		if (m_currentPageEntry == MIDPHASE_NUM_WORKUNITS_PER_PAGE)
+		{
+			if (m_currentPage == MIDPHASE_NUM_WORKUNIT_PAGES-1)
+			{
+				// task buffer is full, issue current task.
+				// if all task buffers busy, this waits until SPU is done.
+				issueTask2();
+
+				// find new task buffer
+				for (unsigned int i = 0; i < m_maxNumOutstandingTasks; i++)
+				{
+					if (!m_taskBusy[i])
+					{
+						m_currentTask = i;
+						//init the task data
+
+						break;
+					}
+				}
+
+				m_currentPage = 0;
+			}
+			else
+			{
+				m_currentPage++;
+			}
+
+			m_currentPageEntry = 0;
+		}
+	}
+
+	{
+
+
+
+		SpuGatherAndProcessWorkUnitInput &wuInput = 
+			*(reinterpret_cast<SpuGatherAndProcessWorkUnitInput*>
+			(MIDPHASE_ENTRY_PTR(m_currentTask, m_currentPage, m_currentPageEntry)));
+		
+		wuInput.m_pairArrayPtr = reinterpret_cast<uint64_t>(pairArrayPtr);
+		wuInput.m_startIndex = startIndex;
+		wuInput.m_endIndex = endIndex;
+
+		
+	
+		m_currentPageEntry++;
+
+		if (!batch)
+		{
+			issueTask2();
+
+			// find new task buffer
+			for (unsigned int i = 0; i < m_maxNumOutstandingTasks; i++)
+			{
+				if (!m_taskBusy[i])
+				{
+					m_currentTask = i;
+					//init the task data
+
+					break;
+				}
+			}
+
+			m_currentPage = 0;
+			m_currentPageEntry =0;
+		}
+	}
+}
+
+
+void 
+SpuCollisionTaskProcess::flush2()
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("\nSpuCollisionTaskProcess::flush()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+	// if there's a partially filled task buffer, submit that task
+	if (m_currentPage > 0 || m_currentPageEntry > 0)
+	{
+		issueTask2();
+	}
+
+
+	// all tasks are issued, wait for all tasks to be complete
+	while(m_numBusyTasks > 0)
+	{
+	  // Consolidating SPU code
+	  unsigned int taskId=-1;
+	  unsigned int outputSize;
+	  
+	  for (int i=0;i<int (m_maxNumOutstandingTasks);i++)
+	  {
+		  if (m_taskBusy[i])
+		  {
+			  taskId = i;
+			  break;
+		  }
+	  }
+
+	  btAssert(taskId>=0);
+
+	
+	  {
+			
+		// SPURS support.
+		  m_threadInterface->waitForResponse(&taskId, &outputSize);
+	  }
+//		 printf("flush2 taskId %d completed, numBusy =%d \n",taskId,m_numBusyTasks);
+		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
+
+		//postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+
+
+}
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionTaskProcess.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuCollisionTaskProcess.h
@ -0,0 +1,163 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_COLLISION_TASK_PROCESS_H
+#define SPU_COLLISION_TASK_PROCESS_H
+
+#include <assert.h>
+
+#include <LinearMath/btScalar.h>
+
+#include "PlatformDefinitions.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h" // for definitions processCollisionTask and createCollisionLocalStoreMemory
+
+#include "btThreadSupportInterface.h"
+
+
+//#include "SPUAssert.h"
+#include <string.h>
+
+
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+#include "BulletCollision/CollisionShapes/btConvexShape.h"
+
+#include <LinearMath/btAlignedAllocator.h>
+
+#include <stdio.h>
+
+
+#define DEBUG_SpuCollisionTaskProcess 1
+
+
+#define CMD_GATHER_AND_PROCESS_PAIRLIST	1
+
+class btCollisionObject;
+class btPersistentManifold;
+class btDispatcher;
+
+
+/////Task Description for SPU collision detection
+//struct SpuGatherAndProcessPairsTaskDesc
+//{
+//	uint64_t	inPtr;//m_pairArrayPtr;
+//	//mutex variable
+//	uint32_t	m_someMutexVariableInMainMemory;
+//
+//	uint64_t	m_dispatcher;
+//
+//	uint32_t	numOnLastPage;
+//
+//	uint16_t numPages;
+//	uint16_t taskId;
+//
+//	struct	CollisionTask_LocalStoreMemory*	m_lsMemory; 
+//}
+//
+//#if  defined(__CELLOS_LV2__) || defined(USE_LIBSPE2)
+//__attribute__ ((aligned (16)))
+//#endif
+//;
+
+
+///MidphaseWorkUnitInput stores individual primitive versus mesh collision detection input, to be processed by the SPU.
+ATTRIBUTE_ALIGNED16(struct) SpuGatherAndProcessWorkUnitInput
+{
+	uint64_t m_pairArrayPtr;
+	int		m_startIndex;
+	int		m_endIndex;
+};
+
+
+
+
+/// SpuCollisionTaskProcess handles SPU processing of collision pairs.
+/// Maintains a set of task buffers.
+/// When the task is full, the task is issued for SPUs to process.  Contact output goes into btPersistentManifold
+/// associated with each task.
+/// When PPU issues a task, it will look for completed task buffers
+/// PPU will do postprocessing, dependent on workunit output (not likely)
+class SpuCollisionTaskProcess
+{
+
+  unsigned char  *m_workUnitTaskBuffers;
+
+
+	// track task buffers that are being used, and total busy tasks
+	btAlignedObjectArray<bool>	m_taskBusy;
+	btAlignedObjectArray<SpuGatherAndProcessPairsTaskDesc>	m_spuGatherTaskDesc;
+
+	class	btThreadSupportInterface*	m_threadInterface;
+
+	unsigned int	m_maxNumOutstandingTasks;
+
+	unsigned int   m_numBusyTasks;
+
+	// the current task and the current entry to insert a new work unit
+	unsigned int   m_currentTask;
+	unsigned int   m_currentPage;
+	unsigned int   m_currentPageEntry;
+
+	bool m_useEpa;
+
+#ifdef DEBUG_SpuCollisionTaskProcess
+	bool m_initialized;
+#endif
+	void issueTask2();
+	//void postProcess(unsigned int taskId, int outputSize);
+
+public:
+	SpuCollisionTaskProcess(btThreadSupportInterface*	threadInterface, unsigned int maxNumOutstandingTasks);
+	
+	~SpuCollisionTaskProcess();
+	
+	///call initialize in the beginning of the frame, before addCollisionPairToTask
+	void initialize2(bool useEpa = false);
+
+	///batch up additional work to a current task for SPU processing. When batch is full, it issues the task.
+	void addWorkToTask(void* pairArrayPtr,int startIndex,int endIndex);
+
+	///call flush to submit potential outstanding work to SPUs and wait for all involved SPUs to be finished
+	void flush2();
+
+	/// set the maximum number of SPU tasks allocated
+	void	setNumTasks(int maxNumTasks);
+
+	int		getNumTasks() const
+	{
+		return m_maxNumOutstandingTasks;
+	}
+};
+
+
+
+#define MIDPHASE_TASK_PTR(task) (&m_workUnitTaskBuffers[0] + MIDPHASE_WORKUNIT_TASK_SIZE*task)
+#define MIDPHASE_ENTRY_PTR(task,page,entry) (MIDPHASE_TASK_PTR(task) + MIDPHASE_WORKUNIT_PAGE_SIZE*page + sizeof(SpuGatherAndProcessWorkUnitInput)*entry)
+#define MIDPHASE_OUTPUT_PTR(task) (&m_contactOutputBuffers[0] + MIDPHASE_MAX_CONTACT_BUFFER_SIZE*task)
+#define MIDPHASE_TREENODES_PTR(task) (&m_complexShapeBuffers[0] + MIDPHASE_COMPLEX_SHAPE_BUFFER_SIZE*task)
+
+
+#define MIDPHASE_WORKUNIT_PAGE_SIZE (16)
+//#define MIDPHASE_WORKUNIT_PAGE_SIZE (128)
+
+#define MIDPHASE_NUM_WORKUNIT_PAGES 1
+#define MIDPHASE_WORKUNIT_TASK_SIZE (MIDPHASE_WORKUNIT_PAGE_SIZE*MIDPHASE_NUM_WORKUNIT_PAGES)
+#define MIDPHASE_NUM_WORKUNITS_PER_PAGE (MIDPHASE_WORKUNIT_PAGE_SIZE / sizeof(SpuGatherAndProcessWorkUnitInput))
+#define MIDPHASE_NUM_WORKUNITS_PER_TASK (MIDPHASE_NUM_WORKUNITS_PER_PAGE*MIDPHASE_NUM_WORKUNIT_PAGES)
+
+
+#endif // SPU_COLLISION_TASK_PROCESS_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.cpp
@ -0,0 +1,69 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuContactManifoldCollisionAlgorithm.h"
+#include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+#include "BulletCollision/CollisionShapes/btPolyhedralConvexShape.h"
+
+
+
+
+void SpuContactManifoldCollisionAlgorithm::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+{
+	btAssert(0);
+}
+
+btScalar SpuContactManifoldCollisionAlgorithm::calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+{
+	btAssert(0);
+	return 1.f;
+}
+
+#ifndef __SPU__
+SpuContactManifoldCollisionAlgorithm::SpuContactManifoldCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1)
+:btCollisionAlgorithm(ci)
+#ifdef USE_SEPDISTANCE_UTIL
+,m_sepDistance(body0->getCollisionShape()->getAngularMotionDisc(),body1->getCollisionShape()->getAngularMotionDisc())
+#endif //USE_SEPDISTANCE_UTIL
+{
+	m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
+	m_shapeType0 = body0->getCollisionShape()->getShapeType();
+	m_shapeType1 = body1->getCollisionShape()->getShapeType();
+	m_collisionMargin0 = body0->getCollisionShape()->getMargin();
+	m_collisionMargin1 = body1->getCollisionShape()->getMargin();
+	m_collisionObject0 = body0;
+	m_collisionObject1 = body1;
+
+	if (body0->getCollisionShape()->isPolyhedral())
+	{
+		btPolyhedralConvexShape* convex0 = (btPolyhedralConvexShape*)body0->getCollisionShape();
+		m_shapeDimensions0 = convex0->getImplicitShapeDimensions();
+	}
+	if (body1->getCollisionShape()->isPolyhedral())
+	{
+		btPolyhedralConvexShape* convex1 = (btPolyhedralConvexShape*)body1->getCollisionShape();
+		m_shapeDimensions1 = convex1->getImplicitShapeDimensions();
+	}
+}
+#endif //__SPU__
+
+
+SpuContactManifoldCollisionAlgorithm::~SpuContactManifoldCollisionAlgorithm()
+{
+	if (m_manifoldPtr)
+			m_dispatcher->releaseManifold(m_manifoldPtr);
+}
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuContactManifoldCollisionAlgorithm.h
@ -0,0 +1,120 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H
+#define SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H
+
+#include "BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h"
+#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
+#include "BulletCollision/CollisionDispatch/btCollisionCreateFunc.h"
+#include "BulletCollision/BroadphaseCollision/btDispatcher.h"
+#include "LinearMath/btTransformUtil.h"
+
+class btPersistentManifold;
+
+//#define USE_SEPDISTANCE_UTIL 1
+
+/// SpuContactManifoldCollisionAlgorithm  provides contact manifold and should be processed on SPU.
+ATTRIBUTE_ALIGNED16(class) SpuContactManifoldCollisionAlgorithm : public btCollisionAlgorithm
+{
+	btVector3	m_shapeDimensions0;
+	btVector3	m_shapeDimensions1;
+	btPersistentManifold*	m_manifoldPtr;
+	int		m_shapeType0;
+	int		m_shapeType1;
+	float	m_collisionMargin0;
+	float	m_collisionMargin1;
+
+	btCollisionObject*	m_collisionObject0;
+	btCollisionObject*	m_collisionObject1;
+	
+	
+
+	
+public:
+	
+	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+
+	virtual btScalar calculateTimeOfImpact(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+
+	
+	SpuContactManifoldCollisionAlgorithm(const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1);
+#ifdef USE_SEPDISTANCE_UTIL
+	btConvexSeparatingDistanceUtil	m_sepDistance;
+#endif //USE_SEPDISTANCE_UTIL
+
+	virtual ~SpuContactManifoldCollisionAlgorithm();
+
+	virtual	void	getAllContactManifolds(btManifoldArray&	manifoldArray)
+	{
+		if (m_manifoldPtr)
+			manifoldArray.push_back(m_manifoldPtr);
+	}
+
+	btPersistentManifold*	getContactManifoldPtr()
+	{
+		return m_manifoldPtr;
+	}
+
+	btCollisionObject*	getCollisionObject0()
+	{
+		return m_collisionObject0;
+	}
+	
+	btCollisionObject*	getCollisionObject1()
+	{
+		return m_collisionObject1;
+	}
+
+	int		getShapeType0() const
+	{
+		return m_shapeType0;
+	}
+
+	int		getShapeType1() const
+	{
+		return m_shapeType1;
+	}
+	float	getCollisionMargin0() const
+	{
+		return m_collisionMargin0;
+	}
+	float	getCollisionMargin1() const
+	{
+		return m_collisionMargin1;
+	}
+
+	const btVector3&	getShapeDimensions0() const
+	{
+		return m_shapeDimensions0;
+	}
+
+	const btVector3&	getShapeDimensions1() const
+	{
+		return m_shapeDimensions1;
+	}
+
+	struct CreateFunc :public 	btCollisionAlgorithmCreateFunc
+	{
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		{
+			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(SpuContactManifoldCollisionAlgorithm));
+			return new(mem) SpuContactManifoldCollisionAlgorithm(ci,body0,body1);
+		}
+	};
+
+};
+
+#endif //SPU_CONTACTMANIFOLD_COLLISION_ALGORITHM_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuDoubleBuffer.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuDoubleBuffer.h
@ -0,0 +1,110 @@
+#ifndef DOUBLE_BUFFER_H
+#define DOUBLE_BUFFER_H
+
+#include "SpuFakeDma.h"
+#include <LinearMath/btScalar.h>
+
+
+///DoubleBuffer
+template<class T, int size>
+class DoubleBuffer
+{
+#if defined(__SPU__) || defined(USE_LIBSPE2)
+	ATTRIBUTE_ALIGNED128( T m_buffer0[size] ) ;
+	ATTRIBUTE_ALIGNED128( T m_buffer1[size] ) ;
+#else
+	T m_buffer0[size];
+	T m_buffer1[size];
+#endif
+	
+	T *m_frontBuffer;
+	T *m_backBuffer;
+
+	unsigned int m_dmaTag;
+	bool m_dmaPending;
+public:
+	bool	isPending() const { return m_dmaPending;}
+	DoubleBuffer();
+
+	void init ();
+
+	// dma get and put commands
+	void backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag);
+	void backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag);
+
+	// gets pointer to a buffer
+	T *getFront();
+	T *getBack();
+
+	// if back buffer dma was started, wait for it to complete
+	// then move back to front and vice versa
+	T *swapBuffers();
+};
+
+template<class T, int size>
+DoubleBuffer<T,size>::DoubleBuffer()
+{
+	init ();
+}
+
+template<class T, int size>
+void DoubleBuffer<T,size>::init()
+{
+	this->m_dmaPending = false;
+	this->m_frontBuffer = &this->m_buffer0[0];
+	this->m_backBuffer = &this->m_buffer1[0];
+}
+
+template<class T, int size>
+void
+DoubleBuffer<T,size>::backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsigned int tag)
+{
+	m_dmaPending = true;
+	m_dmaTag = tag;
+	if (numBytes)
+	{
+		m_backBuffer = (T*)cellDmaLargeGetReadOnly(m_backBuffer, ea, numBytes, tag, 0, 0);
+	}
+}
+
+template<class T, int size>
+void
+DoubleBuffer<T,size>::backBufferDmaPut(uint64_t ea, unsigned int numBytes, unsigned int tag)
+{
+	m_dmaPending = true;
+	m_dmaTag = tag;
+	cellDmaLargePut(m_backBuffer, ea, numBytes, tag, 0, 0);
+}
+
+template<class T, int size>
+T *
+DoubleBuffer<T,size>::getFront()
+{
+	return m_frontBuffer;
+}
+
+template<class T, int size>
+T *
+DoubleBuffer<T,size>::getBack()
+{
+	return m_backBuffer;
+}
+
+template<class T, int size>
+T *
+DoubleBuffer<T,size>::swapBuffers()
+{
+	if (m_dmaPending)
+	{
+		cellDmaWaitTagStatusAll(1<<m_dmaTag);
+		m_dmaPending = false;
+	}
+
+	T *tmp = m_backBuffer;
+	m_backBuffer = m_frontBuffer;
+	m_frontBuffer = tmp;
+
+	return m_frontBuffer;
+}
+
+#endif
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuFakeDma.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuFakeDma.cpp
@ -0,0 +1,211 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuFakeDma.h"
+#include <LinearMath/btScalar.h> //for btAssert
+//Disabling memcpy sometimes helps debugging DMA
+
+#define USE_MEMCPY 1
+#ifdef USE_MEMCPY
+
+#endif
+
+
+void*	cellDmaLargeGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+
+#if defined (__SPU__) || defined (USE_LIBSPE2)
+	cellDmaLargeGet(ls,ea,size,tag,tid,rid);
+	return ls;
+#else
+	return (void*)(uint32_t)ea;
+#endif
+}
+
+void*	cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+#if defined (__SPU__) || defined (USE_LIBSPE2)
+	mfc_get(ls,ea,size,tag,0,0);
+	return ls;
+#else
+	return (void*)(uint32_t)ea;
+#endif
+}
+
+
+
+
+void*	cellDmaGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+#if defined (__SPU__) || defined (USE_LIBSPE2)
+	cellDmaGet(ls,ea,size,tag,tid,rid);
+	return ls;
+#else
+	return (void*)(uint32_t)ea;
+#endif
+}
+
+
+///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes)
+int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size)
+{
+	
+	btAssert(size<32);
+	
+	ATTRIBUTE_ALIGNED16(char	tmpBuffer[32]);
+
+
+	char* localStore = (char*)ls;
+	uint32_t i;
+	
+
+	///make sure last 4 bits are the same, for cellDmaSmallGet
+	uint32_t last4BitsOffset = ea & 0x0f;
+	char* tmpTarget = tmpBuffer + last4BitsOffset;
+	
+#if defined (__SPU__) || defined (USE_LIBSPE2)
+	
+	int remainingSize = size;
+
+//#define FORCE_cellDmaUnalignedGet 1
+#ifdef FORCE_cellDmaUnalignedGet
+	cellDmaUnalignedGet(tmpTarget,ea,size,DMA_TAG(1),0,0);
+#else
+	char* remainingTmpTarget = tmpTarget;
+	uint64_t remainingEa = ea;
+
+	while (remainingSize)
+	{
+		switch (remainingSize)
+		{
+		case 1:
+		case 2:
+		case 4:
+		case 8:
+		case 16:
+			{
+				mfc_get(remainingTmpTarget,remainingEa,remainingSize,DMA_TAG(1),0,0);
+				remainingSize=0;
+				break;
+			}
+		default:
+			{
+				//spu_printf("unaligned DMA with non-natural size:%d\n",remainingSize);
+				int actualSize = 0;
+
+				if (remainingSize > 16)
+					actualSize = 16;
+				else
+					if (remainingSize >8)
+						actualSize=8;
+					else
+						if (remainingSize >4)
+							actualSize=4;
+						else
+							if (remainingSize >2)
+								actualSize=2;
+				mfc_get(remainingTmpTarget,remainingEa,actualSize,DMA_TAG(1),0,0);
+				remainingSize-=actualSize;
+				remainingTmpTarget+=actualSize;
+				remainingEa += actualSize;
+			}
+		}
+	}
+#endif//FORCE_cellDmaUnalignedGet
+
+#else
+	char* mainMem = (char*)ea;
+	//copy into final destination
+#ifdef USE_MEMCPY
+		
+		memcpy(tmpTarget,mainMem,size);
+#else
+		for ( i=0;i<size;i++)
+		{
+			tmpTarget[i] = mainMem[i];
+		}
+#endif //USE_MEMCPY
+
+#endif
+
+	cellDmaWaitTagStatusAll(DMA_MASK(1));
+
+	//this is slowish, perhaps memcpy on SPU is smarter?
+	for (i=0; btLikely( i<size );i++)
+	{
+		localStore[i] = tmpTarget[i];
+	}
+
+	return 0;
+}
+
+#if defined (__SPU__) || defined (USE_LIBSPE2)
+#else
+
+int	cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+	char* mainMem = (char*)ea;
+	char* localStore = (char*)ls;
+
+#ifdef USE_MEMCPY
+	memcpy(localStore,mainMem,size);
+#else
+	for (uint32_t i=0;i<size;i++)
+	{
+		localStore[i] = mainMem[i];
+	}
+#endif
+	return 0;
+}
+
+int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+	char* mainMem = (char*)ea;
+	char* localStore = (char*)ls;
+#ifdef USE_MEMCPY
+	memcpy(localStore,mainMem,size);
+#else
+	for (uint32_t i=0;i<size;i++)
+	{
+		localStore[i] = mainMem[i];
+	}	
+#endif //#ifdef USE_MEMCPY
+	return 0;
+}
+
+int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
+{
+	char* mainMem = (char*)ea;
+	const char* localStore = (const char*)ls;
+#ifdef USE_MEMCPY
+	memcpy(mainMem,localStore,size);
+#else
+	for (uint32_t i=0;i<size;i++)
+	{
+		mainMem[i] = localStore[i];
+	}	
+#endif //#ifdef USE_MEMCPY
+
+	return 0;
+}
+
+
+
+void	cellDmaWaitTagStatusAll(int ignore)
+{
+
+}
+
+#endif
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuFakeDma.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuFakeDma.h
@ -0,0 +1,135 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef FAKE_DMA_H
+#define FAKE_DMA_H
+
+
+#include "PlatformDefinitions.h"
+#include "LinearMath/btScalar.h"
+
+
+#ifdef __SPU__
+
+#ifndef USE_LIBSPE2
+
+#include <cell/dma.h>
+#include <stdint.h>
+
+#define DMA_TAG(xfer) (xfer + 1)
+#define DMA_MASK(xfer) (1 << DMA_TAG(xfer))
+
+#else // !USE_LIBSPE2
+
+#define DMA_TAG(xfer) (xfer + 1)
+#define DMA_MASK(xfer) (1 << DMA_TAG(xfer))
+		
+#include <spu_mfcio.h>		
+		
+#define DEBUG_DMA		
+#ifdef DEBUG_DMA
+#define dUASSERT(a,b) if (!(a)) { printf(b);}
+#define uintsize ppu_address_t
+		
+#define cellDmaLargeGet(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
+															dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
+															dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
+															dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
+															dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
+															dUASSERT(size < 16384, "size too big: "); \
+															dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
+	    													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
+															printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
+															} \
+															mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaGet(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
+														dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
+														dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
+														dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
+														dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
+    													dUASSERT(size < 16384, "size too big: "); \
+														dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
+    													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
+    													printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
+														} \
+														mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaLargePut(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
+															dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
+															dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
+															dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
+															dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
+        													dUASSERT(size < 16384, "size too big: "); \
+															dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
+        													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
+    														printf("PUT %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ls,(unsigned int)ea,(unsigned int)size); \
+															} \
+															mfc_put(ls, ea, size, tag, tid, rid)
+#define cellDmaSmallGet(ls, ea, size, tag, tid, rid) if (  (((uintsize)ls%16) != ((uintsize)ea%16)) || ((((uintsize)ea%16) || ((uintsize)ls%16)) && (( ((uintsize)ls%16) != ((uintsize)size%16) ) || ( ((uintsize)ea%16) != ((uintsize)size%16) ) ) ) || ( ((uintsize)size%16) && ((uintsize)size!=1) && ((uintsize)size!=2) && ((uintsize)size!=4) && ((uintsize)size!=8) ) || (size >= 16384) || !(uintsize)ls || !(uintsize)ea) { \
+																dUASSERT( (((uintsize)ea % 16) == 0) || (size < 16), "XDR Address not aligned: "); \
+																dUASSERT( (((uintsize)ls % 16) == 0) || (size < 16), "LS Address not aligned: "); \
+																dUASSERT( ((((uintsize)ls % size) == 0) && (((uintsize)ea % size) == 0))  || (size > 16), "Not naturally aligned: "); \
+    															dUASSERT((size == 1) || (size == 2) || (size == 4) || (size == 8) || ((size % 16) == 0), "size not a multiple of 16byte: "); \
+    															dUASSERT(size < 16384, "size too big: "); \
+    															dUASSERT( ((uintsize)ea%16)==((uintsize)ls%16), "wrong Quadword alignment of LS and EA: "); \
+    	    													dUASSERT(ea != 0, "Nullpointer EA: "); dUASSERT(ls != 0, "Nullpointer LS: ");\
+    															printf("GET %s:%d from: 0x%x, to: 0x%x - %d bytes\n", __FILE__, __LINE__, (unsigned int)ea,(unsigned int)ls,(unsigned int)size);\
+																} \
+																mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaWaitTagStatusAll(ignore) mfc_write_tag_mask(ignore) ; mfc_read_tag_status_all()
+
+#else
+#define cellDmaLargeGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaLargePut(ls, ea, size, tag, tid, rid) mfc_put(ls, ea, size, tag, tid, rid)
+#define cellDmaSmallGet(ls, ea, size, tag, tid, rid) mfc_get(ls, ea, size, tag, tid, rid)
+#define cellDmaWaitTagStatusAll(ignore) mfc_write_tag_mask(ignore) ; mfc_read_tag_status_all()
+#endif // DEBUG_DMA
+
+		
+		
+		
+		
+		
+		
+		
+#endif // USE_LIBSPE2
+#else // !__SPU__
+//Simulate DMA using memcpy or direct access on non-CELL platforms that don't have DMAs and SPUs (Win32, Mac, Linux etc)
+//Potential to add networked simulation using this interface
+
+#define DMA_TAG(a) (a)
+#define DMA_MASK(a) (a)
+
+		/// cellDmaLargeGet Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
+		int	cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+		int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+		/// cellDmaLargePut Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
+		int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+		/// cellDmaWaitTagStatusAll Win32 replacements for Cell DMA to allow simulating most of the SPU code (just memcpy)
+		void	cellDmaWaitTagStatusAll(int ignore);
+
+
+#endif //__CELLOS_LV2__
+
+///stallingUnalignedDmaSmallGet internally uses DMA_TAG(1)
+int	stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size);
+
+
+void*	cellDmaLargeGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+void*	cellDmaGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+void*	cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid);
+
+
+#endif //FAKE_DMA_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.cpp
@ -0,0 +1,238 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuGatheringCollisionDispatcher.h"
+#include "SpuCollisionTaskProcess.h"
+
+
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+#include "BulletCollision/CollisionDispatch/btEmptyCollisionAlgorithm.h"
+#include "SpuContactManifoldCollisionAlgorithm.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+#include "LinearMath/btQuickprof.h"
+
+
+
+
+SpuGatheringCollisionDispatcher::SpuGatheringCollisionDispatcher(class	btThreadSupportInterface*	threadInterface, unsigned int	maxNumOutstandingTasks,btCollisionConfiguration* collisionConfiguration)
+:btCollisionDispatcher(collisionConfiguration),
+m_spuCollisionTaskProcess(0),
+m_threadInterface(threadInterface),
+m_maxNumOutstandingTasks(maxNumOutstandingTasks)
+{
+	
+}
+
+
+bool	SpuGatheringCollisionDispatcher::supportsDispatchPairOnSpu(int proxyType0,int proxyType1)
+{
+	bool supported0 = (
+		(proxyType0 == BOX_SHAPE_PROXYTYPE) ||
+		(proxyType0 == TRIANGLE_SHAPE_PROXYTYPE) ||
+		(proxyType0 == SPHERE_SHAPE_PROXYTYPE) ||
+		(proxyType0 == CAPSULE_SHAPE_PROXYTYPE) ||
+		(proxyType0 == CYLINDER_SHAPE_PROXYTYPE) ||
+//		(proxyType0 == CONE_SHAPE_PROXYTYPE) ||
+		(proxyType0 == TRIANGLE_MESH_SHAPE_PROXYTYPE) ||
+		(proxyType0 == CONVEX_HULL_SHAPE_PROXYTYPE)||
+		(proxyType0 == COMPOUND_SHAPE_PROXYTYPE)
+		);
+
+	bool supported1 = (
+		(proxyType1 == BOX_SHAPE_PROXYTYPE) ||
+		(proxyType1 == TRIANGLE_SHAPE_PROXYTYPE) ||
+		(proxyType1 == SPHERE_SHAPE_PROXYTYPE) ||
+		(proxyType1 == CAPSULE_SHAPE_PROXYTYPE) ||
+		(proxyType1 == CYLINDER_SHAPE_PROXYTYPE) ||
+//		(proxyType1 == CONE_SHAPE_PROXYTYPE) ||
+		(proxyType1 == TRIANGLE_MESH_SHAPE_PROXYTYPE) ||
+		(proxyType1 == CONVEX_HULL_SHAPE_PROXYTYPE) ||
+		(proxyType1 == COMPOUND_SHAPE_PROXYTYPE)
+		);
+
+	return supported0 && supported1;
+}
+
+
+
+SpuGatheringCollisionDispatcher::~SpuGatheringCollisionDispatcher()
+{
+	if (m_spuCollisionTaskProcess)
+		delete m_spuCollisionTaskProcess;
+	
+}
+
+#include "stdio.h"
+
+
+
+///interface for iterating all overlapping collision pairs, no matter how those pairs are stored (array, set, map etc)
+///this is useful for the collision dispatcher.
+class btSpuCollisionPairCallback : public btOverlapCallback
+{
+	const btDispatcherInfo& m_dispatchInfo;
+	SpuGatheringCollisionDispatcher*	m_dispatcher;
+
+public:
+
+	btSpuCollisionPairCallback(const btDispatcherInfo& dispatchInfo, SpuGatheringCollisionDispatcher*	dispatcher)
+	:m_dispatchInfo(dispatchInfo),
+	m_dispatcher(dispatcher)
+	{
+	}
+
+	virtual bool	processOverlap(btBroadphasePair& collisionPair)
+	{
+
+
+		//PPU version
+		//(*m_dispatcher->getNearCallback())(collisionPair,*m_dispatcher,m_dispatchInfo);
+
+		//only support discrete collision detection for now, we could fallback on PPU/unoptimized version for TOI/CCD
+		btAssert(m_dispatchInfo.m_dispatchFunc == btDispatcherInfo::DISPATCH_DISCRETE);
+
+		//by default, Bullet will use this near callback
+		{
+			///userInfo is used to determine if the SPU has to handle this case or not (skip PPU tasks)
+			if (!collisionPair.m_internalTmpValue)
+			{
+				collisionPair.m_internalTmpValue = 1;
+			}
+			if (!collisionPair.m_algorithm)
+			{
+				btCollisionObject* colObj0 = (btCollisionObject*)collisionPair.m_pProxy0->m_clientObject;
+				btCollisionObject* colObj1 = (btCollisionObject*)collisionPair.m_pProxy1->m_clientObject;
+
+				btCollisionAlgorithmConstructionInfo ci;
+				ci.m_dispatcher1 = m_dispatcher;
+				ci.m_manifold = 0;
+
+				if (m_dispatcher->needsCollision(colObj0,colObj1))
+				{
+					int	proxyType0 = colObj0->getCollisionShape()->getShapeType();
+					int	proxyType1 = colObj1->getCollisionShape()->getShapeType();
+					if (m_dispatcher->supportsDispatchPairOnSpu(proxyType0,proxyType1))
+					{
+						int so = sizeof(SpuContactManifoldCollisionAlgorithm);
+#ifdef ALLOCATE_SEPARATELY
+						void* mem = btAlignedAlloc(so,16);//m_dispatcher->allocateCollisionAlgorithm(so);
+#else
+						void* mem = m_dispatcher->allocateCollisionAlgorithm(so);
+#endif
+						collisionPair.m_algorithm = new(mem) SpuContactManifoldCollisionAlgorithm(ci,colObj0,colObj1);
+						collisionPair.m_internalTmpValue =  2;
+					} else
+					{
+						collisionPair.m_algorithm = m_dispatcher->findAlgorithm(colObj0,colObj1);
+						collisionPair.m_internalTmpValue = 3;
+					}
+				} 
+			}
+		}
+		return false;
+	}
+};
+
+void	SpuGatheringCollisionDispatcher::dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo, btDispatcher* dispatcher) 
+{
+
+	if (dispatchInfo.m_enableSPU)
+	{
+		m_maxNumOutstandingTasks = m_threadInterface->getNumTasks();
+
+		{
+			BT_PROFILE("processAllOverlappingPairs");
+
+			if (!m_spuCollisionTaskProcess)
+				m_spuCollisionTaskProcess = new SpuCollisionTaskProcess(m_threadInterface,m_maxNumOutstandingTasks);
+		
+			m_spuCollisionTaskProcess->setNumTasks(m_maxNumOutstandingTasks);
+	//		printf("m_maxNumOutstandingTasks =%d\n",m_maxNumOutstandingTasks);
+
+			m_spuCollisionTaskProcess->initialize2(dispatchInfo.m_useEpa);
+			
+		
+			///modified version of btCollisionDispatcher::dispatchAllCollisionPairs:
+			{
+				btSpuCollisionPairCallback	collisionCallback(dispatchInfo,this);
+
+				pairCache->processAllOverlappingPairs(&collisionCallback,dispatcher);
+			}
+		}
+
+		//send one big batch
+		int numTotalPairs = pairCache->getNumOverlappingPairs();
+		btBroadphasePair* pairPtr = pairCache->getOverlappingPairArrayPtr();
+		int i;
+		{
+			BT_PROFILE("addWorkToTask");
+			for (i=0;i<numTotalPairs;)
+			{
+				//Performance Hint: tweak this number during benchmarking
+				static const int pairRange = SPU_BATCHSIZE_BROADPHASE_PAIRS;
+				int endIndex = (i+pairRange) < numTotalPairs ? i+pairRange : numTotalPairs;
+				m_spuCollisionTaskProcess->addWorkToTask(pairPtr,i,endIndex);
+				i = endIndex;
+			}
+		}
+
+		{
+			BT_PROFILE("PPU fallback");
+			//handle PPU fallback pairs
+			for (i=0;i<numTotalPairs;i++)
+			{
+				btBroadphasePair& collisionPair = pairPtr[i];
+				if (collisionPair.m_internalTmpValue == 3)
+				{
+					if (collisionPair.m_algorithm)
+					{
+						btCollisionObject* colObj0 = (btCollisionObject*)collisionPair.m_pProxy0->m_clientObject;
+						btCollisionObject* colObj1 = (btCollisionObject*)collisionPair.m_pProxy1->m_clientObject;
+
+						if (dispatcher->needsCollision(colObj0,colObj1))
+						{
+							btManifoldResult contactPointResult(colObj0,colObj1);
+							
+							if (dispatchInfo.m_dispatchFunc == 		btDispatcherInfo::DISPATCH_DISCRETE)
+							{
+								//discrete collision detection query
+								collisionPair.m_algorithm->processCollision(colObj0,colObj1,dispatchInfo,&contactPointResult);
+							} else
+							{
+								//continuous collision detection query, time of impact (toi)
+								btScalar toi = collisionPair.m_algorithm->calculateTimeOfImpact(colObj0,colObj1,dispatchInfo,&contactPointResult);
+								if (dispatchInfo.m_timeOfImpact > toi)
+									dispatchInfo.m_timeOfImpact = toi;
+
+							}
+						}
+					}
+				}
+			}
+		}
+		{
+			BT_PROFILE("flush2");
+			//make sure all SPU work is done
+			m_spuCollisionTaskProcess->flush2();
+		}
+
+	} else
+	{
+		///PPU fallback
+		///!Need to make sure to clear all 'algorithms' when switching between SPU and PPU
+		btCollisionDispatcher::dispatchAllCollisionPairs(pairCache,dispatchInfo,dispatcher);
+	}
+}
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuGatheringCollisionDispatcher.h
@ -0,0 +1,69 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef SPU_GATHERING_COLLISION__DISPATCHER_H
+#define SPU_GATHERING_COLLISION__DISPATCHER_H
+
+#include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
+
+
+///Tuning value to optimized SPU utilization 
+///Too small value means Task overhead is large compared to computation (too fine granularity)
+///Too big value might render some SPUs are idle, while a few other SPUs are doing all work.
+//#define SPU_BATCHSIZE_BROADPHASE_PAIRS 8
+//#define SPU_BATCHSIZE_BROADPHASE_PAIRS 16
+#define SPU_BATCHSIZE_BROADPHASE_PAIRS 64
+//#define SPU_BATCHSIZE_BROADPHASE_PAIRS 128
+//#define SPU_BATCHSIZE_BROADPHASE_PAIRS 256
+//#define SPU_BATCHSIZE_BROADPHASE_PAIRS 1024
+
+
+
+class SpuCollisionTaskProcess;
+
+///SpuGatheringCollisionDispatcher can use SPU to gather and calculate collision detection
+///Time of Impact, Closest Points and Penetration Depth.
+class SpuGatheringCollisionDispatcher : public btCollisionDispatcher
+{
+	
+	SpuCollisionTaskProcess*	m_spuCollisionTaskProcess;
+	
+protected:
+
+	class	btThreadSupportInterface*	m_threadInterface;
+
+	unsigned int	m_maxNumOutstandingTasks;
+	
+
+public:
+
+	//can be used by SPU collision algorithms	
+	SpuCollisionTaskProcess*	getSpuCollisionTaskProcess()
+	{
+			return m_spuCollisionTaskProcess;
+	}
+	
+	SpuGatheringCollisionDispatcher (class	btThreadSupportInterface*	threadInterface, unsigned int	maxNumOutstandingTasks,btCollisionConfiguration* collisionConfiguration);
+	
+	virtual ~SpuGatheringCollisionDispatcher();
+
+	bool	supportsDispatchPairOnSpu(int proxyType0,int proxyType1);
+
+	virtual void	dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo,btDispatcher* dispatcher) ;
+
+};
+
+
+
+#endif //SPU_GATHERING_COLLISION__DISPATCHER_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuLibspe2Support.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuLibspe2Support.cpp
@ -0,0 +1,257 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifdef USE_LIBSPE2
+
+#include "SpuLibspe2Support.h"
+
+
+
+
+//SpuLibspe2Support helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+///Setup and initialize SPU/CELL/Libspe2
+SpuLibspe2Support::SpuLibspe2Support(spe_program_handle_t *speprog, int numThreads)
+{
+	this->program = speprog;
+	this->numThreads =  ((numThreads <= spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1)) ? numThreads : spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1));
+}
+
+///cleanup/shutdown Libspe2
+SpuLibspe2Support::~SpuLibspe2Support()
+{
+	
+	stopSPU();
+}
+
+
+
+///send messages to SPUs
+void SpuLibspe2Support::sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t uiArgument1)
+{
+	spe_context_ptr_t context;
+	
+	switch (uiCommand)
+	{
+	case CMD_SAMPLE_TASK_COMMAND:
+	{
+		//get taskdescription
+		SpuSampleTaskDesc* taskDesc = (SpuSampleTaskDesc*) uiArgument0;
+
+		btAssert(taskDesc->m_taskId<m_activeSpuStatus.size());
+
+		//get status of SPU on which task should run
+		btSpuStatus&	spuStatus = m_activeSpuStatus[taskDesc->m_taskId];
+
+		//set data for spuStatus
+		spuStatus.m_commandId = uiCommand;
+		spuStatus.m_status = Spu_Status_Occupied; //set SPU as "occupied"
+		spuStatus.m_taskDesc.p = taskDesc; 
+		
+		//get context
+		context = data[taskDesc->m_taskId].context;
+		
+		
+		taskDesc->m_mainMemoryPtr = reinterpret_cast<uint64_t> (spuStatus.m_lsMemory.p);
+		
+
+		break;
+	}
+	case CMD_GATHER_AND_PROCESS_PAIRLIST:
+		{
+			//get taskdescription
+			SpuGatherAndProcessPairsTaskDesc* taskDesc = (SpuGatherAndProcessPairsTaskDesc*) uiArgument0;
+
+			btAssert(taskDesc->taskId<m_activeSpuStatus.size());
+
+			//get status of SPU on which task should run
+			btSpuStatus&	spuStatus = m_activeSpuStatus[taskDesc->taskId];
+
+			//set data for spuStatus
+			spuStatus.m_commandId = uiCommand;
+			spuStatus.m_status = Spu_Status_Occupied; //set SPU as "occupied"
+			spuStatus.m_taskDesc.p = taskDesc; 
+			
+			//get context
+			context = data[taskDesc->taskId].context;
+			
+			
+			taskDesc->m_lsMemory = (CollisionTask_LocalStoreMemory*)spuStatus.m_lsMemory.p;
+			
+			break;
+		}
+	default:
+		{
+			///not implemented
+			btAssert(0);
+		}
+
+	};
+
+	
+	//write taskdescription in mailbox
+	unsigned int event = Spu_Mailbox_Event_Task;
+	spe_in_mbox_write(context, &event, 1, SPE_MBOX_ANY_NONBLOCKING);
+
+}
+
+///check for messages from SPUs
+void SpuLibspe2Support::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
+{
+	///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
+	
+	///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
+	
+	btAssert(m_activeSpuStatus.size());
+
+	
+	int last = -1;
+	
+	//find an active spu/thread
+	while(last < 0)
+	{
+		for (int i=0;i<m_activeSpuStatus.size();i++)
+		{
+			if ( m_activeSpuStatus[i].m_status == Spu_Status_Free)
+			{
+				last = i;
+				break;
+			}
+		}
+		if(last < 0)
+			sched_yield();
+	}
+
+
+
+	btSpuStatus& spuStatus = m_activeSpuStatus[last];
+
+	///need to find an active spu
+	btAssert(last>=0);
+
+	
+
+	*puiArgument0 = spuStatus.m_taskId;
+	*puiArgument1 = spuStatus.m_status;
+
+
+}
+
+
+void SpuLibspe2Support::startSPU()
+{
+	this->internal_startSPU();
+}
+
+
+
+///start the spus group (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+void SpuLibspe2Support::internal_startSPU()
+{
+	m_activeSpuStatus.resize(numThreads);
+	
+	
+	for (int i=0; i < numThreads; i++)
+	{
+		
+		if(data[i].context == NULL) 
+		{
+					
+			 /* Create context */
+			if ((data[i].context = spe_context_create(0, NULL)) == NULL)
+			{
+			      perror ("Failed creating context");
+		          exit(1);
+			}
+	
+			/* Load program into context */
+			if(spe_program_load(data[i].context, this->program))
+			{
+			      perror ("Failed loading program");
+		          exit(1);
+			}
+			
+			m_activeSpuStatus[i].m_status = Spu_Status_Startup; 
+			m_activeSpuStatus[i].m_taskId = i; 
+			m_activeSpuStatus[i].m_commandId = 0; 
+			m_activeSpuStatus[i].m_lsMemory.p = NULL; 
+			
+			
+			data[i].entry = SPE_DEFAULT_ENTRY;
+			data[i].flags = 0;
+			data[i].argp.p = &m_activeSpuStatus[i];
+			data[i].envp.p = NULL;
+			
+		    /* Create thread for each SPE context */
+			if (pthread_create(&data[i].pthread, NULL, &ppu_pthread_function, &(data[i]) ))
+			{
+			      perror ("Failed creating thread");
+		          exit(1);
+			}
+			/*
+			else
+			{
+				printf("started thread %d\n",i);
+			}*/
+		}		
+	}
+	
+	
+	for (int i=0; i < numThreads; i++)
+	{
+		if(data[i].context != NULL) 
+		{
+			while( m_activeSpuStatus[i].m_status == Spu_Status_Startup)
+			{
+				// wait for spu to set up
+				sched_yield();
+			}
+			printf("Spu %d is ready\n", i);
+		}
+	}
+}
+
+///tell the task scheduler we are done with the SPU tasks
+void SpuLibspe2Support::stopSPU()
+{
+	// wait for all threads to finish 
+	int i;
+	for ( i = 0; i < this->numThreads; i++ ) 
+	{ 
+		
+		unsigned int event = Spu_Mailbox_Event_Shutdown;
+		spe_context_ptr_t context = data[i].context;
+		spe_in_mbox_write(context, &event, 1, SPE_MBOX_ALL_BLOCKING);
+		pthread_join (data[i].pthread, NULL); 
+		
+	} 
+	// close SPE program 
+	spe_image_close(program); 
+	// destroy SPE contexts 
+	for ( i = 0; i < this->numThreads; i++ ) 
+	{ 
+		if(data[i].context != NULL)
+		{
+			spe_context_destroy (data[i].context);
+		}
+	} 
+	
+	m_activeSpuStatus.clear();
+	
+}
+
+
+
+#endif //USE_LIBSPE2
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuLibspe2Support.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuLibspe2Support.h
@ -0,0 +1,180 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef SPU_LIBSPE2_SUPPORT_H
+#define SPU_LIBSPE2_SUPPORT_H
+
+#include <LinearMath/btScalar.h> //for uint32_t etc.
+
+#ifdef USE_LIBSPE2
+
+#include <stdlib.h>
+#include <stdio.h>
+//#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
+#include "PlatformDefinitions.h"
+
+
+//extern struct SpuGatherAndProcessPairsTaskDesc;
+
+enum
+{
+	Spu_Mailbox_Event_Nothing = 0,
+	Spu_Mailbox_Event_Task = 1,
+	Spu_Mailbox_Event_Shutdown = 2,
+	
+	Spu_Mailbox_Event_ForceDword = 0xFFFFFFFF
+	
+};
+
+enum
+{
+	Spu_Status_Free = 0,
+	Spu_Status_Occupied = 1,
+	Spu_Status_Startup = 2,
+	
+	Spu_Status_ForceDword = 0xFFFFFFFF
+	
+};
+
+
+struct btSpuStatus
+{
+	uint32_t	m_taskId;
+	uint32_t	m_commandId;
+	uint32_t	m_status;
+
+	addr64 m_taskDesc;
+	addr64 m_lsMemory;
+	
+}
+__attribute__ ((aligned (128)))
+;
+
+
+
+#ifndef __SPU__
+
+#include "LinearMath/btAlignedObjectArray.h"
+#include "SpuCollisionTaskProcess.h"
+#include "SpuSampleTaskProcess.h"
+#include "btThreadSupportInterface.h"
+#include <libspe2.h>
+#include <pthread.h>
+#include <sched.h>
+
+#define MAX_SPUS 4 
+
+typedef struct ppu_pthread_data 
+{
+	spe_context_ptr_t context;
+	pthread_t pthread;
+	unsigned int entry;
+	unsigned int flags;
+	addr64 argp;
+	addr64 envp;
+	spe_stop_info_t stopinfo;
+} ppu_pthread_data_t;
+
+
+static void *ppu_pthread_function(void *arg)
+{
+    ppu_pthread_data_t * datap = (ppu_pthread_data_t *)arg;
+    /*
+    int rc;
+    do 
+    {*/
+        spe_context_run(datap->context, &datap->entry, datap->flags, datap->argp.p, datap->envp.p, &datap->stopinfo);
+        if (datap->stopinfo.stop_reason == SPE_EXIT) 
+        {
+           if (datap->stopinfo.result.spe_exit_code != 0) 
+           {
+             perror("FAILED: SPE returned a non-zero exit status: \n");
+             exit(1);
+           }
+         } 
+        else 
+         {
+           perror("FAILED: SPE abnormally terminated\n");
+           exit(1);
+         }
+        
+        
+    //} while (rc > 0); // loop until exit or error, and while any stop & signal
+    pthread_exit(NULL);
+}
+
+
+
+
+
+
+///SpuLibspe2Support helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+class SpuLibspe2Support : public btThreadSupportInterface
+{
+
+	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
+	
+public:
+	//Setup and initialize SPU/CELL/Libspe2
+	SpuLibspe2Support(spe_program_handle_t *speprog,int numThreads);
+	
+	// SPE program handle ptr.
+	spe_program_handle_t *program;
+	
+	// SPE program data
+	ppu_pthread_data_t data[MAX_SPUS];
+	
+	//cleanup/shutdown Libspe2
+	~SpuLibspe2Support();
+
+	///send messages to SPUs
+	void sendRequest(uint32_t uiCommand, uint32_t uiArgument0, uint32_t uiArgument1=0);
+
+	//check for messages from SPUs
+	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
+
+	//start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	virtual void startSPU();
+
+	//tell the task scheduler we are done with the SPU tasks
+	virtual void stopSPU();
+
+	virtual void setNumTasks(int numTasks)
+	{
+		//changing the number of tasks after initialization is not implemented (yet)
+	}
+
+private:
+	
+	///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	void internal_startSPU();
+
+
+	
+	
+	int numThreads;
+
+};
+
+#endif // NOT __SPU__
+
+#endif //USE_LIBSPE2
+
+#endif //SPU_LIBSPE2_SUPPORT_H
+
+
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
@ -0,0 +1,167 @@
+/*
+   Copyright (C) 2006, 2008 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef __BOX_H__
+#define __BOX_H__
+
+
+#ifndef PE_REF
+#define PE_REF(a) a&
+#endif
+
+#include <math.h>
+
+//#include "BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h"
+#include <vectormath_aos.h>
+
+
+using namespace Vectormath::Aos;
+
+enum FeatureType { F, E, V };
+
+//----------------------------------------------------------------------------
+// Box
+//----------------------------------------------------------------------------
+///The Box is an internal class used by the boxBoxDistance calculation.
+class Box
+{
+public:
+	Vector3 half;
+
+	inline Box()
+	{}
+	inline Box(PE_REF(Vector3) half_);
+	inline Box(float hx, float hy, float hz);
+
+	inline void Set(PE_REF(Vector3) half_);
+	inline void Set(float hx, float hy, float hz);
+
+	inline Vector3 GetAABB(const Matrix3& rotation) const;
+};
+
+inline
+Box::Box(PE_REF(Vector3) half_)
+{
+	Set(half_);
+}
+
+inline
+Box::Box(float hx, float hy, float hz)
+{
+	Set(hx, hy, hz);
+}
+
+inline
+void
+Box::Set(PE_REF(Vector3) half_)
+{
+	half = half_;
+}
+
+inline
+void
+Box::Set(float hx, float hy, float hz)
+{
+	half = Vector3(hx, hy, hz);
+}
+
+inline
+Vector3
+Box::GetAABB(const Matrix3& rotation) const
+{
+	return absPerElem(rotation) * half;
+}
+
+//-------------------------------------------------------------------------------------------------
+// BoxPoint
+//-------------------------------------------------------------------------------------------------
+
+///The BoxPoint class is an internally used class to contain feature information for boxBoxDistance calculation.
+class BoxPoint
+{
+public:
+	BoxPoint() : localPoint(0.0f) {}
+
+	Point3      localPoint;
+	FeatureType featureType;
+	int         featureIdx;
+
+	inline void setVertexFeature(int plusX, int plusY, int plusZ);
+	inline void setEdgeFeature(int dim0, int plus0, int dim1, int plus1);
+	inline void setFaceFeature(int dim, int plus);
+
+	inline void getVertexFeature(int & plusX, int & plusY, int & plusZ) const;
+	inline void getEdgeFeature(int & dim0, int & plus0, int & dim1, int & plus1) const;
+	inline void getFaceFeature(int & dim, int & plus) const;
+};
+
+inline
+void
+BoxPoint::setVertexFeature(int plusX, int plusY, int plusZ)
+{
+	featureType = V;
+	featureIdx = plusX << 2 | plusY << 1 | plusZ;
+}
+
+inline
+void
+BoxPoint::setEdgeFeature(int dim0, int plus0, int dim1, int plus1)
+{
+	featureType = E;
+
+	if (dim0 > dim1) {
+		featureIdx = plus1 << 5 | dim1 << 3 | plus0 << 2 | dim0;
+	} else {
+		featureIdx = plus0 << 5 | dim0 << 3 | plus1 << 2 | dim1;
+	}
+}
+
+inline
+void
+BoxPoint::setFaceFeature(int dim, int plus)
+{
+	featureType = F;
+	featureIdx = plus << 2 | dim;
+}
+
+inline
+void
+BoxPoint::getVertexFeature(int & plusX, int & plusY, int & plusZ) const
+{
+	plusX = featureIdx >> 2;
+	plusY = featureIdx >> 1 & 1;
+	plusZ = featureIdx & 1;
+}
+
+inline
+void
+BoxPoint::getEdgeFeature(int & dim0, int & plus0, int & dim1, int & plus1) const
+{
+	plus0 = featureIdx >> 5;
+	dim0 = featureIdx >> 3 & 3;
+	plus1 = featureIdx >> 2 & 1;
+	dim1 = featureIdx & 3;
+}
+
+inline
+void
+BoxPoint::getFaceFeature(int & dim, int & plus) const
+{
+	plus = featureIdx >> 2;
+	dim = featureIdx & 3;
+}
+
+#endif /* __BOX_H__ */
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
@ -0,0 +1,295 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "SpuCollisionShapes.h"
+
+///not supported on IBM SDK, until we fix the alignment of btVector3
+#if defined (__CELLOS_LV2__) && defined (__SPU__)
+#include <spu_intrinsics.h>
+static inline vec_float4 vec_dot3( vec_float4 vec0, vec_float4 vec1 )
+{
+    vec_float4 result;
+    result = spu_mul( vec0, vec1 );
+    result = spu_madd( spu_rlqwbyte( vec0, 4 ), spu_rlqwbyte( vec1, 4 ), result );
+    return spu_madd( spu_rlqwbyte( vec0, 8 ), spu_rlqwbyte( vec1, 8 ), result );
+}
+#endif //__SPU__
+
+
+void computeAabb (btVector3& aabbMin, btVector3& aabbMax, btConvexInternalShape* convexShape, ppu_address_t convexShapePtr, int shapeType, const btTransform& xform)
+{
+	//calculate the aabb, given the types...
+	switch (shapeType)
+	{
+	case CYLINDER_SHAPE_PROXYTYPE:
+		/* fall through */
+	case BOX_SHAPE_PROXYTYPE:
+	{
+		btScalar margin=convexShape->getMarginNV();
+		btVector3 halfExtents = convexShape->getImplicitShapeDimensions();
+		halfExtents += btVector3(margin,margin,margin);
+		const btTransform& t = xform;
+		btMatrix3x3 abs_b = t.getBasis().absolute();  
+		btVector3 center = t.getOrigin();
+		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
+		
+		aabbMin = center - extent;
+		aabbMax = center + extent;
+		break;
+	}
+	case CAPSULE_SHAPE_PROXYTYPE:
+	{
+		btScalar margin=convexShape->getMarginNV();
+		btVector3 halfExtents = convexShape->getImplicitShapeDimensions();
+		//add the radius to y-axis to get full height
+		btScalar radius = halfExtents[0];
+		halfExtents[1] += radius;
+		halfExtents += btVector3(margin,margin,margin);
+#if 0
+		int capsuleUpAxis = convexShape->getUpAxis();
+		btScalar halfHeight = convexShape->getHalfHeight();
+		btScalar radius = convexShape->getRadius();
+		halfExtents[capsuleUpAxis] = radius + halfHeight;
+#endif
+		const btTransform& t = xform;
+		btMatrix3x3 abs_b = t.getBasis().absolute();  
+		btVector3 center = t.getOrigin();
+		btVector3 extent = btVector3(abs_b[0].dot(halfExtents),abs_b[1].dot(halfExtents),abs_b[2].dot(halfExtents));
+		
+		aabbMin = center - extent;
+		aabbMax = center + extent;
+		break;
+	}
+	case SPHERE_SHAPE_PROXYTYPE:
+	{
+		btScalar radius = convexShape->getImplicitShapeDimensions().getX();// * convexShape->getLocalScaling().getX();
+		btScalar margin = radius + convexShape->getMarginNV();
+		const btTransform& t = xform;
+		const btVector3& center = t.getOrigin();
+		btVector3 extent(margin,margin,margin);
+		aabbMin = center - extent;
+		aabbMax = center + extent;
+		break;
+	}
+	case CONVEX_HULL_SHAPE_PROXYTYPE:
+	{
+		ATTRIBUTE_ALIGNED16(char convexHullShape0[sizeof(btConvexHullShape)]);
+		cellDmaGet(&convexHullShape0, convexShapePtr  , sizeof(btConvexHullShape), DMA_TAG(1), 0, 0);
+		cellDmaWaitTagStatusAll(DMA_MASK(1));
+		btConvexHullShape* localPtr = (btConvexHullShape*)&convexHullShape0;
+		const btTransform& t = xform;
+		btScalar margin = convexShape->getMarginNV();
+		localPtr->getNonvirtualAabb(t,aabbMin,aabbMax,margin);
+		//spu_printf("SPU convex aabbMin=%f,%f,%f=\n",aabbMin.getX(),aabbMin.getY(),aabbMin.getZ());
+		//spu_printf("SPU convex aabbMax=%f,%f,%f=\n",aabbMax.getX(),aabbMax.getY(),aabbMax.getZ());
+		break;
+	}
+	default:
+		{
+	//	spu_printf("SPU: unsupported shapetype %d in AABB calculation\n");
+		}
+	};
+}
+
+void dmaBvhShapeData (bvhMeshShape_LocalStoreMemory* bvhMeshShape, btBvhTriangleMeshShape* triMeshShape)
+{
+	register int dmaSize;
+	register ppu_address_t	dmaPpuAddress2;
+
+	dmaSize = sizeof(btTriangleIndexVertexArray);
+	dmaPpuAddress2 = reinterpret_cast<ppu_address_t>(triMeshShape->getMeshInterface());
+	//	spu_printf("trimeshShape->getMeshInterface() == %llx\n",dmaPpuAddress2);
+#ifdef __SPU__
+	cellDmaGet(&bvhMeshShape->gTriangleMeshInterfaceStorage, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
+	bvhMeshShape->gTriangleMeshInterfacePtr = &bvhMeshShape->gTriangleMeshInterfaceStorage;
+#else
+	bvhMeshShape->gTriangleMeshInterfacePtr = (btTriangleIndexVertexArray*)cellDmaGetReadOnly(&bvhMeshShape->gTriangleMeshInterfaceStorage, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
+#endif
+
+	//cellDmaWaitTagStatusAll(DMA_MASK(1));
+	
+	///now DMA over the BVH
+	
+	dmaSize = sizeof(btOptimizedBvh);
+	dmaPpuAddress2 = reinterpret_cast<ppu_address_t>(triMeshShape->getOptimizedBvh());
+	//spu_printf("trimeshShape->getOptimizedBvh() == %llx\n",dmaPpuAddress2);
+	cellDmaGet(&bvhMeshShape->gOptimizedBvh, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
+	//cellDmaWaitTagStatusAll(DMA_MASK(2));
+	cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
+}
+
+void dmaBvhIndexedMesh (btIndexedMesh* IndexMesh, IndexedMeshArray& indexArray, int index, uint32_t dmaTag)
+{		
+	cellDmaGet(IndexMesh, (ppu_address_t)&indexArray[index]  , sizeof(btIndexedMesh), DMA_TAG(dmaTag), 0, 0);
+	
+}
+
+void dmaBvhSubTreeHeaders (btBvhSubtreeInfo* subTreeHeaders, ppu_address_t subTreePtr, int batchSize, uint32_t dmaTag)
+{
+	cellDmaGet(subTreeHeaders, subTreePtr, batchSize * sizeof(btBvhSubtreeInfo), DMA_TAG(dmaTag), 0, 0);
+}
+
+void dmaBvhSubTreeNodes (btQuantizedBvhNode* nodes, const btBvhSubtreeInfo& subtree, QuantizedNodeArray&	nodeArray, int dmaTag)
+{
+	cellDmaGet(nodes, reinterpret_cast<ppu_address_t>(&nodeArray[subtree.m_rootNodeIndex]) , subtree.m_subtreeSize* sizeof(btQuantizedBvhNode), DMA_TAG(2), 0, 0);
+}
+
+///getShapeTypeSize could easily be optimized, but it is not likely a bottleneck
+int		getShapeTypeSize(int shapeType)
+{
+
+
+	switch (shapeType)
+	{
+	case CYLINDER_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btCylinderShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+	case BOX_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btBoxShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+	case SPHERE_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btSphereShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+	case TRIANGLE_MESH_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btBvhTriangleMeshShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+	case CAPSULE_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btCapsuleShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+
+	case CONVEX_HULL_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btConvexHullShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+
+	case COMPOUND_SHAPE_PROXYTYPE:
+		{
+			int shapeSize = sizeof(btCompoundShape);
+			btAssert(shapeSize < MAX_SHAPE_SIZE);
+			return shapeSize;
+		}
+
+	default:
+		btAssert(0);
+		//unsupported shapetype, please add here
+		return 0;
+	}
+}
+
+void dmaConvexVertexData (SpuConvexPolyhedronVertexData* convexVertexData, btConvexHullShape* convexShapeSPU)
+{
+	convexVertexData->gNumConvexPoints = convexShapeSPU->getNumPoints();
+	if (convexVertexData->gNumConvexPoints>MAX_NUM_SPU_CONVEX_POINTS)
+	{
+		btAssert(0);
+	//	spu_printf("SPU: Error: MAX_NUM_SPU_CONVEX_POINTS(%d) exceeded: %d\n",MAX_NUM_SPU_CONVEX_POINTS,convexVertexData->gNumConvexPoints);
+		return;
+	}
+			
+	register int dmaSize = convexVertexData->gNumConvexPoints*sizeof(btVector3);
+	ppu_address_t pointsPPU = (ppu_address_t) convexShapeSPU->getUnscaledPoints();
+	cellDmaGet(&convexVertexData->g_convexPointBuffer[0], pointsPPU  , dmaSize, DMA_TAG(2), 0, 0);
+}
+
+void dmaCollisionShape (void* collisionShapeLocation, ppu_address_t collisionShapePtr, uint32_t dmaTag, int shapeType)
+{
+	register int dmaSize = getShapeTypeSize(shapeType);
+	cellDmaGet(collisionShapeLocation, collisionShapePtr  , dmaSize, DMA_TAG(dmaTag), 0, 0);
+	//cellDmaWaitTagStatusAll(DMA_MASK(dmaTag));
+}
+
+void dmaCompoundShapeInfo (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag)
+{
+	register int dmaSize;
+	register	ppu_address_t	dmaPpuAddress2;
+	int childShapeCount = spuCompoundShape->getNumChildShapes();
+	dmaSize = childShapeCount * sizeof(btCompoundShapeChild);
+	dmaPpuAddress2 = (ppu_address_t)spuCompoundShape->getChildList();
+	cellDmaGet(&compoundShapeLocation->gSubshapes[0], dmaPpuAddress2, dmaSize, DMA_TAG(dmaTag), 0, 0);
+}
+
+void dmaCompoundSubShapes (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag)
+{
+	int childShapeCount = spuCompoundShape->getNumChildShapes();
+	int i;
+	// DMA all the subshapes 
+	for ( i = 0; i < childShapeCount; ++i)
+	{
+		btCompoundShapeChild& childShape = compoundShapeLocation->gSubshapes[i];
+		dmaCollisionShape (&compoundShapeLocation->gSubshapeShape[i],(ppu_address_t)childShape.m_childShape, dmaTag, childShape.m_childShapeType);
+	}
+}
+
+
+void	spuWalkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,const btQuantizedBvhNode* rootNode,int startNodeIndex,int endNodeIndex)
+{
+
+	int curIndex = startNodeIndex;
+	int walkIterations = 0;
+#ifdef BT_DEBUG
+	int subTreeSize = endNodeIndex - startNodeIndex;
+#endif
+
+	int escapeIndex;
+
+	unsigned int aabbOverlap, isLeafNode;
+
+	while (curIndex < endNodeIndex)
+	{
+		//catch bugs in tree data
+		btAssert (walkIterations < subTreeSize);
+
+		walkIterations++;
+		aabbOverlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
+		isLeafNode = rootNode->isLeafNode();
+
+		if (isLeafNode && aabbOverlap)
+		{
+			//printf("overlap with node %d\n",rootNode->getTriangleIndex());
+			nodeCallback->processNode(0,rootNode->getTriangleIndex());
+			//			spu_printf("SPU: overlap detected with triangleIndex:%d\n",rootNode->getTriangleIndex());
+		} 
+
+		if (aabbOverlap || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->getEscapeIndex();
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+
+}
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
@ -0,0 +1,125 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef __SPU_COLLISION_SHAPES_H
+#define __SPU_COLLISION_SHAPES_H
+
+#include "../SpuDoubleBuffer.h"
+
+#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
+#include "BulletCollision/CollisionShapes/btConvexInternalShape.h"
+#include "BulletCollision/CollisionShapes/btCylinderShape.h"
+
+#include "BulletCollision/CollisionShapes/btOptimizedBvh.h"
+#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
+#include "BulletCollision/CollisionShapes/btSphereShape.h"
+
+#include "BulletCollision/CollisionShapes/btCapsuleShape.h"
+
+#include "BulletCollision/CollisionShapes/btConvexShape.h"
+#include "BulletCollision/CollisionShapes/btBvhTriangleMeshShape.h"
+#include "BulletCollision/CollisionShapes/btConvexHullShape.h"
+#include "BulletCollision/CollisionShapes/btCompoundShape.h"
+
+#define MAX_NUM_SPU_CONVEX_POINTS 128
+
+ATTRIBUTE_ALIGNED16(struct)	SpuConvexPolyhedronVertexData
+{
+	void*	gSpuConvexShapePtr;
+	btVector3* gConvexPoints;
+	int gNumConvexPoints;
+	int unused;
+	ATTRIBUTE_ALIGNED16(btVector3 g_convexPointBuffer[MAX_NUM_SPU_CONVEX_POINTS]);
+};
+
+#define MAX_SHAPE_SIZE 256
+
+ATTRIBUTE_ALIGNED16(struct) CollisionShape_LocalStoreMemory
+{
+	ATTRIBUTE_ALIGNED16(char collisionShape[MAX_SHAPE_SIZE]);
+};
+
+ATTRIBUTE_ALIGNED16(struct) CompoundShape_LocalStoreMemory
+{
+	// Compound data
+#define MAX_SPU_COMPOUND_SUBSHAPES 16
+	ATTRIBUTE_ALIGNED16(btCompoundShapeChild gSubshapes[MAX_SPU_COMPOUND_SUBSHAPES]);
+	ATTRIBUTE_ALIGNED16(char gSubshapeShape[MAX_SPU_COMPOUND_SUBSHAPES][MAX_SHAPE_SIZE]);
+};
+
+ATTRIBUTE_ALIGNED16(struct) bvhMeshShape_LocalStoreMemory
+{
+	//ATTRIBUTE_ALIGNED16(btOptimizedBvh	gOptimizedBvh);
+	ATTRIBUTE_ALIGNED16(char gOptimizedBvh[sizeof(btOptimizedBvh)+16]);
+	btOptimizedBvh*	getOptimizedBvh()
+	{
+		return (btOptimizedBvh*) gOptimizedBvh;
+	}
+
+	ATTRIBUTE_ALIGNED16(btTriangleIndexVertexArray	gTriangleMeshInterfaceStorage);
+	btTriangleIndexVertexArray*	gTriangleMeshInterfacePtr;
+	///only a single mesh part for now, we can add support for multiple parts, but quantized trees don't support this at the moment 
+	ATTRIBUTE_ALIGNED16(btIndexedMesh	gIndexMesh);
+	#define MAX_SPU_SUBTREE_HEADERS 32
+	//1024
+	ATTRIBUTE_ALIGNED16(btBvhSubtreeInfo	gSubtreeHeaders[MAX_SPU_SUBTREE_HEADERS]);
+	ATTRIBUTE_ALIGNED16(btQuantizedBvhNode	gSubtreeNodes[MAX_SUBTREE_SIZE_IN_BYTES/sizeof(btQuantizedBvhNode)]);
+};
+
+
+void computeAabb (btVector3& aabbMin, btVector3& aabbMax, btConvexInternalShape* convexShape, ppu_address_t convexShapePtr, int shapeType, const btTransform& xform);
+void dmaBvhShapeData (bvhMeshShape_LocalStoreMemory* bvhMeshShape, btBvhTriangleMeshShape* triMeshShape);
+void dmaBvhIndexedMesh (btIndexedMesh* IndexMesh, IndexedMeshArray& indexArray, int index, uint32_t dmaTag);
+void dmaBvhSubTreeHeaders (btBvhSubtreeInfo* subTreeHeaders, ppu_address_t subTreePtr, int batchSize, uint32_t dmaTag);
+void dmaBvhSubTreeNodes (btQuantizedBvhNode* nodes, const btBvhSubtreeInfo& subtree, QuantizedNodeArray&	nodeArray, int dmaTag);
+
+int  getShapeTypeSize(int shapeType);
+void dmaConvexVertexData (SpuConvexPolyhedronVertexData* convexVertexData, btConvexHullShape* convexShapeSPU);
+void dmaCollisionShape (void* collisionShapeLocation, ppu_address_t collisionShapePtr, uint32_t dmaTag, int shapeType);
+void dmaCompoundShapeInfo (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag);
+void dmaCompoundSubShapes (CompoundShape_LocalStoreMemory* compoundShapeLocation, btCompoundShape* spuCompoundShape, uint32_t dmaTag);
+
+
+#define USE_BRANCHFREE_TEST 1
+#ifdef USE_BRANCHFREE_TEST
+SIMD_FORCE_INLINE unsigned int spuTestQuantizedAabbAgainstQuantizedAabb(unsigned short int* aabbMin1,unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2)
+{		
+#if defined(__CELLOS_LV2__) && defined (__SPU__)
+	vec_ushort8 vecMin = {aabbMin1[0],aabbMin2[0],aabbMin1[2],aabbMin2[2],aabbMin1[1],aabbMin2[1],0,0};
+	vec_ushort8 vecMax = {aabbMax2[0],aabbMax1[0],aabbMax2[2],aabbMax1[2],aabbMax2[1],aabbMax1[1],0,0};
+	vec_ushort8 isGt = spu_cmpgt(vecMin,vecMax);
+	return spu_extract(spu_gather(isGt),0)==0;
+
+#else
+	return btSelect((unsigned)((aabbMin1[0] <= aabbMax2[0]) & (aabbMax1[0] >= aabbMin2[0])
+		& (aabbMin1[2] <= aabbMax2[2]) & (aabbMax1[2] >= aabbMin2[2])
+		& (aabbMin1[1] <= aabbMax2[1]) & (aabbMax1[1] >= aabbMin2[1])),
+		1, 0);
+#endif
+}
+#else
+
+SIMD_FORCE_INLINE unsigned int spuTestQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1,const unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int*  aabbMax2)
+{
+	unsigned int overlap = 1;
+	overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? 0 : overlap;
+	overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? 0 : overlap;
+	overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? 0 : overlap;
+	return overlap;
+}
+#endif
+
+void	spuWalkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,const btQuantizedBvhNode* rootNode,int startNodeIndex,int endNodeIndex);
+
+#endif
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
@ -0,0 +1,236 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuContactResult.h"
+
+//#define DEBUG_SPU_COLLISION_DETECTION 1
+
+
+SpuContactResult::SpuContactResult()
+{
+	m_manifoldAddress = 0;
+	m_spuManifold = NULL;
+	m_RequiresWriteBack = false;
+}
+
+ SpuContactResult::~SpuContactResult()
+{
+	g_manifoldDmaExport.swapBuffers();
+}
+
+ 	///User can override this material combiner by implementing gContactAddedCallback and setting body0->m_collisionFlags |= btCollisionObject::customMaterialCallback;
+inline btScalar	calculateCombinedFriction(btScalar friction0,btScalar friction1)
+{
+	btScalar friction = friction0*friction1;
+
+	const btScalar MAX_FRICTION  = btScalar(10.);
+
+	if (friction < -MAX_FRICTION)
+		friction = -MAX_FRICTION;
+	if (friction > MAX_FRICTION)
+		friction = MAX_FRICTION;
+	return friction;
+
+}
+
+inline btScalar	calculateCombinedRestitution(btScalar restitution0,btScalar restitution1)
+{
+	return restitution0*restitution1;
+}
+
+
+
+ void	SpuContactResult::setContactInfo(btPersistentManifold* spuManifold, ppu_address_t	manifoldAddress,const btTransform& worldTrans0,const btTransform& worldTrans1, btScalar restitution0,btScalar restitution1, btScalar friction0,btScalar friction1, bool isSwapped)
+ {
+	//spu_printf("SpuContactResult::setContactInfo ManifoldAddress: %lu\n", manifoldAddress);
+	m_rootWorldTransform0 = worldTrans0;
+	m_rootWorldTransform1 = worldTrans1;
+	m_manifoldAddress = manifoldAddress;    
+	m_spuManifold = spuManifold;
+
+	m_combinedFriction = calculateCombinedFriction(friction0,friction1);
+	m_combinedRestitution = calculateCombinedRestitution(restitution0,restitution1);
+	m_isSwapped = isSwapped;
+ }
+
+ void SpuContactResult::setShapeIdentifiersA(int partId0,int index0)
+ {
+	
+ }
+
+ void SpuContactResult::setShapeIdentifiersB(int partId1,int index1)
+ {
+	
+ }
+
+
+
+ ///return true if it requires a dma transfer back
+bool ManifoldResultAddContactPoint(const btVector3& normalOnBInWorld,
+								   const btVector3& pointInWorld,
+								   float depth,
+								   btPersistentManifold* manifoldPtr,
+								   btTransform& transA,
+								   btTransform& transB,
+									btScalar	combinedFriction,
+									btScalar	combinedRestitution,
+								   bool isSwapped)
+{
+	
+//	float contactTreshold = manifoldPtr->getContactBreakingThreshold();
+
+	//spu_printf("SPU: add contactpoint, depth:%f, contactTreshold %f, manifoldPtr %llx\n",depth,contactTreshold,manifoldPtr);
+
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+	spu_printf("SPU: contactTreshold %f\n",contactTreshold);
+#endif //DEBUG_SPU_COLLISION_DETECTION
+	if (depth > manifoldPtr->getContactBreakingThreshold())
+		return false;
+
+	//provide inverses or just calculate?
+	btTransform transAInv = transA.inverse();//m_body0->m_cachedInvertedWorldTransform;
+	btTransform transBInv= transB.inverse();//m_body1->m_cachedInvertedWorldTransform;
+
+	btVector3 pointA;
+	btVector3 localA;
+	btVector3 localB;
+	btVector3 normal;
+
+	if (isSwapped)
+	{
+		normal = normalOnBInWorld * -1;
+		pointA = pointInWorld + normal * depth;
+		localA = transAInv(pointA );
+		localB = transBInv(pointInWorld);
+		/*localA = transBInv(pointA );
+		localB = transAInv(pointInWorld);*/
+	}
+	else
+	{
+		normal = normalOnBInWorld;
+		pointA = pointInWorld + normal * depth;
+		localA = transAInv(pointA );
+		localB = transBInv(pointInWorld);
+	}
+
+	btManifoldPoint newPt(localA,localB,normal,depth);
+
+	int insertIndex = manifoldPtr->getCacheEntry(newPt);
+	if (insertIndex >= 0)
+	{
+//		manifoldPtr->replaceContactPoint(newPt,insertIndex);
+//		return true;
+
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+		spu_printf("SPU: same contact detected, nothing done\n");
+#endif //DEBUG_SPU_COLLISION_DETECTION
+		// This is not needed, just use the old info! saves a DMA transfer as well
+	} else
+	{
+
+		newPt.m_combinedFriction = combinedFriction;
+		newPt.m_combinedRestitution = combinedRestitution;
+
+		/*
+		///@todo: SPU callbacks, either immediate (local on the SPU), or deferred
+		//User can override friction and/or restitution
+		if (gContactAddedCallback &&
+			//and if either of the two bodies requires custom material
+			 ((m_body0->m_collisionFlags & btCollisionObject::customMaterialCallback) ||
+			   (m_body1->m_collisionFlags & btCollisionObject::customMaterialCallback)))
+		{
+			//experimental feature info, for per-triangle material etc.
+			(*gContactAddedCallback)(newPt,m_body0,m_partId0,m_index0,m_body1,m_partId1,m_index1);
+		}
+		*/
+		manifoldPtr->addManifoldPoint(newPt);
+		return true;
+
+	}
+	return false;
+	
+}
+
+
+void SpuContactResult::writeDoubleBufferedManifold(btPersistentManifold* lsManifold, btPersistentManifold* mmManifold)
+{
+	///only write back the contact information on SPU. Other platforms avoid copying, and use the data in-place
+	///see SpuFakeDma.cpp 'cellDmaLargeGetReadOnly'
+#if defined (__SPU__) || defined (USE_LIBSPE2)
+    memcpy(g_manifoldDmaExport.getFront(),lsManifold,sizeof(btPersistentManifold));
+
+    g_manifoldDmaExport.swapBuffers();
+    ppu_address_t mmAddr = (ppu_address_t)mmManifold;
+    g_manifoldDmaExport.backBufferDmaPut(mmAddr, sizeof(btPersistentManifold), DMA_TAG(9));
+	// Should there be any kind of wait here?  What if somebody tries to use this tag again?  What if we call this function again really soon?
+	//no, the swapBuffers does the wait
+#endif
+}
+
+void SpuContactResult::addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,btScalar depth)
+{
+	//spu_printf("*** SpuContactResult::addContactPoint: depth = %f\n",depth);
+
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+ //   int sman = sizeof(rage::phManifold);
+//	spu_printf("sizeof_manifold = %i\n",sman);
+#endif //DEBUG_SPU_COLLISION_DETECTION
+
+	btPersistentManifold* localManifold = m_spuManifold;
+
+	btVector3	normalB(normalOnBInWorld.getX(),normalOnBInWorld.getY(),normalOnBInWorld.getZ());
+	btVector3	pointWrld(pointInWorld.getX(),pointInWorld.getY(),pointInWorld.getZ());
+
+	//process the contact point
+	const bool retVal = ManifoldResultAddContactPoint(normalB,
+		pointWrld,
+		depth,
+		localManifold,
+		m_rootWorldTransform0,
+		m_rootWorldTransform1,
+		m_combinedFriction,
+		m_combinedRestitution,
+		m_isSwapped);
+	m_RequiresWriteBack = m_RequiresWriteBack || retVal;
+}
+
+void SpuContactResult::flush()
+{
+
+	if (m_spuManifold && m_spuManifold->getNumContacts())
+	{
+		m_spuManifold->refreshContactPoints(m_rootWorldTransform0,m_rootWorldTransform1);
+		m_RequiresWriteBack = true;
+	}
+
+
+	if (m_RequiresWriteBack)
+	{
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+		spu_printf("SPU: Start SpuContactResult::flush (Put) DMA\n");
+		spu_printf("Num contacts:%d\n", m_spuManifold->getNumContacts());
+		spu_printf("Manifold address: %llu\n", m_manifoldAddress);
+#endif //DEBUG_SPU_COLLISION_DETECTION
+	//	spu_printf("writeDoubleBufferedManifold\n");
+		writeDoubleBufferedManifold(m_spuManifold, (btPersistentManifold*)m_manifoldAddress);
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+		spu_printf("SPU: Finished (Put) DMA\n");
+#endif //DEBUG_SPU_COLLISION_DETECTION
+	}
+	m_spuManifold = NULL;
+	m_RequiresWriteBack = false;
+}
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h
@ -0,0 +1,106 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_CONTACT_RESULT2_H
+#define SPU_CONTACT_RESULT2_H
+
+
+#ifndef WIN32
+#include <stdint.h>
+#endif
+
+
+
+#include "../SpuDoubleBuffer.h"
+
+
+#include "LinearMath/btTransform.h"
+
+
+#include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
+#include "BulletCollision/NarrowPhaseCollision/btDiscreteCollisionDetectorInterface.h"
+
+class btCollisionShape;
+
+
+struct SpuCollisionPairInput
+{
+	ppu_address_t m_collisionShapes[2];
+	btCollisionShape*	m_spuCollisionShapes[2];
+
+	ppu_address_t m_persistentManifoldPtr;
+	btVector3	m_primitiveDimensions0;
+	btVector3	m_primitiveDimensions1;
+	int		m_shapeType0;
+	int		m_shapeType1;	
+	float	m_collisionMargin0;
+	float	m_collisionMargin1;
+
+	btTransform	m_worldTransform0;
+	btTransform m_worldTransform1;
+	
+	bool	m_isSwapped;
+	bool    m_useEpa;
+};
+
+
+struct SpuClosestPointInput : public btDiscreteCollisionDetectorInterface::ClosestPointInput
+{
+	struct SpuConvexPolyhedronVertexData* m_convexVertexData[2];
+};
+
+///SpuContactResult exports the contact points using double-buffered DMA transfers, only when needed
+///So when an existing contact point is duplicated, no transfer/refresh is performed.
+class SpuContactResult : public btDiscreteCollisionDetectorInterface::Result
+{
+    btTransform		m_rootWorldTransform0;
+	btTransform		m_rootWorldTransform1;
+	ppu_address_t	m_manifoldAddress;
+
+    btPersistentManifold* m_spuManifold;
+	bool m_RequiresWriteBack;
+	btScalar	m_combinedFriction;
+	btScalar	m_combinedRestitution;
+	
+	bool m_isSwapped;
+
+	DoubleBuffer<btPersistentManifold, 1> g_manifoldDmaExport;
+
+	public:
+		SpuContactResult();
+		virtual ~SpuContactResult();
+
+		btPersistentManifold*	GetSpuManifold() const
+		{
+			return m_spuManifold;
+		}
+
+		virtual void setShapeIdentifiersA(int partId0,int index0);
+		virtual void setShapeIdentifiersB(int partId1,int index1);
+
+		void	setContactInfo(btPersistentManifold* spuManifold, ppu_address_t	manifoldAddress,const btTransform& worldTrans0,const btTransform& worldTrans1, btScalar restitution0,btScalar restitution1, btScalar friction0,btScalar friction01, bool isSwapped);
+
+
+        void writeDoubleBufferedManifold(btPersistentManifold* lsManifold, btPersistentManifold* mmManifold);
+
+        virtual void addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,btScalar depth);
+
+		void flush();
+};
+
+
+
+#endif //SPU_CONTACT_RESULT2_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
@ -0,0 +1,51 @@
+
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef SPU_CONVEX_PENETRATION_DEPTH_H
+#define SPU_CONVEX_PENETRATION_DEPTH_H
+
+
+
+class btStackAlloc;
+class btIDebugDraw;
+#include "BulletCollision/NarrowphaseCollision/btConvexPenetrationDepthSolver.h"
+
+#include <LinearMath/btTransform.h>
+
+
+///ConvexPenetrationDepthSolver provides an interface for penetration depth calculation.
+class SpuConvexPenetrationDepthSolver : public btConvexPenetrationDepthSolver
+{
+public:	
+	
+	virtual ~SpuConvexPenetrationDepthSolver() {};
+	virtual bool calcPenDepth( SpuVoronoiSimplexSolver& simplexSolver,
+	        void* convexA,void* convexB,int shapeTypeA, int shapeTypeB, float marginA, float marginB,
+            btTransform& transA,const btTransform& transB,
+			btVector3& v, btVector3& pa, btVector3& pb,
+			class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataA,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataB
+			) const = 0;
+
+
+};
+
+
+
+#endif //SPU_CONVEX_PENETRATION_DEPTH_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
@ -0,0 +1,140 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_GATHERING_COLLISION_TASK_H
+#define SPU_GATHERING_COLLISION_TASK_H
+
+#include "../PlatformDefinitions.h"
+//#define DEBUG_SPU_COLLISION_DETECTION 1
+
+
+///Task Description for SPU collision detection
+struct SpuGatherAndProcessPairsTaskDesc 
+{
+	ppu_address_t	m_inPairPtr;//m_pairArrayPtr;
+	//mutex variable
+	uint32_t	m_someMutexVariableInMainMemory;
+
+	ppu_address_t	m_dispatcher;
+
+	uint32_t	numOnLastPage;
+
+	uint16_t numPages;
+	uint16_t taskId;
+	bool m_useEpa;
+
+	struct	CollisionTask_LocalStoreMemory*	m_lsMemory; 
+}
+
+#if  defined(__CELLOS_LV2__) || defined(USE_LIBSPE2)
+__attribute__ ((aligned (128)))
+#endif
+;
+
+
+void	processCollisionTask(void* userPtr, void* lsMemory);
+
+void*	createCollisionLocalStoreMemory();
+
+
+#if defined(USE_LIBSPE2) && defined(__SPU__)
+#include "../SpuLibspe2Support.h"
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <SpuFakeDma.h>
+
+//#define DEBUG_LIBSPE2_SPU_TASK
+
+
+
+int main(unsigned long long speid, addr64 argp, addr64 envp)
+{
+	printf("SPU: hello \n");
+	
+	ATTRIBUTE_ALIGNED128(btSpuStatus status);
+	ATTRIBUTE_ALIGNED16( SpuGatherAndProcessPairsTaskDesc taskDesc ) ;
+	unsigned int received_message = Spu_Mailbox_Event_Nothing;
+    bool shutdown = false;
+
+	cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(3));
+
+	status.m_status = Spu_Status_Free;
+	status.m_lsMemory.p = createCollisionLocalStoreMemory();
+
+	cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(3));
+	
+	
+	while ( btLikely( !shutdown ) )
+	{
+		
+		received_message = spu_read_in_mbox();
+		
+		if( btLikely( received_message == Spu_Mailbox_Event_Task ))
+		{
+#ifdef DEBUG_LIBSPE2_SPU_TASK
+			printf("SPU: received Spu_Mailbox_Event_Task\n");
+#endif //DEBUG_LIBSPE2_SPU_TASK
+
+			// refresh the status
+			cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+			cellDmaWaitTagStatusAll(DMA_MASK(3));
+		
+			btAssert(status.m_status==Spu_Status_Occupied);
+			
+			cellDmaGet(&taskDesc, status.m_taskDesc.p, sizeof(SpuGatherAndProcessPairsTaskDesc), DMA_TAG(3), 0, 0);
+			cellDmaWaitTagStatusAll(DMA_MASK(3));
+#ifdef DEBUG_LIBSPE2_SPU_TASK		
+			printf("SPU:processCollisionTask\n");	
+#endif //DEBUG_LIBSPE2_SPU_TASK
+			processCollisionTask((void*)&taskDesc, taskDesc.m_lsMemory);
+			
+#ifdef DEBUG_LIBSPE2_SPU_TASK
+			printf("SPU:finished processCollisionTask\n");
+#endif //DEBUG_LIBSPE2_SPU_TASK
+		}
+		else
+		{
+#ifdef DEBUG_LIBSPE2_SPU_TASK
+			printf("SPU: received ShutDown\n");
+#endif //DEBUG_LIBSPE2_SPU_TASK
+			if( btLikely( received_message == Spu_Mailbox_Event_Shutdown ) )
+			{
+				shutdown = true;
+			}
+			else
+			{
+				//printf("SPU - Sth. recieved\n");
+			}
+		}
+
+		// set to status free and wait for next task
+		status.m_status = Spu_Status_Free;
+		cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+		cellDmaWaitTagStatusAll(DMA_MASK(3));		
+				
+		
+  	}
+
+	printf("SPU: shutdown\n");
+  	return 0;
+}
+#endif // USE_LIBSPE2
+
+
+#endif //SPU_GATHERING_COLLISION_TASK_H
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h
@ -0,0 +1,19 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
@ -0,0 +1,348 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuMinkowskiPenetrationDepthSolver.h"
+#include "SpuContactResult.h"
+#include "SpuPreferredPenetrationDirections.h"
+#include "BulletCollision/NarrowPhaseCollision/btVoronoiSimplexSolver.h"
+#include "BulletCollision/NarrowPhaseCollision/btGjkPairDetector.h"
+#include "SpuCollisionShapes.h"
+
+#define NUM_UNITSPHERE_POINTS 42
+static btVector3	sPenetrationDirections[NUM_UNITSPHERE_POINTS+MAX_PREFERRED_PENETRATION_DIRECTIONS*2] = 
+{
+btVector3(btScalar(0.000000) , btScalar(-0.000000),btScalar(-1.000000)),
+btVector3(btScalar(0.723608) , btScalar(-0.525725),btScalar(-0.447219)),
+btVector3(btScalar(-0.276388) , btScalar(-0.850649),btScalar(-0.447219)),
+btVector3(btScalar(-0.894426) , btScalar(-0.000000),btScalar(-0.447216)),
+btVector3(btScalar(-0.276388) , btScalar(0.850649),btScalar(-0.447220)),
+btVector3(btScalar(0.723608) , btScalar(0.525725),btScalar(-0.447219)),
+btVector3(btScalar(0.276388) , btScalar(-0.850649),btScalar(0.447220)),
+btVector3(btScalar(-0.723608) , btScalar(-0.525725),btScalar(0.447219)),
+btVector3(btScalar(-0.723608) , btScalar(0.525725),btScalar(0.447219)),
+btVector3(btScalar(0.276388) , btScalar(0.850649),btScalar(0.447219)),
+btVector3(btScalar(0.894426) , btScalar(0.000000),btScalar(0.447216)),
+btVector3(btScalar(-0.000000) , btScalar(0.000000),btScalar(1.000000)),
+btVector3(btScalar(0.425323) , btScalar(-0.309011),btScalar(-0.850654)),
+btVector3(btScalar(-0.162456) , btScalar(-0.499995),btScalar(-0.850654)),
+btVector3(btScalar(0.262869) , btScalar(-0.809012),btScalar(-0.525738)),
+btVector3(btScalar(0.425323) , btScalar(0.309011),btScalar(-0.850654)),
+btVector3(btScalar(0.850648) , btScalar(-0.000000),btScalar(-0.525736)),
+btVector3(btScalar(-0.525730) , btScalar(-0.000000),btScalar(-0.850652)),
+btVector3(btScalar(-0.688190) , btScalar(-0.499997),btScalar(-0.525736)),
+btVector3(btScalar(-0.162456) , btScalar(0.499995),btScalar(-0.850654)),
+btVector3(btScalar(-0.688190) , btScalar(0.499997),btScalar(-0.525736)),
+btVector3(btScalar(0.262869) , btScalar(0.809012),btScalar(-0.525738)),
+btVector3(btScalar(0.951058) , btScalar(0.309013),btScalar(0.000000)),
+btVector3(btScalar(0.951058) , btScalar(-0.309013),btScalar(0.000000)),
+btVector3(btScalar(0.587786) , btScalar(-0.809017),btScalar(0.000000)),
+btVector3(btScalar(0.000000) , btScalar(-1.000000),btScalar(0.000000)),
+btVector3(btScalar(-0.587786) , btScalar(-0.809017),btScalar(0.000000)),
+btVector3(btScalar(-0.951058) , btScalar(-0.309013),btScalar(-0.000000)),
+btVector3(btScalar(-0.951058) , btScalar(0.309013),btScalar(-0.000000)),
+btVector3(btScalar(-0.587786) , btScalar(0.809017),btScalar(-0.000000)),
+btVector3(btScalar(-0.000000) , btScalar(1.000000),btScalar(-0.000000)),
+btVector3(btScalar(0.587786) , btScalar(0.809017),btScalar(-0.000000)),
+btVector3(btScalar(0.688190) , btScalar(-0.499997),btScalar(0.525736)),
+btVector3(btScalar(-0.262869) , btScalar(-0.809012),btScalar(0.525738)),
+btVector3(btScalar(-0.850648) , btScalar(0.000000),btScalar(0.525736)),
+btVector3(btScalar(-0.262869) , btScalar(0.809012),btScalar(0.525738)),
+btVector3(btScalar(0.688190) , btScalar(0.499997),btScalar(0.525736)),
+btVector3(btScalar(0.525730) , btScalar(0.000000),btScalar(0.850652)),
+btVector3(btScalar(0.162456) , btScalar(-0.499995),btScalar(0.850654)),
+btVector3(btScalar(-0.425323) , btScalar(-0.309011),btScalar(0.850654)),
+btVector3(btScalar(-0.425323) , btScalar(0.309011),btScalar(0.850654)),
+btVector3(btScalar(0.162456) , btScalar(0.499995),btScalar(0.850654))
+};
+
+
+bool SpuMinkowskiPenetrationDepthSolver::calcPenDepth( btSimplexSolverInterface& simplexSolver,
+		const btConvexShape* convexA,const btConvexShape* convexB,
+					const btTransform& transA,const btTransform& transB,
+				btVector3& v, btVector3& pa, btVector3& pb,
+				class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc)
+{
+#if 0
+	(void)stackAlloc;
+	(void)v;
+	
+
+	struct btIntermediateResult : public SpuContactResult
+	{
+
+		btIntermediateResult():m_hasResult(false)
+		{
+		}
+		
+		btVector3 m_normalOnBInWorld;
+		btVector3 m_pointInWorld;
+		btScalar m_depth;
+		bool	m_hasResult;
+
+		virtual void setShapeIdentifiersA(int partId0,int index0)
+		{
+			(void)partId0;
+			(void)index0;
+		}
+
+		virtual void setShapeIdentifiersB(int partId1,int index1)
+		{
+			(void)partId1;
+			(void)index1;
+		}
+		void addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,btScalar depth)
+		{
+			m_normalOnBInWorld = normalOnBInWorld;
+			m_pointInWorld = pointInWorld;
+			m_depth = depth;
+			m_hasResult = true;
+		}
+	};
+
+	//just take fixed number of orientation, and sample the penetration depth in that direction
+	btScalar minProj = btScalar(BT_LARGE_FLOAT);
+	btVector3 minNorm(0.f,0.f,0.f);
+	btVector3 minVertex;
+	btVector3 minA,minB;
+	btVector3 seperatingAxisInA,seperatingAxisInB;
+	btVector3 pInA,qInB,pWorld,qWorld,w;
+
+//#define USE_BATCHED_SUPPORT 1
+#ifdef USE_BATCHED_SUPPORT
+
+	btVector3	supportVerticesABatch[NUM_UNITSPHERE_POINTS+MAX_PREFERRED_PENETRATION_DIRECTIONS*2];
+	btVector3	supportVerticesBBatch[NUM_UNITSPHERE_POINTS+MAX_PREFERRED_PENETRATION_DIRECTIONS*2];
+	btVector3	seperatingAxisInABatch[NUM_UNITSPHERE_POINTS+MAX_PREFERRED_PENETRATION_DIRECTIONS*2];
+	btVector3	seperatingAxisInBBatch[NUM_UNITSPHERE_POINTS+MAX_PREFERRED_PENETRATION_DIRECTIONS*2];
+	int i;
+
+	int numSampleDirections = NUM_UNITSPHERE_POINTS;
+
+	for (i=0;i<numSampleDirections;i++)
+	{
+		const btVector3& norm = sPenetrationDirections[i];
+		seperatingAxisInABatch[i] =  (-norm) * transA.getBasis() ;
+		seperatingAxisInBBatch[i] =  norm   * transB.getBasis() ;
+	}
+
+	{
+		int numPDA = convexA->getNumPreferredPenetrationDirections();
+		if (numPDA)
+		{
+			for (int i=0;i<numPDA;i++)
+			{
+				btVector3 norm;
+				convexA->getPreferredPenetrationDirection(i,norm);
+				norm  = transA.getBasis() * norm;
+				sPenetrationDirections[numSampleDirections] = norm;
+				seperatingAxisInABatch[numSampleDirections] = (-norm) * transA.getBasis();
+				seperatingAxisInBBatch[numSampleDirections] = norm * transB.getBasis();
+				numSampleDirections++;
+			}
+		}
+	}
+
+	{
+		int numPDB = convexB->getNumPreferredPenetrationDirections();
+		if (numPDB)
+		{
+			for (int i=0;i<numPDB;i++)
+			{
+				btVector3 norm;
+				convexB->getPreferredPenetrationDirection(i,norm);
+				norm  = transB.getBasis() * norm;
+				sPenetrationDirections[numSampleDirections] = norm;
+				seperatingAxisInABatch[numSampleDirections] = (-norm) * transA.getBasis();
+				seperatingAxisInBBatch[numSampleDirections] = norm * transB.getBasis();
+				numSampleDirections++;
+			}
+		}
+	}
+
+
+
+	convexA->batchedUnitVectorGetSupportingVertexWithoutMargin(seperatingAxisInABatch,supportVerticesABatch,numSampleDirections);
+	convexB->batchedUnitVectorGetSupportingVertexWithoutMargin(seperatingAxisInBBatch,supportVerticesBBatch,numSampleDirections);
+
+	for (i=0;i<numSampleDirections;i++)
+	{
+		const btVector3& norm = sPenetrationDirections[i];
+		seperatingAxisInA = seperatingAxisInABatch[i];
+		seperatingAxisInB = seperatingAxisInBBatch[i];
+
+		pInA = supportVerticesABatch[i];
+		qInB = supportVerticesBBatch[i];
+
+		pWorld = transA(pInA);	
+		qWorld = transB(qInB);
+		w	= qWorld - pWorld;
+		btScalar delta = norm.dot(w);
+		//find smallest delta
+		if (delta < minProj)
+		{
+			minProj = delta;
+			minNorm = norm;
+			minA = pWorld;
+			minB = qWorld;
+		}
+	}	
+#else
+
+	int numSampleDirections = NUM_UNITSPHERE_POINTS;
+
+///this is necessary, otherwise the normal is not correct, and sphere will rotate forever on a sloped triangle mesh
+#define DO_PREFERRED_DIRECTIONS 1
+#ifdef DO_PREFERRED_DIRECTIONS
+	{
+		int numPDA = spuGetNumPreferredPenetrationDirections(shapeTypeA,convexA);
+		if (numPDA)
+		{
+			for (int i=0;i<numPDA;i++)
+			{
+				btVector3 norm;
+				spuGetPreferredPenetrationDirection(shapeTypeA,convexA,i,norm);
+				norm  = transA.getBasis() * norm;
+				sPenetrationDirections[numSampleDirections] = norm;
+				numSampleDirections++;
+			}
+		}
+	}
+
+	{
+		int numPDB = spuGetNumPreferredPenetrationDirections(shapeTypeB,convexB);
+		if (numPDB)
+		{
+			for (int i=0;i<numPDB;i++)
+			{
+				btVector3 norm;
+				spuGetPreferredPenetrationDirection(shapeTypeB,convexB,i,norm);
+				norm  = transB.getBasis() * norm;
+				sPenetrationDirections[numSampleDirections] = norm;
+				numSampleDirections++;
+			}
+		}
+	}
+#endif //DO_PREFERRED_DIRECTIONS
+
+	for (int i=0;i<numSampleDirections;i++)
+	{
+		const btVector3& norm = sPenetrationDirections[i];
+		seperatingAxisInA = (-norm)* transA.getBasis();
+		seperatingAxisInB = norm* transB.getBasis();
+
+		pInA = convexA->localGetSupportVertexWithoutMarginNonVirtual( seperatingAxisInA);//, NULL);
+		qInB = convexB->localGetSupportVertexWithoutMarginNonVirtual(seperatingAxisInB);//, NULL);
+
+	//	pInA = convexA->localGetSupportingVertexWithoutMargin(seperatingAxisInA);
+	//	qInB = convexB->localGetSupportingVertexWithoutMargin(seperatingAxisInB);
+
+		pWorld = transA(pInA);	
+		qWorld = transB(qInB);
+		w	= qWorld - pWorld;
+		btScalar delta = norm.dot(w);
+		//find smallest delta
+		if (delta < minProj)
+		{
+			minProj = delta;
+			minNorm = norm;
+			minA = pWorld;
+			minB = qWorld;
+		}
+	}
+#endif //USE_BATCHED_SUPPORT
+
+	//add the margins
+
+	minA += minNorm*marginA;
+	minB -= minNorm*marginB;
+	//no penetration
+	if (minProj < btScalar(0.))
+		return false;
+
+	minProj += (marginA + marginB) + btScalar(1.00);
+
+
+
+
+
+//#define DEBUG_DRAW 1
+#ifdef DEBUG_DRAW
+	if (debugDraw)
+	{
+		btVector3 color(0,1,0);
+		debugDraw->drawLine(minA,minB,color);
+		color = btVector3 (1,1,1);
+		btVector3 vec = minB-minA;
+		btScalar prj2 = minNorm.dot(vec);
+		debugDraw->drawLine(minA,minA+(minNorm*minProj),color);
+
+	}
+#endif //DEBUG_DRAW
+
+	
+	btGjkPairDetector gjkdet(convexA,convexB,&simplexSolver,0);
+
+	btScalar offsetDist = minProj;
+	btVector3 offset = minNorm * offsetDist;
+	
+
+	SpuClosestPointInput input;
+	input.m_convexVertexData[0] = convexVertexDataA;
+	input.m_convexVertexData[1] = convexVertexDataB;
+	btVector3 newOrg = transA.getOrigin() + offset;
+
+	btTransform displacedTrans = transA;
+	displacedTrans.setOrigin(newOrg);
+
+	input.m_transformA = displacedTrans;
+	input.m_transformB = transB;
+	input.m_maximumDistanceSquared = btScalar(BT_LARGE_FLOAT);//minProj;
+	
+	btIntermediateResult res;
+	gjkdet.getClosestPoints(input,res,0);
+
+	btScalar correctedMinNorm = minProj - res.m_depth;
+
+
+	//the penetration depth is over-estimated, relax it
+	btScalar penetration_relaxation= btScalar(1.);
+	minNorm*=penetration_relaxation;
+
+	if (res.m_hasResult)
+	{
+
+		pa = res.m_pointInWorld - minNorm * correctedMinNorm;
+		pb = res.m_pointInWorld;
+		
+#ifdef DEBUG_DRAW
+		if (debugDraw)
+		{
+			btVector3 color(1,0,0);
+			debugDraw->drawLine(pa,pb,color);
+		}
+#endif//DEBUG_DRAW
+
+
+	} else {
+		// could not seperate shapes
+		//btAssert (false);
+	}
+	return res.m_hasResult;
+#endif
+	return false;
+}
+
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
@ -0,0 +1,48 @@
+
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef MINKOWSKI_PENETRATION_DEPTH_SOLVER_H
+#define MINKOWSKI_PENETRATION_DEPTH_SOLVER_H
+
+
+#include "BulletCollision/NarrowPhaseCollision/btConvexPenetrationDepthSolver.h"
+
+class btStackAlloc;
+class btIDebugDraw;
+class btVoronoiSimplexSolver;
+class btConvexShape;
+
+///MinkowskiPenetrationDepthSolver implements bruteforce penetration depth estimation.
+///Implementation is based on sampling the depth using support mapping, and using GJK step to get the witness points.
+class SpuMinkowskiPenetrationDepthSolver : public btConvexPenetrationDepthSolver
+{
+public:
+	SpuMinkowskiPenetrationDepthSolver() {}
+	virtual ~SpuMinkowskiPenetrationDepthSolver() {};
+
+		virtual bool calcPenDepth( btSimplexSolverInterface& simplexSolver,
+		const btConvexShape* convexA,const btConvexShape* convexB,
+					const btTransform& transA,const btTransform& transB,
+				btVector3& v, btVector3& pa, btVector3& pb,
+				class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc
+				);
+
+
+};
+
+
+#endif //MINKOWSKI_PENETRATION_DEPTH_SOLVER_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuPreferredPenetrationDirections.h
@ -0,0 +1,70 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef _SPU_PREFERRED_PENETRATION_DIRECTIONS_H
+#define _SPU_PREFERRED_PENETRATION_DIRECTIONS_H
+
+
+#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
+
+int		spuGetNumPreferredPenetrationDirections(int shapeType, void* shape)
+{
+	switch (shapeType)
+    {
+		case TRIANGLE_SHAPE_PROXYTYPE:
+		{
+			return 2;
+			//spu_printf("2\n");
+			break;
+		}
+		default:
+			{
+#if __ASSERT
+        spu_printf("spuGetNumPreferredPenetrationDirections() - Unsupported bound type: %d.\n", shapeType);
+#endif // __ASSERT
+			}
+	}
+
+	return 0;	
+}	
+
+void	spuGetPreferredPenetrationDirection(int shapeType, void* shape, int index, btVector3& penetrationVector)
+{
+
+
+	switch (shapeType)
+    {
+		case TRIANGLE_SHAPE_PROXYTYPE:
+		{
+			btVector3* vertices = (btVector3*)shape;
+			///calcNormal
+			penetrationVector = (vertices[1]-vertices[0]).cross(vertices[2]-vertices[0]);
+			penetrationVector.normalize();
+			if (index)
+				penetrationVector *= btScalar(-1.);
+			break;
+		}
+		default:
+			{
+					
+#if __ASSERT
+        spu_printf("spuGetNumPreferredPenetrationDirections() - Unsupported bound type: %d.\n", shapeType);
+#endif // __ASSERT
+			}
+	}
+		
+}
+
+#endif //_SPU_PREFERRED_PENETRATION_DIRECTIONS_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h
@ -0,0 +1,66 @@
+/*
+   Copyright (C) 2006, 2008 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+
+#ifndef __BOXBOXDISTANCE_H__
+#define __BOXBOXDISTANCE_H__
+
+
+#include "Box.h"
+
+using namespace Vectormath::Aos;
+
+//---------------------------------------------------------------------------
+// boxBoxDistance:
+//
+// description:
+//    this computes info that can be used for the collision response of two boxes.  when the boxes
+//    do not overlap, the points are set to the closest points of the boxes, and a positive
+//    distance between them is returned.  if the boxes do overlap, a negative distance is returned
+//    and the points are set to two points that would touch after the boxes are translated apart.
+//    the contact normal gives the direction to repel or separate the boxes when they touch or
+//    overlap (it's being approximated here as one of the 15 "separating axis" directions).
+//
+// returns:
+//    positive or negative distance between two boxes.
+//
+// args:
+//    Vector3& normal: set to a unit contact normal pointing from box A to box B.
+//
+//    BoxPoint& boxPointA, BoxPoint& boxPointB:
+//       set to a closest point or point of penetration on each box.
+//
+//    Box boxA, Box boxB:
+//       boxes, represented as 3 half-widths
+//
+//    const Transform3& transformA, const Transform3& transformB:
+//       box transformations, in world coordinates
+//
+//    float distanceThreshold:
+//       the algorithm will exit early if it finds that the boxes are more distant than this
+//       threshold, and not compute a contact normal or points.  if this distance returned
+//       exceeds the threshold, all the other output data may not have been computed.  by
+//       default, this is set to MAX_FLOAT so it will have no effect.
+//
+//---------------------------------------------------------------------------
+
+float
+boxBoxDistance(Vector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB,
+			   PE_REF(Box) boxA, const Transform3 & transformA, PE_REF(Box) boxB,
+			   const Transform3 & transformB,
+			   float distanceThreshold = FLT_MAX );
+
+#endif /* __BOXBOXDISTANCE_H__ */
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/readme.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/readme.txt
@ -0,0 +1 @@
+Empty placeholder for future Libspe2 SPU task
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.cpp
@ -0,0 +1,214 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+
+#include "SpuSampleTask.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+#include "../PlatformDefinitions.h"
+#include "../SpuFakeDma.h"
+#include "LinearMath/btMinMax.h"
+
+#ifdef __SPU__
+#include <spu_printf.h>
+#else
+#include <stdio.h>
+#define spu_printf printf
+#endif
+
+#define MAX_NUM_BODIES 8192
+
+struct SampleTask_LocalStoreMemory
+{
+	ATTRIBUTE_ALIGNED16(char gLocalRigidBody [sizeof(btRigidBody)+16]);
+	ATTRIBUTE_ALIGNED16(void* gPointerArray[MAX_NUM_BODIES]);
+
+};
+
+
+
+
+//-- MAIN METHOD
+void processSampleTask(void* userPtr, void* lsMemory)
+{
+	//	BT_PROFILE("processSampleTask");
+
+	SampleTask_LocalStoreMemory* localMemory = (SampleTask_LocalStoreMemory*)lsMemory;
+
+	SpuSampleTaskDesc* taskDescPtr = (SpuSampleTaskDesc*)userPtr;
+	SpuSampleTaskDesc& taskDesc = *taskDescPtr;
+
+	switch (taskDesc.m_sampleCommand)
+	{
+	case CMD_SAMPLE_INTEGRATE_BODIES:
+		{
+			btTransform predictedTrans;
+			btCollisionObject** eaPtr = (btCollisionObject**)taskDesc.m_mainMemoryPtr;
+
+			int batchSize = taskDesc.m_sampleValue;
+			if (batchSize>MAX_NUM_BODIES)
+			{
+				spu_printf("SPU Error: exceed number of bodies, see MAX_NUM_BODIES in SpuSampleTask.cpp\n");
+				break;
+			}
+			int dmaArraySize = batchSize*sizeof(void*);
+
+			uint64_t ppuArrayAddress = reinterpret_cast<uint64_t>(eaPtr);
+
+			//			spu_printf("array location is at %llx, batchSize = %d, DMA size = %d\n",ppuArrayAddress,batchSize,dmaArraySize);
+
+			if (dmaArraySize>=16)
+			{
+				cellDmaLargeGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize, DMA_TAG(1), 0, 0);	
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+			} else
+			{
+				stallingUnalignedDmaSmallGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize);
+			}
+
+
+			for ( int i=0;i<batchSize;i++)
+			{
+				///DMA rigid body
+
+				void* localPtr = &localMemory->gLocalRigidBody[0];
+				void* shortAdd = localMemory->gPointerArray[i];
+				uint64_t ppuRigidBodyAddress = reinterpret_cast<uint64_t>(shortAdd);
+
+				//	spu_printf("cellDmaGet at CMD_SAMPLE_INTEGRATE_BODIES from %llx to %llx\n",ppuRigidBodyAddress,localPtr);
+
+				int dmaBodySize = sizeof(btRigidBody);
+
+				cellDmaGet((void*)localPtr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);	
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+
+
+				float timeStep = 1.f/60.f;
+
+				btRigidBody* body = (btRigidBody*) localPtr;//btRigidBody::upcast(colObj);
+				if (body)
+				{
+					if (body->isActive() && (!body->isStaticOrKinematicObject()))
+					{
+						body->predictIntegratedTransform(timeStep, predictedTrans);
+						body->proceedToTransform( predictedTrans);
+						void* ptr = (void*)localPtr;
+						//	spu_printf("cellDmaLargePut from %llx to LS %llx\n",ptr,ppuRigidBodyAddress);
+
+						cellDmaLargePut(ptr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);
+						cellDmaWaitTagStatusAll(DMA_MASK(1));
+
+					}
+				}
+
+			}
+			break;
+		}
+
+
+	case CMD_SAMPLE_PREDICT_MOTION_BODIES:
+		{
+			btTransform predictedTrans;
+			btCollisionObject** eaPtr = (btCollisionObject**)taskDesc.m_mainMemoryPtr;
+
+			int batchSize = taskDesc.m_sampleValue;
+			int dmaArraySize = batchSize*sizeof(void*);
+
+			if (batchSize>MAX_NUM_BODIES)
+			{
+				spu_printf("SPU Error: exceed number of bodies, see MAX_NUM_BODIES in SpuSampleTask.cpp\n");
+				break;
+			}
+
+			uint64_t ppuArrayAddress = reinterpret_cast<uint64_t>(eaPtr);
+
+			//			spu_printf("array location is at %llx, batchSize = %d, DMA size = %d\n",ppuArrayAddress,batchSize,dmaArraySize);
+
+			if (dmaArraySize>=16)
+			{
+				cellDmaLargeGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize, DMA_TAG(1), 0, 0);	
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+			} else
+			{
+				stallingUnalignedDmaSmallGet((void*)&localMemory->gPointerArray[0], ppuArrayAddress  , dmaArraySize);
+			}
+
+
+			for ( int i=0;i<batchSize;i++)
+			{
+				///DMA rigid body
+
+				void* localPtr = &localMemory->gLocalRigidBody[0];
+				void* shortAdd = localMemory->gPointerArray[i];
+				uint64_t ppuRigidBodyAddress = reinterpret_cast<uint64_t>(shortAdd);
+
+				//	spu_printf("cellDmaGet at CMD_SAMPLE_INTEGRATE_BODIES from %llx to %llx\n",ppuRigidBodyAddress,localPtr);
+
+				int dmaBodySize = sizeof(btRigidBody);
+
+				cellDmaGet((void*)localPtr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);	
+				cellDmaWaitTagStatusAll(DMA_MASK(1));
+
+
+				float timeStep = 1.f/60.f;
+
+				btRigidBody* body = (btRigidBody*) localPtr;//btRigidBody::upcast(colObj);
+				if (body)
+				{
+					if (!body->isStaticOrKinematicObject())
+					{
+						if (body->isActive())
+						{
+							body->integrateVelocities( timeStep);
+							//damping
+							body->applyDamping(timeStep);
+
+							body->predictIntegratedTransform(timeStep,body->getInterpolationWorldTransform());
+
+							void* ptr = (void*)localPtr;
+							cellDmaLargePut(ptr, ppuRigidBodyAddress  , dmaBodySize, DMA_TAG(1), 0, 0);
+							cellDmaWaitTagStatusAll(DMA_MASK(1));
+						}
+					}
+				}
+
+			}
+			break;
+		}
+	
+
+
+	default:
+		{
+
+		}
+	};
+}
+
+
+#if defined(__CELLOS_LV2__) || defined (LIBSPE2)
+
+ATTRIBUTE_ALIGNED16(SampleTask_LocalStoreMemory	gLocalStoreMemory);
+
+void* createSampleLocalStoreMemory()
+{
+	return &gLocalStoreMemory;
+}
+#else
+void* createSampleLocalStoreMemory()
+{
+	return new SampleTask_LocalStoreMemory;
+};
+
+#endif
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuSampleTask/SpuSampleTask.h
@ -0,0 +1,54 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, Copyright (c) 2007 Erwin Coumans
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef SPU_SAMPLE_TASK_H
+#define SPU_SAMPLE_TASK_H
+
+#include "../PlatformDefinitions.h"
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btVector3.h"
+#include "LinearMath/btMatrix3x3.h"
+
+#include "LinearMath/btAlignedAllocator.h"
+
+
+enum
+{
+	CMD_SAMPLE_INTEGRATE_BODIES = 1,
+	CMD_SAMPLE_PREDICT_MOTION_BODIES
+};
+
+
+
+ATTRIBUTE_ALIGNED16(struct) SpuSampleTaskDesc
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	uint32_t						m_sampleCommand;
+	uint32_t						m_taskId;
+
+	uint64_t 	m_mainMemoryPtr;
+	int			m_sampleValue;
+	
+
+};
+
+
+void	processSampleTask(void* userPtr, void* lsMemory);
+void*	createSampleLocalStoreMemory();
+
+
+#endif //SPU_SAMPLE_TASK_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuSampleTask/readme.txt
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuSampleTask/readme.txt
@ -0,0 +1 @@
+Empty placeholder for future Libspe2 SPU task
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuSampleTaskProcess.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuSampleTaskProcess.cpp
@ -0,0 +1,222 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//#define __CELLOS_LV2__ 1
+
+#define USE_SAMPLE_PROCESS 1
+#ifdef USE_SAMPLE_PROCESS
+
+
+#include "SpuSampleTaskProcess.h"
+#include <stdio.h>
+
+#ifdef __SPU__
+
+
+
+void	SampleThreadFunc(void* userPtr,void* lsMemory)
+{
+	//do nothing
+	printf("hello world\n");
+}
+
+
+void*	SamplelsMemoryFunc()
+{
+	//don't create local store memory, just return 0
+	return 0;
+}
+
+
+#else
+
+
+#include "btThreadSupportInterface.h"
+
+//#	include "SPUAssert.h"
+#include <string.h>
+
+
+
+extern "C" {
+	extern char SPU_SAMPLE_ELF_SYMBOL[];
+}
+
+
+
+
+
+SpuSampleTaskProcess::SpuSampleTaskProcess(btThreadSupportInterface*	threadInterface,  int maxNumOutstandingTasks)
+:m_threadInterface(threadInterface),
+m_maxNumOutstandingTasks(maxNumOutstandingTasks)
+{
+
+	m_taskBusy.resize(m_maxNumOutstandingTasks);
+	m_spuSampleTaskDesc.resize(m_maxNumOutstandingTasks);
+
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+
+	m_initialized = false;
+
+	m_threadInterface->startSPU();
+
+
+}
+
+SpuSampleTaskProcess::~SpuSampleTaskProcess()
+{
+	m_threadInterface->stopSPU();
+	
+}
+
+
+
+void	SpuSampleTaskProcess::initialize()
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("SpuSampleTaskProcess::initialize()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+	m_initialized = true;
+
+}
+
+
+void SpuSampleTaskProcess::issueTask(void* sampleMainMemPtr,int sampleValue,int sampleCommand)
+{
+
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("SpuSampleTaskProcess::issueTask (m_currentTask= %d\)n", m_currentTask);
+#endif //DEBUG_SPU_TASK_SCHEDULING
+
+	m_taskBusy[m_currentTask] = true;
+	m_numBusyTasks++;
+
+	SpuSampleTaskDesc& taskDesc = m_spuSampleTaskDesc[m_currentTask];
+	{
+		// send task description in event message
+		// no error checking here...
+		// but, currently, event queue can be no larger than NUM_WORKUNIT_TASKS.
+	
+		taskDesc.m_mainMemoryPtr = reinterpret_cast<uint64_t>(sampleMainMemPtr);
+		taskDesc.m_sampleValue = sampleValue;
+		taskDesc.m_sampleCommand = sampleCommand;
+
+		//some bookkeeping to recognize finished tasks
+		taskDesc.m_taskId = m_currentTask;
+	}
+
+
+	m_threadInterface->sendRequest(1, (ppu_address_t) &taskDesc, m_currentTask);
+
+	// if all tasks busy, wait for spu event to clear the task.
+	
+	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
+	{
+		unsigned int taskId;
+		unsigned int outputSize;
+
+		for (int i=0;i<m_maxNumOutstandingTasks;i++)
+	  {
+		  if (m_taskBusy[i])
+		  {
+			  taskId = i;
+			  break;
+		  }
+	  }
+		m_threadInterface->waitForResponse(&taskId, &outputSize);
+
+		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
+
+		postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+
+	// find new task buffer
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		if (!m_taskBusy[i])
+		{
+			m_currentTask = i;
+			break;
+		}
+	}
+}
+
+
+///Optional PPU-size post processing for each task
+void SpuSampleTaskProcess::postProcess(int taskId, int outputSize)
+{
+
+}
+
+
+void SpuSampleTaskProcess::flush()
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("\nSpuCollisionTaskProcess::flush()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+
+	// all tasks are issued, wait for all tasks to be complete
+	while(m_numBusyTasks > 0)
+	{
+// Consolidating SPU code
+	  unsigned int taskId;
+	  unsigned int outputSize;
+	  
+	  for (int i=0;i<m_maxNumOutstandingTasks;i++)
+	  {
+		  if (m_taskBusy[i])
+		  {
+			  taskId = i;
+			  break;
+		  }
+	  }
+	  {
+			
+		  m_threadInterface->waitForResponse(&taskId, &outputSize);
+	  }
+
+		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
+
+		postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+
+
+}
+
+#endif
+
+
+#endif //USE_SAMPLE_PROCESS
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuSampleTaskProcess.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuSampleTaskProcess.h
@ -0,0 +1,153 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SPU_SAMPLE_TASK_PROCESS_H
+#define SPU_SAMPLE_TASK_PROCESS_H
+
+#include <assert.h>
+
+
+#include "PlatformDefinitions.h"
+
+#include <stdlib.h>
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+
+#include "SpuSampleTask/SpuSampleTask.h"
+
+
+//just add your commands here, try to keep them globally unique for debugging purposes
+#define CMD_SAMPLE_TASK_COMMAND 10
+
+
+
+/// SpuSampleTaskProcess handles SPU processing of collision pairs.
+/// When PPU issues a task, it will look for completed task buffers
+/// PPU will do postprocessing, dependent on workunit output (not likely)
+class SpuSampleTaskProcess
+{
+	// track task buffers that are being used, and total busy tasks
+	btAlignedObjectArray<bool>	m_taskBusy;
+	btAlignedObjectArray<SpuSampleTaskDesc>m_spuSampleTaskDesc;
+	
+	int   m_numBusyTasks;
+
+	// the current task and the current entry to insert a new work unit
+	int   m_currentTask;
+
+	bool m_initialized;
+
+	void postProcess(int taskId, int outputSize);
+	
+	class	btThreadSupportInterface*	m_threadInterface;
+
+	int	m_maxNumOutstandingTasks;
+
+
+
+public:
+	SpuSampleTaskProcess(btThreadSupportInterface*	threadInterface, int maxNumOutstandingTasks);
+	
+	~SpuSampleTaskProcess();
+	
+	///call initialize in the beginning of the frame, before addCollisionPairToTask
+	void initialize();
+
+	void issueTask(void* sampleMainMemPtr,int sampleValue,int sampleCommand);
+
+	///call flush to submit potential outstanding work to SPUs and wait for all involved SPUs to be finished
+	void flush();
+};
+
+
+#if defined(USE_LIBSPE2) && defined(__SPU__)
+////////////////////MAIN/////////////////////////////
+#include "../SpuLibspe2Support.h"
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <SpuFakeDma.h>
+
+void * SamplelsMemoryFunc();
+void SampleThreadFunc(void* userPtr,void* lsMemory);
+
+//#define DEBUG_LIBSPE2_MAINLOOP
+
+int main(unsigned long long speid, addr64 argp, addr64 envp)
+{
+	printf("SPU is up \n");
+	
+	ATTRIBUTE_ALIGNED128(btSpuStatus status);
+	ATTRIBUTE_ALIGNED16( SpuSampleTaskDesc taskDesc ) ;
+	unsigned int received_message = Spu_Mailbox_Event_Nothing;
+        bool shutdown = false;
+
+	cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(3));
+
+	status.m_status = Spu_Status_Free;
+	status.m_lsMemory.p = SamplelsMemoryFunc();
+
+	cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+	cellDmaWaitTagStatusAll(DMA_MASK(3));
+	
+	
+	while (!shutdown)
+	{
+		received_message = spu_read_in_mbox();
+		
+
+		
+		switch(received_message)
+		{
+		case Spu_Mailbox_Event_Shutdown:
+			shutdown = true;
+			break; 
+		case Spu_Mailbox_Event_Task:
+			// refresh the status
+#ifdef DEBUG_LIBSPE2_MAINLOOP
+			printf("SPU recieved Task \n");
+#endif //DEBUG_LIBSPE2_MAINLOOP
+			cellDmaGet(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+			cellDmaWaitTagStatusAll(DMA_MASK(3));
+		
+			btAssert(status.m_status==Spu_Status_Occupied);
+			
+			cellDmaGet(&taskDesc, status.m_taskDesc.p, sizeof(SpuSampleTaskDesc), DMA_TAG(3), 0, 0);
+			cellDmaWaitTagStatusAll(DMA_MASK(3));
+			
+			SampleThreadFunc((void*)&taskDesc, reinterpret_cast<void*> (taskDesc.m_mainMemoryPtr) );
+			break;
+		case Spu_Mailbox_Event_Nothing:
+		default:
+			break;
+		}
+
+		// set to status free and wait for next task
+		status.m_status = Spu_Status_Free;
+		cellDmaLargePut(&status, argp.ull, sizeof(btSpuStatus), DMA_TAG(3), 0, 0);
+		cellDmaWaitTagStatusAll(DMA_MASK(3));		
+				
+		
+  	}
+  	return 0;
+}
+//////////////////////////////////////////////////////
+#endif
+
+
+
+#endif // SPU_SAMPLE_TASK_PROCESS_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/SpuSync.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/SpuSync.h
@ -0,0 +1,148 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2007 Starbreeze Studios
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+Written by: Marten Svanfeldt
+*/
+
+#ifndef SPU_SYNC_H
+#define	SPU_SYNC_H
+
+
+#include "PlatformDefinitions.h"
+
+
+#if defined(WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#ifdef _XBOX
+#include <Xtl.h>
+#else
+#include <Windows.h>
+#endif
+
+///The btSpinlock is a structure to allow multi-platform synchronization. This allows to port the SPU tasks to other platforms.
+class btSpinlock
+{
+public:
+	//typedef volatile LONG SpinVariable;
+	typedef CRITICAL_SECTION SpinVariable;
+
+	btSpinlock (SpinVariable* var)
+		: spinVariable (var)
+	{}
+
+	void Init ()
+	{
+		//*spinVariable = 0;
+		InitializeCriticalSection(spinVariable);
+	}
+
+	void Lock ()
+	{
+		EnterCriticalSection(spinVariable);
+	}
+
+	void Unlock ()
+	{
+		LeaveCriticalSection(spinVariable);
+	}
+
+private:
+	SpinVariable* spinVariable;
+};
+
+
+#elif defined (__CELLOS_LV2__)
+
+//#include <cell/atomic.h>
+#include <cell/sync/mutex.h>
+
+///The btSpinlock is a structure to allow multi-platform synchronization. This allows to port the SPU tasks to other platforms.
+class btSpinlock
+{
+public:
+	typedef CellSyncMutex SpinVariable;
+
+	btSpinlock (SpinVariable* var)
+		: spinVariable (var)
+	{}
+
+	void Init ()
+	{
+#ifndef __SPU__
+		//*spinVariable = 1;
+		cellSyncMutexInitialize(spinVariable);
+#endif
+	}
+
+
+
+	void Lock ()
+	{
+#ifdef __SPU__
+		// lock semaphore
+		/*while (cellAtomicTestAndDecr32(atomic_buf, (uint64_t)spinVariable) == 0) 
+		{
+
+		};*/
+		cellSyncMutexLock((uint64_t)spinVariable);
+#endif
+	}
+
+	void Unlock ()
+	{
+#ifdef __SPU__
+		//cellAtomicIncr32(atomic_buf, (uint64_t)spinVariable);
+		cellSyncMutexUnlock((uint64_t)spinVariable);
+#endif 
+	}
+
+
+private:
+	SpinVariable*	spinVariable;
+	ATTRIBUTE_ALIGNED128(uint32_t		atomic_buf[32]);
+};
+
+#else
+//create a dummy implementation (without any locking) useful for serial processing
+class btSpinlock
+{
+public:
+	typedef int  SpinVariable;
+
+	btSpinlock (SpinVariable* var)
+		: spinVariable (var)
+	{}
+
+	void Init ()
+	{
+	}
+
+	void Lock ()
+	{
+	}
+
+	void Unlock ()
+	{
+	}
+
+private:
+	SpinVariable* spinVariable;
+};
+
+
+#endif
+
+
+#endif
--- a/Engine/lib/bullet/src/BulletMultiThreaded/Win32ThreadSupport.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/Win32ThreadSupport.cpp
@ -0,0 +1,262 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "Win32ThreadSupport.h"
+
+#ifdef USE_WIN32_THREADING
+
+#include <windows.h>
+
+#include "SpuCollisionTaskProcess.h"
+
+#include "SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
+
+
+
+///The number of threads should be equal to the number of available cores
+///@todo: each worker should be linked to a single core, using SetThreadIdealProcessor.
+
+///Win32ThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+///Setup and initialize SPU/CELL/Libspe2
+Win32ThreadSupport::Win32ThreadSupport(const Win32ThreadConstructionInfo & threadConstructionInfo)
+{
+	m_maxNumTasks = threadConstructionInfo.m_numThreads;
+	startThreads(threadConstructionInfo);
+}
+
+///cleanup/shutdown Libspe2
+Win32ThreadSupport::~Win32ThreadSupport()
+{
+	stopSPU();
+}
+
+
+
+
+#include <stdio.h>
+
+DWORD WINAPI Thread_no_1( LPVOID lpParam ) 
+{
+
+	Win32ThreadSupport::btSpuStatus* status = (Win32ThreadSupport::btSpuStatus*)lpParam;
+
+	
+	while (1)
+	{
+		WaitForSingleObject(status->m_eventStartHandle,INFINITE);
+		
+		void* userPtr = status->m_userPtr;
+
+		if (userPtr)
+		{
+			btAssert(status->m_status);
+			status->m_userThreadFunc(userPtr,status->m_lsMemory);
+			status->m_status = 2;
+			SetEvent(status->m_eventCompletetHandle);
+		} else
+		{
+			//exit Thread
+			status->m_status = 3;
+			SetEvent(status->m_eventCompletetHandle);
+			printf("Thread with taskId %i with handle %p exiting\n",status->m_taskId, status->m_threadHandle);
+			break;
+		}
+		
+	}
+
+	printf("Thread TERMINATED\n");
+	return 0;
+
+}
+
+///send messages to SPUs
+void Win32ThreadSupport::sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t taskId)
+{
+	///	gMidphaseSPU.sendRequest(CMD_GATHER_AND_PROCESS_PAIRLIST, (ppu_address_t) &taskDesc);
+	
+	///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished
+	
+
+
+	switch (uiCommand)
+	{
+	case 	CMD_GATHER_AND_PROCESS_PAIRLIST:
+		{
+
+
+//#define SINGLE_THREADED 1
+#ifdef SINGLE_THREADED
+
+			btSpuStatus&	spuStatus = m_activeSpuStatus[0];
+			spuStatus.m_userPtr=(void*)uiArgument0;
+			spuStatus.m_userThreadFunc(spuStatus.m_userPtr,spuStatus.m_lsMemory);
+			HANDLE handle =0;
+#else
+
+
+			btSpuStatus&	spuStatus = m_activeSpuStatus[taskId];
+			btAssert(taskId>=0);
+			btAssert(int(taskId)<m_activeSpuStatus.size());
+
+			spuStatus.m_commandId = uiCommand;
+			spuStatus.m_status = 1;
+			spuStatus.m_userPtr = (void*)uiArgument0;
+
+			///fire event to start new task
+			SetEvent(spuStatus.m_eventStartHandle);
+
+#endif //CollisionTask_LocalStoreMemory
+
+			
+
+			break;
+		}
+	default:
+		{
+			///not implemented
+			btAssert(0);
+		}
+
+	};
+
+
+}
+
+
+///check for messages from SPUs
+void Win32ThreadSupport::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
+{
+	///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
+	
+	///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
+
+
+	btAssert(m_activeSpuStatus.size());
+
+	int last = -1;
+#ifndef SINGLE_THREADED
+	DWORD res = WaitForMultipleObjects(m_completeHandles.size(), &m_completeHandles[0], FALSE, INFINITE);
+	btAssert(res != WAIT_FAILED);
+	last = res - WAIT_OBJECT_0;
+
+	btSpuStatus& spuStatus = m_activeSpuStatus[last];
+	btAssert(spuStatus.m_threadHandle);
+	btAssert(spuStatus.m_eventCompletetHandle);
+
+	//WaitForSingleObject(spuStatus.m_eventCompletetHandle, INFINITE);
+	btAssert(spuStatus.m_status > 1);
+	spuStatus.m_status = 0;
+
+	///need to find an active spu
+	btAssert(last>=0);
+
+#else
+	last=0;
+	btSpuStatus& spuStatus = m_activeSpuStatus[last];
+#endif //SINGLE_THREADED
+
+	
+
+	*puiArgument0 = spuStatus.m_taskId;
+	*puiArgument1 = spuStatus.m_status;
+
+
+}
+
+
+
+void Win32ThreadSupport::startThreads(const Win32ThreadConstructionInfo& threadConstructionInfo)
+{
+
+	m_activeSpuStatus.resize(threadConstructionInfo.m_numThreads);
+	m_completeHandles.resize(threadConstructionInfo.m_numThreads);
+
+	m_maxNumTasks = threadConstructionInfo.m_numThreads;
+
+	for (int i=0;i<threadConstructionInfo.m_numThreads;i++)
+	{
+		printf("starting thread %d\n",i);
+
+		btSpuStatus&	spuStatus = m_activeSpuStatus[i];
+
+		LPSECURITY_ATTRIBUTES lpThreadAttributes=NULL;
+		SIZE_T dwStackSize=threadConstructionInfo.m_threadStackSize;
+		LPTHREAD_START_ROUTINE lpStartAddress=&Thread_no_1;
+		LPVOID lpParameter=&spuStatus;
+		DWORD dwCreationFlags=0;
+		LPDWORD lpThreadId=0;
+
+		spuStatus.m_userPtr=0;
+
+		sprintf(spuStatus.m_eventStartHandleName,"eventStart%s%d",threadConstructionInfo.m_uniqueName,i);
+		spuStatus.m_eventStartHandle = CreateEventA(0,false,false,spuStatus.m_eventStartHandleName);
+
+		sprintf(spuStatus.m_eventCompletetHandleName,"eventComplete%s%d",threadConstructionInfo.m_uniqueName,i);
+		spuStatus.m_eventCompletetHandle = CreateEventA(0,false,false,spuStatus.m_eventCompletetHandleName);
+
+		m_completeHandles[i] = spuStatus.m_eventCompletetHandle;
+
+		HANDLE handle = CreateThread(lpThreadAttributes,dwStackSize,lpStartAddress,lpParameter,	dwCreationFlags,lpThreadId);
+		SetThreadPriority(handle,THREAD_PRIORITY_HIGHEST);
+		//SetThreadPriority(handle,THREAD_PRIORITY_TIME_CRITICAL);
+
+		SetThreadAffinityMask(handle, 1<<i);
+
+		spuStatus.m_taskId = i;
+		spuStatus.m_commandId = 0;
+		spuStatus.m_status = 0;
+		spuStatus.m_threadHandle = handle;
+		spuStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
+		spuStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
+
+		printf("started thread %d with threadHandle %p\n",i,handle);
+		
+	}
+
+}
+
+void Win32ThreadSupport::startSPU()
+{
+}
+
+
+///tell the task scheduler we are done with the SPU tasks
+void Win32ThreadSupport::stopSPU()
+{
+	int i;
+	for (i=0;i<m_activeSpuStatus.size();i++)
+	{
+		btSpuStatus& spuStatus = m_activeSpuStatus[i];
+		if (spuStatus.m_status>0)
+		{
+			WaitForSingleObject(spuStatus.m_eventCompletetHandle, INFINITE);
+		}
+		
+
+		spuStatus.m_userPtr = 0;
+		SetEvent(spuStatus.m_eventStartHandle);
+		WaitForSingleObject(spuStatus.m_eventCompletetHandle, INFINITE);
+
+		CloseHandle(spuStatus.m_eventCompletetHandle);
+		CloseHandle(spuStatus.m_eventStartHandle);
+		CloseHandle(spuStatus.m_threadHandle);
+	}
+
+	m_activeSpuStatus.clear();
+	m_completeHandles.clear();
+
+}
+
+#endif //USE_WIN32_THREADING
--- a/Engine/lib/bullet/src/BulletMultiThreaded/Win32ThreadSupport.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/Win32ThreadSupport.h
@ -0,0 +1,132 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "LinearMath/btScalar.h"
+#include "PlatformDefinitions.h"
+
+#ifdef USE_WIN32_THREADING  //platform specific defines are defined in PlatformDefinitions.h
+
+#ifndef WIN32_THREAD_SUPPORT_H
+#define WIN32_THREAD_SUPPORT_H
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+#include "btThreadSupportInterface.h"
+
+
+typedef void (*Win32ThreadFunc)(void* userPtr,void* lsMemory);
+typedef void* (*Win32lsMemorySetupFunc)();
+
+
+
+
+
+
+///Win32ThreadSupport helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+class Win32ThreadSupport : public btThreadSupportInterface 
+{
+public:
+	///placeholder, until libspe2 support is there
+	struct	btSpuStatus
+	{
+		uint32_t	m_taskId;
+		uint32_t	m_commandId;
+		uint32_t	m_status;
+
+		Win32ThreadFunc	m_userThreadFunc;
+		void*	m_userPtr; //for taskDesc etc
+		void*	m_lsMemory; //initialized using Win32LocalStoreMemorySetupFunc
+
+		void*	m_threadHandle; //this one is calling 'Win32ThreadFunc'
+
+		void*	m_eventStartHandle;
+		char	m_eventStartHandleName[32];
+
+		void*	m_eventCompletetHandle;
+		char	m_eventCompletetHandleName[32];
+		
+
+	};
+private:
+
+	btAlignedObjectArray<btSpuStatus>	m_activeSpuStatus;
+	btAlignedObjectArray<void*>			m_completeHandles;
+	
+	int m_maxNumTasks;
+public:
+	///Setup and initialize SPU/CELL/Libspe2
+
+	struct	Win32ThreadConstructionInfo
+	{
+		Win32ThreadConstructionInfo(char* uniqueName,
+									Win32ThreadFunc userThreadFunc,
+									Win32lsMemorySetupFunc	lsMemoryFunc,
+									int numThreads=1,
+									int threadStackSize=65535
+									)
+									:m_uniqueName(uniqueName),
+									m_userThreadFunc(userThreadFunc),
+									m_lsMemoryFunc(lsMemoryFunc),
+									m_numThreads(numThreads),
+									m_threadStackSize(threadStackSize)
+		{
+
+		}
+
+		char*					m_uniqueName;
+		Win32ThreadFunc			m_userThreadFunc;
+		Win32lsMemorySetupFunc	m_lsMemoryFunc;
+		int						m_numThreads;
+		int						m_threadStackSize;
+
+	};
+
+
+
+	Win32ThreadSupport(const Win32ThreadConstructionInfo& threadConstructionInfo);
+
+///cleanup/shutdown Libspe2
+	virtual	~Win32ThreadSupport();
+
+	void	startThreads(const Win32ThreadConstructionInfo&	threadInfo);
+
+
+///send messages to SPUs
+	virtual	void sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t uiArgument1);
+
+///check for messages from SPUs
+	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1);
+
+///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	virtual	void startSPU();
+
+///tell the task scheduler we are done with the SPU tasks
+	virtual	void stopSPU();
+
+	virtual	void	setNumTasks(int numTasks)
+	{
+		m_maxNumTasks = numTasks;
+	}
+
+	virtual int getNumTasks() const
+	{
+		return m_maxNumTasks;
+	}
+
+};
+
+#endif //WIN32_THREAD_SUPPORT_H
+
+#endif //USE_WIN32_THREADING
--- a/Engine/lib/bullet/src/BulletMultiThreaded/btGpu3DGridBroadphase.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/btGpu3DGridBroadphase.cpp
@ -0,0 +1,590 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///The 3 following lines include the CPU implementation of the kernels, keep them in this order.
+#include "BulletMultiThreaded/btGpuDefines.h"
+#include "BulletMultiThreaded/btGpuUtilsSharedDefs.h"
+#include "BulletMultiThreaded/btGpuUtilsSharedCode.h"
+
+
+
+#include "LinearMath/btAlignedAllocator.h"
+#include "LinearMath/btQuickprof.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+
+
+
+#include "btGpuDefines.h"
+#include "btGpuUtilsSharedDefs.h"
+
+#include "btGpu3DGridBroadphaseSharedDefs.h"
+
+#include "btGpu3DGridBroadphase.h"
+#include <string.h> //for memset
+
+
+#include <stdio.h>
+
+
+
+static bt3DGridBroadphaseParams s3DGridBroadphaseParams;
+
+
+
+btGpu3DGridBroadphase::btGpu3DGridBroadphase(	const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										int maxBodiesPerCell,
+										btScalar cellFactorAABB) :
+	btSimpleBroadphase(maxSmallProxies,
+//				     new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache),
+				     new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache),
+	m_bInitialized(false),
+    m_numBodies(0)
+{
+	_initialize(worldAabbMin, worldAabbMax, gridSizeX, gridSizeY, gridSizeZ, 
+				maxSmallProxies, maxLargeProxies, maxPairsPerBody,
+				maxBodiesPerCell, cellFactorAABB);
+}
+
+
+
+btGpu3DGridBroadphase::btGpu3DGridBroadphase(	btOverlappingPairCache* overlappingPairCache,
+										const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										int maxBodiesPerCell,
+										btScalar cellFactorAABB) :
+	btSimpleBroadphase(maxSmallProxies, overlappingPairCache),
+	m_bInitialized(false),
+    m_numBodies(0)
+{
+	_initialize(worldAabbMin, worldAabbMax, gridSizeX, gridSizeY, gridSizeZ, 
+				maxSmallProxies, maxLargeProxies, maxPairsPerBody,
+				maxBodiesPerCell, cellFactorAABB);
+}
+
+
+
+btGpu3DGridBroadphase::~btGpu3DGridBroadphase()
+{
+	//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
+	assert(m_bInitialized);
+	_finalize();
+}
+
+
+
+void btGpu3DGridBroadphase::_initialize(	const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										int maxBodiesPerCell,
+										btScalar cellFactorAABB)
+{
+	// set various paramerers
+	m_ownsPairCache = true;
+	m_params.m_gridSizeX = gridSizeX;
+	m_params.m_gridSizeY = gridSizeY;
+	m_params.m_gridSizeZ = gridSizeZ;
+	m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ;
+	btVector3 w_org = worldAabbMin;
+	m_params.m_worldOriginX = w_org.getX();
+	m_params.m_worldOriginY = w_org.getY();
+	m_params.m_worldOriginZ = w_org.getZ();
+	btVector3 w_size = worldAabbMax - worldAabbMin;
+	m_params.m_cellSizeX = w_size.getX() / m_params.m_gridSizeX;
+	m_params.m_cellSizeY = w_size.getY() / m_params.m_gridSizeY;
+	m_params.m_cellSizeZ = w_size.getZ() / m_params.m_gridSizeZ;
+	m_maxRadius = btMin(btMin(m_params.m_cellSizeX, m_params.m_cellSizeY), m_params.m_cellSizeZ);
+	m_maxRadius *= btScalar(0.5f);
+	m_params.m_numBodies = m_numBodies;
+	m_params.m_maxBodiesPerCell = maxBodiesPerCell;
+
+	m_numLargeHandles = 0;						
+	m_maxLargeHandles = maxLargeProxies;
+
+	m_maxPairsPerBody = maxPairsPerBody;
+
+	m_cellFactorAABB = cellFactorAABB;
+
+	m_LastLargeHandleIndex = -1;
+
+    assert(!m_bInitialized);
+    // allocate host storage
+    m_hBodiesHash = new unsigned int[m_maxHandles * 2];
+    memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int));
+
+    m_hCellStart = new unsigned int[m_params.m_numCells];
+    memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int));
+
+	m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2];
+	// --------------- for now, init with m_maxPairsPerBody for each body
+	m_hPairBuffStartCurr[0] = 0;
+	m_hPairBuffStartCurr[1] = 0;
+	for(int i = 1; i <= m_maxHandles; i++) 
+	{
+		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
+		m_hPairBuffStartCurr[i * 2 + 1] = 0;
+	}
+	//----------------
+	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
+	m_hAABB = new bt3DGrid3F1U[numAABB * 2]; // AABB Min & Max
+
+	m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody];
+	memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed?
+
+	m_hPairScan = new unsigned int[m_maxHandles + 1];
+
+	m_hPairOut = new unsigned int[m_maxHandles * m_maxPairsPerBody];
+
+// large proxies
+
+	// allocate handles buffer and put all handles on free list
+	m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16);
+	m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles];
+	m_firstFreeLargeHandle = 0;
+	{
+		for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++)
+		{
+			m_pLargeHandles[i].SetNextFree(i + 1);
+			m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i;
+		}
+		m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0);
+	}
+
+// debug data
+	m_numPairsAdded = 0;
+	m_numOverflows = 0;
+
+    m_bInitialized = true;
+}
+
+
+
+void btGpu3DGridBroadphase::_finalize()
+{
+    assert(m_bInitialized);
+    delete [] m_hBodiesHash;
+    delete [] m_hCellStart;
+    delete [] m_hPairBuffStartCurr;
+    delete [] m_hAABB;
+	delete [] m_hPairBuff;
+	delete [] m_hPairScan;
+	delete [] m_hPairOut;
+	btAlignedFree(m_pLargeHandlesRawPtr);
+	m_bInitialized = false;
+}
+
+
+
+void btGpu3DGridBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher)
+{
+	if(m_numHandles <= 0)
+	{
+		BT_PROFILE("addLarge2LargePairsToCache");
+		addLarge2LargePairsToCache(dispatcher);
+		return;
+	}
+	// update constants
+	setParameters(&m_params);
+	// prepare AABB array
+	prepareAABB();
+	// calculate hash
+	calcHashAABB();
+	// sort bodies based on hash
+	sortHash();
+	// find start of each cell
+	findCellStart();
+	// findOverlappingPairs (small/small)
+	findOverlappingPairs();
+	// findOverlappingPairs (small/large)
+	findPairsLarge();
+	// add pairs to CPU cache
+	computePairCacheChanges();
+	scanOverlappingPairBuff();
+	squeezeOverlappingPairBuff();
+	addPairsToCache(dispatcher);
+	// find and add large/large pairs to CPU cache
+	addLarge2LargePairsToCache(dispatcher);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::addPairsToCache(btDispatcher* dispatcher)
+{
+	m_numPairsAdded = 0;
+	m_numPairsRemoved = 0;
+	for(int i = 0; i < m_numHandles; i++) 
+	{
+		unsigned int num = m_hPairScan[i+1] - m_hPairScan[i];
+		if(!num)
+		{
+			continue;
+		}
+		unsigned int* pInp = m_hPairOut + m_hPairScan[i];
+		unsigned int index0 = m_hAABB[i * 2].uw;
+		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0];
+		for(unsigned int j = 0; j < num; j++)
+		{
+			unsigned int indx1_s = pInp[j];
+			unsigned int index1 = indx1_s & (~BT_3DGRID_PAIR_ANY_FLG);
+			btSimpleBroadphaseProxy* proxy1;
+			if(index1 < (unsigned int)m_maxHandles)
+			{
+				proxy1 = &m_pHandles[index1];
+			}
+			else
+			{
+				index1 -= m_maxHandles;
+				btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles));
+				proxy1 = &m_pLargeHandles[index1];
+			}
+			if(indx1_s & BT_3DGRID_PAIR_NEW_FLG)
+			{
+				m_pairCache->addOverlappingPair(proxy0,proxy1);
+				m_numPairsAdded++;
+			}
+			else
+			{
+				m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
+				m_numPairsRemoved++;
+			}
+		}
+	}
+}
+
+
+
+btBroadphaseProxy* btGpu3DGridBroadphase::createProxy(  const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy)
+{
+	btBroadphaseProxy*  proxy;
+	bool bIsLarge = isLargeProxy(aabbMin, aabbMax);
+	if(bIsLarge)
+	{
+		if (m_numLargeHandles >= m_maxLargeHandles)
+		{
+			///you have to increase the cell size, so 'large' proxies become 'small' proxies (fitting a cell)
+			btAssert(0);
+			return 0; //should never happen, but don't let the game crash ;-)
+		}
+		btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2]));
+		int newHandleIndex = allocLargeHandle();
+		proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy);
+	}
+	else
+	{
+		proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy);
+	}
+	return proxy;
+}
+
+
+
+void btGpu3DGridBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher)
+{
+	bool bIsLarge = isLargeProxy(proxy);
+	if(bIsLarge)
+	{
+		
+		btSimpleBroadphaseProxy* proxy0 = static_cast<btSimpleBroadphaseProxy*>(proxy);
+		freeLargeHandle(proxy0);
+		m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher);
+	}
+	else
+	{
+		btSimpleBroadphase::destroyProxy(proxy, dispatcher);
+	}
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::resetPool(btDispatcher* dispatcher)
+{
+	m_hPairBuffStartCurr[0] = 0;
+	m_hPairBuffStartCurr[1] = 0;
+	for(int i = 1; i <= m_maxHandles; i++) 
+	{
+		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
+		m_hPairBuffStartCurr[i * 2 + 1] = 0;
+	}
+}
+
+
+
+bool btGpu3DGridBroadphase::isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax)
+{
+	btVector3 diag = aabbMax - aabbMin;
+	
+	///use the bounding sphere radius of this bounding box, to include rotation
+	btScalar radius = diag.length() * btScalar(0.5f);
+	radius *= m_cellFactorAABB; // user-defined factor
+
+	return (radius > m_maxRadius);
+}
+
+
+
+bool btGpu3DGridBroadphase::isLargeProxy(btBroadphaseProxy* proxy)
+{
+	return (proxy->getUid() >= (m_maxHandles+2));
+}
+
+
+
+void btGpu3DGridBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher)
+{
+	int i,j;
+	if (m_numLargeHandles <= 0)
+	{
+		return;
+	}
+	int new_largest_index = -1;
+	for(i = 0; i <= m_LastLargeHandleIndex; i++)
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
+		if(!proxy0->m_clientObject)
+		{
+			continue;
+		}
+		new_largest_index = i;
+		for(j = i + 1; j <= m_LastLargeHandleIndex; j++)
+		{
+			btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j];
+			if(!proxy1->m_clientObject)
+			{
+				continue;
+			}
+			btAssert(proxy0 != proxy1);
+			btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0);
+			btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1);
+			if(aabbOverlap(p0,p1))
+			{
+				if (!m_pairCache->findPair(proxy0,proxy1))
+				{
+					m_pairCache->addOverlappingPair(proxy0,proxy1);
+				}
+			} 
+			else
+			{
+				if(m_pairCache->findPair(proxy0,proxy1))
+				{
+					m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
+				}
+			}
+		}
+	}
+	m_LastLargeHandleIndex = new_largest_index;
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback)
+{
+	btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback);
+	for (int i=0; i <= m_LastLargeHandleIndex; i++)
+	{
+		btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i];
+		if(!proxy->m_clientObject)
+		{
+			continue;
+		}
+		rayCallback.process(proxy);
+	}
+}
+
+
+
+//
+// overrides for CPU version
+//
+
+
+
+void btGpu3DGridBroadphase::prepareAABB()
+{
+	BT_PROFILE("prepareAABB");
+	bt3DGrid3F1U* pBB = m_hAABB;
+	int i;
+	int new_largest_index = -1;
+	unsigned int num_small = 0;
+	for(i = 0; i <= m_LastHandleIndex; i++) 
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i];
+		if(!proxy0->m_clientObject)
+		{
+			continue;
+		}
+		new_largest_index = i;
+		pBB->fx = proxy0->m_aabbMin.getX();
+		pBB->fy = proxy0->m_aabbMin.getY();
+		pBB->fz = proxy0->m_aabbMin.getZ();
+		pBB->uw = i;
+		pBB++;
+		pBB->fx = proxy0->m_aabbMax.getX();
+		pBB->fy = proxy0->m_aabbMax.getY();
+		pBB->fz = proxy0->m_aabbMax.getZ();
+		pBB->uw = num_small;
+		pBB++;
+		num_small++;
+	}
+	m_LastHandleIndex = new_largest_index;
+	new_largest_index = -1;
+	unsigned int num_large = 0;
+	for(i = 0; i <= m_LastLargeHandleIndex; i++) 
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
+		if(!proxy0->m_clientObject)
+		{
+			continue;
+		}
+		new_largest_index = i;
+		pBB->fx = proxy0->m_aabbMin.getX();
+		pBB->fy = proxy0->m_aabbMin.getY();
+		pBB->fz = proxy0->m_aabbMin.getZ();
+		pBB->uw = i + m_maxHandles;
+		pBB++;
+		pBB->fx = proxy0->m_aabbMax.getX();
+		pBB->fy = proxy0->m_aabbMax.getY();
+		pBB->fz = proxy0->m_aabbMax.getZ();
+		pBB->uw = num_large + m_maxHandles;
+		pBB++;
+		num_large++;
+	}
+	m_LastLargeHandleIndex = new_largest_index;
+	// paranoid checks
+	btAssert(num_small == m_numHandles);
+	btAssert(num_large == m_numLargeHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::setParameters(bt3DGridBroadphaseParams* hostParams)
+{
+	s3DGridBroadphaseParams = *hostParams;
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::calcHashAABB()
+{
+	BT_PROFILE("bt3DGrid_calcHashAABB");
+	btGpu_calcHashAABB(m_hAABB, m_hBodiesHash, m_numHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::sortHash()
+{
+	class bt3DGridHashKey
+	{
+	public:
+	   unsigned int hash;
+	   unsigned int index;
+	   void quickSort(bt3DGridHashKey* pData, int lo, int hi)
+	   {
+			int i=lo, j=hi;
+			bt3DGridHashKey x = pData[(lo+hi)/2];
+			do
+			{    
+				while(pData[i].hash > x.hash) i++; 
+				while(x.hash > pData[j].hash) j--;
+				if(i <= j)
+				{
+					bt3DGridHashKey t = pData[i];
+					pData[i] = pData[j];
+					pData[j] = t;
+					i++; j--;
+				}
+			} while(i <= j);
+			if(lo < j) pData->quickSort(pData, lo, j);
+			if(i < hi) pData->quickSort(pData, i, hi);
+	   }
+	};
+	BT_PROFILE("bt3DGrid_sortHash");
+	bt3DGridHashKey* pHash = (bt3DGridHashKey*)m_hBodiesHash;
+	pHash->quickSort(pHash, 0, m_numHandles - 1);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findCellStart()
+{
+	BT_PROFILE("bt3DGrid_findCellStart");
+	btGpu_findCellStart(m_hBodiesHash, m_hCellStart, m_numHandles, m_params.m_numCells);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findOverlappingPairs()
+{
+	BT_PROFILE("bt3DGrid_findOverlappingPairs");
+	btGpu_findOverlappingPairs(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findPairsLarge()
+{
+	BT_PROFILE("bt3DGrid_findPairsLarge");
+	btGpu_findPairsLarge(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr,	m_numHandles, m_numLargeHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::computePairCacheChanges()
+{
+	BT_PROFILE("bt3DGrid_computePairCacheChanges");
+	btGpu_computePairCacheChanges(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_hAABB, m_numHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::scanOverlappingPairBuff()
+{
+	BT_PROFILE("bt3DGrid_scanOverlappingPairBuff");
+	m_hPairScan[0] = 0;
+	for(int i = 1; i <= m_numHandles; i++) 
+	{
+		unsigned int delta = m_hPairScan[i];
+		m_hPairScan[i] = m_hPairScan[i-1] + delta;
+	}
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::squeezeOverlappingPairBuff()
+{
+	BT_PROFILE("bt3DGrid_squeezeOverlappingPairBuff");
+	btGpu_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_hPairOut, m_hAABB, m_numHandles);
+	return;
+}
+
+
+
+#include "btGpu3DGridBroadphaseSharedCode.h"
+
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/btGpu3DGridBroadphase.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/btGpu3DGridBroadphase.h
@ -0,0 +1,138 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASE_H
+#define BTGPU3DGRIDBROADPHASE_H
+
+//----------------------------------------------------------------------------------------
+
+#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
+
+#include "btGpu3DGridBroadphaseSharedTypes.h"
+
+//----------------------------------------------------------------------------------------
+
+///The btGpu3DGridBroadphase uses GPU-style code compiled for CPU to compute overlapping pairs
+
+class btGpu3DGridBroadphase : public btSimpleBroadphase
+{
+protected:
+	bool			m_bInitialized;
+    unsigned int	m_numBodies;
+    unsigned int	m_numCells;
+	unsigned int	m_maxPairsPerBody;
+	btScalar		m_cellFactorAABB;
+    unsigned int	m_maxBodiesPerCell;
+	bt3DGridBroadphaseParams m_params;
+	btScalar		m_maxRadius;
+	// CPU data
+    unsigned int*	m_hBodiesHash;
+    unsigned int*	m_hCellStart;
+	unsigned int*	m_hPairBuffStartCurr;
+	bt3DGrid3F1U*		m_hAABB;
+	unsigned int*	m_hPairBuff;
+	unsigned int*	m_hPairScan;
+	unsigned int*	m_hPairOut;
+// large proxies
+	int		m_numLargeHandles;						
+	int		m_maxLargeHandles;						
+	int		m_LastLargeHandleIndex;							
+	btSimpleBroadphaseProxy* m_pLargeHandles;
+	void* m_pLargeHandlesRawPtr;
+	int		m_firstFreeLargeHandle;
+	int allocLargeHandle()
+	{
+		btAssert(m_numLargeHandles < m_maxLargeHandles);
+		int freeLargeHandle = m_firstFreeLargeHandle;
+		m_firstFreeLargeHandle = m_pLargeHandles[freeLargeHandle].GetNextFree();
+		m_numLargeHandles++;
+		if(freeLargeHandle > m_LastLargeHandleIndex)
+		{
+			m_LastLargeHandleIndex = freeLargeHandle;
+		}
+		return freeLargeHandle;
+	}
+	void freeLargeHandle(btSimpleBroadphaseProxy* proxy)
+	{
+		int handle = int(proxy - m_pLargeHandles);
+		btAssert((handle >= 0) && (handle < m_maxHandles));
+		if(handle == m_LastLargeHandleIndex)
+		{
+			m_LastLargeHandleIndex--;
+		}
+		proxy->SetNextFree(m_firstFreeLargeHandle);
+		m_firstFreeLargeHandle = handle;
+		proxy->m_clientObject = 0;
+		m_numLargeHandles--;
+	}
+	bool isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax);
+	bool isLargeProxy(btBroadphaseProxy* proxy);
+// debug
+	unsigned int	m_numPairsAdded;
+	unsigned int	m_numPairsRemoved;
+	unsigned int	m_numOverflows;
+// 
+public:
+	btGpu3DGridBroadphase(const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
+					   int gridSizeX, int gridSizeY, int gridSizeZ, 
+					   int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+					   int maxBodiesPerCell = 8,
+					   btScalar cellFactorAABB = btScalar(1.0f));
+	btGpu3DGridBroadphase(	btOverlappingPairCache* overlappingPairCache,
+						const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
+						int gridSizeX, int gridSizeY, int gridSizeZ, 
+						int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+						int maxBodiesPerCell = 8,
+						btScalar cellFactorAABB = btScalar(1.0f));
+	virtual ~btGpu3DGridBroadphase();
+	virtual void	calculateOverlappingPairs(btDispatcher* dispatcher);
+
+	virtual btBroadphaseProxy*	createProxy(const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy);
+	virtual void	destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher);
+	virtual void	rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback);
+	virtual void	resetPool(btDispatcher* dispatcher);
+
+protected:
+	void _initialize(	const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
+						int gridSizeX, int gridSizeY, int gridSizeZ, 
+						int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+						int maxBodiesPerCell = 8,
+						btScalar cellFactorAABB = btScalar(1.0f));
+	void _finalize();
+	void addPairsToCache(btDispatcher* dispatcher);
+	void addLarge2LargePairsToCache(btDispatcher* dispatcher);
+
+// overrides for CPU version
+	virtual void setParameters(bt3DGridBroadphaseParams* hostParams);
+	virtual void prepareAABB();
+	virtual void calcHashAABB();
+	virtual void sortHash();	
+	virtual void findCellStart();
+	virtual void findOverlappingPairs();
+	virtual void findPairsLarge();
+	virtual void computePairCacheChanges();
+	virtual void scanOverlappingPairBuff();
+	virtual void squeezeOverlappingPairBuff();
+};
+
+//----------------------------------------------------------------------------------------
+
+#endif //BTGPU3DGRIDBROADPHASE_H
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
--- a/Engine/lib/bullet/src/BulletMultiThreaded/btGpu3DGridBroadphaseSharedCode.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/btGpu3DGridBroadphaseSharedCode.h
@ -0,0 +1,430 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//               K E R N E L    F U N C T I O N S 
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+// calculate position in uniform grid
+BT_GPU___device__ int3 bt3DGrid_calcGridPos(float4 p)
+{
+    int3 gridPos;
+    gridPos.x = (int)floor((p.x - BT_GPU_params.m_worldOriginX) / BT_GPU_params.m_cellSizeX);
+    gridPos.y = (int)floor((p.y - BT_GPU_params.m_worldOriginY) / BT_GPU_params.m_cellSizeY);
+    gridPos.z = (int)floor((p.z - BT_GPU_params.m_worldOriginZ) / BT_GPU_params.m_cellSizeZ);
+    return gridPos;
+} // bt3DGrid_calcGridPos()
+
+//----------------------------------------------------------------------------------------
+
+// calculate address in grid from position (clamping to edges)
+BT_GPU___device__ uint bt3DGrid_calcGridHash(int3 gridPos)
+{
+    gridPos.x = BT_GPU_max(0, BT_GPU_min(gridPos.x, (int)BT_GPU_params.m_gridSizeX - 1));
+    gridPos.y = BT_GPU_max(0, BT_GPU_min(gridPos.y, (int)BT_GPU_params.m_gridSizeY - 1));
+    gridPos.z = BT_GPU_max(0, BT_GPU_min(gridPos.z, (int)BT_GPU_params.m_gridSizeZ - 1));
+    return BT_GPU___mul24(BT_GPU___mul24(gridPos.z, BT_GPU_params.m_gridSizeY), BT_GPU_params.m_gridSizeX) + BT_GPU___mul24(gridPos.y, BT_GPU_params.m_gridSizeX) + gridPos.x;
+} // bt3DGrid_calcGridHash()
+
+//----------------------------------------------------------------------------------------
+
+// calculate grid hash value for each body using its AABB
+BT_GPU___global__ void calcHashAABBD(bt3DGrid3F1U* pAABB, uint2* pHash, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index*2];
+	bt3DGrid3F1U bbMax = pAABB[index*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
+	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
+	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
+    // get address in grid
+    int3 gridPos = bt3DGrid_calcGridPos(pos);
+    uint gridHash = bt3DGrid_calcGridHash(gridPos);
+    // store grid hash and body index
+    pHash[index] = BT_GPU_make_uint2(gridHash, index);
+} // calcHashAABBD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	// Load hash data into shared memory so that we can look 
+	// at neighboring body's hash value without loading
+	// two hash values per thread
+	BT_GPU___shared__ uint sharedHash[257];
+	sharedHash[BT_GPU_threadIdx.x+1] = sortedData.x;
+	if((index > 0) && (BT_GPU_threadIdx.x == 0))
+	{
+		// first thread in block must load neighbor body hash
+		volatile uint2 prevData = pHash[index-1];
+		sharedHash[0] = prevData.x;
+	}
+	BT_GPU___syncthreads();
+	if((index == 0) || (sortedData.x != sharedHash[BT_GPU_threadIdx.x]))
+	{
+		cellStart[sortedData.x] = index;
+	}
+} // findCellStartD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ uint cudaTestAABBOverlap(bt3DGrid3F1U min0, bt3DGrid3F1U max0, bt3DGrid3F1U min1, bt3DGrid3F1U max1)
+{
+	return	(min0.fx <= max1.fx)&& (min1.fx <= max0.fx) && 
+			(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) && 
+			(min0.fz <= max1.fz)&& (min1.fz <= max0.fz); 
+} // cudaTestAABBOverlap()
+ 
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ void findPairsInCell(	int3	gridPos,
+										uint    index,
+										uint2*  pHash,
+										uint*   pCellStart,
+										bt3DGrid3F1U* pAABB, 
+										uint*   pPairBuff,
+										uint2*	pPairBuffStartCurr,
+										uint	numBodies)
+{
+    if (	(gridPos.x < 0) || (gridPos.x > (int)BT_GPU_params.m_gridSizeX - 1)
+		||	(gridPos.y < 0) || (gridPos.y > (int)BT_GPU_params.m_gridSizeY - 1)
+		||  (gridPos.z < 0) || (gridPos.z > (int)BT_GPU_params.m_gridSizeZ - 1)) 
+    {
+		return;
+	}
+    uint gridHash = bt3DGrid_calcGridHash(gridPos);
+    // get start of bucket for this cell
+    uint bucketStart = pCellStart[gridHash];
+    if (bucketStart == 0xffffffff)
+	{
+        return;   // cell empty
+	}
+	// iterate over bodies in this cell
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+    bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2); 
+	bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	uint handleIndex =  min0.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	uint curr_max = start_curr_next.x - start - 1;
+	uint bucketEnd = bucketStart + BT_GPU_params.m_maxBodiesPerCell;
+	bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
+	for(uint index2 = bucketStart; index2 < bucketEnd; index2++) 
+	{
+        uint2 cellData = pHash[index2];
+        if (cellData.x != gridHash)
+        {
+			break;   // no longer in same bucket
+		}
+		uint unsorted_indx2 = cellData.y;
+        if (unsorted_indx2 < unsorted_indx) // check not colliding with self
+        {   
+			bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2);
+			bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2 + 1);
+			if(cudaTestAABBOverlap(min0, max0, min1, max1))
+			{
+				uint handleIndex2 = min1.uw;
+				uint k;
+				for(k = 0; k < curr; k++)
+				{
+					uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
+					if(old_pair == handleIndex2)
+					{
+						pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
+						break;
+					}
+				}
+				if(k == curr)
+				{
+					if(curr >= curr_max) 
+					{ // not a good solution, but let's avoid crash
+						break;
+					}
+					pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
+					curr++;
+				}
+			}
+		}
+	}
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
+    return;
+} // findPairsInCell()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findOverlappingPairsD(	bt3DGrid3F1U*	pAABB, uint2* pHash, uint* pCellStart, 
+												uint* pPairBuff, uint2* pPairBuffStartCurr, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+	bt3DGrid3F1U bbMin = BT_GPU_FETCH(pAABB, unsorted_indx*2);
+	bt3DGrid3F1U bbMax = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	float4 pos;
+	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
+	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
+	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
+    // get address in grid
+    int3 gridPos = bt3DGrid_calcGridPos(pos);
+    // examine only neighbouring cells
+    for(int z=-1; z<=1; z++) {
+        for(int y=-1; y<=1; y++) {
+            for(int x=-1; x<=1; x++) {
+                findPairsInCell(gridPos + BT_GPU_make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
+            }
+        }
+    }
+} // findOverlappingPairsD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findPairsLargeD(	bt3DGrid3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
+										uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+	bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2);
+	bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	uint handleIndex =  min0.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	uint curr_max = start_curr_next.x - start - 1;
+    for(uint i = 0; i < numLarge; i++)
+    {
+		uint indx2 = numBodies + i;
+		bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, indx2*2);
+		bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, indx2*2 + 1);
+		if(cudaTestAABBOverlap(min0, max0, min1, max1))
+		{
+			uint k;
+			uint handleIndex2 =  min1.uw;
+			for(k = 0; k < curr; k++)
+			{
+				uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
+				if(old_pair == handleIndex2)
+				{
+					pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
+					break;
+				}
+			}
+			if(k == curr)
+			{
+				pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
+				if(curr >= curr_max) 
+				{ // not a good solution, but let's avoid crash
+					break;
+				}
+				curr++;
+			}
+		}
+    }
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
+    return;
+} // findPairsLargeD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, 
+												uint* pPairScan, bt3DGrid3F1U* pAABB, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index * 2];
+	uint handleIndex = bbMin.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint *pInp = pPairBuff + start;
+	uint num_changes = 0;
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
+		{
+			num_changes++;
+		}
+	}
+	pPairScan[index+1] = num_changes;
+} // computePairCacheChangesD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan,
+												   uint* pPairOut, bt3DGrid3F1U* pAABB, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index * 2];
+	uint handleIndex = bbMin.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint* pInp = pPairBuff + start;
+	uint* pOut = pPairOut + pPairScan[index];
+	uint* pOut2 = pInp;
+	uint num = 0; 
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
+		{
+			*pOut = *pInp;
+			pOut++;
+		}
+		if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
+		{
+			*pOut2 = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
+			pOut2++;
+			num++;
+		}
+	}
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, num);
+} // squeezeOverlappingPairBuffD()
+
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//               E N D   O F    K E R N E L    F U N C T I O N S 
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash,	unsigned int numBodies)
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    // execute the kernel
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies));
+    // check if kernel invocation generated an error
+    BT_GPU_CHECK_ERROR("calcHashAABBD kernel execution failed");
+} // calcHashAABB()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+	BT_GPU_SAFE_CALL(BT_GPU_Memset(cellStart, 0xffffffff, numCells*sizeof(uint)));
+	BT_GPU_EXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: findCellStartD");
+} // findCellStart()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findOverlappingPairs(bt3DGrid3F1U* pAABB, unsigned int* pHash,	unsigned int* pCellStart, unsigned int*	pPairBuff, unsigned int*	pPairBuffStartCurr, unsigned int	numBodies))
+{
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(bt3DGrid3F1U)));
+#endif
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
+#endif
+} // findOverlappingPairs()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findPairsLarge(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge))
+{
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(bt3DGrid3F1U)));
+#endif
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
+#endif
+} // findPairsLarge()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
+} // computePairCacheChanges()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint*)pPairOut,pAABB,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
+} // btCuda_squeezeOverlappingPairBuff()
+
+//------------------------------------------------------------------------------------------------
+
+} // extern "C"
+
+//------------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------------
--- a/Engine/lib/bullet/src/BulletMultiThreaded/btGpu3DGridBroadphaseSharedDefs.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/btGpu3DGridBroadphaseSharedDefs.h
@ -0,0 +1,60 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared definitions for GPU-based 3D Grid collision detection broadphase
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASESHAREDDEFS_H
+#define BTGPU3DGRIDBROADPHASESHAREDDEFS_H
+
+//----------------------------------------------------------------------------------------
+
+#include "btGpu3DGridBroadphaseSharedTypes.h"
+
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash,	unsigned int numBodies);
+
+void BT_GPU_PREF(findCellStart)(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells);
+
+void BT_GPU_PREF(findOverlappingPairs)(bt3DGrid3F1U* pAABB, unsigned int* pHash,	unsigned int* pCellStart, unsigned int*	pPairBuff, unsigned int*	pPairBuffStartCurr, unsigned int	numBodies);
+
+void BT_GPU_PREF(findPairsLarge)(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge);
+
+void BT_GPU_PREF(computePairCacheChanges)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies);
+
+void BT_GPU_PREF(squeezeOverlappingPairBuff)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies);
+
+
+//----------------------------------------------------------------------------------------
+
+} // extern "C"
+
+//----------------------------------------------------------------------------------------
+
+#endif // BTGPU3DGRIDBROADPHASESHAREDDEFS_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/btGpu3DGridBroadphaseSharedTypes.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/btGpu3DGridBroadphaseSharedTypes.h
@ -0,0 +1,66 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared definitions for GPU-based 3D Grid collision detection broadphase
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASESHAREDTYPES_H
+#define BTGPU3DGRIDBROADPHASESHAREDTYPES_H
+
+//----------------------------------------------------------------------------------------
+
+#define BT_3DGRID_PAIR_FOUND_FLG (0x40000000)
+#define BT_3DGRID_PAIR_NEW_FLG   (0x20000000)
+#define BT_3DGRID_PAIR_ANY_FLG   (BT_3DGRID_PAIR_FOUND_FLG | BT_3DGRID_PAIR_NEW_FLG)
+
+//----------------------------------------------------------------------------------------
+
+struct bt3DGridBroadphaseParams 
+{
+	unsigned int	m_gridSizeX;
+	unsigned int	m_gridSizeY;
+	unsigned int	m_gridSizeZ;
+	unsigned int	m_numCells;
+	float			m_worldOriginX;
+	float			m_worldOriginY;
+	float			m_worldOriginZ;
+	float			m_cellSizeX;
+	float			m_cellSizeY;
+	float			m_cellSizeZ;
+	unsigned int	m_numBodies;
+	unsigned int	m_maxBodiesPerCell;
+};
+
+//----------------------------------------------------------------------------------------
+
+struct bt3DGrid3F1U
+{
+	float			fx;
+	float			fy;
+	float			fz;
+	unsigned int	uw;
+};
+
+//----------------------------------------------------------------------------------------
+
+#endif // BTGPU3DGRIDBROADPHASESHAREDTYPES_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/btGpuDefines.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/btGpuDefines.h
@ -0,0 +1,211 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+// definitions for "GPU on CPU" code
+
+
+#ifndef BT_GPU_DEFINES_H
+#define BT_GPU_DEFINES_H
+
+typedef unsigned int uint;
+
+struct int2
+{
+	int x, y;
+};
+
+struct uint2
+{
+	unsigned int x, y;
+};
+
+struct int3
+{
+	int x, y, z;
+};
+
+struct uint3
+{
+	unsigned int x, y, z;
+};
+
+struct float4
+{
+	float x, y, z, w;
+};
+
+struct float3
+{
+	float x, y, z;
+};
+
+
+#define BT_GPU___device__ inline
+#define BT_GPU___devdata__
+#define BT_GPU___constant__
+#define BT_GPU_max(a, b) ((a) > (b) ? (a) : (b))
+#define BT_GPU_min(a, b) ((a) < (b) ? (a) : (b))
+#define BT_GPU_params s3DGridBroadphaseParams
+#define BT_GPU___mul24(a, b) ((a)*(b))
+#define BT_GPU___global__ inline
+#define BT_GPU___shared__ static
+#define BT_GPU___syncthreads()
+#define CUDART_PI_F SIMD_PI
+
+static inline uint2 bt3dGrid_make_uint2(unsigned int x, unsigned int y)
+{
+  uint2 t; t.x = x; t.y = y; return t;
+}
+#define BT_GPU_make_uint2(x, y) bt3dGrid_make_uint2(x, y)
+
+static inline int3 bt3dGrid_make_int3(int x, int y, int z)
+{
+  int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+#define BT_GPU_make_int3(x, y, z) bt3dGrid_make_int3(x, y, z)
+
+static inline float3 bt3dGrid_make_float3(float x, float y, float z)
+{
+  float3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+#define BT_GPU_make_float3(x, y, z) bt3dGrid_make_float3(x, y, z)
+
+static inline float3 bt3dGrid_make_float34(float4 f)
+{
+  float3 t; t.x = f.x; t.y = f.y; t.z = f.z; return t;
+}
+#define BT_GPU_make_float34(f) bt3dGrid_make_float34(f)
+
+static inline float3 bt3dGrid_make_float31(float f)
+{
+  float3 t; t.x = t.y = t.z = f; return t;
+}
+#define BT_GPU_make_float31(x) bt3dGrid_make_float31(x)
+
+static inline float4 bt3dGrid_make_float42(float3 v, float f)
+{
+  float4 t; t.x = v.x; t.y = v.y; t.z = v.z; t.w = f; return t;
+}
+#define BT_GPU_make_float42(a, b) bt3dGrid_make_float42(a, b) 
+
+static inline float4 bt3dGrid_make_float44(float a, float b, float c, float d)
+{
+  float4 t; t.x = a; t.y = b; t.z = c; t.w = d; return t;
+}
+#define BT_GPU_make_float44(a, b, c, d) bt3dGrid_make_float44(a, b, c, d) 
+
+inline int3 operator+(int3 a, int3 b)
+{
+    return bt3dGrid_make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+inline float4 operator+(const float4& a, const float4& b)
+{
+	float4 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; r.w = a.w+b.w; return r;
+}
+inline float4 operator*(const float4& a, float fact)
+{
+	float4 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; r.w = a.w*fact; return r;
+}
+inline float4 operator*(float fact, float4& a)
+{
+	return (a * fact);
+}
+inline float4& operator*=(float4& a, float fact)
+{
+	a = fact * a;
+	return a;
+}
+inline float4& operator+=(float4& a, const float4& b)
+{
+	a = a + b;
+	return a;
+}
+
+inline float3 operator+(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; return r;
+}
+inline float3 operator-(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.x-b.x; r.y = a.y-b.y; r.z = a.z-b.z; return r;
+}
+static inline float bt3dGrid_dot(float3& a, float3& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+#define BT_GPU_dot(a,b) bt3dGrid_dot(a,b)
+
+static inline float bt3dGrid_dot4(float4& a, float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+#define BT_GPU_dot4(a,b) bt3dGrid_dot4(a,b)
+
+static inline float3 bt3dGrid_cross(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.y*b.z-a.z*b.y; r.y = -a.x*b.z+a.z*b.x; r.z = a.x*b.y-a.y*b.x;	return r;
+}
+#define BT_GPU_cross(a,b) bt3dGrid_cross(a,b)
+
+
+inline float3 operator*(const float3& a, float fact)
+{
+	float3 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; return r;
+}
+
+
+inline float3& operator+=(float3& a, const float3& b)
+{
+	a = a + b;
+	return a;
+}
+inline float3& operator-=(float3& a, const float3& b)
+{
+	a = a - b;
+	return a;
+}
+inline float3& operator*=(float3& a, float fact)
+{
+	a = a * fact;
+	return a;
+}
+inline float3 operator-(const float3& v)
+{
+	float3 r; r.x = -v.x; r.y = -v.y; r.z = -v.z; return r;
+}
+
+
+#define BT_GPU_FETCH(a, b) a[b]
+#define BT_GPU_FETCH4(a, b) a[b]
+#define BT_GPU_PREF(func) btGpu_##func
+#define BT_GPU_SAFE_CALL(func) func
+#define BT_GPU_Memset memset
+#define BT_GPU_MemcpyToSymbol(a, b, c) memcpy(a, b, c)
+#define BT_GPU_BindTexture(a, b, c, d)
+#define BT_GPU_UnbindTexture(a)
+
+static uint2 s_blockIdx, s_blockDim, s_threadIdx;
+#define BT_GPU_blockIdx s_blockIdx
+#define BT_GPU_blockDim s_blockDim
+#define BT_GPU_threadIdx s_threadIdx
+#define BT_GPU_EXECKERNEL(numb, numt, kfunc, args) {s_blockDim.x=numt;for(int nb=0;nb<numb;nb++){s_blockIdx.x=nb;for(int nt=0;nt<numt;nt++){s_threadIdx.x=nt;kfunc args;}}}
+
+#define BT_GPU_CHECK_ERROR(s)
+
+
+#endif //BT_GPU_DEFINES_H
--- a/Engine/lib/bullet/src/BulletMultiThreaded/btGpuUtilsSharedCode.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/btGpuUtilsSharedCode.h
@ -0,0 +1,54 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared code for GPU-based utilities
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  will be compiled by both CPU and CUDA compilers
+//	file with definitions of BT_GPU_xxx should be included first
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#include "btGpuUtilsSharedDefs.h"
+
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+//Round a / b to nearest higher integer value
+int BT_GPU_PREF(iDivUp)(int a, int b)
+{
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+} // iDivUp()
+
+//----------------------------------------------------------------------------------------
+
+// compute grid and thread block size for a given number of elements
+void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads)
+{
+    numThreads = BT_GPU_min(blockSize, n);
+    numBlocks = BT_GPU_PREF(iDivUp)(n, numThreads);
+} // computeGridSize()
+
+//----------------------------------------------------------------------------------------
+
+} // extern "C"
--- a/Engine/lib/bullet/src/BulletMultiThreaded/btGpuUtilsSharedDefs.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/btGpuUtilsSharedDefs.h
@ -0,0 +1,59 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+// Shared definitions for GPU-based utilities
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//	file with definitions of BT_GPU_xxx should be included first
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+
+#ifndef BTGPUUTILSDHAREDDEFS_H
+#define BTGPUUTILSDHAREDDEFS_H
+
+
+
+extern "C"
+{
+
+
+
+//Round a / b to nearest higher integer value
+int BT_GPU_PREF(iDivUp)(int a, int b);
+
+// compute grid and thread block size for a given number of elements
+void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads);
+
+void BT_GPU_PREF(allocateArray)(void** devPtr, unsigned int size);
+void BT_GPU_PREF(freeArray)(void* devPtr);
+void BT_GPU_PREF(copyArrayFromDevice)(void* host, const void* device, unsigned int size);
+void BT_GPU_PREF(copyArrayToDevice)(void* device, const void* host, unsigned int size);
+
+
+
+
+
+
+} // extern "C"
+
+
+
+#endif // BTGPUUTILSDHAREDDEFS_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/btThreadSupportInterface.cpp
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/btThreadSupportInterface.cpp
@ -0,0 +1,22 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "btThreadSupportInterface.h"
+
+btThreadSupportInterface::~btThreadSupportInterface()
+{
+
+}
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/btThreadSupportInterface.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/btThreadSupportInterface.h
@ -0,0 +1,50 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef THREAD_SUPPORT_INTERFACE_H
+#define THREAD_SUPPORT_INTERFACE_H
+
+
+//#include <LinearMath/btScalar.h> //for uint32_t etc.
+#include "PlatformDefinitions.h"
+#include "PpuAddressSpace.h"
+
+class btThreadSupportInterface
+{
+public:
+
+	virtual ~btThreadSupportInterface();
+
+///send messages to SPUs
+	virtual void sendRequest(uint32_t uiCommand, ppu_address_t uiArgument0, uint32_t uiArgument1) =0;
+
+///check for messages from SPUs
+	virtual	void waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1) =0;
+
+///start the spus (can be called at the beginning of each frame, to make sure that the right SPU program is loaded)
+	virtual	void startSPU() =0;
+
+///tell the task scheduler we are done with the SPU tasks
+	virtual	void stopSPU()=0;
+
+	///tell the task scheduler to use no more than numTasks tasks
+	virtual void	setNumTasks(int numTasks)=0;
+
+	virtual int		getNumTasks() const = 0;
+
+};
+
+#endif //THREAD_SUPPORT_INTERFACE_H
+
--- a/Engine/lib/bullet/src/BulletMultiThreaded/vectormath/scalar/cpp/mat_aos.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/vectormath/scalar/cpp/mat_aos.h
--- a/Engine/lib/bullet/src/BulletMultiThreaded/vectormath/scalar/cpp/quat_aos.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/vectormath/scalar/cpp/quat_aos.h
@ -0,0 +1,432 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _VECTORMATH_QUAT_AOS_CPP_H
+#define _VECTORMATH_QUAT_AOS_CPP_H
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+inline Quat::Quat( const Quat & quat )
+{
+    mX = quat.mX;
+    mY = quat.mY;
+    mZ = quat.mZ;
+    mW = quat.mW;
+}
+
+inline Quat::Quat( float _x, float _y, float _z, float _w )
+{
+    mX = _x;
+    mY = _y;
+    mZ = _z;
+    mW = _w;
+}
+
+inline Quat::Quat( const Vector3 & xyz, float _w )
+{
+    this->setXYZ( xyz );
+    this->setW( _w );
+}
+
+inline Quat::Quat( const Vector4 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    mW = vec.getW();
+}
+
+inline Quat::Quat( float scalar )
+{
+    mX = scalar;
+    mY = scalar;
+    mZ = scalar;
+    mW = scalar;
+}
+
+inline const Quat Quat::identity( )
+{
+    return Quat( 0.0f, 0.0f, 0.0f, 1.0f );
+}
+
+inline const Quat lerp( float t, const Quat & quat0, const Quat & quat1 )
+{
+    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
+}
+
+inline const Quat slerp( float t, const Quat & unitQuat0, const Quat & unitQuat1 )
+{
+    Quat start;
+    float recipSinAngle, scale0, scale1, cosAngle, angle;
+    cosAngle = dot( unitQuat0, unitQuat1 );
+    if ( cosAngle < 0.0f ) {
+        cosAngle = -cosAngle;
+        start = ( -unitQuat0 );
+    } else {
+        start = unitQuat0;
+    }
+    if ( cosAngle < _VECTORMATH_SLERP_TOL ) {
+        angle = acosf( cosAngle );
+        recipSinAngle = ( 1.0f / sinf( angle ) );
+        scale0 = ( sinf( ( ( 1.0f - t ) * angle ) ) * recipSinAngle );
+        scale1 = ( sinf( ( t * angle ) ) * recipSinAngle );
+    } else {
+        scale0 = ( 1.0f - t );
+        scale1 = t;
+    }
+    return ( ( start * scale0 ) + ( unitQuat1 * scale1 ) );
+}
+
+inline const Quat squad( float t, const Quat & unitQuat0, const Quat & unitQuat1, const Quat & unitQuat2, const Quat & unitQuat3 )
+{
+    Quat tmp0, tmp1;
+    tmp0 = slerp( t, unitQuat0, unitQuat3 );
+    tmp1 = slerp( t, unitQuat1, unitQuat2 );
+    return slerp( ( ( 2.0f * t ) * ( 1.0f - t ) ), tmp0, tmp1 );
+}
+
+inline Quat & Quat::operator =( const Quat & quat )
+{
+    mX = quat.mX;
+    mY = quat.mY;
+    mZ = quat.mZ;
+    mW = quat.mW;
+    return *this;
+}
+
+inline Quat & Quat::setXYZ( const Vector3 & vec )
+{
+    mX = vec.getX();
+    mY = vec.getY();
+    mZ = vec.getZ();
+    return *this;
+}
+
+inline const Vector3 Quat::getXYZ( ) const
+{
+    return Vector3( mX, mY, mZ );
+}
+
+inline Quat & Quat::setX( float _x )
+{
+    mX = _x;
+    return *this;
+}
+
+inline float Quat::getX( ) const
+{
+    return mX;
+}
+
+inline Quat & Quat::setY( float _y )
+{
+    mY = _y;
+    return *this;
+}
+
+inline float Quat::getY( ) const
+{
+    return mY;
+}
+
+inline Quat & Quat::setZ( float _z )
+{
+    mZ = _z;
+    return *this;
+}
+
+inline float Quat::getZ( ) const
+{
+    return mZ;
+}
+
+inline Quat & Quat::setW( float _w )
+{
+    mW = _w;
+    return *this;
+}
+
+inline float Quat::getW( ) const
+{
+    return mW;
+}
+
+inline Quat & Quat::setElem( int idx, float value )
+{
+    *(&mX + idx) = value;
+    return *this;
+}
+
+inline float Quat::getElem( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline float & Quat::operator []( int idx )
+{
+    return *(&mX + idx);
+}
+
+inline float Quat::operator []( int idx ) const
+{
+    return *(&mX + idx);
+}
+
+inline const Quat Quat::operator +( const Quat & quat ) const
+{
+    return Quat(
+        ( mX + quat.mX ),
+        ( mY + quat.mY ),
+        ( mZ + quat.mZ ),
+        ( mW + quat.mW )
+    );
+}
+
+inline const Quat Quat::operator -( const Quat & quat ) const
+{
+    return Quat(
+        ( mX - quat.mX ),
+        ( mY - quat.mY ),
+        ( mZ - quat.mZ ),
+        ( mW - quat.mW )
+    );
+}
+
+inline const Quat Quat::operator *( float scalar ) const
+{
+    return Quat(
+        ( mX * scalar ),
+        ( mY * scalar ),
+        ( mZ * scalar ),
+        ( mW * scalar )
+    );
+}
+
+inline Quat & Quat::operator +=( const Quat & quat )
+{
+    *this = *this + quat;
+    return *this;
+}
+
+inline Quat & Quat::operator -=( const Quat & quat )
+{
+    *this = *this - quat;
+    return *this;
+}
+
+inline Quat & Quat::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+inline const Quat Quat::operator /( float scalar ) const
+{
+    return Quat(
+        ( mX / scalar ),
+        ( mY / scalar ),
+        ( mZ / scalar ),
+        ( mW / scalar )
+    );
+}
+
+inline Quat & Quat::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+inline const Quat Quat::operator -( ) const
+{
+    return Quat(
+        -mX,
+        -mY,
+        -mZ,
+        -mW
+    );
+}
+
+inline const Quat operator *( float scalar, const Quat & quat )
+{
+    return quat * scalar;
+}
+
+inline float dot( const Quat & quat0, const Quat & quat1 )
+{
+    float result;
+    result = ( quat0.getX() * quat1.getX() );
+    result = ( result + ( quat0.getY() * quat1.getY() ) );
+    result = ( result + ( quat0.getZ() * quat1.getZ() ) );
+    result = ( result + ( quat0.getW() * quat1.getW() ) );
+    return result;
+}
+
+inline float norm( const Quat & quat )
+{
+    float result;
+    result = ( quat.getX() * quat.getX() );
+    result = ( result + ( quat.getY() * quat.getY() ) );
+    result = ( result + ( quat.getZ() * quat.getZ() ) );
+    result = ( result + ( quat.getW() * quat.getW() ) );
+    return result;
+}
+
+inline float length( const Quat & quat )
+{
+    return sqrtf( norm( quat ) );
+}
+
+inline const Quat normalize( const Quat & quat )
+{
+    float lenSqr, lenInv;
+    lenSqr = norm( quat );
+    lenInv = ( 1.0f / sqrtf( lenSqr ) );
+    return Quat(
+        ( quat.getX() * lenInv ),
+        ( quat.getY() * lenInv ),
+        ( quat.getZ() * lenInv ),
+        ( quat.getW() * lenInv )
+    );
+}
+
+inline const Quat Quat::rotation( const Vector3 & unitVec0, const Vector3 & unitVec1 )
+{
+    float cosHalfAngleX2, recipCosHalfAngleX2;
+    cosHalfAngleX2 = sqrtf( ( 2.0f * ( 1.0f + dot( unitVec0, unitVec1 ) ) ) );
+    recipCosHalfAngleX2 = ( 1.0f / cosHalfAngleX2 );
+    return Quat( ( cross( unitVec0, unitVec1 ) * recipCosHalfAngleX2 ), ( cosHalfAngleX2 * 0.5f ) );
+}
+
+inline const Quat Quat::rotation( float radians, const Vector3 & unitVec )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( ( unitVec * s ), c );
+}
+
+inline const Quat Quat::rotationX( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( s, 0.0f, 0.0f, c );
+}
+
+inline const Quat Quat::rotationY( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( 0.0f, s, 0.0f, c );
+}
+
+inline const Quat Quat::rotationZ( float radians )
+{
+    float s, c, angle;
+    angle = ( radians * 0.5f );
+    s = sinf( angle );
+    c = cosf( angle );
+    return Quat( 0.0f, 0.0f, s, c );
+}
+
+inline const Quat Quat::operator *( const Quat & quat ) const
+{
+    return Quat(
+        ( ( ( ( mW * quat.mX ) + ( mX * quat.mW ) ) + ( mY * quat.mZ ) ) - ( mZ * quat.mY ) ),
+        ( ( ( ( mW * quat.mY ) + ( mY * quat.mW ) ) + ( mZ * quat.mX ) ) - ( mX * quat.mZ ) ),
+        ( ( ( ( mW * quat.mZ ) + ( mZ * quat.mW ) ) + ( mX * quat.mY ) ) - ( mY * quat.mX ) ),
+        ( ( ( ( mW * quat.mW ) - ( mX * quat.mX ) ) - ( mY * quat.mY ) ) - ( mZ * quat.mZ ) )
+    );
+}
+
+inline Quat & Quat::operator *=( const Quat & quat )
+{
+    *this = *this * quat;
+    return *this;
+}
+
+inline const Vector3 rotate( const Quat & quat, const Vector3 & vec )
+{
+    float tmpX, tmpY, tmpZ, tmpW;
+    tmpX = ( ( ( quat.getW() * vec.getX() ) + ( quat.getY() * vec.getZ() ) ) - ( quat.getZ() * vec.getY() ) );
+    tmpY = ( ( ( quat.getW() * vec.getY() ) + ( quat.getZ() * vec.getX() ) ) - ( quat.getX() * vec.getZ() ) );
+    tmpZ = ( ( ( quat.getW() * vec.getZ() ) + ( quat.getX() * vec.getY() ) ) - ( quat.getY() * vec.getX() ) );
+    tmpW = ( ( ( quat.getX() * vec.getX() ) + ( quat.getY() * vec.getY() ) ) + ( quat.getZ() * vec.getZ() ) );
+    return Vector3(
+        ( ( ( ( tmpW * quat.getX() ) + ( tmpX * quat.getW() ) ) - ( tmpY * quat.getZ() ) ) + ( tmpZ * quat.getY() ) ),
+        ( ( ( ( tmpW * quat.getY() ) + ( tmpY * quat.getW() ) ) - ( tmpZ * quat.getX() ) ) + ( tmpX * quat.getZ() ) ),
+        ( ( ( ( tmpW * quat.getZ() ) + ( tmpZ * quat.getW() ) ) - ( tmpX * quat.getY() ) ) + ( tmpY * quat.getX() ) )
+    );
+}
+
+inline const Quat conj( const Quat & quat )
+{
+    return Quat( -quat.getX(), -quat.getY(), -quat.getZ(), quat.getW() );
+}
+
+inline const Quat select( const Quat & quat0, const Quat & quat1, bool select1 )
+{
+    return Quat(
+        ( select1 )? quat1.getX() : quat0.getX(),
+        ( select1 )? quat1.getY() : quat0.getY(),
+        ( select1 )? quat1.getZ() : quat0.getZ(),
+        ( select1 )? quat1.getW() : quat0.getW()
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+inline void print( const Quat & quat )
+{
+    printf( "( %f %f %f %f )\n", quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
+}
+
+inline void print( const Quat & quat, const char * name )
+{
+    printf( "%s: ( %f %f %f %f )\n", name, quat.getX(), quat.getY(), quat.getZ(), quat.getW() );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
--- a/Engine/lib/bullet/src/BulletMultiThreaded/vectormath/scalar/cpp/vec_aos.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/vectormath/scalar/cpp/vec_aos.h
--- a/Engine/lib/bullet/src/BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h
--- a/Engine/lib/bullet/src/BulletMultiThreaded/vectormath2bullet.h
+++ b/Engine/lib/bullet/src/BulletMultiThreaded/vectormath2bullet.h
@ -0,0 +1,75 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef AOS_VECTORMATH_BULLET_CONVERT_H
+#define AOS_VECTORMATH_BULLET_CONVERT_H
+
+#include <vectormath_aos.h>
+//#include "BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h"
+
+#include "LinearMath/btVector3.h"
+#include "LinearMath/btQuaternion.h"
+#include "LinearMath/btMatrix3x3.h"
+
+inline Vectormath::Aos::Vector3	getVmVector3(const btVector3& bulletVec)
+{
+	return Vectormath::Aos::Vector3(bulletVec.getX(),bulletVec.getY(),bulletVec.getZ());
+}
+
+inline btVector3 getBtVector3(const Vectormath::Aos::Vector3& vmVec)
+{
+	return btVector3(vmVec.getX(),vmVec.getY(),vmVec.getZ());
+}
+inline btVector3 getBtVector3(const Vectormath::Aos::Point3& vmVec)
+{
+	return btVector3(vmVec.getX(),vmVec.getY(),vmVec.getZ());
+}
+
+inline Vectormath::Aos::Quat	getVmQuat(const btQuaternion& bulletQuat)
+{
+	Vectormath::Aos::Quat vmQuat(bulletQuat.getX(),bulletQuat.getY(),bulletQuat.getZ(),bulletQuat.getW());
+	return vmQuat;
+}
+
+inline btQuaternion	getBtQuat(const Vectormath::Aos::Quat& vmQuat)
+{
+	return btQuaternion (vmQuat.getX(),vmQuat.getY(),vmQuat.getZ(),vmQuat.getW());
+}
+
+inline Vectormath::Aos::Matrix3	getVmMatrix3(const btMatrix3x3& btMat)
+{
+	Vectormath::Aos::Matrix3 mat(
+		getVmVector3(btMat.getColumn(0)),
+		getVmVector3(btMat.getColumn(1)),
+		getVmVector3(btMat.getColumn(2)));
+		return mat;
+}
+
+
+#endif //AOS_VECTORMATH_BULLET_CONVERT_H
				`@ -0,0 +1 @@`
				`Empty placeholder for future Libspe2 SPU task`