Bullet 2.82 update

This commit is contained in:
rextimmy 2014-06-10 22:40:30 +10:00
parent d0a64026b0
commit 416c50690e
146 changed files with 12202 additions and 1422 deletions

View file

@ -54,7 +54,10 @@ IF (INSTALL_LIBS)
IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
INSTALL(TARGETS LinearMath DESTINATION .)
ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
INSTALL(TARGETS LinearMath DESTINATION lib${LIB_SUFFIX})
INSTALL(TARGETS LinearMath
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib${LIB_SUFFIX}
ARCHIVE DESTINATION lib${LIB_SUFFIX})
INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DESTINATION ${INCLUDE_INSTALL_DIR} FILES_MATCHING PATTERN "*.h" PATTERN
".svn" EXCLUDE PATTERN "CMakeFiles" EXCLUDE)

View file

@ -1931,11 +1931,15 @@ void btConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1)
}
}
static bool pointCmp(const btConvexHullInternal::Point32& p, const btConvexHullInternal::Point32& q)
class pointCmp
{
return (p.y < q.y) || ((p.y == q.y) && ((p.x < q.x) || ((p.x == q.x) && (p.z < q.z))));
}
public:
bool operator() ( const btConvexHullInternal::Point32& p, const btConvexHullInternal::Point32& q ) const
{
return (p.y < q.y) || ((p.y == q.y) && ((p.x < q.x) || ((p.x == q.x) && (p.z < q.z))));
}
};
void btConvexHullInternal::compute(const void* coords, bool doubleCoords, int stride, int count)
{
@ -2026,7 +2030,7 @@ void btConvexHullInternal::compute(const void* coords, bool doubleCoords, int st
points[i].index = i;
}
}
points.quickSort(pointCmp);
points.quickSort(pointCmp());
vertexPool.reset();
vertexPool.setArraySize(count);

View file

@ -62,29 +62,17 @@ class btIDebugDraw
virtual void drawSphere(btScalar radius, const btTransform& transform, const btVector3& color)
{
btVector3 start = transform.getOrigin();
const btVector3 xoffs = transform.getBasis() * btVector3(radius,0,0);
const btVector3 yoffs = transform.getBasis() * btVector3(0,radius,0);
const btVector3 zoffs = transform.getBasis() * btVector3(0,0,radius);
// XY
drawLine(start-xoffs, start+yoffs, color);
drawLine(start+yoffs, start+xoffs, color);
drawLine(start+xoffs, start-yoffs, color);
drawLine(start-yoffs, start-xoffs, color);
// XZ
drawLine(start-xoffs, start+zoffs, color);
drawLine(start+zoffs, start+xoffs, color);
drawLine(start+xoffs, start-zoffs, color);
drawLine(start-zoffs, start-xoffs, color);
// YZ
drawLine(start-yoffs, start+zoffs, color);
drawLine(start+zoffs, start+yoffs, color);
drawLine(start+yoffs, start-zoffs, color);
drawLine(start-zoffs, start-yoffs, color);
btVector3 center = transform.getOrigin();
btVector3 up = transform.getBasis().getColumn(1);
btVector3 axis = transform.getBasis().getColumn(0);
btScalar minTh = -SIMD_HALF_PI;
btScalar maxTh = SIMD_HALF_PI;
btScalar minPs = -SIMD_HALF_PI;
btScalar maxPs = SIMD_HALF_PI;
btScalar stepDegrees = 30.f;
drawSpherePatch(center, up, axis, radius,minTh, maxTh, minPs, maxPs, color, stepDegrees ,false);
drawSpherePatch(center, up, -axis, radius,minTh, maxTh, minPs, maxPs, color, stepDegrees,false );
}
virtual void drawSphere (const btVector3& p, btScalar radius, const btVector3& color)
@ -179,7 +167,7 @@ class btIDebugDraw
}
}
virtual void drawSpherePatch(const btVector3& center, const btVector3& up, const btVector3& axis, btScalar radius,
btScalar minTh, btScalar maxTh, btScalar minPs, btScalar maxPs, const btVector3& color, btScalar stepDegrees = btScalar(10.f))
btScalar minTh, btScalar maxTh, btScalar minPs, btScalar maxPs, const btVector3& color, btScalar stepDegrees = btScalar(10.f),bool drawCenter = true)
{
btVector3 vA[74];
btVector3 vB[74];
@ -261,18 +249,22 @@ class btIDebugDraw
{
drawLine(npole, pvB[j], color);
}
if(isClosed)
if (drawCenter)
{
if(j == (n_vert-1))
if(isClosed)
{
drawLine(arcStart, pvB[j], color);
if(j == (n_vert-1))
{
drawLine(arcStart, pvB[j], color);
}
}
}
else
{
if(((!i) || (i == (n_hor-1))) && ((!j) || (j == (n_vert-1))))
else
{
drawLine(center, pvB[j], color);
if(((!i) || (i == (n_hor-1))) && ((!j) || (j == (n_vert-1))))
{
drawLine(center, pvB[j], color);
}
}
}
}
@ -314,6 +306,8 @@ class btIDebugDraw
virtual void drawCapsule(btScalar radius, btScalar halfHeight, int upAxis, const btTransform& transform, const btVector3& color)
{
int stepDegrees = 30;
btVector3 capStart(0.f,0.f,0.f);
capStart[upAxis] = -halfHeight;
@ -325,34 +319,47 @@ class btIDebugDraw
btTransform childTransform = transform;
childTransform.getOrigin() = transform * capStart;
drawSphere(radius, childTransform, color);
{
btVector3 center = childTransform.getOrigin();
btVector3 up = childTransform.getBasis().getColumn((upAxis+1)%3);
btVector3 axis = -childTransform.getBasis().getColumn(upAxis);
btScalar minTh = -SIMD_HALF_PI;
btScalar maxTh = SIMD_HALF_PI;
btScalar minPs = -SIMD_HALF_PI;
btScalar maxPs = SIMD_HALF_PI;
drawSpherePatch(center, up, axis, radius,minTh, maxTh, minPs, maxPs, color, btScalar(stepDegrees) ,false);
}
}
{
btTransform childTransform = transform;
childTransform.getOrigin() = transform * capEnd;
drawSphere(radius, childTransform, color);
{
btVector3 center = childTransform.getOrigin();
btVector3 up = childTransform.getBasis().getColumn((upAxis+1)%3);
btVector3 axis = childTransform.getBasis().getColumn(upAxis);
btScalar minTh = -SIMD_HALF_PI;
btScalar maxTh = SIMD_HALF_PI;
btScalar minPs = -SIMD_HALF_PI;
btScalar maxPs = SIMD_HALF_PI;
drawSpherePatch(center, up, axis, radius,minTh, maxTh, minPs, maxPs, color, btScalar(stepDegrees) ,false);
}
}
// Draw some additional lines
btVector3 start = transform.getOrigin();
capStart[(upAxis+1)%3] = radius;
capEnd[(upAxis+1)%3] = radius;
drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
capStart[(upAxis+1)%3] = -radius;
capEnd[(upAxis+1)%3] = -radius;
drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
capStart[(upAxis+1)%3] = 0.f;
capEnd[(upAxis+1)%3] = 0.f;
capStart[(upAxis+2)%3] = radius;
capEnd[(upAxis+2)%3] = radius;
drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
capStart[(upAxis+2)%3] = -radius;
capEnd[(upAxis+2)%3] = -radius;
drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
for (int i=0;i<360;i+=stepDegrees)
{
capEnd[(upAxis+1)%3] = capStart[(upAxis+1)%3] = btSin(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
capEnd[(upAxis+2)%3] = capStart[(upAxis+2)%3] = btCos(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
}
}
virtual void drawCylinder(btScalar radius, btScalar halfHeight, int upAxis, const btTransform& transform, const btVector3& color)
@ -360,11 +367,18 @@ class btIDebugDraw
btVector3 start = transform.getOrigin();
btVector3 offsetHeight(0,0,0);
offsetHeight[upAxis] = halfHeight;
btVector3 offsetRadius(0,0,0);
offsetRadius[(upAxis+1)%3] = radius;
drawLine(start+transform.getBasis() * (offsetHeight+offsetRadius),start+transform.getBasis() * (-offsetHeight+offsetRadius),color);
drawLine(start+transform.getBasis() * (offsetHeight-offsetRadius),start+transform.getBasis() * (-offsetHeight-offsetRadius),color);
int stepDegrees=30;
btVector3 capStart(0.f,0.f,0.f);
capStart[upAxis] = -halfHeight;
btVector3 capEnd(0.f,0.f,0.f);
capEnd[upAxis] = halfHeight;
for (int i=0;i<360;i+=stepDegrees)
{
capEnd[(upAxis+1)%3] = capStart[(upAxis+1)%3] = btSin(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
capEnd[(upAxis+2)%3] = capStart[(upAxis+2)%3] = btCos(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
drawLine(start+transform.getBasis() * capStart,start+transform.getBasis() * capEnd, color);
}
// Drawing top and bottom caps of the cylinder
btVector3 yaxis(0,0,0);
yaxis[upAxis] = btScalar(1.0);
@ -376,16 +390,28 @@ class btIDebugDraw
virtual void drawCone(btScalar radius, btScalar height, int upAxis, const btTransform& transform, const btVector3& color)
{
int stepDegrees = 30;
btVector3 start = transform.getOrigin();
btVector3 offsetHeight(0,0,0);
offsetHeight[upAxis] = height * btScalar(0.5);
btScalar halfHeight = height * btScalar(0.5);
offsetHeight[upAxis] = halfHeight;
btVector3 offsetRadius(0,0,0);
offsetRadius[(upAxis+1)%3] = radius;
btVector3 offset2Radius(0,0,0);
offset2Radius[(upAxis+2)%3] = radius;
btVector3 capEnd(0.f,0.f,0.f);
capEnd[upAxis] = -halfHeight;
for (int i=0;i<360;i+=stepDegrees)
{
capEnd[(upAxis+1)%3] = btSin(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
capEnd[(upAxis+2)%3] = btCos(btScalar(i)*SIMD_RADS_PER_DEG)*radius;
drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * capEnd, color);
}
drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * (-offsetHeight+offsetRadius),color);
drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * (-offsetHeight-offsetRadius),color);
drawLine(start+transform.getBasis() * (offsetHeight),start+transform.getBasis() * (-offsetHeight+offset2Radius),color);

View file

@ -22,10 +22,15 @@ subject to the following restrictions:
#ifdef BT_USE_SSE
//const __m128 ATTRIBUTE_ALIGNED16(v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
const __m128 ATTRIBUTE_ALIGNED16(vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
//const __m128 ATTRIBUTE_ALIGNED16(vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
#define vMPPP (_mm_set_ps (+0.0f, +0.0f, +0.0f, -0.0f))
#endif
#if defined(BT_USE_SSE) || defined(BT_USE_NEON)
#if defined(BT_USE_SSE)
#define v1000 (_mm_set_ps(0.0f,0.0f,0.0f,1.0f))
#define v0100 (_mm_set_ps(0.0f,0.0f,1.0f,0.0f))
#define v0010 (_mm_set_ps(0.0f,1.0f,0.0f,0.0f))
#elif defined(BT_USE_NEON)
const btSimdFloat4 ATTRIBUTE_ALIGNED16(v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
@ -207,7 +212,7 @@ public:
btFullAssert(d != btScalar(0.0));
btScalar s = btScalar(2.0) / d;
#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
__m128 vs, Q = q.get128();
__m128i Qi = btCastfTo128i(Q);
__m128 Y, Z;
@ -341,7 +346,7 @@ public:
* @param m The array to be filled */
void getOpenGLSubMatrix(btScalar *m) const
{
#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
__m128 v0 = m_el[0].mVec128;
__m128 v1 = m_el[1].mVec128;
__m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2
@ -362,7 +367,7 @@ public:
vm[2] = v2;
#elif defined(BT_USE_NEON)
// note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
static const uint32x2_t zMask = (const uint32x2_t) {static_cast<uint32_t>(-1), 0 };
float32x4_t *vm = (float32x4_t *)m;
float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 ); // {x0 x1 z0 z1}, {y0 y1 w0 w1}
float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) ); // {x2 0 }, {y2 0}
@ -740,7 +745,7 @@ public:
SIMD_FORCE_INLINE btMatrix3x3&
btMatrix3x3::operator*=(const btMatrix3x3& m)
{
#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
__m128 rv00, rv01, rv02;
__m128 rv10, rv11, rv12;
__m128 rv20, rv21, rv22;
@ -953,7 +958,7 @@ btMatrix3x3::determinant() const
SIMD_FORCE_INLINE btMatrix3x3
btMatrix3x3::absolute() const
{
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
return btMatrix3x3(
_mm_and_ps(m_el[0].mVec128, btvAbsfMask),
_mm_and_ps(m_el[1].mVec128, btvAbsfMask),
@ -974,7 +979,7 @@ btMatrix3x3::absolute() const
SIMD_FORCE_INLINE btMatrix3x3
btMatrix3x3::transpose() const
{
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
__m128 v0 = m_el[0].mVec128;
__m128 v1 = m_el[1].mVec128;
__m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2
@ -993,7 +998,7 @@ btMatrix3x3::transpose() const
return btMatrix3x3( v0, v1, v2 );
#elif defined(BT_USE_NEON)
// note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
static const uint32x2_t zMask = (const uint32x2_t) {-1, 0 };
static const uint32x2_t zMask = (const uint32x2_t) {static_cast<uint32_t>(-1), 0 };
float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 ); // {x0 x1 z0 z1}, {y0 y1 w0 w1}
float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) ); // {x2 0 }, {y2 0}
float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
@ -1031,7 +1036,7 @@ btMatrix3x3::inverse() const
SIMD_FORCE_INLINE btMatrix3x3
btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
{
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
// zeros w
// static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
__m128 row = m_el[0].mVec128;
@ -1053,7 +1058,7 @@ btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
#elif defined BT_USE_NEON
// zeros w
static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 };
static const uint32x4_t xyzMask = (const uint32x4_t){ static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0 };
float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask );
float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask );
float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask );
@ -1151,7 +1156,7 @@ operator*(const btMatrix3x3& m, const btVector3& v)
SIMD_FORCE_INLINE btVector3
operator*(const btVector3& v, const btMatrix3x3& m)
{
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
const __m128 vv = v.mVec128;
@ -1191,7 +1196,7 @@ operator*(const btVector3& v, const btMatrix3x3& m)
SIMD_FORCE_INLINE btMatrix3x3
operator*(const btMatrix3x3& m1, const btMatrix3x3& m2)
{
#if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
#if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
__m128 m10 = m1[0].mVec128;
__m128 m11 = m1[1].mVec128;

View file

@ -0,0 +1,504 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2013 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///original version written by Erwin Coumans, October 2013
#ifndef BT_MATRIX_X_H
#define BT_MATRIX_X_H
#include "LinearMath/btQuickprof.h"
#include "LinearMath/btAlignedObjectArray.h"
class btIntSortPredicate
{
public:
bool operator() ( const int& a, const int& b ) const
{
return a < b;
}
};
template <typename T>
struct btMatrixX
{
int m_rows;
int m_cols;
int m_operations;
int m_resizeOperations;
int m_setElemOperations;
btAlignedObjectArray<T> m_storage;
btAlignedObjectArray< btAlignedObjectArray<int> > m_rowNonZeroElements1;
btAlignedObjectArray< btAlignedObjectArray<int> > m_colNonZeroElements;
T* getBufferPointerWritable()
{
return m_storage.size() ? &m_storage[0] : 0;
}
const T* getBufferPointer() const
{
return m_storage.size() ? &m_storage[0] : 0;
}
btMatrixX()
:m_rows(0),
m_cols(0),
m_operations(0),
m_resizeOperations(0),
m_setElemOperations(0)
{
}
btMatrixX(int rows,int cols)
:m_rows(rows),
m_cols(cols),
m_operations(0),
m_resizeOperations(0),
m_setElemOperations(0)
{
resize(rows,cols);
}
void resize(int rows, int cols)
{
m_resizeOperations++;
m_rows = rows;
m_cols = cols;
{
BT_PROFILE("m_storage.resize");
m_storage.resize(rows*cols);
}
clearSparseInfo();
}
int cols() const
{
return m_cols;
}
int rows() const
{
return m_rows;
}
///we don't want this read/write operator(), because we cannot keep track of non-zero elements, use setElem instead
/*T& operator() (int row,int col)
{
return m_storage[col*m_rows+row];
}
*/
void addElem(int row,int col, T val)
{
if (val)
{
if (m_storage[col+row*m_cols]==0.f)
{
setElem(row,col,val);
} else
{
m_storage[row*m_cols+col] += val;
}
}
}
void copyLowerToUpperTriangle()
{
int count=0;
for (int row=0;row<m_rowNonZeroElements1.size();row++)
{
for (int j=0;j<m_rowNonZeroElements1[row].size();j++)
{
int col = m_rowNonZeroElements1[row][j];
setElem(col,row, (*this)(row,col));
count++;
}
}
//printf("copyLowerToUpperTriangle copied %d elements out of %dx%d=%d\n", count,rows(),cols(),cols()*rows());
}
void setElem(int row,int col, T val)
{
m_setElemOperations++;
if (val)
{
if (m_storage[col+row*m_cols]==0.f)
{
m_rowNonZeroElements1[row].push_back(col);
m_colNonZeroElements[col].push_back(row);
}
m_storage[row*m_cols+col] = val;
}
}
const T& operator() (int row,int col) const
{
return m_storage[col+row*m_cols];
}
void clearSparseInfo()
{
BT_PROFILE("clearSparseInfo=0");
m_rowNonZeroElements1.resize(m_rows);
m_colNonZeroElements.resize(m_cols);
for (int i=0;i<m_rows;i++)
m_rowNonZeroElements1[i].resize(0);
for (int j=0;j<m_cols;j++)
m_colNonZeroElements[j].resize(0);
}
void setZero()
{
{
BT_PROFILE("storage=0");
btSetZero(&m_storage[0],m_storage.size());
//memset(&m_storage[0],0,sizeof(T)*m_storage.size());
//for (int i=0;i<m_storage.size();i++)
// m_storage[i]=0;
}
{
BT_PROFILE("clearSparseInfo=0");
clearSparseInfo();
}
}
void printMatrix(const char* msg)
{
printf("%s ---------------------\n",msg);
for (int i=0;i<rows();i++)
{
printf("\n");
for (int j=0;j<cols();j++)
{
printf("%2.1f\t",(*this)(i,j));
}
}
printf("\n---------------------\n");
}
void printNumZeros(const char* msg)
{
printf("%s: ",msg);
int numZeros = 0;
for (int i=0;i<m_storage.size();i++)
if (m_storage[i]==0)
numZeros++;
int total = m_cols*m_rows;
int computedNonZero = total-numZeros;
int nonZero = 0;
for (int i=0;i<m_colNonZeroElements.size();i++)
nonZero += m_colNonZeroElements[i].size();
btAssert(computedNonZero==nonZero);
if(computedNonZero!=nonZero)
{
printf("Error: computedNonZero=%d, but nonZero=%d\n",computedNonZero,nonZero);
}
//printf("%d numZeros out of %d (%f)\n",numZeros,m_cols*m_rows,numZeros/(m_cols*m_rows));
printf("total %d, %d rows, %d cols, %d non-zeros (%f %)\n", total, rows(),cols(), nonZero,100.f*(T)nonZero/T(total));
}
/*
void rowComputeNonZeroElements()
{
m_rowNonZeroElements1.resize(rows());
for (int i=0;i<rows();i++)
{
m_rowNonZeroElements1[i].resize(0);
for (int j=0;j<cols();j++)
{
if ((*this)(i,j)!=0.f)
{
m_rowNonZeroElements1[i].push_back(j);
}
}
}
}
*/
btMatrixX transpose() const
{
//transpose is optimized for sparse matrices
btMatrixX tr(m_cols,m_rows);
tr.setZero();
#if 0
for (int i=0;i<m_cols;i++)
for (int j=0;j<m_rows;j++)
{
T v = (*this)(j,i);
if (v)
{
tr.setElem(i,j,v);
}
}
#else
for (int i=0;i<m_colNonZeroElements.size();i++)
for (int h=0;h<m_colNonZeroElements[i].size();h++)
{
int j = m_colNonZeroElements[i][h];
T v = (*this)(j,i);
tr.setElem(i,j,v);
}
#endif
return tr;
}
void sortRowIndexArrays()
{
for (int i=0;i<m_rowNonZeroElements1[i].size();i++)
{
m_rowNonZeroElements1[i].quickSort(btIntSortPredicate());
}
}
void sortColIndexArrays()
{
for (int i=0;i<m_colNonZeroElements[i].size();i++)
{
m_colNonZeroElements[i].quickSort(btIntSortPredicate());
}
}
btMatrixX operator*(const btMatrixX& other)
{
//btMatrixX*btMatrixX implementation, optimized for sparse matrices
btAssert(cols() == other.rows());
btMatrixX res(rows(),other.cols());
res.setZero();
// BT_PROFILE("btMatrixX mul");
for (int j=0; j < res.cols(); ++j)
{
//int numZero=other.m_colNonZeroElements[j].size();
//if (numZero)
{
for (int i=0; i < res.rows(); ++i)
//for (int g = 0;g<m_colNonZeroElements[j].size();g++)
{
T dotProd=0;
T dotProd2=0;
int waste=0,waste2=0;
bool doubleWalk = false;
if (doubleWalk)
{
int numRows = m_rowNonZeroElements1[i].size();
int numOtherCols = other.m_colNonZeroElements[j].size();
for (int ii=0;ii<numRows;ii++)
{
int vThis=m_rowNonZeroElements1[i][ii];
}
for (int ii=0;ii<numOtherCols;ii++)
{
int vOther = other.m_colNonZeroElements[j][ii];
}
int indexRow = 0;
int indexOtherCol = 0;
while (indexRow < numRows && indexOtherCol < numOtherCols)
{
int vThis=m_rowNonZeroElements1[i][indexRow];
int vOther = other.m_colNonZeroElements[j][indexOtherCol];
if (vOther==vThis)
{
dotProd += (*this)(i,vThis) * other(vThis,j);
}
if (vThis<vOther)
{
indexRow++;
} else
{
indexOtherCol++;
}
}
} else
{
bool useOtherCol = true;
if (other.m_colNonZeroElements[j].size() <m_rowNonZeroElements1[i].size())
{
useOtherCol=true;
}
if (!useOtherCol )
{
for (int q=0;q<other.m_colNonZeroElements[j].size();q++)
{
int v = other.m_colNonZeroElements[j][q];
T w = (*this)(i,v);
if (w!=0.f)
{
dotProd+=w*other(v,j);
}
}
}
else
{
for (int q=0;q<m_rowNonZeroElements1[i].size();q++)
{
int v=m_rowNonZeroElements1[i][q];
T w = (*this)(i,v);
if (other(v,j)!=0.f)
{
dotProd+=w*other(v,j);
}
}
}
}
if (dotProd)
res.setElem(i,j,dotProd);
}
}
}
return res;
}
// this assumes the 4th and 8th rows of B and C are zero.
void multiplyAdd2_p8r (const btScalar *B, const btScalar *C, int numRows, int numRowsOther ,int row, int col)
{
const btScalar *bb = B;
for ( int i = 0;i<numRows;i++)
{
const btScalar *cc = C;
for ( int j = 0;j<numRowsOther;j++)
{
btScalar sum;
sum = bb[0]*cc[0];
sum += bb[1]*cc[1];
sum += bb[2]*cc[2];
sum += bb[4]*cc[4];
sum += bb[5]*cc[5];
sum += bb[6]*cc[6];
addElem(row+i,col+j,sum);
cc += 8;
}
bb += 8;
}
}
void multiply2_p8r (const btScalar *B, const btScalar *C, int numRows, int numRowsOther, int row, int col)
{
btAssert (numRows>0 && numRowsOther>0 && B && C);
const btScalar *bb = B;
for ( int i = 0;i<numRows;i++)
{
const btScalar *cc = C;
for ( int j = 0;j<numRowsOther;j++)
{
btScalar sum;
sum = bb[0]*cc[0];
sum += bb[1]*cc[1];
sum += bb[2]*cc[2];
sum += bb[4]*cc[4];
sum += bb[5]*cc[5];
sum += bb[6]*cc[6];
setElem(row+i,col+j,sum);
cc += 8;
}
bb += 8;
}
}
};
template <typename T>
struct btVectorX
{
btAlignedObjectArray<T> m_storage;
btVectorX()
{
}
btVectorX(int numRows)
{
m_storage.resize(numRows);
}
void resize(int rows)
{
m_storage.resize(rows);
}
int cols() const
{
return 1;
}
int rows() const
{
return m_storage.size();
}
int size() const
{
return rows();
}
void setZero()
{
// for (int i=0;i<m_storage.size();i++)
// m_storage[i]=0;
//memset(&m_storage[0],0,sizeof(T)*m_storage.size());
btSetZero(&m_storage[0],m_storage.size());
}
const T& operator[] (int index) const
{
return m_storage[index];
}
T& operator[] (int index)
{
return m_storage[index];
}
T* getBufferPointerWritable()
{
return m_storage.size() ? &m_storage[0] : 0;
}
const T* getBufferPointer() const
{
return m_storage.size() ? &m_storage[0] : 0;
}
};
/*
template <typename T>
void setElem(btMatrixX<T>& mat, int row, int col, T val)
{
mat.setElem(row,col,val);
}
*/
typedef btMatrixX<float> btMatrixXf;
typedef btVectorX<float> btVectorXf;
typedef btMatrixX<double> btMatrixXd;
typedef btVectorX<double> btVectorXd;
inline void setElem(btMatrixXd& mat, int row, int col, double val)
{
mat.setElem(row,col,val);
}
inline void setElem(btMatrixXf& mat, int row, int col, float val)
{
mat.setElem(row,col,val);
}
#ifdef BT_USE_DOUBLE_PRECISION
#define btVectorXu btVectorXd
#define btMatrixXu btMatrixXd
#else
#define btVectorXu btVectorXf
#define btMatrixXu btMatrixXf
#endif //BT_USE_DOUBLE_PRECISION
#endif//BT_MATRIX_H_H

View file

@ -60,10 +60,10 @@ unsigned int btPolarDecomposition::decompose(const btMatrix3x3& a, btMatrix3x3&
break;
const btScalar gamma = btPow(h_norm / u_norm, 0.25f);
const btScalar inv_gamma = 1.0 / gamma;
const btScalar inv_gamma = btScalar(1.0) / gamma;
// Determine the delta to 'u'
const btMatrix3x3 delta = (u * (gamma - 2.0) + h.transpose() * inv_gamma) * 0.5;
const btMatrix3x3 delta = (u * (gamma - btScalar(2.0)) + h.transpose() * inv_gamma) * btScalar(0.5);
// Update the matrices
u += delta;

View file

@ -27,11 +27,17 @@ subject to the following restrictions:
#ifdef BT_USE_SSE
const __m128 ATTRIBUTE_ALIGNED16(vOnes) = {1.0f, 1.0f, 1.0f, 1.0f};
//const __m128 ATTRIBUTE_ALIGNED16(vOnes) = {1.0f, 1.0f, 1.0f, 1.0f};
#define vOnes (_mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f))
#endif
#if defined(BT_USE_SSE) || defined(BT_USE_NEON)
#if defined(BT_USE_SSE)
#define vQInv (_mm_set_ps(+0.0f, -0.0f, -0.0f, -0.0f))
#define vPPPM (_mm_set_ps(-0.0f, +0.0f, +0.0f, +0.0f))
#elif defined(BT_USE_NEON)
const btSimdFloat4 ATTRIBUTE_ALIGNED16(vQInv) = {-0.0f, -0.0f, -0.0f, +0.0f};
const btSimdFloat4 ATTRIBUTE_ALIGNED16(vPPPM) = {+0.0f, +0.0f, +0.0f, -0.0f};
@ -285,7 +291,7 @@ public:
* @param q The other quaternion */
btScalar dot(const btQuaternion& q) const
{
#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
__m128 vd;
vd = _mm_mul_ps(mVec128, q.mVec128);
@ -384,7 +390,7 @@ public:
{
return *this / length();
}
/**@brief Return the angle between this quaternion and the other
/**@brief Return the ***half*** angle between this quaternion and the other
* @param q The other quaternion */
btScalar angle(const btQuaternion& q) const
{
@ -392,6 +398,19 @@ public:
btAssert(s != btScalar(0.0));
return btAcos(dot(q) / s);
}
/**@brief Return the angle between this quaternion and the other along the shortest path
* @param q The other quaternion */
btScalar angleShortestPath(const btQuaternion& q) const
{
btScalar s = btSqrt(length2() * q.length2());
btAssert(s != btScalar(0.0));
if (dot(q) < 0) // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp
return btAcos(dot(-q) / s) * btScalar(2.0);
else
return btAcos(dot(q) / s) * btScalar(2.0);
}
/**@brief Return the angle of rotation represented by this quaternion */
btScalar getAngle() const
{
@ -399,6 +418,19 @@ public:
return s;
}
/**@brief Return the angle of rotation represented by this quaternion along the shortest path*/
btScalar getAngleShortestPath() const
{
btScalar s;
if (dot(*this) < 0)
s = btScalar(2.) * btAcos(m_floats[3]);
else
s = btScalar(2.) * btAcos(-m_floats[3]);
return s;
}
/**@brief Return the axis of the rotation represented by this quaternion */
btVector3 getAxis() const
{
@ -498,7 +530,7 @@ public:
btAssert(magnitude > btScalar(0));
btScalar product = dot(q) / magnitude;
if (btFabs(product) != btScalar(1))
if (btFabs(product) < btScalar(1))
{
// Take care of long angle case see http://en.wikipedia.org/wiki/Slerp
const btScalar sign = (product < 0) ? btScalar(-1) : btScalar(1);
@ -835,7 +867,7 @@ quatRotate(const btQuaternion& rotation, const btVector3& v)
{
btQuaternion q = rotation * v;
q *= rotation.inverse();
#if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
return btVector3(_mm_and_ps(q.get128(), btvFFF0fMask));
#elif defined(BT_USE_NEON)
return btVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), btvFFF0Mask));

View file

@ -28,7 +28,7 @@ subject to the following restrictions:
#include <float.h>
/* SVN $Revision$ on $Date$ from http://bullet.googlecode.com*/
#define BT_BULLET_VERSION 281
#define BT_BULLET_VERSION 282
inline int btGetVersion()
{
@ -68,6 +68,10 @@ inline int btGetVersion()
#else
#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (BT_USE_DOUBLE_PRECISION))
#if _MSC_VER>1400
#define BT_USE_SIMD_VECTOR3
#endif
#define BT_USE_SSE
#ifdef BT_USE_SSE
//BT_USE_SSE_IN_API is disabled under Windows by default, because
@ -159,7 +163,8 @@ inline int btGetVersion()
#if (defined (__APPLE__) && (!defined (BT_USE_DOUBLE_PRECISION)))
#if defined (__i386__) || defined (__x86_64__)
#define BT_USE_SSE
#define BT_USE_SIMD_VECTOR3
#define BT_USE_SSE
//BT_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
//if apps run into issues, we will disable the next line
#define BT_USE_SSE_IN_API
@ -175,10 +180,11 @@ inline int btGetVersion()
#include <emmintrin.h>
#endif
#endif //BT_USE_SSE
#elif defined( __armv7__ )
#elif defined( __ARM_NEON__ )
#ifdef __clang__
#define BT_USE_NEON 1
#define BT_USE_SIMD_VECTOR3
#if defined BT_USE_NEON && defined (__clang__)
#include <arm_neon.h>
#endif//BT_USE_NEON
@ -207,8 +213,7 @@ inline int btGetVersion()
}
#else//defined (__i386__) || defined (__x86_64__)
#define btAssert assert
#end//defined (__i386__) || defined (__x86_64__)
#endif
#endif//defined (__i386__) || defined (__x86_64__)
#else//defined(DEBUG) || defined (_DEBUG)
#define btAssert(x)
#endif//defined(DEBUG) || defined (_DEBUG)
@ -252,10 +257,12 @@ inline int btGetVersion()
///The btScalar type abstracts floating point numbers, to easily switch between double and single floating point precision.
#if defined(BT_USE_DOUBLE_PRECISION)
typedef double btScalar;
//this number could be bigger in double precision
#define BT_LARGE_FLOAT 1e30
#else
typedef float btScalar;
//keep BT_LARGE_FLOAT*BT_LARGE_FLOAT < FLT_MAX
#define BT_LARGE_FLOAT 1e18f
@ -265,7 +272,8 @@ typedef float btScalar;
typedef __m128 btSimdFloat4;
#endif//BT_USE_SSE
#if defined BT_USE_SSE_IN_API && defined (BT_USE_SSE)
#if defined (BT_USE_SSE)
//#if defined BT_USE_SSE_IN_API && defined (BT_USE_SSE)
#ifdef _WIN32
#ifndef BT_NAN
@ -278,6 +286,8 @@ static int btInfinityMask = 0x7F800000;
#define BT_INFINITY (*(float*)&btInfinityMask)
#endif
//use this, in case there are clashes (such as xnamath.h)
#ifndef BT_NO_SIMD_OPERATOR_OVERLOADS
inline __m128 operator + (const __m128 A, const __m128 B)
{
return _mm_add_ps(A, B);
@ -292,6 +302,7 @@ inline __m128 operator * (const __m128 A, const __m128 B)
{
return _mm_mul_ps(A, B);
}
#endif //BT_NO_SIMD_OPERATOR_OVERLOADS
#define btCastfTo128i(a) (_mm_castps_si128(a))
#define btCastfTo128d(a) (_mm_castps_pd(a))
@ -311,7 +322,24 @@ inline __m128 operator * (const __m128 A, const __m128 B)
#define BT_INFINITY INFINITY
#define BT_NAN NAN
#endif//_WIN32
#endif //BT_USE_SSE_IN_API
#else
#ifdef BT_USE_NEON
#include <arm_neon.h>
typedef float32x4_t btSimdFloat4;
#define BT_INFINITY INFINITY
#define BT_NAN NAN
#define btAssign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3}
#else//BT_USE_NEON
#ifndef BT_INFINITY
static int btInfinityMask = 0x7F800000;
#define BT_INFINITY (*(float*)&btInfinityMask)
#endif
#endif//BT_USE_NEON
#endif //BT_USE_SSE
#ifdef BT_USE_NEON
#include <arm_neon.h>
@ -403,15 +431,15 @@ SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmodf(x,y); }
#endif
#define SIMD_2_PI btScalar(6.283185307179586232)
#define SIMD_PI (SIMD_2_PI * btScalar(0.5))
#define SIMD_HALF_PI (SIMD_2_PI * btScalar(0.25))
#define SIMD_PI btScalar(3.1415926535897932384626433832795029)
#define SIMD_2_PI btScalar(2.0) * SIMD_PI
#define SIMD_HALF_PI (SIMD_PI * btScalar(0.5))
#define SIMD_RADS_PER_DEG (SIMD_2_PI / btScalar(360.0))
#define SIMD_DEGS_PER_RAD (btScalar(360.0) / SIMD_2_PI)
#define SIMDSQRT12 btScalar(0.7071067811865475244008443621048490)
#define btRecipSqrt(x) ((btScalar)(btScalar(1.0)/btSqrt(btScalar(x)))) /* reciprocal square root */
#define btRecip(x) (btScalar(1.0)/btScalar(x))
#ifdef BT_USE_DOUBLE_PRECISION
#define SIMD_EPSILON DBL_EPSILON
@ -602,6 +630,46 @@ SIMD_FORCE_INLINE double btUnswapEndianDouble(const unsigned char *src)
return d;
}
template<typename T>
SIMD_FORCE_INLINE void btSetZero(T* a, int n)
{
T* acurr = a;
size_t ncurr = n;
while (ncurr > 0)
{
*(acurr++) = 0;
--ncurr;
}
}
SIMD_FORCE_INLINE btScalar btLargeDot(const btScalar *a, const btScalar *b, int n)
{
btScalar p0,q0,m0,p1,q1,m1,sum;
sum = 0;
n -= 2;
while (n >= 0) {
p0 = a[0]; q0 = b[0];
m0 = p0 * q0;
p1 = a[1]; q1 = b[1];
m1 = p1 * q1;
sum += m0;
sum += m1;
a += 2;
b += 2;
n -= 2;
}
n += 2;
while (n > 0) {
sum += (*a) * (*b);
a++;
b++;
n--;
}
return sum;
}
// returns normalized value in range [-SIMD_PI, SIMD_PI]
SIMD_FORCE_INLINE btScalar btNormalizeAngle(btScalar angleInRadians)
{
@ -620,6 +688,8 @@ SIMD_FORCE_INLINE btScalar btNormalizeAngle(btScalar angleInRadians)
}
}
///rudimentary class to provide type info
struct btTypedObject
{

File diff suppressed because it is too large Load diff

View file

@ -17,7 +17,6 @@ subject to the following restrictions:
#define BT_SERIALIZER_H
#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE
#include "btStackAlloc.h"
#include "btHashMap.h"
#if !defined( __CELLOS_LV2__) && !defined(__MWERKS__)
@ -439,7 +438,7 @@ public:
buffer[9] = '2';
buffer[10] = '8';
buffer[11] = '1';
buffer[11] = '2';
}

View file

@ -19,9 +19,17 @@
#define BT_USE_SSE_IN_API
#endif
#include "btVector3.h"
#if defined (BT_USE_SSE) || defined (BT_USE_NEON)
#if defined BT_USE_SIMD_VECTOR3
#if DEBUG
#include <string.h>//for memset
#endif
#ifdef __APPLE__
#include <stdint.h>
@ -43,7 +51,7 @@ long _maxdot_large( const float *vv, const float *vec, unsigned long count, floa
long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
{
const float4 *vertices = (const float4*) vv;
static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY );
float4 vvec = _mm_loadu_ps( vec );
float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz
@ -428,7 +436,7 @@ long _mindot_large( const float *vv, const float *vec, unsigned long count, floa
long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
{
const float4 *vertices = (const float4*) vv;
static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
float4 dotmin = btAssign128( BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY );
float4 vvec = _mm_loadu_ps( vec );
float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz
@ -815,7 +823,8 @@ long _mindot_large( const float *vv, const float *vec, unsigned long count, floa
#elif defined BT_USE_NEON
#define ARM_NEON_GCC_COMPATIBILITY 1
#include <arm_neon.h>
#include <sys/types.h>
#include <sys/sysctl.h> //for sysctlbyname
static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
@ -827,11 +836,34 @@ static long _mindot_large_sel( const float *vv, const float *vec, unsigned long
long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
extern "C" {int _get_cpu_capabilities( void );}
static inline uint32_t btGetCpuCapabilities( void )
{
static uint32_t capabilities = 0;
static bool testedCapabilities = false;
if( 0 == testedCapabilities)
{
uint32_t hasFeature = 0;
size_t featureSize = sizeof( hasFeature );
int err = sysctlbyname( "hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
if( 0 == err && hasFeature)
capabilities |= 0x2000;
testedCapabilities = true;
}
return capabilities;
}
static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
{
if( _get_cpu_capabilities() & 0x2000 )
if( btGetCpuCapabilities() & 0x2000 )
_maxdot_large = _maxdot_large_v1;
else
_maxdot_large = _maxdot_large_v0;
@ -841,7 +873,8 @@ static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long
static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
{
if( _get_cpu_capabilities() & 0x2000 )
if( btGetCpuCapabilities() & 0x2000 )
_mindot_large = _mindot_large_v1;
else
_mindot_large = _mindot_large_v0;
@ -864,8 +897,8 @@ long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, f
float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
uint32x2_t indexLo = (uint32x2_t) {0, 1};
uint32x2_t indexHi = (uint32x2_t) {2, 3};
uint32x2_t iLo = (uint32x2_t) {-1, -1};
uint32x2_t iHi = (uint32x2_t) {-1, -1};
uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
const uint32x2_t four = (uint32x2_t) {4,4};
for( ; i+8 <= count; i+= 8 )
@ -1051,7 +1084,7 @@ long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, f
float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
unsigned long i = 0;
@ -1249,8 +1282,8 @@ long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, f
float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
uint32x2_t indexLo = (uint32x2_t) {0, 1};
uint32x2_t indexHi = (uint32x2_t) {2, 3};
uint32x2_t iLo = (uint32x2_t) {-1, -1};
uint32x2_t iHi = (uint32x2_t) {-1, -1};
uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
const uint32x2_t four = (uint32x2_t) {4,4};
for( ; i+8 <= count; i+= 8 )
@ -1434,7 +1467,7 @@ long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, f
float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
unsigned long i = 0;

View file

@ -53,19 +53,24 @@ subject to the following restrictions:
#define btvxyzMaskf btvFFF0fMask
#define btvAbsfMask btCastiTo128f(btvAbsMask)
//there is an issue with XCode 3.2 (LCx errors)
#define btvMzeroMask (_mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f))
#define v1110 (_mm_set_ps(0.0f, 1.0f, 1.0f, 1.0f))
#define vHalf (_mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f))
#define v1_5 (_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f))
const __m128 ATTRIBUTE_ALIGNED16(btvMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f};
const __m128 ATTRIBUTE_ALIGNED16(v1110) = {1.0f, 1.0f, 1.0f, 0.0f};
const __m128 ATTRIBUTE_ALIGNED16(vHalf) = {0.5f, 0.5f, 0.5f, 0.5f};
const __m128 ATTRIBUTE_ALIGNED16(v1_5) = {1.5f, 1.5f, 1.5f, 1.5f};
//const __m128 ATTRIBUTE_ALIGNED16(btvMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f};
//const __m128 ATTRIBUTE_ALIGNED16(v1110) = {1.0f, 1.0f, 1.0f, 0.0f};
//const __m128 ATTRIBUTE_ALIGNED16(vHalf) = {0.5f, 0.5f, 0.5f, 0.5f};
//const __m128 ATTRIBUTE_ALIGNED16(v1_5) = {1.5f, 1.5f, 1.5f, 1.5f};
#endif
#ifdef BT_USE_NEON
const float32x4_t ATTRIBUTE_ALIGNED16(btvMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
const int32x4_t ATTRIBUTE_ALIGNED16(btvFFF0Mask) = (int32x4_t){0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
const int32x4_t ATTRIBUTE_ALIGNED16(btvFFF0Mask) = (int32x4_t){static_cast<int32_t>(0xFFFFFFFF),
static_cast<int32_t>(0xFFFFFFFF), static_cast<int32_t>(0xFFFFFFFF), 0x0};
const int32x4_t ATTRIBUTE_ALIGNED16(btvAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
const int32x4_t ATTRIBUTE_ALIGNED16(btv3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
@ -229,7 +234,7 @@ public:
* @param v The other vector in the dot product */
SIMD_FORCE_INLINE btScalar dot(const btVector3& v) const
{
#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
__m128 vd = _mm_mul_ps(mVec128, v.mVec128);
__m128 z = _mm_movehl_ps(vd, vd);
__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
@ -260,6 +265,12 @@ public:
return btSqrt(length2());
}
/**@brief Return the norm (length) of the vector */
SIMD_FORCE_INLINE btScalar norm() const
{
return length();
}
/**@brief Return the distance squared between the ends of this and another vector
* This is symantically treating the vector like a point */
SIMD_FORCE_INLINE btScalar distance2(const btVector3& v) const;
@ -285,6 +296,9 @@ public:
* x^2 + y^2 + z^2 = 1 */
SIMD_FORCE_INLINE btVector3& normalize()
{
btAssert(length() != btScalar(0));
#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
// dot product first
__m128 vd = _mm_mul_ps(mVec128, mVec128);
@ -345,7 +359,8 @@ public:
/**@brief Return a vector will the absolute values of each element */
SIMD_FORCE_INLINE btVector3 absolute() const
{
#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
return btVector3(_mm_and_ps(mVec128, btv3AbsfMask));
#elif defined(BT_USE_NEON)
return btVector3(vabsq_f32(mVec128));
@ -400,7 +415,7 @@ public:
SIMD_FORCE_INLINE btScalar triple(const btVector3& v1, const btVector3& v2) const
{
#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
// cross:
__m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, BT_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0)
__m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, BT_SHUFFLE(1, 2, 0, 3)); // (Y Z X 0)
@ -632,7 +647,7 @@ public:
void getSkewSymmetricMatrix(btVector3* v0,btVector3* v1,btVector3* v2) const
{
#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
__m128 V = _mm_and_ps(mVec128, btvFFF0fMask);
__m128 V0 = _mm_xor_ps(btvMzeroMask, V);
@ -702,7 +717,7 @@ public:
/* create a vector as btVector3( this->dot( btVector3 v0 ), this->dot( btVector3 v1), this->dot( btVector3 v2 )) */
SIMD_FORCE_INLINE btVector3 dot3( const btVector3 &v0, const btVector3 &v1, const btVector3 &v2 ) const
{
#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
__m128 a0 = _mm_mul_ps( v0.mVec128, this->mVec128 );
__m128 a1 = _mm_mul_ps( v1.mVec128, this->mVec128 );
@ -717,7 +732,7 @@ public:
return btVector3(r);
#elif defined(BT_USE_NEON)
static const uint32x4_t xyzMask = (const uint32x4_t){ -1, -1, -1, 0 };
static const uint32x4_t xyzMask = (const uint32x4_t){ static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0 };
float32x4_t a0 = vmulq_f32( v0.mVec128, this->mVec128);
float32x4_t a1 = vmulq_f32( v1.mVec128, this->mVec128);
float32x4_t a2 = vmulq_f32( v2.mVec128, this->mVec128);
@ -768,7 +783,7 @@ operator*(const btVector3& v1, const btVector3& v2)
SIMD_FORCE_INLINE btVector3
operator-(const btVector3& v1, const btVector3& v2)
{
#if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
// without _mm_and_ps this code causes slowdown in Concave moving
__m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
@ -788,7 +803,7 @@ operator-(const btVector3& v1, const btVector3& v2)
SIMD_FORCE_INLINE btVector3
operator-(const btVector3& v)
{
#if (defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
__m128 r = _mm_xor_ps(v.mVec128, btvMzeroMask);
return btVector3(_mm_and_ps(r, btvFFF0fMask));
#elif defined(BT_USE_NEON)
@ -842,7 +857,7 @@ operator/(const btVector3& v, const btScalar& s)
SIMD_FORCE_INLINE btVector3
operator/(const btVector3& v1, const btVector3& v2)
{
#if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE))
#if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE))
__m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
vec = _mm_and_ps(vec, btvFFF0fMask);
return btVector3(vec);
@ -935,20 +950,16 @@ SIMD_FORCE_INLINE btScalar btVector3::distance(const btVector3& v) const
SIMD_FORCE_INLINE btVector3 btVector3::normalized() const
{
#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
btVector3 norm = *this;
return norm.normalize();
#else
return *this / length();
#endif
}
SIMD_FORCE_INLINE btVector3 btVector3::rotate( const btVector3& wAxis, const btScalar _angle ) const
{
// wAxis must be a unit lenght vector
#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
__m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
btScalar ssin = btSin( _angle );
@ -988,7 +999,7 @@ SIMD_FORCE_INLINE btVector3 btVector3::rotate( const btVector3& wAxis, const btS
SIMD_FORCE_INLINE long btVector3::maxDot( const btVector3 *array, long array_count, btScalar &dotOut ) const
{
#if defined (BT_USE_SSE) || defined (BT_USE_NEON)
#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
#if defined _WIN32 || defined (BT_USE_SSE)
const long scalar_cutoff = 10;
long _maxdot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
@ -996,10 +1007,8 @@ SIMD_FORCE_INLINE long btVector3::maxDot( const btVector3 *array, long arra
const long scalar_cutoff = 4;
extern long (*_maxdot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut );
#endif
if( array_count < scalar_cutoff )
#else
#endif//BT_USE_SSE || BT_USE_NEON
if( array_count < scalar_cutoff )
#endif
{
btScalar maxDot = -SIMD_INFINITY;
int i = 0;
@ -1018,14 +1027,14 @@ SIMD_FORCE_INLINE long btVector3::maxDot( const btVector3 *array, long arra
dotOut = maxDot;
return ptIndex;
}
#if defined (BT_USE_SSE) || defined (BT_USE_NEON)
#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
return _maxdot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
#endif
}
SIMD_FORCE_INLINE long btVector3::minDot( const btVector3 *array, long array_count, btScalar &dotOut ) const
{
#if defined (BT_USE_SSE) || defined (BT_USE_NEON)
#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
#if defined BT_USE_SSE
const long scalar_cutoff = 10;
long _mindot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
@ -1037,7 +1046,7 @@ SIMD_FORCE_INLINE long btVector3::minDot( const btVector3 *array, long arra
#endif
if( array_count < scalar_cutoff )
#endif//BT_USE_SSE || BT_USE_NEON
#endif
{
btScalar minDot = SIMD_INFINITY;
int i = 0;
@ -1058,9 +1067,9 @@ SIMD_FORCE_INLINE long btVector3::minDot( const btVector3 *array, long arra
return ptIndex;
}
#if defined (BT_USE_SSE) || defined (BT_USE_NEON)
#if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
return _mindot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
#endif
#endif//BT_USE_SIMD_VECTOR3
}
@ -1098,7 +1107,7 @@ public:
SIMD_FORCE_INLINE btVector4 absolute4() const
{
#if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
#if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
return btVector4(_mm_and_ps(mVec128, btvAbsfMask));
#elif defined(BT_USE_NEON)
return btVector4(vabsq_f32(mVec128));