//========= Copyright Valve Corporation, All rights reserved. ============// // // Purpose: - defines SIMD "structure of arrays" classes and functions. // //===========================================================================// #ifndef SSEQUATMATH_H #define SSEQUATMATH_H #ifdef _WIN32 #pragma once #endif #include "mathlib/ssemath.h" // Use this #define to allow SSE versions of Quaternion math // to exist on PC. // On PC, certain horizontal vector operations are not supported. // This causes the SSE implementation of quaternion math to mix the // vector and scalar floating point units, which is extremely // performance negative if you don't compile to native SSE2 (which // we don't as of Sept 1, 2007). So, it's best not to allow these // functions to exist at all. It's not good enough to simply replace // the contents of the functions with scalar math, because each call // to LoadAligned and StoreAligned will result in an unnecssary copy // of the quaternion, and several moves to and from the XMM registers. // // Basically, the problem you run into is that for efficient SIMD code, // you need to load the quaternions and vectors into SIMD registers and // keep them there as long as possible while doing only SIMD math, // whereas for efficient scalar code, each time you copy onto or ever // use a fltx4, it hoses your pipeline. So the difference has to be // in the management of temporary variables in the calling function, // not inside the math functions. // // If you compile assuming the presence of SSE2, the MSVC will abandon // the traditional x87 FPU operations altogether and make everything use // the SSE2 registers, which lessens this problem a little. // permitted only on 360, as we've done careful tuning on its Altivec math: #ifdef _X360 #define ALLOW_SIMD_QUATERNION_MATH 1 // not on PC! #endif //--------------------------------------------------------------------- // Load/store quaternions //--------------------------------------------------------------------- #ifndef _X360 #if ALLOW_SIMD_QUATERNION_MATH // Using STDC or SSE FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned &pSIMD) { fltx4 retval = LoadAlignedSIMD(pSIMD.Base()); return retval; } FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned *RESTRICT pSIMD) { fltx4 retval = LoadAlignedSIMD(pSIMD); return retval; } FORCEINLINE void StoreAlignedSIMD(QuaternionAligned *RESTRICT pSIMD, const fltx4 &a) { StoreAlignedSIMD(pSIMD->Base(), a); } #endif #else // for the transitional class -- load a QuaternionAligned FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned& pSIMD) { fltx4 retval = XMLoadVector4A(pSIMD.Base()); return retval; } FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned* RESTRICT pSIMD) { fltx4 retval = XMLoadVector4A(pSIMD); return retval; } FORCEINLINE void StoreAlignedSIMD(QuaternionAligned* RESTRICT pSIMD, const fltx4& a) { XMStoreVector4A(pSIMD->Base(), a); } #endif #if ALLOW_SIMD_QUATERNION_MATH //--------------------------------------------------------------------- // Make sure quaternions are within 180 degrees of one another, if not, reverse // q //--------------------------------------------------------------------- FORCEINLINE fltx4 QuaternionAlignSIMD(const fltx4 &p, const fltx4 &q) { // decide if one of the quaternions is backwards fltx4 a = SubSIMD(p, q); fltx4 b = AddSIMD(p, q); a = Dot4SIMD(a, a); b = Dot4SIMD(b, b); fltx4 cmp = CmpGtSIMD(a, b); fltx4 result = MaskedAssign(cmp, NegSIMD(q), q); return result; } //--------------------------------------------------------------------- // Normalize Quaternion //--------------------------------------------------------------------- #if USE_STDC_FOR_SIMD FORCEINLINE fltx4 QuaternionNormalizeSIMD(const fltx4 &q) { fltx4 radius, result; radius = Dot4SIMD(q, q); if (SubFloat(radius, 0)) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || // (radius > 1.0f + 4*FLT_EPSILON)) { float iradius = 1.0f / sqrt(SubFloat(radius, 0)); result = ReplicateX4(iradius); result = MulSIMD(result, q); return result; } return q; } #else // SSE + X360 implementation FORCEINLINE fltx4 QuaternionNormalizeSIMD(const fltx4 &q) { fltx4 radius, result, mask; radius = Dot4SIMD(q, q); mask = CmpEqSIMD(radius, Four_Zeros); // all ones iff radius = 0 result = ReciprocalSqrtSIMD(radius); result = MulSIMD(result, q); return MaskedAssign(mask, q, result); // if radius was 0, just return q } #endif //--------------------------------------------------------------------- // 0.0 returns p, 1.0 return q. //--------------------------------------------------------------------- FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD(const fltx4 &p, const fltx4 &q, float t) { fltx4 sclp, sclq, result; sclq = ReplicateX4(t); sclp = SubSIMD(Four_Ones, sclq); result = MulSIMD(sclp, p); result = MaddSIMD(sclq, q, result); return QuaternionNormalizeSIMD(result); } //--------------------------------------------------------------------- // Blend Quaternions //--------------------------------------------------------------------- FORCEINLINE fltx4 QuaternionBlendSIMD(const fltx4 &p, const fltx4 &q, float t) { // decide if one of the quaternions is backwards fltx4 q2, result; q2 = QuaternionAlignSIMD(p, q); result = QuaternionBlendNoAlignSIMD(p, q2, t); return result; } //--------------------------------------------------------------------- // Multiply Quaternions //--------------------------------------------------------------------- #ifndef _X360 // SSE and STDC FORCEINLINE fltx4 QuaternionMultSIMD(const fltx4 &p, const fltx4 &q) { // decide if one of the quaternions is backwards fltx4 q2, result; q2 = QuaternionAlignSIMD(p, q); SubFloat(result, 0) = SubFloat(p, 0) * SubFloat(q2, 3) + SubFloat(p, 1) * SubFloat(q2, 2) - SubFloat(p, 2) * SubFloat(q2, 1) + SubFloat(p, 3) * SubFloat(q2, 0); SubFloat(result, 1) = -SubFloat(p, 0) * SubFloat(q2, 2) + SubFloat(p, 1) * SubFloat(q2, 3) + SubFloat(p, 2) * SubFloat(q2, 0) + SubFloat(p, 3) * SubFloat(q2, 1); SubFloat(result, 2) = SubFloat(p, 0) * SubFloat(q2, 1) - SubFloat(p, 1) * SubFloat(q2, 0) + SubFloat(p, 2) * SubFloat(q2, 3) + SubFloat(p, 3) * SubFloat(q2, 2); SubFloat(result, 3) = -SubFloat(p, 0) * SubFloat(q2, 0) - SubFloat(p, 1) * SubFloat(q2, 1) - SubFloat(p, 2) * SubFloat(q2, 2) + SubFloat(p, 3) * SubFloat(q2, 3); return result; } #else // X360 extern const fltx4 g_QuatMultRowSign[4]; FORCEINLINE fltx4 QuaternionMultSIMD(const fltx4 &p, const fltx4 &q) { fltx4 q2, row, result; q2 = QuaternionAlignSIMD(p, q); row = XMVectorSwizzle(q2, 3, 2, 1, 0); row = MulSIMD(row, g_QuatMultRowSign[0]); result = Dot4SIMD(row, p); row = XMVectorSwizzle(q2, 2, 3, 0, 1); row = MulSIMD(row, g_QuatMultRowSign[1]); row = Dot4SIMD(row, p); result = __vrlimi(result, row, 4, 0); row = XMVectorSwizzle(q2, 1, 0, 3, 2); row = MulSIMD(row, g_QuatMultRowSign[2]); row = Dot4SIMD(row, p); result = __vrlimi(result, row, 2, 0); row = MulSIMD(q2, g_QuatMultRowSign[3]); row = Dot4SIMD(row, p); result = __vrlimi(result, row, 1, 0); return result; } #endif //--------------------------------------------------------------------- // Quaternion scale //--------------------------------------------------------------------- #ifndef _X360 // SSE and STDC FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4 &p, float t) { float r; fltx4 q; // FIXME: nick, this isn't overly sensitive to accuracy, and it may be // faster to use the cos part (w) of the quaternion // (sin(omega)*N,cos(omega)) to figure the new scale. float sinom = sqrt(SubFloat(p, 0) * SubFloat(p, 0) + SubFloat(p, 1) * SubFloat(p, 1) + SubFloat(p, 2) * SubFloat(p, 2)); sinom = min(sinom, 1.f); float sinsom = sin(asin(sinom) * t); t = sinsom / (sinom + FLT_EPSILON); SubFloat(q, 0) = t * SubFloat(p, 0); SubFloat(q, 1) = t * SubFloat(p, 1); SubFloat(q, 2) = t * SubFloat(p, 2); // rescale rotation r = 1.0f - sinsom * sinsom; // Assert( r >= 0 ); if (r < 0.0f) r = 0.0f; r = sqrt(r); // keep sign of rotation SubFloat(q, 3) = fsel(SubFloat(p, 3), r, -r); return q; } #else // X360 FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4 &p, float t) { fltx4 sinom = Dot3SIMD(p, p); sinom = SqrtSIMD(sinom); sinom = MinSIMD(sinom, Four_Ones); fltx4 sinsom = ArcSinSIMD(sinom); fltx4 t4 = ReplicateX4(t); sinsom = MulSIMD(sinsom, t4); sinsom = SinSIMD(sinsom); sinom = AddSIMD(sinom, Four_Epsilons); sinom = ReciprocalSIMD(sinom); t4 = MulSIMD(sinsom, sinom); fltx4 result = MulSIMD(p, t4); // rescale rotation sinsom = MulSIMD(sinsom, sinsom); fltx4 r = SubSIMD(Four_Ones, sinsom); r = MaxSIMD(r, Four_Zeros); r = SqrtSIMD(r); // keep sign of rotation fltx4 cmp = CmpGeSIMD(p, Four_Zeros); r = MaskedAssign(cmp, r, NegSIMD(r)); result = __vrlimi(result, r, 1, 0); return result; } #endif //----------------------------------------------------------------------------- // Quaternion sphereical linear interpolation //----------------------------------------------------------------------------- #ifndef _X360 // SSE and STDC FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD(const fltx4 &p, const fltx4 &q, float t) { float omega, cosom, sinom, sclp, sclq; fltx4 result; // 0.0 returns p, 1.0 return q. cosom = SubFloat(p, 0) * SubFloat(q, 0) + SubFloat(p, 1) * SubFloat(q, 1) + SubFloat(p, 2) * SubFloat(q, 2) + SubFloat(p, 3) * SubFloat(q, 3); if ((1.0f + cosom) > 0.000001f) { if ((1.0f - cosom) > 0.000001f) { omega = acos(cosom); sinom = sin(omega); sclp = sin((1.0f - t) * omega) / sinom; sclq = sin(t * omega) / sinom; } else { // TODO: add short circuit for cosom == 1.0f? sclp = 1.0f - t; sclq = t; } SubFloat(result, 0) = sclp * SubFloat(p, 0) + sclq * SubFloat(q, 0); SubFloat(result, 1) = sclp * SubFloat(p, 1) + sclq * SubFloat(q, 1); SubFloat(result, 2) = sclp * SubFloat(p, 2) + sclq * SubFloat(q, 2); SubFloat(result, 3) = sclp * SubFloat(p, 3) + sclq * SubFloat(q, 3); } else { SubFloat(result, 0) = -SubFloat(q, 1); SubFloat(result, 1) = SubFloat(q, 0); SubFloat(result, 2) = -SubFloat(q, 3); SubFloat(result, 3) = SubFloat(q, 2); sclp = sin((1.0f - t) * (0.5f * M_PI)); sclq = sin(t * (0.5f * M_PI)); SubFloat(result, 0) = sclp * SubFloat(p, 0) + sclq * SubFloat(result, 0); SubFloat(result, 1) = sclp * SubFloat(p, 1) + sclq * SubFloat(result, 1); SubFloat(result, 2) = sclp * SubFloat(p, 2) + sclq * SubFloat(result, 2); } return result; } #else // X360 FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD(const fltx4 &p, const fltx4 &q, float t) { return XMQuaternionSlerp(p, q, t); } #endif FORCEINLINE fltx4 QuaternionSlerpSIMD(const fltx4 &p, const fltx4 &q, float t) { fltx4 q2, result; q2 = QuaternionAlignSIMD(p, q); result = QuaternionSlerpNoAlignSIMD(p, q2, t); return result; } #endif // ALLOW_SIMD_QUATERNION_MATH #endif // SSEQUATMATH_H