353 lines
12 KiB
C
353 lines
12 KiB
C
//========= Copyright Valve Corporation, All rights reserved. ============//
|
|
//
|
|
// Purpose: - defines SIMD "structure of arrays" classes and functions.
|
|
//
|
|
//===========================================================================//
|
|
#ifndef SSEQUATMATH_H
|
|
#define SSEQUATMATH_H
|
|
|
|
#ifdef _WIN32
|
|
#pragma once
|
|
#endif
|
|
|
|
#include "mathlib/ssemath.h"
|
|
|
|
// Use this #define to allow SSE versions of Quaternion math
|
|
// to exist on PC.
|
|
// On PC, certain horizontal vector operations are not supported.
|
|
// This causes the SSE implementation of quaternion math to mix the
|
|
// vector and scalar floating point units, which is extremely
|
|
// performance negative if you don't compile to native SSE2 (which
|
|
// we don't as of Sept 1, 2007). So, it's best not to allow these
|
|
// functions to exist at all. It's not good enough to simply replace
|
|
// the contents of the functions with scalar math, because each call
|
|
// to LoadAligned and StoreAligned will result in an unnecssary copy
|
|
// of the quaternion, and several moves to and from the XMM registers.
|
|
//
|
|
// Basically, the problem you run into is that for efficient SIMD code,
|
|
// you need to load the quaternions and vectors into SIMD registers and
|
|
// keep them there as long as possible while doing only SIMD math,
|
|
// whereas for efficient scalar code, each time you copy onto or ever
|
|
// use a fltx4, it hoses your pipeline. So the difference has to be
|
|
// in the management of temporary variables in the calling function,
|
|
// not inside the math functions.
|
|
//
|
|
// If you compile assuming the presence of SSE2, the MSVC will abandon
|
|
// the traditional x87 FPU operations altogether and make everything use
|
|
// the SSE2 registers, which lessens this problem a little.
|
|
|
|
// permitted only on 360, as we've done careful tuning on its Altivec math:
|
|
#ifdef _X360
|
|
#define ALLOW_SIMD_QUATERNION_MATH 1 // not on PC!
|
|
#endif
|
|
|
|
//---------------------------------------------------------------------
|
|
// Load/store quaternions
|
|
//---------------------------------------------------------------------
|
|
#ifndef _X360
|
|
#if ALLOW_SIMD_QUATERNION_MATH
|
|
// Using STDC or SSE
|
|
FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned &pSIMD) {
|
|
fltx4 retval = LoadAlignedSIMD(pSIMD.Base());
|
|
return retval;
|
|
}
|
|
|
|
FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned *RESTRICT pSIMD) {
|
|
fltx4 retval = LoadAlignedSIMD(pSIMD);
|
|
return retval;
|
|
}
|
|
|
|
FORCEINLINE void StoreAlignedSIMD(QuaternionAligned *RESTRICT pSIMD,
|
|
const fltx4 &a) {
|
|
StoreAlignedSIMD(pSIMD->Base(), a);
|
|
}
|
|
#endif
|
|
#else
|
|
|
|
// for the transitional class -- load a QuaternionAligned
|
|
FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned& pSIMD) {
|
|
fltx4 retval = XMLoadVector4A(pSIMD.Base());
|
|
return retval;
|
|
}
|
|
|
|
FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned* RESTRICT pSIMD) {
|
|
fltx4 retval = XMLoadVector4A(pSIMD);
|
|
return retval;
|
|
}
|
|
|
|
FORCEINLINE void StoreAlignedSIMD(QuaternionAligned* RESTRICT pSIMD,
|
|
const fltx4& a) {
|
|
XMStoreVector4A(pSIMD->Base(), a);
|
|
}
|
|
|
|
#endif
|
|
|
|
#if ALLOW_SIMD_QUATERNION_MATH
|
|
//---------------------------------------------------------------------
|
|
// Make sure quaternions are within 180 degrees of one another, if not, reverse
|
|
// q
|
|
//---------------------------------------------------------------------
|
|
FORCEINLINE fltx4 QuaternionAlignSIMD(const fltx4 &p, const fltx4 &q) {
|
|
// decide if one of the quaternions is backwards
|
|
fltx4 a = SubSIMD(p, q);
|
|
fltx4 b = AddSIMD(p, q);
|
|
a = Dot4SIMD(a, a);
|
|
b = Dot4SIMD(b, b);
|
|
fltx4 cmp = CmpGtSIMD(a, b);
|
|
fltx4 result = MaskedAssign(cmp, NegSIMD(q), q);
|
|
return result;
|
|
}
|
|
|
|
//---------------------------------------------------------------------
|
|
// Normalize Quaternion
|
|
//---------------------------------------------------------------------
|
|
#if USE_STDC_FOR_SIMD
|
|
|
|
FORCEINLINE fltx4 QuaternionNormalizeSIMD(const fltx4 &q) {
|
|
fltx4 radius, result;
|
|
radius = Dot4SIMD(q, q);
|
|
|
|
if (SubFloat(radius,
|
|
0)) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) ||
|
|
// (radius > 1.0f + 4*FLT_EPSILON))
|
|
{
|
|
float iradius = 1.0f / sqrt(SubFloat(radius, 0));
|
|
result = ReplicateX4(iradius);
|
|
result = MulSIMD(result, q);
|
|
return result;
|
|
}
|
|
return q;
|
|
}
|
|
|
|
#else
|
|
|
|
// SSE + X360 implementation
|
|
FORCEINLINE fltx4 QuaternionNormalizeSIMD(const fltx4 &q) {
|
|
fltx4 radius, result, mask;
|
|
radius = Dot4SIMD(q, q);
|
|
mask = CmpEqSIMD(radius, Four_Zeros); // all ones iff radius = 0
|
|
result = ReciprocalSqrtSIMD(radius);
|
|
result = MulSIMD(result, q);
|
|
return MaskedAssign(mask, q, result); // if radius was 0, just return q
|
|
}
|
|
|
|
#endif
|
|
|
|
//---------------------------------------------------------------------
|
|
// 0.0 returns p, 1.0 return q.
|
|
//---------------------------------------------------------------------
|
|
FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD(const fltx4 &p, const fltx4 &q,
|
|
float t) {
|
|
fltx4 sclp, sclq, result;
|
|
sclq = ReplicateX4(t);
|
|
sclp = SubSIMD(Four_Ones, sclq);
|
|
result = MulSIMD(sclp, p);
|
|
result = MaddSIMD(sclq, q, result);
|
|
return QuaternionNormalizeSIMD(result);
|
|
}
|
|
|
|
//---------------------------------------------------------------------
|
|
// Blend Quaternions
|
|
//---------------------------------------------------------------------
|
|
FORCEINLINE fltx4 QuaternionBlendSIMD(const fltx4 &p, const fltx4 &q, float t) {
|
|
// decide if one of the quaternions is backwards
|
|
fltx4 q2, result;
|
|
q2 = QuaternionAlignSIMD(p, q);
|
|
result = QuaternionBlendNoAlignSIMD(p, q2, t);
|
|
return result;
|
|
}
|
|
|
|
//---------------------------------------------------------------------
|
|
// Multiply Quaternions
|
|
//---------------------------------------------------------------------
|
|
#ifndef _X360
|
|
|
|
// SSE and STDC
|
|
FORCEINLINE fltx4 QuaternionMultSIMD(const fltx4 &p, const fltx4 &q) {
|
|
// decide if one of the quaternions is backwards
|
|
fltx4 q2, result;
|
|
q2 = QuaternionAlignSIMD(p, q);
|
|
SubFloat(result, 0) =
|
|
SubFloat(p, 0) * SubFloat(q2, 3) + SubFloat(p, 1) * SubFloat(q2, 2) -
|
|
SubFloat(p, 2) * SubFloat(q2, 1) + SubFloat(p, 3) * SubFloat(q2, 0);
|
|
SubFloat(result, 1) =
|
|
-SubFloat(p, 0) * SubFloat(q2, 2) + SubFloat(p, 1) * SubFloat(q2, 3) +
|
|
SubFloat(p, 2) * SubFloat(q2, 0) + SubFloat(p, 3) * SubFloat(q2, 1);
|
|
SubFloat(result, 2) =
|
|
SubFloat(p, 0) * SubFloat(q2, 1) - SubFloat(p, 1) * SubFloat(q2, 0) +
|
|
SubFloat(p, 2) * SubFloat(q2, 3) + SubFloat(p, 3) * SubFloat(q2, 2);
|
|
SubFloat(result, 3) =
|
|
-SubFloat(p, 0) * SubFloat(q2, 0) - SubFloat(p, 1) * SubFloat(q2, 1) -
|
|
SubFloat(p, 2) * SubFloat(q2, 2) + SubFloat(p, 3) * SubFloat(q2, 3);
|
|
return result;
|
|
}
|
|
|
|
#else
|
|
|
|
// X360
|
|
extern const fltx4 g_QuatMultRowSign[4];
|
|
FORCEINLINE fltx4 QuaternionMultSIMD(const fltx4 &p, const fltx4 &q) {
|
|
fltx4 q2, row, result;
|
|
q2 = QuaternionAlignSIMD(p, q);
|
|
|
|
row = XMVectorSwizzle(q2, 3, 2, 1, 0);
|
|
row = MulSIMD(row, g_QuatMultRowSign[0]);
|
|
result = Dot4SIMD(row, p);
|
|
|
|
row = XMVectorSwizzle(q2, 2, 3, 0, 1);
|
|
row = MulSIMD(row, g_QuatMultRowSign[1]);
|
|
row = Dot4SIMD(row, p);
|
|
result = __vrlimi(result, row, 4, 0);
|
|
|
|
row = XMVectorSwizzle(q2, 1, 0, 3, 2);
|
|
row = MulSIMD(row, g_QuatMultRowSign[2]);
|
|
row = Dot4SIMD(row, p);
|
|
result = __vrlimi(result, row, 2, 0);
|
|
|
|
row = MulSIMD(q2, g_QuatMultRowSign[3]);
|
|
row = Dot4SIMD(row, p);
|
|
result = __vrlimi(result, row, 1, 0);
|
|
return result;
|
|
}
|
|
|
|
#endif
|
|
|
|
//---------------------------------------------------------------------
|
|
// Quaternion scale
|
|
//---------------------------------------------------------------------
|
|
#ifndef _X360
|
|
|
|
// SSE and STDC
|
|
FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4 &p, float t) {
|
|
float r;
|
|
fltx4 q;
|
|
|
|
// FIXME: nick, this isn't overly sensitive to accuracy, and it may be
|
|
// faster to use the cos part (w) of the quaternion
|
|
// (sin(omega)*N,cos(omega)) to figure the new scale.
|
|
float sinom =
|
|
sqrt(SubFloat(p, 0) * SubFloat(p, 0) + SubFloat(p, 1) * SubFloat(p, 1) +
|
|
SubFloat(p, 2) * SubFloat(p, 2));
|
|
sinom = min(sinom, 1.f);
|
|
|
|
float sinsom = sin(asin(sinom) * t);
|
|
|
|
t = sinsom / (sinom + FLT_EPSILON);
|
|
SubFloat(q, 0) = t * SubFloat(p, 0);
|
|
SubFloat(q, 1) = t * SubFloat(p, 1);
|
|
SubFloat(q, 2) = t * SubFloat(p, 2);
|
|
|
|
// rescale rotation
|
|
r = 1.0f - sinsom * sinsom;
|
|
|
|
// Assert( r >= 0 );
|
|
if (r < 0.0f) r = 0.0f;
|
|
r = sqrt(r);
|
|
|
|
// keep sign of rotation
|
|
SubFloat(q, 3) = fsel(SubFloat(p, 3), r, -r);
|
|
return q;
|
|
}
|
|
|
|
#else
|
|
|
|
// X360
|
|
FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4 &p, float t) {
|
|
fltx4 sinom = Dot3SIMD(p, p);
|
|
sinom = SqrtSIMD(sinom);
|
|
sinom = MinSIMD(sinom, Four_Ones);
|
|
fltx4 sinsom = ArcSinSIMD(sinom);
|
|
fltx4 t4 = ReplicateX4(t);
|
|
sinsom = MulSIMD(sinsom, t4);
|
|
sinsom = SinSIMD(sinsom);
|
|
sinom = AddSIMD(sinom, Four_Epsilons);
|
|
sinom = ReciprocalSIMD(sinom);
|
|
t4 = MulSIMD(sinsom, sinom);
|
|
fltx4 result = MulSIMD(p, t4);
|
|
|
|
// rescale rotation
|
|
sinsom = MulSIMD(sinsom, sinsom);
|
|
fltx4 r = SubSIMD(Four_Ones, sinsom);
|
|
r = MaxSIMD(r, Four_Zeros);
|
|
r = SqrtSIMD(r);
|
|
|
|
// keep sign of rotation
|
|
fltx4 cmp = CmpGeSIMD(p, Four_Zeros);
|
|
r = MaskedAssign(cmp, r, NegSIMD(r));
|
|
|
|
result = __vrlimi(result, r, 1, 0);
|
|
return result;
|
|
}
|
|
|
|
#endif
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Quaternion sphereical linear interpolation
|
|
//-----------------------------------------------------------------------------
|
|
#ifndef _X360
|
|
|
|
// SSE and STDC
|
|
FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD(const fltx4 &p, const fltx4 &q,
|
|
float t) {
|
|
float omega, cosom, sinom, sclp, sclq;
|
|
|
|
fltx4 result;
|
|
|
|
// 0.0 returns p, 1.0 return q.
|
|
cosom = SubFloat(p, 0) * SubFloat(q, 0) + SubFloat(p, 1) * SubFloat(q, 1) +
|
|
SubFloat(p, 2) * SubFloat(q, 2) + SubFloat(p, 3) * SubFloat(q, 3);
|
|
|
|
if ((1.0f + cosom) > 0.000001f) {
|
|
if ((1.0f - cosom) > 0.000001f) {
|
|
omega = acos(cosom);
|
|
sinom = sin(omega);
|
|
sclp = sin((1.0f - t) * omega) / sinom;
|
|
sclq = sin(t * omega) / sinom;
|
|
} else {
|
|
// TODO: add short circuit for cosom == 1.0f?
|
|
sclp = 1.0f - t;
|
|
sclq = t;
|
|
}
|
|
SubFloat(result, 0) = sclp * SubFloat(p, 0) + sclq * SubFloat(q, 0);
|
|
SubFloat(result, 1) = sclp * SubFloat(p, 1) + sclq * SubFloat(q, 1);
|
|
SubFloat(result, 2) = sclp * SubFloat(p, 2) + sclq * SubFloat(q, 2);
|
|
SubFloat(result, 3) = sclp * SubFloat(p, 3) + sclq * SubFloat(q, 3);
|
|
} else {
|
|
SubFloat(result, 0) = -SubFloat(q, 1);
|
|
SubFloat(result, 1) = SubFloat(q, 0);
|
|
SubFloat(result, 2) = -SubFloat(q, 3);
|
|
SubFloat(result, 3) = SubFloat(q, 2);
|
|
sclp = sin((1.0f - t) * (0.5f * M_PI));
|
|
sclq = sin(t * (0.5f * M_PI));
|
|
SubFloat(result, 0) =
|
|
sclp * SubFloat(p, 0) + sclq * SubFloat(result, 0);
|
|
SubFloat(result, 1) =
|
|
sclp * SubFloat(p, 1) + sclq * SubFloat(result, 1);
|
|
SubFloat(result, 2) =
|
|
sclp * SubFloat(p, 2) + sclq * SubFloat(result, 2);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
#else
|
|
|
|
// X360
|
|
FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD(const fltx4 &p, const fltx4 &q,
|
|
float t) {
|
|
return XMQuaternionSlerp(p, q, t);
|
|
}
|
|
|
|
#endif
|
|
|
|
FORCEINLINE fltx4 QuaternionSlerpSIMD(const fltx4 &p, const fltx4 &q, float t) {
|
|
fltx4 q2, result;
|
|
q2 = QuaternionAlignSIMD(p, q);
|
|
result = QuaternionSlerpNoAlignSIMD(p, q2, t);
|
|
return result;
|
|
}
|
|
|
|
#endif // ALLOW_SIMD_QUATERNION_MATH
|
|
|
|
#endif // SSEQUATMATH_H
|