This repository has been archived on 2024-06-13. You can view files and clone it, but cannot push or open issues or pull requests.
2020-08-04 13:13:01 -04:00

353 lines
12 KiB
C

//========= Copyright Valve Corporation, All rights reserved. ============//
//
// Purpose: - defines SIMD "structure of arrays" classes and functions.
//
//===========================================================================//
#ifndef SSEQUATMATH_H
#define SSEQUATMATH_H
#ifdef _WIN32
#pragma once
#endif
#include "mathlib/ssemath.h"
// Use this #define to allow SSE versions of Quaternion math
// to exist on PC.
// On PC, certain horizontal vector operations are not supported.
// This causes the SSE implementation of quaternion math to mix the
// vector and scalar floating point units, which is extremely
// performance negative if you don't compile to native SSE2 (which
// we don't as of Sept 1, 2007). So, it's best not to allow these
// functions to exist at all. It's not good enough to simply replace
// the contents of the functions with scalar math, because each call
// to LoadAligned and StoreAligned will result in an unnecssary copy
// of the quaternion, and several moves to and from the XMM registers.
//
// Basically, the problem you run into is that for efficient SIMD code,
// you need to load the quaternions and vectors into SIMD registers and
// keep them there as long as possible while doing only SIMD math,
// whereas for efficient scalar code, each time you copy onto or ever
// use a fltx4, it hoses your pipeline. So the difference has to be
// in the management of temporary variables in the calling function,
// not inside the math functions.
//
// If you compile assuming the presence of SSE2, the MSVC will abandon
// the traditional x87 FPU operations altogether and make everything use
// the SSE2 registers, which lessens this problem a little.
// permitted only on 360, as we've done careful tuning on its Altivec math:
#ifdef _X360
#define ALLOW_SIMD_QUATERNION_MATH 1 // not on PC!
#endif
//---------------------------------------------------------------------
// Load/store quaternions
//---------------------------------------------------------------------
#ifndef _X360
#if ALLOW_SIMD_QUATERNION_MATH
// Using STDC or SSE
FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned &pSIMD) {
fltx4 retval = LoadAlignedSIMD(pSIMD.Base());
return retval;
}
FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned *RESTRICT pSIMD) {
fltx4 retval = LoadAlignedSIMD(pSIMD);
return retval;
}
FORCEINLINE void StoreAlignedSIMD(QuaternionAligned *RESTRICT pSIMD,
const fltx4 &a) {
StoreAlignedSIMD(pSIMD->Base(), a);
}
#endif
#else
// for the transitional class -- load a QuaternionAligned
FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned& pSIMD) {
fltx4 retval = XMLoadVector4A(pSIMD.Base());
return retval;
}
FORCEINLINE fltx4 LoadAlignedSIMD(const QuaternionAligned* RESTRICT pSIMD) {
fltx4 retval = XMLoadVector4A(pSIMD);
return retval;
}
FORCEINLINE void StoreAlignedSIMD(QuaternionAligned* RESTRICT pSIMD,
const fltx4& a) {
XMStoreVector4A(pSIMD->Base(), a);
}
#endif
#if ALLOW_SIMD_QUATERNION_MATH
//---------------------------------------------------------------------
// Make sure quaternions are within 180 degrees of one another, if not, reverse
// q
//---------------------------------------------------------------------
FORCEINLINE fltx4 QuaternionAlignSIMD(const fltx4 &p, const fltx4 &q) {
// decide if one of the quaternions is backwards
fltx4 a = SubSIMD(p, q);
fltx4 b = AddSIMD(p, q);
a = Dot4SIMD(a, a);
b = Dot4SIMD(b, b);
fltx4 cmp = CmpGtSIMD(a, b);
fltx4 result = MaskedAssign(cmp, NegSIMD(q), q);
return result;
}
//---------------------------------------------------------------------
// Normalize Quaternion
//---------------------------------------------------------------------
#if USE_STDC_FOR_SIMD
FORCEINLINE fltx4 QuaternionNormalizeSIMD(const fltx4 &q) {
fltx4 radius, result;
radius = Dot4SIMD(q, q);
if (SubFloat(radius,
0)) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) ||
// (radius > 1.0f + 4*FLT_EPSILON))
{
float iradius = 1.0f / sqrt(SubFloat(radius, 0));
result = ReplicateX4(iradius);
result = MulSIMD(result, q);
return result;
}
return q;
}
#else
// SSE + X360 implementation
FORCEINLINE fltx4 QuaternionNormalizeSIMD(const fltx4 &q) {
fltx4 radius, result, mask;
radius = Dot4SIMD(q, q);
mask = CmpEqSIMD(radius, Four_Zeros); // all ones iff radius = 0
result = ReciprocalSqrtSIMD(radius);
result = MulSIMD(result, q);
return MaskedAssign(mask, q, result); // if radius was 0, just return q
}
#endif
//---------------------------------------------------------------------
// 0.0 returns p, 1.0 return q.
//---------------------------------------------------------------------
FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD(const fltx4 &p, const fltx4 &q,
float t) {
fltx4 sclp, sclq, result;
sclq = ReplicateX4(t);
sclp = SubSIMD(Four_Ones, sclq);
result = MulSIMD(sclp, p);
result = MaddSIMD(sclq, q, result);
return QuaternionNormalizeSIMD(result);
}
//---------------------------------------------------------------------
// Blend Quaternions
//---------------------------------------------------------------------
FORCEINLINE fltx4 QuaternionBlendSIMD(const fltx4 &p, const fltx4 &q, float t) {
// decide if one of the quaternions is backwards
fltx4 q2, result;
q2 = QuaternionAlignSIMD(p, q);
result = QuaternionBlendNoAlignSIMD(p, q2, t);
return result;
}
//---------------------------------------------------------------------
// Multiply Quaternions
//---------------------------------------------------------------------
#ifndef _X360
// SSE and STDC
FORCEINLINE fltx4 QuaternionMultSIMD(const fltx4 &p, const fltx4 &q) {
// decide if one of the quaternions is backwards
fltx4 q2, result;
q2 = QuaternionAlignSIMD(p, q);
SubFloat(result, 0) =
SubFloat(p, 0) * SubFloat(q2, 3) + SubFloat(p, 1) * SubFloat(q2, 2) -
SubFloat(p, 2) * SubFloat(q2, 1) + SubFloat(p, 3) * SubFloat(q2, 0);
SubFloat(result, 1) =
-SubFloat(p, 0) * SubFloat(q2, 2) + SubFloat(p, 1) * SubFloat(q2, 3) +
SubFloat(p, 2) * SubFloat(q2, 0) + SubFloat(p, 3) * SubFloat(q2, 1);
SubFloat(result, 2) =
SubFloat(p, 0) * SubFloat(q2, 1) - SubFloat(p, 1) * SubFloat(q2, 0) +
SubFloat(p, 2) * SubFloat(q2, 3) + SubFloat(p, 3) * SubFloat(q2, 2);
SubFloat(result, 3) =
-SubFloat(p, 0) * SubFloat(q2, 0) - SubFloat(p, 1) * SubFloat(q2, 1) -
SubFloat(p, 2) * SubFloat(q2, 2) + SubFloat(p, 3) * SubFloat(q2, 3);
return result;
}
#else
// X360
extern const fltx4 g_QuatMultRowSign[4];
FORCEINLINE fltx4 QuaternionMultSIMD(const fltx4 &p, const fltx4 &q) {
fltx4 q2, row, result;
q2 = QuaternionAlignSIMD(p, q);
row = XMVectorSwizzle(q2, 3, 2, 1, 0);
row = MulSIMD(row, g_QuatMultRowSign[0]);
result = Dot4SIMD(row, p);
row = XMVectorSwizzle(q2, 2, 3, 0, 1);
row = MulSIMD(row, g_QuatMultRowSign[1]);
row = Dot4SIMD(row, p);
result = __vrlimi(result, row, 4, 0);
row = XMVectorSwizzle(q2, 1, 0, 3, 2);
row = MulSIMD(row, g_QuatMultRowSign[2]);
row = Dot4SIMD(row, p);
result = __vrlimi(result, row, 2, 0);
row = MulSIMD(q2, g_QuatMultRowSign[3]);
row = Dot4SIMD(row, p);
result = __vrlimi(result, row, 1, 0);
return result;
}
#endif
//---------------------------------------------------------------------
// Quaternion scale
//---------------------------------------------------------------------
#ifndef _X360
// SSE and STDC
FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4 &p, float t) {
float r;
fltx4 q;
// FIXME: nick, this isn't overly sensitive to accuracy, and it may be
// faster to use the cos part (w) of the quaternion
// (sin(omega)*N,cos(omega)) to figure the new scale.
float sinom =
sqrt(SubFloat(p, 0) * SubFloat(p, 0) + SubFloat(p, 1) * SubFloat(p, 1) +
SubFloat(p, 2) * SubFloat(p, 2));
sinom = min(sinom, 1.f);
float sinsom = sin(asin(sinom) * t);
t = sinsom / (sinom + FLT_EPSILON);
SubFloat(q, 0) = t * SubFloat(p, 0);
SubFloat(q, 1) = t * SubFloat(p, 1);
SubFloat(q, 2) = t * SubFloat(p, 2);
// rescale rotation
r = 1.0f - sinsom * sinsom;
// Assert( r >= 0 );
if (r < 0.0f) r = 0.0f;
r = sqrt(r);
// keep sign of rotation
SubFloat(q, 3) = fsel(SubFloat(p, 3), r, -r);
return q;
}
#else
// X360
FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4 &p, float t) {
fltx4 sinom = Dot3SIMD(p, p);
sinom = SqrtSIMD(sinom);
sinom = MinSIMD(sinom, Four_Ones);
fltx4 sinsom = ArcSinSIMD(sinom);
fltx4 t4 = ReplicateX4(t);
sinsom = MulSIMD(sinsom, t4);
sinsom = SinSIMD(sinsom);
sinom = AddSIMD(sinom, Four_Epsilons);
sinom = ReciprocalSIMD(sinom);
t4 = MulSIMD(sinsom, sinom);
fltx4 result = MulSIMD(p, t4);
// rescale rotation
sinsom = MulSIMD(sinsom, sinsom);
fltx4 r = SubSIMD(Four_Ones, sinsom);
r = MaxSIMD(r, Four_Zeros);
r = SqrtSIMD(r);
// keep sign of rotation
fltx4 cmp = CmpGeSIMD(p, Four_Zeros);
r = MaskedAssign(cmp, r, NegSIMD(r));
result = __vrlimi(result, r, 1, 0);
return result;
}
#endif
//-----------------------------------------------------------------------------
// Quaternion sphereical linear interpolation
//-----------------------------------------------------------------------------
#ifndef _X360
// SSE and STDC
FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD(const fltx4 &p, const fltx4 &q,
float t) {
float omega, cosom, sinom, sclp, sclq;
fltx4 result;
// 0.0 returns p, 1.0 return q.
cosom = SubFloat(p, 0) * SubFloat(q, 0) + SubFloat(p, 1) * SubFloat(q, 1) +
SubFloat(p, 2) * SubFloat(q, 2) + SubFloat(p, 3) * SubFloat(q, 3);
if ((1.0f + cosom) > 0.000001f) {
if ((1.0f - cosom) > 0.000001f) {
omega = acos(cosom);
sinom = sin(omega);
sclp = sin((1.0f - t) * omega) / sinom;
sclq = sin(t * omega) / sinom;
} else {
// TODO: add short circuit for cosom == 1.0f?
sclp = 1.0f - t;
sclq = t;
}
SubFloat(result, 0) = sclp * SubFloat(p, 0) + sclq * SubFloat(q, 0);
SubFloat(result, 1) = sclp * SubFloat(p, 1) + sclq * SubFloat(q, 1);
SubFloat(result, 2) = sclp * SubFloat(p, 2) + sclq * SubFloat(q, 2);
SubFloat(result, 3) = sclp * SubFloat(p, 3) + sclq * SubFloat(q, 3);
} else {
SubFloat(result, 0) = -SubFloat(q, 1);
SubFloat(result, 1) = SubFloat(q, 0);
SubFloat(result, 2) = -SubFloat(q, 3);
SubFloat(result, 3) = SubFloat(q, 2);
sclp = sin((1.0f - t) * (0.5f * M_PI));
sclq = sin(t * (0.5f * M_PI));
SubFloat(result, 0) =
sclp * SubFloat(p, 0) + sclq * SubFloat(result, 0);
SubFloat(result, 1) =
sclp * SubFloat(p, 1) + sclq * SubFloat(result, 1);
SubFloat(result, 2) =
sclp * SubFloat(p, 2) + sclq * SubFloat(result, 2);
}
return result;
}
#else
// X360
FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD(const fltx4 &p, const fltx4 &q,
float t) {
return XMQuaternionSlerp(p, q, t);
}
#endif
FORCEINLINE fltx4 QuaternionSlerpSIMD(const fltx4 &p, const fltx4 &q, float t) {
fltx4 q2, result;
q2 = QuaternionAlignSIMD(p, q);
result = QuaternionSlerpNoAlignSIMD(p, q2, t);
return result;
}
#endif // ALLOW_SIMD_QUATERNION_MATH
#endif // SSEQUATMATH_H