//========= Copyright Valve Corporation, All rights reserved. ============//
//
// Purpose: - defines SIMD "structure of arrays" classes and functions.
//
//===========================================================================//
#ifndef SSEMATH_H
#define SSEMATH_H

#if defined(_X360)
#include <xboxmath.h>
#else
#include <xmmintrin.h>
#endif

#include "mathlib.h"
#include "vector.h"

#if defined(GNUC)
#define USE_STDC_FOR_SIMD 0
#else
#define USE_STDC_FOR_SIMD 0
#endif

#if (!defined(_X360) && (USE_STDC_FOR_SIMD == 0))
#define _SSE1 1
#endif

// I thought about defining a class/union for the SIMD packed floats instead of
// using fltx4, but decided against it because (a) the nature of SIMD code which
// includes comparisons is to blur the relationship between packed floats and
// packed integer types and (b) not sure that the compiler would handle
// generating good code for the intrinsics.

#if USE_STDC_FOR_SIMD

typedef union {
    float m128_f32[4];
    uint32 m128_u32[4];
} fltx4;

typedef fltx4 i32x4;
typedef fltx4 u32x4;

#elif (defined(_X360))

typedef union {
    // This union allows float/int access (which generally shouldn't be done in
    // inner loops)
    __vector4 vmx;
    float m128_f32[4];
    uint32 m128_u32[4];
} fltx4_union;

typedef __vector4 fltx4;
typedef __vector4 i32x4;  // a VMX register; just a way of making it explicit
                          // that we're doing integer ops.
typedef __vector4 u32x4;  // a VMX register; just a way of making it explicit
                          // that we're doing unsigned integer ops.

#else

typedef __m128 fltx4;
typedef __m128 i32x4;
typedef __m128 u32x4;

#endif

// The FLTX4 type is a fltx4 used as a parameter to a function.
// On the 360, the best way to do this is pass-by-copy on the registers.
// On the PC, the best way is to pass by const reference.
// The compiler will sometimes, but not always, replace a pass-by-const-ref
// with a pass-in-reg on the 360; to avoid this confusion, you can
// explicitly use a FLTX4 as the parameter type.
#ifdef _X360
typedef __vector4 FLTX4;
#else
typedef const fltx4 &FLTX4;
#endif

// A 16-byte aligned int32 datastructure
// (for use when writing out fltx4's as SIGNED
// ints).
struct ALIGN16 intx4 {
    int32 m_i32[4];

    inline int &operator[](int which) { return m_i32[which]; }

    inline const int &operator[](int which) const { return m_i32[which]; }

    inline int32 *Base() { return m_i32; }

    inline const int32 *Base() const { return m_i32; }

    inline const bool operator==(const intx4 &other) const {
        return m_i32[0] == other.m_i32[0] && m_i32[1] == other.m_i32[1] &&
               m_i32[2] == other.m_i32[2] && m_i32[3] == other.m_i32[3];
    }
} ALIGN16_POST;

#if defined(_DEBUG) && defined(_X360)
FORCEINLINE void TestVPUFlags() {
    // Check that the VPU is in the appropriate (Java-compliant) mode (see 3.2.1
    // in altivec_pem.pdf on xds.xbox.com)
    __vector4 a;
    __asm
    {
		mfvscr	a;
    }
    unsigned int *flags = (unsigned int *)&a;
    unsigned int controlWord = flags[3];
    Assert(controlWord == 0);
}
#else   // _DEBUG
FORCEINLINE void TestVPUFlags() {}
#endif  // _DEBUG

// useful constants in SIMD packed float format:
// (note: some of these aren't stored on the 360,
// but are manufactured directly in one or two
// instructions, saving a load and possible L2
// miss.)
#ifndef _X360
extern const fltx4 Four_Zeros;       // 0 0 0 0
extern const fltx4 Four_Ones;        // 1 1 1 1
extern const fltx4 Four_Twos;        // 2 2 2 2
extern const fltx4 Four_Threes;      // 3 3 3 3
extern const fltx4 Four_Fours;       // guess.
extern const fltx4 Four_Point225s;   // .225 .225 .225 .225
extern const fltx4 Four_PointFives;  // .5 .5 .5 .5
extern const fltx4
    Four_Epsilons;  // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
extern const fltx4 Four_2ToThe21s;  // (1<<21)..
extern const fltx4 Four_2ToThe22s;  // (1<<22)..
extern const fltx4 Four_2ToThe23s;  // (1<<23)..
extern const fltx4 Four_2ToThe24s;  // (1<<24)..
extern const fltx4 Four_Origin;  // 0 0 0 1 (origin point, like vr0 on the PS2)
extern const fltx4 Four_NegativeOnes;  // -1 -1 -1 -1
#else
#define Four_Zeros XMVectorZero()     // 0 0 0 0
#define Four_Ones XMVectorSplatOne()  // 1 1 1 1
extern const fltx4 Four_Twos;        // 2 2 2 2
extern const fltx4 Four_Threes;      // 3 3 3 3
extern const fltx4 Four_Fours;       // guess.
extern const fltx4 Four_Point225s;   // .225 .225 .225 .225
extern const fltx4 Four_PointFives;  // .5 .5 .5 .5
extern const fltx4
    Four_Epsilons;  // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
extern const fltx4 Four_2ToThe21s;  // (1<<21)..
extern const fltx4 Four_2ToThe22s;  // (1<<22)..
extern const fltx4 Four_2ToThe23s;  // (1<<23)..
extern const fltx4 Four_2ToThe24s;  // (1<<24)..
extern const fltx4 Four_Origin;  // 0 0 0 1 (origin point, like vr0 on the PS2)
extern const fltx4 Four_NegativeOnes;  // -1 -1 -1 -1
#endif
extern const fltx4 Four_FLT_MAX;  // FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX
extern const fltx4
    Four_Negative_FLT_MAX;       // -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX
extern const fltx4 g_SIMD_0123;  // 0 1 2 3 as float

// external aligned integer constants
extern const ALIGN16 int32
    g_SIMD_clear_signmask[] ALIGN16_POST;                      // 0x7fffffff x 4
extern const ALIGN16 int32 g_SIMD_signmask[] ALIGN16_POST;     // 0x80000000 x 4
extern const ALIGN16 int32 g_SIMD_lsbmask[] ALIGN16_POST;      // 0xfffffffe x 4
extern const ALIGN16 int32 g_SIMD_clear_wmask[] ALIGN16_POST;  // -1 -1 -1 0
extern const ALIGN16 int32 g_SIMD_ComponentMask
    [4][4] ALIGN16_POST;  // [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0
                          // 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF]
extern const ALIGN16 int32 g_SIMD_AllOnesMask[] ALIGN16_POST;    // ~0,~0,~0,~0
extern const ALIGN16 int32 g_SIMD_Low16BitsMask[] ALIGN16_POST;  // 0xffff x 4

// this mask is used for skipping the tail of things. If you have N elements in
// an array, and wish to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you
// want to use for the last iteration.
extern const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST;

// Define prefetch macros.
// The characteristics of cache and prefetch are completely
// different between the different platforms, so you DO NOT
// want to just define one macro that maps to every platform
// intrinsic under the hood -- you need to prefetch at different
// intervals between x86 and PPC, for example, and that is
// a higher level code change.
// On the other hand, I'm tired of typing #ifdef _X360
// all over the place, so this is just a nop on Intel, PS3.
#ifdef _X360
#define PREFETCH360(address, offset) __dcbt(offset, address)
#else
#define PREFETCH360(x, y)  // nothing
#endif

#if USE_STDC_FOR_SIMD

//---------------------------------------------------------------------
// Standard C (fallback/Linux) implementation (only there for compat - slow)
//---------------------------------------------------------------------

FORCEINLINE float SubFloat(const fltx4 &a, int idx) { return a.m128_f32[idx]; }

FORCEINLINE float &SubFloat(fltx4 &a, int idx) { return a.m128_f32[idx]; }

FORCEINLINE uint32 SubInt(const fltx4 &a, int idx) { return a.m128_u32[idx]; }

FORCEINLINE uint32 &SubInt(fltx4 &a, int idx) { return a.m128_u32[idx]; }

// Return one in the fastest way -- on the x360, faster even than loading.
FORCEINLINE fltx4 LoadZeroSIMD(void) { return Four_Zeros; }

// Return one in the fastest way -- on the x360, faster even than loading.
FORCEINLINE fltx4 LoadOneSIMD(void) { return Four_Ones; }

FORCEINLINE fltx4 SplatXSIMD(const fltx4 &a) {
    fltx4 retVal;
    SubFloat(retVal, 0) = SubFloat(a, 0);
    SubFloat(retVal, 1) = SubFloat(a, 0);
    SubFloat(retVal, 2) = SubFloat(a, 0);
    SubFloat(retVal, 3) = SubFloat(a, 0);
    return retVal;
}

FORCEINLINE fltx4 SplatYSIMD(fltx4 a) {
    fltx4 retVal;
    SubFloat(retVal, 0) = SubFloat(a, 1);
    SubFloat(retVal, 1) = SubFloat(a, 1);
    SubFloat(retVal, 2) = SubFloat(a, 1);
    SubFloat(retVal, 3) = SubFloat(a, 1);
    return retVal;
}

FORCEINLINE fltx4 SplatZSIMD(fltx4 a) {
    fltx4 retVal;
    SubFloat(retVal, 0) = SubFloat(a, 2);
    SubFloat(retVal, 1) = SubFloat(a, 2);
    SubFloat(retVal, 2) = SubFloat(a, 2);
    SubFloat(retVal, 3) = SubFloat(a, 2);
    return retVal;
}

FORCEINLINE fltx4 SplatWSIMD(fltx4 a) {
    fltx4 retVal;
    SubFloat(retVal, 0) = SubFloat(a, 3);
    SubFloat(retVal, 1) = SubFloat(a, 3);
    SubFloat(retVal, 2) = SubFloat(a, 3);
    SubFloat(retVal, 3) = SubFloat(a, 3);
    return retVal;
}

FORCEINLINE fltx4 SetXSIMD(const fltx4 &a, const fltx4 &x) {
    fltx4 result = a;
    SubFloat(result, 0) = SubFloat(x, 0);
    return result;
}

FORCEINLINE fltx4 SetYSIMD(const fltx4 &a, const fltx4 &y) {
    fltx4 result = a;
    SubFloat(result, 1) = SubFloat(y, 1);
    return result;
}

FORCEINLINE fltx4 SetZSIMD(const fltx4 &a, const fltx4 &z) {
    fltx4 result = a;
    SubFloat(result, 2) = SubFloat(z, 2);
    return result;
}

FORCEINLINE fltx4 SetWSIMD(const fltx4 &a, const fltx4 &w) {
    fltx4 result = a;
    SubFloat(result, 3) = SubFloat(w, 3);
    return result;
}

FORCEINLINE fltx4 SetComponentSIMD(const fltx4 &a, int nComponent,
                                   float flValue) {
    fltx4 result = a;
    SubFloat(result, nComponent) = flValue;
    return result;
}

// a b c d -> b c d a
FORCEINLINE fltx4 RotateLeft(const fltx4 &a) {
    fltx4 retVal;
    SubFloat(retVal, 0) = SubFloat(a, 1);
    SubFloat(retVal, 1) = SubFloat(a, 2);
    SubFloat(retVal, 2) = SubFloat(a, 3);
    SubFloat(retVal, 3) = SubFloat(a, 0);
    return retVal;
}

// a b c d -> c d a b
FORCEINLINE fltx4 RotateLeft2(const fltx4 &a) {
    fltx4 retVal;
    SubFloat(retVal, 0) = SubFloat(a, 2);
    SubFloat(retVal, 1) = SubFloat(a, 3);
    SubFloat(retVal, 2) = SubFloat(a, 0);
    SubFloat(retVal, 3) = SubFloat(a, 1);
    return retVal;
}

#define BINOP(op)                                             \
    fltx4 retVal;                                             \
    SubFloat(retVal, 0) = (SubFloat(a, 0) op SubFloat(b, 0)); \
    SubFloat(retVal, 1) = (SubFloat(a, 1) op SubFloat(b, 1)); \
    SubFloat(retVal, 2) = (SubFloat(a, 2) op SubFloat(b, 2)); \
    SubFloat(retVal, 3) = (SubFloat(a, 3) op SubFloat(b, 3)); \
    return retVal;

#define IBINOP(op)                                      \
    fltx4 retVal;                                       \
    SubInt(retVal, 0) = (SubInt(a, 0) op SubInt(b, 0)); \
    SubInt(retVal, 1) = (SubInt(a, 1) op SubInt(b, 1)); \
    SubInt(retVal, 2) = (SubInt(a, 2) op SubInt(b, 2)); \
    SubInt(retVal, 3) = (SubInt(a, 3) op SubInt(b, 3)); \
    return retVal;

FORCEINLINE fltx4 AddSIMD(const fltx4 &a, const fltx4 &b) { BINOP(+); }

FORCEINLINE fltx4 SubSIMD(const fltx4 &a, const fltx4 &b)  // a-b
{
    BINOP(-);
};

FORCEINLINE fltx4 MulSIMD(const fltx4 &a, const fltx4 &b)  // a*b
{
    BINOP(*);
}

FORCEINLINE fltx4 DivSIMD(const fltx4 &a, const fltx4 &b)  // a/b
{
    BINOP(/);
}

FORCEINLINE fltx4 MaddSIMD(const fltx4 &a, const fltx4 &b,
                           const fltx4 &c)  // a*b + c
{
    return AddSIMD(MulSIMD(a, b), c);
}

FORCEINLINE fltx4 MsubSIMD(const fltx4 &a, const fltx4 &b,
                           const fltx4 &c)  // c - a*b
{
    return SubSIMD(c, MulSIMD(a, b));
};

FORCEINLINE fltx4 SinSIMD(const fltx4 &radians) {
    fltx4 result;
    SubFloat(result, 0) = sin(SubFloat(radians, 0));
    SubFloat(result, 1) = sin(SubFloat(radians, 1));
    SubFloat(result, 2) = sin(SubFloat(radians, 2));
    SubFloat(result, 3) = sin(SubFloat(radians, 3));
    return result;
}

FORCEINLINE void SinCos3SIMD(fltx4 &sine, fltx4 &cosine, const fltx4 &radians) {
    SinCos(SubFloat(radians, 0), &SubFloat(sine, 0), &SubFloat(cosine, 0));
    SinCos(SubFloat(radians, 1), &SubFloat(sine, 1), &SubFloat(cosine, 1));
    SinCos(SubFloat(radians, 2), &SubFloat(sine, 2), &SubFloat(cosine, 2));
}

FORCEINLINE void SinCosSIMD(fltx4 &sine, fltx4 &cosine, const fltx4 &radians) {
    SinCos(SubFloat(radians, 0), &SubFloat(sine, 0), &SubFloat(cosine, 0));
    SinCos(SubFloat(radians, 1), &SubFloat(sine, 1), &SubFloat(cosine, 1));
    SinCos(SubFloat(radians, 2), &SubFloat(sine, 2), &SubFloat(cosine, 2));
    SinCos(SubFloat(radians, 3), &SubFloat(sine, 3), &SubFloat(cosine, 3));
}

FORCEINLINE fltx4 ArcSinSIMD(const fltx4 &sine) {
    fltx4 result;
    SubFloat(result, 0) = asin(SubFloat(sine, 0));
    SubFloat(result, 1) = asin(SubFloat(sine, 1));
    SubFloat(result, 2) = asin(SubFloat(sine, 2));
    SubFloat(result, 3) = asin(SubFloat(sine, 3));
    return result;
}

FORCEINLINE fltx4 ArcCosSIMD(const fltx4 &cs) {
    fltx4 result;
    SubFloat(result, 0) = acos(SubFloat(cs, 0));
    SubFloat(result, 1) = acos(SubFloat(cs, 1));
    SubFloat(result, 2) = acos(SubFloat(cs, 2));
    SubFloat(result, 3) = acos(SubFloat(cs, 3));
    return result;
}

// tan^1(a/b) .. ie, pass sin in as a and cos in as b
FORCEINLINE fltx4 ArcTan2SIMD(const fltx4 &a, const fltx4 &b) {
    fltx4 result;
    SubFloat(result, 0) = atan2(SubFloat(a, 0), SubFloat(b, 0));
    SubFloat(result, 1) = atan2(SubFloat(a, 1), SubFloat(b, 1));
    SubFloat(result, 2) = atan2(SubFloat(a, 2), SubFloat(b, 2));
    SubFloat(result, 3) = atan2(SubFloat(a, 3), SubFloat(b, 3));
    return result;
}

FORCEINLINE fltx4 MaxSIMD(const fltx4 &a, const fltx4 &b)  // max(a,b)
{
    fltx4 retVal;
    SubFloat(retVal, 0) = max(SubFloat(a, 0), SubFloat(b, 0));
    SubFloat(retVal, 1) = max(SubFloat(a, 1), SubFloat(b, 1));
    SubFloat(retVal, 2) = max(SubFloat(a, 2), SubFloat(b, 2));
    SubFloat(retVal, 3) = max(SubFloat(a, 3), SubFloat(b, 3));
    return retVal;
}

FORCEINLINE fltx4 MinSIMD(const fltx4 &a, const fltx4 &b)  // min(a,b)
{
    fltx4 retVal;
    SubFloat(retVal, 0) = min(SubFloat(a, 0), SubFloat(b, 0));
    SubFloat(retVal, 1) = min(SubFloat(a, 1), SubFloat(b, 1));
    SubFloat(retVal, 2) = min(SubFloat(a, 2), SubFloat(b, 2));
    SubFloat(retVal, 3) = min(SubFloat(a, 3), SubFloat(b, 3));
    return retVal;
}

FORCEINLINE fltx4 AndSIMD(const fltx4 &a, const fltx4 &b)  // a & b
{
    IBINOP(&);
}

FORCEINLINE fltx4 AndNotSIMD(const fltx4 &a, const fltx4 &b)  // ~a & b
{
    fltx4 retVal;
    SubInt(retVal, 0) = ~SubInt(a, 0) & SubInt(b, 0);
    SubInt(retVal, 1) = ~SubInt(a, 1) & SubInt(b, 1);
    SubInt(retVal, 2) = ~SubInt(a, 2) & SubInt(b, 2);
    SubInt(retVal, 3) = ~SubInt(a, 3) & SubInt(b, 3);
    return retVal;
}

FORCEINLINE fltx4 XorSIMD(const fltx4 &a, const fltx4 &b)  // a ^ b
{
    IBINOP(^);
}

FORCEINLINE fltx4 OrSIMD(const fltx4 &a, const fltx4 &b)  // a | b
{
    IBINOP(|);
}

FORCEINLINE fltx4 NegSIMD(const fltx4 &a)  // negate: -a
{
    fltx4 retval;
    SubFloat(retval, 0) = -SubFloat(a, 0);
    SubFloat(retval, 1) = -SubFloat(a, 1);
    SubFloat(retval, 2) = -SubFloat(a, 2);
    SubFloat(retval, 3) = -SubFloat(a, 3);

    return retval;
}

FORCEINLINE bool IsAllZeros(const fltx4 &a)  // all floats of a zero?
{
    return (SubFloat(a, 0) == 0.0) && (SubFloat(a, 1) == 0.0) &&
           (SubFloat(a, 2) == 0.0) && (SubFloat(a, 3) == 0.0);
}

// for branching when a.xyzw > b.xyzw
FORCEINLINE bool IsAllGreaterThan(const fltx4 &a, const fltx4 &b) {
    return SubFloat(a, 0) > SubFloat(b, 0) && SubFloat(a, 1) > SubFloat(b, 1) &&
           SubFloat(a, 2) > SubFloat(b, 2) && SubFloat(a, 3) > SubFloat(b, 3);
}

// for branching when a.xyzw >= b.xyzw
FORCEINLINE bool IsAllGreaterThanOrEq(const fltx4 &a, const fltx4 &b) {
    return SubFloat(a, 0) >= SubFloat(b, 0) &&
           SubFloat(a, 1) >= SubFloat(b, 1) &&
           SubFloat(a, 2) >= SubFloat(b, 2) && SubFloat(a, 3) >= SubFloat(b, 3);
}

// For branching if all a.xyzw == b.xyzw
FORCEINLINE bool IsAllEqual(const fltx4 &a, const fltx4 &b) {
    return SubFloat(a, 0) == SubFloat(b, 0) &&
           SubFloat(a, 1) == SubFloat(b, 1) &&
           SubFloat(a, 2) == SubFloat(b, 2) && SubFloat(a, 3) == SubFloat(b, 3);
}

FORCEINLINE int TestSignSIMD(
    const fltx4 &a)  // mask of which floats have the high bit set
{
    int nRet = 0;

    nRet |= (SubInt(a, 0) & 0x80000000) >> 31;  // sign(x) -> bit 0
    nRet |= (SubInt(a, 1) & 0x80000000) >> 30;  // sign(y) -> bit 1
    nRet |= (SubInt(a, 2) & 0x80000000) >> 29;  // sign(z) -> bit 2
    nRet |= (SubInt(a, 3) & 0x80000000) >> 28;  // sign(w) -> bit 3

    return nRet;
}

FORCEINLINE bool IsAnyNegative(
    const fltx4 &a)  // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
{
    return (0 != TestSignSIMD(a));
}

FORCEINLINE fltx4 CmpEqSIMD(const fltx4 &a, const fltx4 &b)  // (a==b) ? ~0:0
{
    fltx4 retVal;
    SubInt(retVal, 0) = (SubFloat(a, 0) == SubFloat(b, 0)) ? ~0 : 0;
    SubInt(retVal, 1) = (SubFloat(a, 1) == SubFloat(b, 1)) ? ~0 : 0;
    SubInt(retVal, 2) = (SubFloat(a, 2) == SubFloat(b, 2)) ? ~0 : 0;
    SubInt(retVal, 3) = (SubFloat(a, 3) == SubFloat(b, 3)) ? ~0 : 0;
    return retVal;
}

FORCEINLINE fltx4 CmpGtSIMD(const fltx4 &a, const fltx4 &b)  // (a>b) ? ~0:0
{
    fltx4 retVal;
    SubInt(retVal, 0) = (SubFloat(a, 0) > SubFloat(b, 0)) ? ~0 : 0;
    SubInt(retVal, 1) = (SubFloat(a, 1) > SubFloat(b, 1)) ? ~0 : 0;
    SubInt(retVal, 2) = (SubFloat(a, 2) > SubFloat(b, 2)) ? ~0 : 0;
    SubInt(retVal, 3) = (SubFloat(a, 3) > SubFloat(b, 3)) ? ~0 : 0;
    return retVal;
}

FORCEINLINE fltx4 CmpGeSIMD(const fltx4 &a, const fltx4 &b)  // (a>=b) ? ~0:0
{
    fltx4 retVal;
    SubInt(retVal, 0) = (SubFloat(a, 0) >= SubFloat(b, 0)) ? ~0 : 0;
    SubInt(retVal, 1) = (SubFloat(a, 1) >= SubFloat(b, 1)) ? ~0 : 0;
    SubInt(retVal, 2) = (SubFloat(a, 2) >= SubFloat(b, 2)) ? ~0 : 0;
    SubInt(retVal, 3) = (SubFloat(a, 3) >= SubFloat(b, 3)) ? ~0 : 0;
    return retVal;
}

FORCEINLINE fltx4 CmpLtSIMD(const fltx4 &a, const fltx4 &b)  // (a<b) ? ~0:0
{
    fltx4 retVal;
    SubInt(retVal, 0) = (SubFloat(a, 0) < SubFloat(b, 0)) ? ~0 : 0;
    SubInt(retVal, 1) = (SubFloat(a, 1) < SubFloat(b, 1)) ? ~0 : 0;
    SubInt(retVal, 2) = (SubFloat(a, 2) < SubFloat(b, 2)) ? ~0 : 0;
    SubInt(retVal, 3) = (SubFloat(a, 3) < SubFloat(b, 3)) ? ~0 : 0;
    return retVal;
}

FORCEINLINE fltx4 CmpLeSIMD(const fltx4 &a, const fltx4 &b)  // (a<=b) ? ~0:0
{
    fltx4 retVal;
    SubInt(retVal, 0) = (SubFloat(a, 0) <= SubFloat(b, 0)) ? ~0 : 0;
    SubInt(retVal, 1) = (SubFloat(a, 1) <= SubFloat(b, 1)) ? ~0 : 0;
    SubInt(retVal, 2) = (SubFloat(a, 2) <= SubFloat(b, 2)) ? ~0 : 0;
    SubInt(retVal, 3) = (SubFloat(a, 3) <= SubFloat(b, 3)) ? ~0 : 0;
    return retVal;
}

FORCEINLINE fltx4
CmpInBoundsSIMD(const fltx4 &a, const fltx4 &b)  // (a <= b && a >= -b) ? ~0 : 0
{
    fltx4 retVal;
    SubInt(retVal, 0) =
        (SubFloat(a, 0) <= SubFloat(b, 0) && SubFloat(a, 0) >= -SubFloat(b, 0))
            ? ~0
            : 0;
    SubInt(retVal, 1) =
        (SubFloat(a, 1) <= SubFloat(b, 1) && SubFloat(a, 1) >= -SubFloat(b, 1))
            ? ~0
            : 0;
    SubInt(retVal, 2) =
        (SubFloat(a, 2) <= SubFloat(b, 2) && SubFloat(a, 2) >= -SubFloat(b, 2))
            ? ~0
            : 0;
    SubInt(retVal, 3) =
        (SubFloat(a, 3) <= SubFloat(b, 3) && SubFloat(a, 3) >= -SubFloat(b, 3))
            ? ~0
            : 0;
    return retVal;
}

FORCEINLINE fltx4 MaskedAssign(const fltx4 &ReplacementMask,
                               const fltx4 &NewValue, const fltx4 &OldValue) {
    return OrSIMD(AndSIMD(ReplacementMask, NewValue),
                  AndNotSIMD(ReplacementMask, OldValue));
}

FORCEINLINE fltx4 ReplicateX4(float flValue)  //  a,a,a,a
{
    fltx4 retVal;
    SubFloat(retVal, 0) = flValue;
    SubFloat(retVal, 1) = flValue;
    SubFloat(retVal, 2) = flValue;
    SubFloat(retVal, 3) = flValue;
    return retVal;
}

/// replicate a single 32 bit integer value to all 4 components of an m128
FORCEINLINE fltx4 ReplicateIX4(int nValue) {
    fltx4 retVal;
    SubInt(retVal, 0) = nValue;
    SubInt(retVal, 1) = nValue;
    SubInt(retVal, 2) = nValue;
    SubInt(retVal, 3) = nValue;
    return retVal;
}

// Round towards positive infinity
FORCEINLINE fltx4 CeilSIMD(const fltx4 &a) {
    fltx4 retVal;
    SubFloat(retVal, 0) = ceil(SubFloat(a, 0));
    SubFloat(retVal, 1) = ceil(SubFloat(a, 1));
    SubFloat(retVal, 2) = ceil(SubFloat(a, 2));
    SubFloat(retVal, 3) = ceil(SubFloat(a, 3));
    return retVal;
}

// Round towards negative infinity
FORCEINLINE fltx4 FloorSIMD(const fltx4 &a) {
    fltx4 retVal;
    SubFloat(retVal, 0) = floor(SubFloat(a, 0));
    SubFloat(retVal, 1) = floor(SubFloat(a, 1));
    SubFloat(retVal, 2) = floor(SubFloat(a, 2));
    SubFloat(retVal, 3) = floor(SubFloat(a, 3));
    return retVal;
}

FORCEINLINE fltx4 SqrtEstSIMD(const fltx4 &a)  // sqrt(a), more or less
{
    fltx4 retVal;
    SubFloat(retVal, 0) = sqrt(SubFloat(a, 0));
    SubFloat(retVal, 1) = sqrt(SubFloat(a, 1));
    SubFloat(retVal, 2) = sqrt(SubFloat(a, 2));
    SubFloat(retVal, 3) = sqrt(SubFloat(a, 3));
    return retVal;
}

FORCEINLINE fltx4 SqrtSIMD(const fltx4 &a)  // sqrt(a)
{
    fltx4 retVal;
    SubFloat(retVal, 0) = sqrt(SubFloat(a, 0));
    SubFloat(retVal, 1) = sqrt(SubFloat(a, 1));
    SubFloat(retVal, 2) = sqrt(SubFloat(a, 2));
    SubFloat(retVal, 3) = sqrt(SubFloat(a, 3));
    return retVal;
}

FORCEINLINE fltx4
ReciprocalSqrtEstSIMD(const fltx4 &a)  // 1/sqrt(a), more or less
{
    fltx4 retVal;
    SubFloat(retVal, 0) = 1.0 / sqrt(SubFloat(a, 0));
    SubFloat(retVal, 1) = 1.0 / sqrt(SubFloat(a, 1));
    SubFloat(retVal, 2) = 1.0 / sqrt(SubFloat(a, 2));
    SubFloat(retVal, 3) = 1.0 / sqrt(SubFloat(a, 3));
    return retVal;
}

FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD(const fltx4 &a) {
    fltx4 retVal;
    SubFloat(retVal, 0) =
        1.0 / sqrt(SubFloat(a, 0) != 0.0f ? SubFloat(a, 0) : FLT_EPSILON);
    SubFloat(retVal, 1) =
        1.0 / sqrt(SubFloat(a, 1) != 0.0f ? SubFloat(a, 1) : FLT_EPSILON);
    SubFloat(retVal, 2) =
        1.0 / sqrt(SubFloat(a, 2) != 0.0f ? SubFloat(a, 2) : FLT_EPSILON);
    SubFloat(retVal, 3) =
        1.0 / sqrt(SubFloat(a, 3) != 0.0f ? SubFloat(a, 3) : FLT_EPSILON);
    return retVal;
}

FORCEINLINE fltx4 ReciprocalSqrtSIMD(const fltx4 &a)  // 1/sqrt(a)
{
    fltx4 retVal;
    SubFloat(retVal, 0) = 1.0 / sqrt(SubFloat(a, 0));
    SubFloat(retVal, 1) = 1.0 / sqrt(SubFloat(a, 1));
    SubFloat(retVal, 2) = 1.0 / sqrt(SubFloat(a, 2));
    SubFloat(retVal, 3) = 1.0 / sqrt(SubFloat(a, 3));
    return retVal;
}

FORCEINLINE fltx4 ReciprocalEstSIMD(const fltx4 &a)  // 1/a, more or less
{
    fltx4 retVal;
    SubFloat(retVal, 0) = 1.0 / SubFloat(a, 0);
    SubFloat(retVal, 1) = 1.0 / SubFloat(a, 1);
    SubFloat(retVal, 2) = 1.0 / SubFloat(a, 2);
    SubFloat(retVal, 3) = 1.0 / SubFloat(a, 3);
    return retVal;
}

FORCEINLINE fltx4 ReciprocalSIMD(const fltx4 &a)  // 1/a
{
    fltx4 retVal;
    SubFloat(retVal, 0) = 1.0 / SubFloat(a, 0);
    SubFloat(retVal, 1) = 1.0 / SubFloat(a, 1);
    SubFloat(retVal, 2) = 1.0 / SubFloat(a, 2);
    SubFloat(retVal, 3) = 1.0 / SubFloat(a, 3);
    return retVal;
}

/// 1/x for all 4 values.
/// 1/0 will result in a big but NOT infinite result
FORCEINLINE fltx4 ReciprocalEstSaturateSIMD(const fltx4 &a) {
    fltx4 retVal;
    SubFloat(retVal, 0) =
        1.0 / (SubFloat(a, 0) == 0.0f ? FLT_EPSILON : SubFloat(a, 0));
    SubFloat(retVal, 1) =
        1.0 / (SubFloat(a, 1) == 0.0f ? FLT_EPSILON : SubFloat(a, 1));
    SubFloat(retVal, 2) =
        1.0 / (SubFloat(a, 2) == 0.0f ? FLT_EPSILON : SubFloat(a, 2));
    SubFloat(retVal, 3) =
        1.0 / (SubFloat(a, 3) == 0.0f ? FLT_EPSILON : SubFloat(a, 3));
    return retVal;
}

FORCEINLINE fltx4 ReciprocalSaturateSIMD(const fltx4 &a) {
    fltx4 retVal;
    SubFloat(retVal, 0) =
        1.0 / (SubFloat(a, 0) == 0.0f ? FLT_EPSILON : SubFloat(a, 0));
    SubFloat(retVal, 1) =
        1.0 / (SubFloat(a, 1) == 0.0f ? FLT_EPSILON : SubFloat(a, 1));
    SubFloat(retVal, 2) =
        1.0 / (SubFloat(a, 2) == 0.0f ? FLT_EPSILON : SubFloat(a, 2));
    SubFloat(retVal, 3) =
        1.0 / (SubFloat(a, 3) == 0.0f ? FLT_EPSILON : SubFloat(a, 3));
    return retVal;
}

// 2^x for all values (the antilog)
FORCEINLINE fltx4 ExpSIMD(const fltx4 &toPower) {
    fltx4 retVal;
    SubFloat(retVal, 0) = powf(2, SubFloat(toPower, 0));
    SubFloat(retVal, 1) = powf(2, SubFloat(toPower, 1));
    SubFloat(retVal, 2) = powf(2, SubFloat(toPower, 2));
    SubFloat(retVal, 3) = powf(2, SubFloat(toPower, 3));

    return retVal;
}

FORCEINLINE fltx4 Dot3SIMD(const fltx4 &a, const fltx4 &b) {
    float flDot = SubFloat(a, 0) * SubFloat(b, 0) +
                  SubFloat(a, 1) * SubFloat(b, 1) +
                  SubFloat(a, 2) * SubFloat(b, 2);
    return ReplicateX4(flDot);
}

FORCEINLINE fltx4 Dot4SIMD(const fltx4 &a, const fltx4 &b) {
    float flDot =
        SubFloat(a, 0) * SubFloat(b, 0) + SubFloat(a, 1) * SubFloat(b, 1) +
        SubFloat(a, 2) * SubFloat(b, 2) + SubFloat(a, 3) * SubFloat(b, 3);
    return ReplicateX4(flDot);
}

// Clamps the components of a vector to a specified minimum and maximum range.
FORCEINLINE fltx4 ClampVectorSIMD(FLTX4 in, FLTX4 min, FLTX4 max) {
    return MaxSIMD(min, MinSIMD(max, in));
}

// Squelch the w component of a vector to +0.0.
// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
FORCEINLINE fltx4 SetWToZeroSIMD(const fltx4 &a) {
    fltx4 retval;
    retval = a;
    SubFloat(retval, 0) = 0;
    return retval;
}

FORCEINLINE fltx4 LoadUnalignedSIMD(const void *pSIMD) {
    return *(reinterpret_cast<const fltx4 *>(pSIMD));
}

FORCEINLINE fltx4 LoadUnaligned3SIMD(const void *pSIMD) {
    return *(reinterpret_cast<const fltx4 *>(pSIMD));
}

FORCEINLINE fltx4 LoadAlignedSIMD(const void *pSIMD) {
    return *(reinterpret_cast<const fltx4 *>(pSIMD));
}

// for the transitional class -- load a 3-by VectorAligned and squash its w
// component
FORCEINLINE fltx4 LoadAlignedSIMD(const VectorAligned &pSIMD) {
    fltx4 retval = LoadAlignedSIMD(pSIMD.Base());
    // squelch w
    SubInt(retval, 3) = 0;
    return retval;
}

FORCEINLINE void StoreAlignedSIMD(float *pSIMD, const fltx4 &a) {
    *(reinterpret_cast<fltx4 *>(pSIMD)) = a;
}

FORCEINLINE void StoreUnalignedSIMD(float *pSIMD, const fltx4 &a) {
    *(reinterpret_cast<fltx4 *>(pSIMD)) = a;
}

FORCEINLINE void StoreUnaligned3SIMD(float *pSIMD, const fltx4 &a) {
    *pSIMD = SubFloat(a, 0);
    *(pSIMD + 1) = SubFloat(a, 1);
    *(pSIMD + 2) = SubFloat(a, 2);
}

// strongly typed -- syntactic castor oil used for typechecking as we transition
// to SIMD
FORCEINLINE void StoreAligned3SIMD(VectorAligned *RESTRICT pSIMD,
                                   const fltx4 &a) {
    StoreAlignedSIMD(pSIMD->Base(), a);
}

FORCEINLINE void TransposeSIMD(fltx4 &x, fltx4 &y, fltx4 &z, fltx4 &w) {
#define SWAP_FLOATS(_a_, _ia_, _b_, _ib_)          \
    {                                              \
        float tmp = SubFloat(_a_, _ia_);           \
        SubFloat(_a_, _ia_) = SubFloat(_b_, _ib_); \
        SubFloat(_b_, _ib_) = tmp;                 \
    }
    SWAP_FLOATS(x, 1, y, 0);
    SWAP_FLOATS(x, 2, z, 0);
    SWAP_FLOATS(x, 3, w, 0);
    SWAP_FLOATS(y, 2, z, 1);
    SWAP_FLOATS(y, 3, w, 1);
    SWAP_FLOATS(z, 3, w, 2);
}

// find the lowest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
FORCEINLINE fltx4 FindLowestSIMD3(const fltx4 &a) {
    float lowest = min(min(SubFloat(a, 0), SubFloat(a, 1)), SubFloat(a, 2));
    return ReplicateX4(lowest);
}

// find the highest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
FORCEINLINE fltx4 FindHighestSIMD3(const fltx4 &a) {
    float highest = max(max(SubFloat(a, 0), SubFloat(a, 1)), SubFloat(a, 2));
    return ReplicateX4(highest);
}

// Fixed-point conversion and save as SIGNED INTS.
// pDest->x = Int (vSrc.x)
// note: some architectures have means of doing
// fixed point conversion when the fix depth is
// specified as an immediate.. but there is no way
// to guarantee an immediate as a parameter to function
// like this.
FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 *RESTRICT pDest,
                                        const fltx4 &vSrc) {
    (*pDest)[0] = SubFloat(vSrc, 0);
    (*pDest)[1] = SubFloat(vSrc, 1);
    (*pDest)[2] = SubFloat(vSrc, 2);
    (*pDest)[3] = SubFloat(vSrc, 3);
}

// ------------------------------------
// INTEGER SIMD OPERATIONS.
// ------------------------------------
// splat all components of a vector to a signed immediate int number.
FORCEINLINE fltx4 IntSetImmediateSIMD(int nValue) {
    fltx4 retval;
    SubInt(retval, 0) = SubInt(retval, 1) = SubInt(retval, 2) =
        SubInt(retval, 3) = nValue;
    return retval;
}

// Load 4 aligned words into a SIMD register
FORCEINLINE i32x4 LoadAlignedIntSIMD(const void *RESTRICT pSIMD) {
    return *(reinterpret_cast<const i32x4 *>(pSIMD));
}

// Load 4 unaligned words into a SIMD register
FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void *RESTRICT pSIMD) {
    return *(reinterpret_cast<const i32x4 *>(pSIMD));
}

// save into four words, 16-byte aligned
FORCEINLINE void StoreAlignedIntSIMD(int32 *pSIMD, const fltx4 &a) {
    *(reinterpret_cast<i32x4 *>(pSIMD)) = a;
}

FORCEINLINE void StoreAlignedIntSIMD(intx4 &pSIMD, const fltx4 &a) {
    *(reinterpret_cast<i32x4 *>(pSIMD.Base())) = a;
}

FORCEINLINE void StoreUnalignedIntSIMD(int32 *pSIMD, const fltx4 &a) {
    *(reinterpret_cast<i32x4 *>(pSIMD)) = a;
}

// Take a fltx4 containing fixed-point uints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD(const u32x4 &vSrcA) {
    Assert(0); /* pc has no such operation */
    fltx4 retval;
    SubFloat(retval, 0) = ((float)SubInt(retval, 0));
    SubFloat(retval, 1) = ((float)SubInt(retval, 1));
    SubFloat(retval, 2) = ((float)SubInt(retval, 2));
    SubFloat(retval, 3) = ((float)SubInt(retval, 3));
    return retval;
}

#if 0 /* pc has no such op */
// Take a fltx4 containing fixed-point sints and 
// return them as single precision floats. No 
// fixed point conversion is done.
FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
{
	fltx4 retval;
	SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[0])) );
	SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[1])) );
	SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[2])) );
	SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[3])) );
	return retval;
}


/*
  works on fltx4's as if they are four uints.
  the first parameter contains the words to be shifted,
  the second contains the amount to shift by AS INTS

  for i = 0 to 3
  shift = vSrcB_i*32:(i*32)+4
  vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
*/
FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
{
	i32x4 retval;
	SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
	SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
	SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
	SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);


	return retval;
}
#endif

#elif (defined(_X360))

//---------------------------------------------------------------------
// X360 implementation
//---------------------------------------------------------------------

FORCEINLINE float &FloatSIMD(fltx4 &a, int idx) {
    fltx4_union &a_union = (fltx4_union &)a;
    return a_union.m128_f32[idx];
}

FORCEINLINE unsigned int &UIntSIMD(fltx4 &a, int idx) {
    fltx4_union &a_union = (fltx4_union &)a;
    return a_union.m128_u32[idx];
}

FORCEINLINE fltx4 AddSIMD(const fltx4 &a, const fltx4 &b) {
    return __vaddfp(a, b);
}

FORCEINLINE fltx4 SubSIMD(const fltx4 &a, const fltx4 &b)  // a-b
{
    return __vsubfp(a, b);
}

FORCEINLINE fltx4 MulSIMD(const fltx4 &a, const fltx4 &b)  // a*b
{
    return __vmulfp(a, b);
}

FORCEINLINE fltx4 MaddSIMD(const fltx4 &a, const fltx4 &b,
                           const fltx4 &c)  // a*b + c
{
    return __vmaddfp(a, b, c);
}

FORCEINLINE fltx4 MsubSIMD(const fltx4 &a, const fltx4 &b,
                           const fltx4 &c)  // c - a*b
{
    return __vnmsubfp(a, b, c);
};

FORCEINLINE fltx4 Dot3SIMD(const fltx4 &a, const fltx4 &b) {
    return __vmsum3fp(a, b);
}

FORCEINLINE fltx4 Dot4SIMD(const fltx4 &a, const fltx4 &b) {
    return __vmsum4fp(a, b);
}

FORCEINLINE fltx4 SinSIMD(const fltx4 &radians) { return XMVectorSin(radians); }

FORCEINLINE void SinCos3SIMD(fltx4 &sine, fltx4 &cosine, const fltx4 &radians) {
    XMVectorSinCos(&sine, &cosine, radians);
}

FORCEINLINE void SinCosSIMD(fltx4 &sine, fltx4 &cosine, const fltx4 &radians) {
    XMVectorSinCos(&sine, &cosine, radians);
}

FORCEINLINE void CosSIMD(fltx4 &cosine, const fltx4 &radians) {
    cosine = XMVectorCos(radians);
}

FORCEINLINE fltx4 ArcSinSIMD(const fltx4 &sine) { return XMVectorASin(sine); }

FORCEINLINE fltx4 ArcCosSIMD(const fltx4 &cs) { return XMVectorACos(cs); }

// tan^1(a/b) .. ie, pass sin in as a and cos in as b
FORCEINLINE fltx4 ArcTan2SIMD(const fltx4 &a, const fltx4 &b) {
    return XMVectorATan2(a, b);
}

// DivSIMD defined further down, since it uses ReciprocalSIMD

FORCEINLINE fltx4 MaxSIMD(const fltx4 &a, const fltx4 &b)  // max(a,b)
{
    return __vmaxfp(a, b);
}

FORCEINLINE fltx4 MinSIMD(const fltx4 &a, const fltx4 &b)  // min(a,b)
{
    return __vminfp(a, b);
}

FORCEINLINE fltx4 AndSIMD(const fltx4 &a, const fltx4 &b)  // a & b
{
    return __vand(a, b);
}

FORCEINLINE fltx4 AndNotSIMD(const fltx4 &a, const fltx4 &b)  // ~a & b
{
    // NOTE: a and b are swapped in the call: SSE complements the first
    // argument, VMX the second
    return __vandc(b, a);
}

FORCEINLINE fltx4 XorSIMD(const fltx4 &a, const fltx4 &b)  // a ^ b
{
    return __vxor(a, b);
}

FORCEINLINE fltx4 OrSIMD(const fltx4 &a, const fltx4 &b)  // a | b
{
    return __vor(a, b);
}

FORCEINLINE fltx4 NegSIMD(const fltx4 &a)  // negate: -a
{
    return XMVectorNegate(a);
}

FORCEINLINE bool IsAllZeros(const fltx4 &a)  // all floats of a zero?
{
    unsigned int equalFlags = 0;
    __vcmpeqfpR(a, Four_Zeros, &equalFlags);
    return XMComparisonAllTrue(equalFlags);
}

FORCEINLINE bool IsAnyZeros(const fltx4 &a)  // any floats are zero?
{
    unsigned int conditionregister;
    XMVectorEqualR(&conditionregister, a, XMVectorZero());
    return XMComparisonAnyTrue(conditionregister);
}

FORCEINLINE bool IsAnyXYZZero(const fltx4 &a)  // are any of x,y,z zero?
{
    // copy a's x component into w, in case w was zero.
    fltx4 temp = __vrlimi(a, a, 1, 1);
    unsigned int conditionregister;
    XMVectorEqualR(&conditionregister, temp, XMVectorZero());
    return XMComparisonAnyTrue(conditionregister);
}

// for branching when a.xyzw > b.xyzw
FORCEINLINE bool IsAllGreaterThan(const fltx4 &a, const fltx4 &b) {
    unsigned int cr;
    XMVectorGreaterR(&cr, a, b);
    return XMComparisonAllTrue(cr);
}

// for branching when a.xyzw >= b.xyzw
FORCEINLINE bool IsAllGreaterThanOrEq(const fltx4 &a, const fltx4 &b) {
    unsigned int cr;
    XMVectorGreaterOrEqualR(&cr, a, b);
    return XMComparisonAllTrue(cr);
}

// For branching if all a.xyzw == b.xyzw
FORCEINLINE bool IsAllEqual(const fltx4 &a, const fltx4 &b) {
    unsigned int cr;
    XMVectorEqualR(&cr, a, b);
    return XMComparisonAllTrue(cr);
}

FORCEINLINE int TestSignSIMD(
    const fltx4 &a)  // mask of which floats have the high bit set
{
    // NOTE: this maps to SSE way better than it does to VMX (most code uses
    // IsAnyNegative(), though)
    int nRet = 0;

    const fltx4_union &a_union = (const fltx4_union &)a;
    nRet |= (a_union.m128_u32[0] & 0x80000000) >> 31;  // sign(x) -> bit 0
    nRet |= (a_union.m128_u32[1] & 0x80000000) >> 30;  // sign(y) -> bit 1
    nRet |= (a_union.m128_u32[2] & 0x80000000) >> 29;  // sign(z) -> bit 2
    nRet |= (a_union.m128_u32[3] & 0x80000000) >> 28;  // sign(w) -> bit 3

    return nRet;
}

// Squelch the w component of a vector to +0.0.
// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
FORCEINLINE fltx4 SetWToZeroSIMD(const fltx4 &a) {
    return __vrlimi(a, __vzero(), 1, 0);
}

FORCEINLINE bool IsAnyNegative(
    const fltx4 &a)  // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
{
    // NOTE: this tests the top bits of each vector element using integer math
    //       (so it ignores NaNs - it will return true for "-NaN")
    unsigned int equalFlags = 0;
    fltx4 signMask =
        __vspltisw(-1);  // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low
                         // order 5 bits of each element = 31)
    signMask = __vslw(signMask,
                      signMask);  // 0x80000000 0x80000000 0x80000000 0x80000000
    __vcmpequwR(Four_Zeros, __vand(signMask, a), &equalFlags);
    return !XMComparisonAllTrue(equalFlags);
}

FORCEINLINE fltx4 CmpEqSIMD(const fltx4 &a, const fltx4 &b)  // (a==b) ? ~0:0
{
    return __vcmpeqfp(a, b);
}

FORCEINLINE fltx4 CmpGtSIMD(const fltx4 &a, const fltx4 &b)  // (a>b) ? ~0:0
{
    return __vcmpgtfp(a, b);
}

FORCEINLINE fltx4 CmpGeSIMD(const fltx4 &a, const fltx4 &b)  // (a>=b) ? ~0:0
{
    return __vcmpgefp(a, b);
}

FORCEINLINE fltx4 CmpLtSIMD(const fltx4 &a, const fltx4 &b)  // (a<b) ? ~0:0
{
    return __vcmpgtfp(b, a);
}

FORCEINLINE fltx4 CmpLeSIMD(const fltx4 &a, const fltx4 &b)  // (a<=b) ? ~0:0
{
    return __vcmpgefp(b, a);
}

FORCEINLINE fltx4
CmpInBoundsSIMD(const fltx4 &a, const fltx4 &b)  // (a <= b && a >= -b) ? ~0 : 0
{
    return XMVectorInBounds(a, b);
}

// returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue
FORCEINLINE fltx4 MaskedAssign(const fltx4 &ReplacementMask,
                               const fltx4 &NewValue, const fltx4 &OldValue) {
    return __vsel(OldValue, NewValue, ReplacementMask);
}

// AKA "Broadcast", "Splat"
FORCEINLINE fltx4 ReplicateX4(float flValue)  //  a,a,a,a
{
    // NOTE: if flValue comes from a register, this causes a Load-Hit-Store
    // stall (don't mix fpu/vpu math!)
    float *pValue = &flValue;
    Assert(pValue);
    Assert(((unsigned int)pValue & 3) == 0);
    return __vspltw(__lvlx(pValue, 0), 0);
}

FORCEINLINE fltx4 ReplicateX4(const float *pValue)  //  a,a,a,a
{
    Assert(pValue);
    return __vspltw(__lvlx(pValue, 0), 0);
}

/// replicate a single 32 bit integer value to all 4 components of an m128
FORCEINLINE fltx4 ReplicateIX4(int nValue) {
    // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall
    // (should not mix ints with fltx4s!)
    int *pValue = &nValue;
    Assert(pValue);
    Assert(((unsigned int)pValue & 3) == 0);
    return __vspltw(__lvlx(pValue, 0), 0);
}

// Round towards positive infinity
FORCEINLINE fltx4 CeilSIMD(const fltx4 &a) { return __vrfip(a); }

// Round towards nearest integer
FORCEINLINE fltx4 RoundSIMD(const fltx4 &a) { return __vrfin(a); }

// Round towards negative infinity
FORCEINLINE fltx4 FloorSIMD(const fltx4 &a) { return __vrfim(a); }

FORCEINLINE fltx4 SqrtEstSIMD(const fltx4 &a)  // sqrt(a), more or less
{
    // This is emulated from rsqrt
    return XMVectorSqrtEst(a);
}

FORCEINLINE fltx4 SqrtSIMD(const fltx4 &a)  // sqrt(a)
{
    // This is emulated from rsqrt
    return XMVectorSqrt(a);
}

FORCEINLINE fltx4
ReciprocalSqrtEstSIMD(const fltx4 &a)  // 1/sqrt(a), more or less
{
    return __vrsqrtefp(a);
}

FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD(const fltx4 &a) {
    // Convert zeros to epsilons
    fltx4 zero_mask = CmpEqSIMD(a, Four_Zeros);
    fltx4 a_safe = OrSIMD(a, AndSIMD(Four_Epsilons, zero_mask));
    return ReciprocalSqrtEstSIMD(a_safe);
}

FORCEINLINE fltx4 ReciprocalSqrtSIMD(const fltx4 &a)  // 1/sqrt(a)
{
    // This uses Newton-Raphson to improve the HW result
    return XMVectorReciprocalSqrt(a);
}

FORCEINLINE fltx4 ReciprocalEstSIMD(const fltx4 &a)  // 1/a, more or less
{
    return __vrefp(a);
}

/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton
/// iteration. No error checking!
FORCEINLINE fltx4 ReciprocalSIMD(const fltx4 &a)  // 1/a
{
    // This uses Newton-Raphson to improve the HW result
    return XMVectorReciprocal(a);
}

// FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need
// DivEstSIMD?)
FORCEINLINE fltx4 DivSIMD(const fltx4 &a, const fltx4 &b)  // a/b
{
    return MulSIMD(ReciprocalSIMD(b), a);
}

/// 1/x for all 4 values.
/// 1/0 will result in a big but NOT infinite result
FORCEINLINE fltx4 ReciprocalEstSaturateSIMD(const fltx4 &a) {
    // Convert zeros to epsilons
    fltx4 zero_mask = CmpEqSIMD(a, Four_Zeros);
    fltx4 a_safe = OrSIMD(a, AndSIMD(Four_Epsilons, zero_mask));
    return ReciprocalEstSIMD(a_safe);
}

FORCEINLINE fltx4 ReciprocalSaturateSIMD(const fltx4 &a) {
    // Convert zeros to epsilons
    fltx4 zero_mask = CmpEqSIMD(a, Four_Zeros);
    fltx4 a_safe = OrSIMD(a, AndSIMD(Four_Epsilons, zero_mask));
    return ReciprocalSIMD(a_safe);

    // FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0,
    // whereas the above does) fltx4 zeroMask = CmpEqSIMD( Four_Zeros, a );
    // fltx4 a_safe = XMVectorSelect( a, Four_Epsilons, zeroMask );
    // return ReciprocalSIMD( a_safe );
}

// CHRISG: is it worth doing integer bitfiddling for this?
// 2^x for all values (the antilog)
FORCEINLINE fltx4 ExpSIMD(const fltx4 &toPower) { return XMVectorExp(toPower); }

// Clamps the components of a vector to a specified minimum and maximum range.
FORCEINLINE fltx4 ClampVectorSIMD(FLTX4 in, FLTX4 min, FLTX4 max) {
    return XMVectorClamp(in, min, max);
}

FORCEINLINE fltx4 LoadUnalignedSIMD(const void *pSIMD) {
    return XMLoadVector4(pSIMD);
}

// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec).
FORCEINLINE fltx4 LoadUnaligned3SIMD(const void *pSIMD) {
    return XMLoadVector3(pSIMD);
}

FORCEINLINE fltx4 LoadAlignedSIMD(const void *pSIMD) {
    return *(reinterpret_cast<const fltx4 *>(pSIMD));
}

// for the transitional class -- load a 3-by VectorAligned and squash its w
// component
FORCEINLINE fltx4 LoadAlignedSIMD(const VectorAligned &pSIMD) {
    fltx4 out = XMLoadVector3A(pSIMD.Base());
    // squelch w
    return __vrlimi(out, __vzero(), 1, 0);
}

// for the transitional class -- load a 3-by VectorAligned and squash its w
// component
FORCEINLINE fltx4 LoadAlignedSIMD(const VectorAligned *RESTRICT pSIMD) {
    fltx4 out = XMLoadVector3A(pSIMD);
    // squelch w
    return __vrlimi(out, __vzero(), 1, 0);
}

FORCEINLINE void StoreAlignedSIMD(float *pSIMD, const fltx4 &a) {
    *(reinterpret_cast<fltx4 *>(pSIMD)) = a;
}

FORCEINLINE void StoreUnalignedSIMD(float *pSIMD, const fltx4 &a) {
    XMStoreVector4(pSIMD, a);
}

FORCEINLINE void StoreUnaligned3SIMD(float *pSIMD, const fltx4 &a) {
    XMStoreVector3(pSIMD, a);
}

// strongly typed -- for typechecking as we transition to SIMD
FORCEINLINE void StoreAligned3SIMD(VectorAligned *RESTRICT pSIMD,
                                   const fltx4 &a) {
    XMStoreVector3A(pSIMD->Base(), a);
}

// Fixed-point conversion and save as SIGNED INTS.
// pDest->x = Int (vSrc.x)
// note: some architectures have means of doing
// fixed point conversion when the fix depth is
// specified as an immediate.. but there is no way
// to guarantee an immediate as a parameter to function
// like this.
FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 *RESTRICT pDest,
                                        const fltx4 &vSrc) {
    fltx4 asInt = __vctsxs(vSrc, 0);
    XMStoreVector4A(pDest->Base(), asInt);
}

FORCEINLINE void TransposeSIMD(fltx4 &x, fltx4 &y, fltx4 &z, fltx4 &w) {
    XMMATRIX xyzwMatrix = _XMMATRIX(x, y, z, w);
    xyzwMatrix = XMMatrixTranspose(xyzwMatrix);
    x = xyzwMatrix.r[0];
    y = xyzwMatrix.r[1];
    z = xyzwMatrix.r[2];
    w = xyzwMatrix.r[3];
}

// Return one in the fastest way -- faster even than loading.
FORCEINLINE fltx4 LoadZeroSIMD(void) { return XMVectorZero(); }

// Return one in the fastest way -- faster even than loading.
FORCEINLINE fltx4 LoadOneSIMD(void) { return XMVectorSplatOne(); }

FORCEINLINE fltx4 SplatXSIMD(fltx4 a) { return XMVectorSplatX(a); }

FORCEINLINE fltx4 SplatYSIMD(fltx4 a) { return XMVectorSplatY(a); }

FORCEINLINE fltx4 SplatZSIMD(fltx4 a) { return XMVectorSplatZ(a); }

FORCEINLINE fltx4 SplatWSIMD(fltx4 a) { return XMVectorSplatW(a); }

FORCEINLINE fltx4 SetXSIMD(const fltx4 &a, const fltx4 &x) {
    fltx4 result = __vrlimi(a, x, 8, 0);
    return result;
}

FORCEINLINE fltx4 SetYSIMD(const fltx4 &a, const fltx4 &y) {
    fltx4 result = __vrlimi(a, y, 4, 0);
    return result;
}

FORCEINLINE fltx4 SetZSIMD(const fltx4 &a, const fltx4 &z) {
    fltx4 result = __vrlimi(a, z, 2, 0);
    return result;
}

FORCEINLINE fltx4 SetWSIMD(const fltx4 &a, const fltx4 &w) {
    fltx4 result = __vrlimi(a, w, 1, 0);
    return result;
}

FORCEINLINE fltx4 SetComponentSIMD(const fltx4 &a, int nComponent,
                                   float flValue) {
    static int s_nVrlimiMask[4] = {8, 4, 2, 1};
    fltx4 val = ReplicateX4(flValue);
    fltx4 result = __vrlimi(a, val, s_nVrlimiMask[nComponent], 0);
    return result;
}

FORCEINLINE fltx4 RotateLeft(const fltx4 &a) {
    fltx4 compareOne = a;
    return __vrlimi(compareOne, a, 8 | 4 | 2 | 1, 1);
}

FORCEINLINE fltx4 RotateLeft2(const fltx4 &a) {
    fltx4 compareOne = a;
    return __vrlimi(compareOne, a, 8 | 4 | 2 | 1, 2);
}

// find the lowest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
// ignores a.w.
// Though this is only five instructions long,
// they are all dependent, making this stall city.
// Forcing this inline should hopefully help with scheduling.
FORCEINLINE fltx4 FindLowestSIMD3(const fltx4 &a) {
    // a is [x,y,z,G] (where G is garbage)
    // rotate left by one
    fltx4 compareOne = a;
    compareOne = __vrlimi(compareOne, a, 8 | 4, 1);
    // compareOne is [y,z,G,G]
    fltx4 retval = MinSIMD(a, compareOne);
    // retVal is [min(x,y), min(y,z), G, G]
    compareOne = __vrlimi(compareOne, a, 8, 2);
    // compareOne is [z, G, G, G]
    retval = MinSIMD(retval, compareOne);
    // retVal = [ min(min(x,y),z), G, G, G ]

    // splat the x component out to the whole vector and return
    return SplatXSIMD(retval);
}

// find the highest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
// ignores a.w.
// Though this is only five instructions long,
// they are all dependent, making this stall city.
// Forcing this inline should hopefully help with scheduling.
FORCEINLINE fltx4 FindHighestSIMD3(const fltx4 &a) {
    // a is [x,y,z,G] (where G is garbage)
    // rotate left by one
    fltx4 compareOne = a;
    compareOne = __vrlimi(compareOne, a, 8 | 4, 1);
    // compareOne is [y,z,G,G]
    fltx4 retval = MaxSIMD(a, compareOne);
    // retVal is [max(x,y), max(y,z), G, G]
    compareOne = __vrlimi(compareOne, a, 8, 2);
    // compareOne is [z, G, G, G]
    retval = MaxSIMD(retval, compareOne);
    // retVal = [ max(max(x,y),z), G, G, G ]

    // splat the x component out to the whole vector and return
    return SplatXSIMD(retval);
}

// Transform many (horizontal) points in-place by a 3x4 matrix,
// here already loaded onto three fltx4 registers.
// The points must be stored as 16-byte aligned. They are points
// and not vectors because we assume the w-component to be 1.
// To spare yourself the annoyance of loading the matrix yourself,
// use one of the overloads below.
void TransformManyPointsBy(VectorAligned *RESTRICT pVectors,
                           unsigned int numVectors, FLTX4 mRow1, FLTX4 mRow2,
                           FLTX4 mRow3);

// Transform many (horizontal) points in-place by a 3x4 matrix.
// The points must be stored as 16-byte aligned. They are points
// and not vectors because we assume the w-component to be 1.
// In this function, the matrix need not be aligned.
FORCEINLINE void TransformManyPointsBy(VectorAligned *RESTRICT pVectors,
                                       unsigned int numVectors,
                                       const matrix3x4_t &pMatrix) {
    return TransformManyPointsBy(
        pVectors, numVectors, LoadUnalignedSIMD(pMatrix[0]),
        LoadUnalignedSIMD(pMatrix[1]), LoadUnalignedSIMD(pMatrix[2]));
}

// Transform many (horizontal) points in-place by a 3x4 matrix.
// The points must be stored as 16-byte aligned. They are points
// and not vectors because we assume the w-component to be 1.
// In this function, the matrix must itself be aligned on a 16-byte
// boundary.
FORCEINLINE void TransformManyPointsByA(VectorAligned *RESTRICT pVectors,
                                        unsigned int numVectors,
                                        const matrix3x4_t &pMatrix) {
    return TransformManyPointsBy(
        pVectors, numVectors, LoadAlignedSIMD(pMatrix[0]),
        LoadAlignedSIMD(pMatrix[1]), LoadAlignedSIMD(pMatrix[2]));
}

// ------------------------------------
// INTEGER SIMD OPERATIONS.
// ------------------------------------

// Load 4 aligned words into a SIMD register
FORCEINLINE i32x4 LoadAlignedIntSIMD(const void *RESTRICT pSIMD) {
    return XMLoadVector4A(pSIMD);
}

// Load 4 unaligned words into a SIMD register
FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void *RESTRICT pSIMD) {
    return XMLoadVector4(pSIMD);
}

// save into four words, 16-byte aligned
FORCEINLINE void StoreAlignedIntSIMD(int32 *pSIMD, const fltx4 &a) {
    *(reinterpret_cast<i32x4 *>(pSIMD)) = a;
}

FORCEINLINE void StoreAlignedIntSIMD(intx4 &pSIMD, const fltx4 &a) {
    *(reinterpret_cast<i32x4 *>(pSIMD.Base())) = a;
}

FORCEINLINE void StoreUnalignedIntSIMD(int32 *pSIMD, const fltx4 &a) {
    XMStoreVector4(pSIMD, a);
}

// Take a fltx4 containing fixed-point uints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD(const i32x4 &vSrcA) {
    return __vcfux(vSrcA, 0);
}

// Take a fltx4 containing fixed-point sints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 SignedIntConvertToFltSIMD(const i32x4 &vSrcA) {
    return __vcfsx(vSrcA, 0);
}

// Take a fltx4 containing fixed-point uints and
// return them as single precision floats. Each uint
// will be divided by 2^immed after conversion
// (eg, this is fixed point math).
/* as if:
   FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned
   int uImmed )
   {
   return __vcfux( vSrcA, uImmed );
   }
*/
#define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) \
    (__vcfux((vSrcA), (uImmed)))

// Take a fltx4 containing fixed-point sints and
// return them as single precision floats. Each int
// will be divided by 2^immed (eg, this is fixed point
// math).
/* as if:
   FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int
   uImmed )
   {
   return __vcfsx( vSrcA, uImmed );
   }
*/
#define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) \
    (__vcfsx((vSrcA), (uImmed)))

// set all components of a vector to a signed immediate int number.
/* as if:
   FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate)
   {
   return __vspltisw( toImmediate );
   }
*/
#define IntSetImmediateSIMD(x) (__vspltisw(x))

/*
  works on fltx4's as if they are four uints.
  the first parameter contains the words to be shifted,
  the second contains the amount to shift by AS INTS

  for i = 0 to 3
  shift = vSrcB_i*32:(i*32)+4
  vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
*/
FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB) {
    return __vslw(vSrcA, vSrcB);
}

FORCEINLINE float SubFloat(const fltx4 &a, int idx) {
    // NOTE: if the output goes into a register, this causes a Load-Hit-Store
    // stall (don't mix fpu/vpu math!)
    const fltx4_union &a_union = (const fltx4_union &)a;
    return a_union.m128_f32[idx];
}

FORCEINLINE float &SubFloat(fltx4 &a, int idx) {
    fltx4_union &a_union = (fltx4_union &)a;
    return a_union.m128_f32[idx];
}

FORCEINLINE uint32 SubFloatConvertToInt(const fltx4 &a, int idx) {
    fltx4 t = __vctuxs(a, 0);
    const fltx4_union &a_union = (const fltx4_union &)t;
    return a_union.m128_u32[idx];
}

FORCEINLINE uint32 SubInt(const fltx4 &a, int idx) {
    const fltx4_union &a_union = (const fltx4_union &)a;
    return a_union.m128_u32[idx];
}

FORCEINLINE uint32 &SubInt(fltx4 &a, int idx) {
    fltx4_union &a_union = (fltx4_union &)a;
    return a_union.m128_u32[idx];
}

#else

//---------------------------------------------------------------------
// Intel/SSE implementation
//---------------------------------------------------------------------

FORCEINLINE void StoreAlignedSIMD(float *RESTRICT pSIMD, const fltx4 &a) {
    _mm_store_ps(pSIMD, a);
}

FORCEINLINE void StoreUnalignedSIMD(float *RESTRICT pSIMD, const fltx4 &a) {
    _mm_storeu_ps(pSIMD, a);
}

FORCEINLINE fltx4 RotateLeft(const fltx4 &a);
FORCEINLINE fltx4 RotateLeft2(const fltx4 &a);

FORCEINLINE void StoreUnaligned3SIMD(float *pSIMD, const fltx4 &a) {
    _mm_store_ss(pSIMD, a);
    _mm_store_ss(pSIMD + 1, RotateLeft(a));
    _mm_store_ss(pSIMD + 2, RotateLeft2(a));
}

// strongly typed -- syntactic castor oil used for typechecking as we transition
// to SIMD
FORCEINLINE void StoreAligned3SIMD(VectorAligned *RESTRICT pSIMD,
                                   const fltx4 &a) {
    StoreAlignedSIMD(pSIMD->Base(), a);
}

FORCEINLINE fltx4 LoadAlignedSIMD(const void *pSIMD) {
    return _mm_load_ps(reinterpret_cast<const float *>(pSIMD));
}

FORCEINLINE fltx4 AndSIMD(const fltx4 &a, const fltx4 &b)  // a & b
{
    return _mm_and_ps(a, b);
}

FORCEINLINE fltx4 AndNotSIMD(const fltx4 &a, const fltx4 &b)  // ~a & b
{
    return _mm_andnot_ps(a, b);
}

FORCEINLINE fltx4 XorSIMD(const fltx4 &a, const fltx4 &b)  // a ^ b
{
    return _mm_xor_ps(a, b);
}

FORCEINLINE fltx4 OrSIMD(const fltx4 &a, const fltx4 &b)  // a | b
{
    return _mm_or_ps(a, b);
}

// Squelch the w component of a vector to +0.0.
// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
FORCEINLINE fltx4 SetWToZeroSIMD(const fltx4 &a) {
    return AndSIMD(a, LoadAlignedSIMD(g_SIMD_clear_wmask));
}

// for the transitional class -- load a 3-by VectorAligned and squash its w
// component
FORCEINLINE fltx4 LoadAlignedSIMD(const VectorAligned &pSIMD) {
    return SetWToZeroSIMD(LoadAlignedSIMD(pSIMD.Base()));
}

FORCEINLINE fltx4 LoadUnalignedSIMD(const void *pSIMD) {
    return _mm_loadu_ps(reinterpret_cast<const float *>(pSIMD));
}

FORCEINLINE fltx4 LoadUnaligned3SIMD(const void *pSIMD) {
    return _mm_loadu_ps(reinterpret_cast<const float *>(pSIMD));
}

/// replicate a single 32 bit integer value to all 4 components of an m128
FORCEINLINE fltx4 ReplicateIX4(int i) {
    fltx4 value = _mm_set_ss(*((float *)&i));
    ;
    return _mm_shuffle_ps(value, value, 0);
}

FORCEINLINE fltx4 ReplicateX4(float flValue) {
    __m128 value = _mm_set_ss(flValue);
    return _mm_shuffle_ps(value, value, 0);
}

FORCEINLINE float SubFloat(const fltx4 &a, int idx) {
    // NOTE: if the output goes into a register, this causes a Load-Hit-Store
    // stall (don't mix fpu/vpu math!)
#ifndef POSIX
    return a.m128_f32[idx];
#else
    return (reinterpret_cast<float const *>(&a))[idx];
#endif
}

FORCEINLINE float &SubFloat(fltx4 &a, int idx) {
#ifndef POSIX
    return a.m128_f32[idx];
#else
    return (reinterpret_cast<float *>(&a))[idx];
#endif
}

FORCEINLINE uint32 SubFloatConvertToInt(const fltx4 &a, int idx) {
    return (uint32)SubFloat(a, idx);
}

FORCEINLINE uint32 SubInt(const fltx4 &a, int idx) {
#ifndef POSIX
    return a.m128_u32[idx];
#else
    return (reinterpret_cast<uint32 const *>(&a))[idx];
#endif
}

FORCEINLINE uint32 &SubInt(fltx4 &a, int idx) {
#ifndef POSIX
    return a.m128_u32[idx];
#else
    return (reinterpret_cast<uint32 *>(&a))[idx];
#endif
}

// Return one in the fastest way -- on the x360, faster even than loading.
FORCEINLINE fltx4 LoadZeroSIMD(void) { return Four_Zeros; }

// Return one in the fastest way -- on the x360, faster even than loading.
FORCEINLINE fltx4 LoadOneSIMD(void) { return Four_Ones; }

FORCEINLINE fltx4 MaskedAssign(const fltx4 &ReplacementMask,
                               const fltx4 &NewValue, const fltx4 &OldValue) {
    return OrSIMD(AndSIMD(ReplacementMask, NewValue),
                  AndNotSIMD(ReplacementMask, OldValue));
}

// remember, the SSE numbers its words 3 2 1 0
// The way we want to specify shuffles is backwards from the default
// MM_SHUFFLE_REV is in array index order (default is reversed)
#define MM_SHUFFLE_REV(a, b, c, d) _MM_SHUFFLE(d, c, b, a)

FORCEINLINE fltx4 SplatXSIMD(fltx4 const &a) {
    return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(0, 0, 0, 0));
}

FORCEINLINE fltx4 SplatYSIMD(fltx4 const &a) {
    return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(1, 1, 1, 1));
}

FORCEINLINE fltx4 SplatZSIMD(fltx4 const &a) {
    return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(2, 2, 2, 2));
}

FORCEINLINE fltx4 SplatWSIMD(fltx4 const &a) {
    return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3));
}

FORCEINLINE fltx4 SetXSIMD(const fltx4 &a, const fltx4 &x) {
    fltx4 result = MaskedAssign(LoadAlignedSIMD(g_SIMD_ComponentMask[0]), x, a);
    return result;
}

FORCEINLINE fltx4 SetYSIMD(const fltx4 &a, const fltx4 &y) {
    fltx4 result = MaskedAssign(LoadAlignedSIMD(g_SIMD_ComponentMask[1]), y, a);
    return result;
}

FORCEINLINE fltx4 SetZSIMD(const fltx4 &a, const fltx4 &z) {
    fltx4 result = MaskedAssign(LoadAlignedSIMD(g_SIMD_ComponentMask[2]), z, a);
    return result;
}

FORCEINLINE fltx4 SetWSIMD(const fltx4 &a, const fltx4 &w) {
    fltx4 result = MaskedAssign(LoadAlignedSIMD(g_SIMD_ComponentMask[3]), w, a);
    return result;
}

FORCEINLINE fltx4 SetComponentSIMD(const fltx4 &a, int nComponent,
                                   float flValue) {
    fltx4 val = ReplicateX4(flValue);
    fltx4 result =
        MaskedAssign(LoadAlignedSIMD(g_SIMD_ComponentMask[nComponent]), val, a);
    return result;
}

// a b c d -> b c d a
FORCEINLINE fltx4 RotateLeft(const fltx4 &a) {
    return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(1, 2, 3, 0));
}

// a b c d -> c d a b
FORCEINLINE fltx4 RotateLeft2(const fltx4 &a) {
    return _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(2, 3, 0, 1));
}

// a b c d -> d a b c
FORCEINLINE fltx4 RotateRight(const fltx4 &a) {
    return _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
}

// a b c d -> c d a b
FORCEINLINE fltx4 RotateRight2(const fltx4 &a) {
    return _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
}

FORCEINLINE fltx4 AddSIMD(const fltx4 &a, const fltx4 &b)  // a+b
{
    return _mm_add_ps(a, b);
};

FORCEINLINE fltx4 SubSIMD(const fltx4 &a, const fltx4 &b)  // a-b
{
    return _mm_sub_ps(a, b);
};

FORCEINLINE fltx4 MulSIMD(const fltx4 &a, const fltx4 &b)  // a*b
{
    return _mm_mul_ps(a, b);
};

FORCEINLINE fltx4 DivSIMD(const fltx4 &a, const fltx4 &b)  // a/b
{
    return _mm_div_ps(a, b);
};

FORCEINLINE fltx4 MaddSIMD(const fltx4 &a, const fltx4 &b,
                           const fltx4 &c)  // a*b + c
{
    return AddSIMD(MulSIMD(a, b), c);
}

FORCEINLINE fltx4 MsubSIMD(const fltx4 &a, const fltx4 &b,
                           const fltx4 &c)  // c - a*b
{
    return SubSIMD(c, MulSIMD(a, b));
};

FORCEINLINE fltx4 Dot3SIMD(const fltx4 &a, const fltx4 &b) {
    fltx4 m = MulSIMD(a, b);
    float flDot = SubFloat(m, 0) + SubFloat(m, 1) + SubFloat(m, 2);
    return ReplicateX4(flDot);
}

FORCEINLINE fltx4 Dot4SIMD(const fltx4 &a, const fltx4 &b) {
    fltx4 m = MulSIMD(a, b);
    float flDot =
        SubFloat(m, 0) + SubFloat(m, 1) + SubFloat(m, 2) + SubFloat(m, 3);
    return ReplicateX4(flDot);
}

// TODO: implement as four-way Taylor series (see xbox implementation)
FORCEINLINE fltx4 SinSIMD(const fltx4 &radians) {
    fltx4 result;
    SubFloat(result, 0) = sin(SubFloat(radians, 0));
    SubFloat(result, 1) = sin(SubFloat(radians, 1));
    SubFloat(result, 2) = sin(SubFloat(radians, 2));
    SubFloat(result, 3) = sin(SubFloat(radians, 3));
    return result;
}

FORCEINLINE void SinCos3SIMD(fltx4 &sine, fltx4 &cosine, const fltx4 &radians) {
    // FIXME: Make a fast SSE version
    SinCos(SubFloat(radians, 0), &SubFloat(sine, 0), &SubFloat(cosine, 0));
    SinCos(SubFloat(radians, 1), &SubFloat(sine, 1), &SubFloat(cosine, 1));
    SinCos(SubFloat(radians, 2), &SubFloat(sine, 2), &SubFloat(cosine, 2));
}

FORCEINLINE void SinCosSIMD(fltx4 &sine, fltx4 &cosine,
                            const fltx4 &radians)  // a*b + c
{
    // FIXME: Make a fast SSE version
    SinCos(SubFloat(radians, 0), &SubFloat(sine, 0), &SubFloat(cosine, 0));
    SinCos(SubFloat(radians, 1), &SubFloat(sine, 1), &SubFloat(cosine, 1));
    SinCos(SubFloat(radians, 2), &SubFloat(sine, 2), &SubFloat(cosine, 2));
    SinCos(SubFloat(radians, 3), &SubFloat(sine, 3), &SubFloat(cosine, 3));
}

// TODO: implement as four-way Taylor series (see xbox implementation)
FORCEINLINE fltx4 ArcSinSIMD(const fltx4 &sine) {
    // FIXME: Make a fast SSE version
    fltx4 result;
    SubFloat(result, 0) = asin(SubFloat(sine, 0));
    SubFloat(result, 1) = asin(SubFloat(sine, 1));
    SubFloat(result, 2) = asin(SubFloat(sine, 2));
    SubFloat(result, 3) = asin(SubFloat(sine, 3));
    return result;
}

FORCEINLINE fltx4 ArcCosSIMD(const fltx4 &cs) {
    fltx4 result;
    SubFloat(result, 0) = acos(SubFloat(cs, 0));
    SubFloat(result, 1) = acos(SubFloat(cs, 1));
    SubFloat(result, 2) = acos(SubFloat(cs, 2));
    SubFloat(result, 3) = acos(SubFloat(cs, 3));
    return result;
}

// tan^1(a/b) .. ie, pass sin in as a and cos in as b
FORCEINLINE fltx4 ArcTan2SIMD(const fltx4 &a, const fltx4 &b) {
    fltx4 result;
    SubFloat(result, 0) = atan2(SubFloat(a, 0), SubFloat(b, 0));
    SubFloat(result, 1) = atan2(SubFloat(a, 1), SubFloat(b, 1));
    SubFloat(result, 2) = atan2(SubFloat(a, 2), SubFloat(b, 2));
    SubFloat(result, 3) = atan2(SubFloat(a, 3), SubFloat(b, 3));
    return result;
}

FORCEINLINE fltx4 NegSIMD(const fltx4 &a)  // negate: -a
{
    return SubSIMD(LoadZeroSIMD(), a);
}

FORCEINLINE int TestSignSIMD(
    const fltx4 &a)  // mask of which floats have the high bit set
{
    return _mm_movemask_ps(a);
}

FORCEINLINE bool IsAnyNegative(
    const fltx4 &a)  // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
{
    return (0 != TestSignSIMD(a));
}

FORCEINLINE fltx4 CmpEqSIMD(const fltx4 &a, const fltx4 &b)  // (a==b) ? ~0:0
{
    return _mm_cmpeq_ps(a, b);
}

FORCEINLINE fltx4 CmpGtSIMD(const fltx4 &a, const fltx4 &b)  // (a>b) ? ~0:0
{
    return _mm_cmpgt_ps(a, b);
}

FORCEINLINE fltx4 CmpGeSIMD(const fltx4 &a, const fltx4 &b)  // (a>=b) ? ~0:0
{
    return _mm_cmpge_ps(a, b);
}

FORCEINLINE fltx4 CmpLtSIMD(const fltx4 &a, const fltx4 &b)  // (a<b) ? ~0:0
{
    return _mm_cmplt_ps(a, b);
}

FORCEINLINE fltx4 CmpLeSIMD(const fltx4 &a, const fltx4 &b)  // (a<=b) ? ~0:0
{
    return _mm_cmple_ps(a, b);
}

// for branching when a.xyzw > b.xyzw
FORCEINLINE bool IsAllGreaterThan(const fltx4 &a, const fltx4 &b) {
    return TestSignSIMD(CmpLeSIMD(a, b)) == 0;
}

// for branching when a.xyzw >= b.xyzw
FORCEINLINE bool IsAllGreaterThanOrEq(const fltx4 &a, const fltx4 &b) {
    return TestSignSIMD(CmpLtSIMD(a, b)) == 0;
}

// For branching if all a.xyzw == b.xyzw
FORCEINLINE bool IsAllEqual(const fltx4 &a, const fltx4 &b) {
    return TestSignSIMD(CmpEqSIMD(a, b)) == 0xf;
}

FORCEINLINE fltx4
CmpInBoundsSIMD(const fltx4 &a, const fltx4 &b)  // (a <= b && a >= -b) ? ~0 : 0
{
    return AndSIMD(CmpLeSIMD(a, b), CmpGeSIMD(a, NegSIMD(b)));
}

FORCEINLINE fltx4 MinSIMD(const fltx4 &a, const fltx4 &b)  // min(a,b)
{
    return _mm_min_ps(a, b);
}

FORCEINLINE fltx4 MaxSIMD(const fltx4 &a, const fltx4 &b)  // max(a,b)
{
    return _mm_max_ps(a, b);
}

// SSE lacks rounding operations.
// Really.
// You can emulate them by setting the rounding mode for the
// whole processor and then converting to int, and then back again.
// But every time you set the rounding mode, you clear out the
// entire pipeline. So, I can't do them per operation. You
// have to do it once, before the loop that would call these.
// Round towards positive infinity
FORCEINLINE fltx4 CeilSIMD(const fltx4 &a) {
    fltx4 retVal;
    SubFloat(retVal, 0) = ceil(SubFloat(a, 0));
    SubFloat(retVal, 1) = ceil(SubFloat(a, 1));
    SubFloat(retVal, 2) = ceil(SubFloat(a, 2));
    SubFloat(retVal, 3) = ceil(SubFloat(a, 3));
    return retVal;
}

fltx4 fabs(const fltx4 &x);
// Round towards negative infinity
// This is the implementation that was here before; it assumes
// you are in round-to-floor mode, which I guess is usually the
// case for us vis-a-vis SSE. It's totally unnecessary on
// VMX, which has a native floor op.
FORCEINLINE fltx4 FloorSIMD(const fltx4 &val) {
    fltx4 fl4Abs = fabs(val);
    fltx4 ival = SubSIMD(AddSIMD(fl4Abs, Four_2ToThe23s), Four_2ToThe23s);
    ival =
        MaskedAssign(CmpGtSIMD(ival, fl4Abs), SubSIMD(ival, Four_Ones), ival);
    return XorSIMD(ival, XorSIMD(val, fl4Abs));  // restore sign bits
}

inline bool IsAllZeros(const fltx4 &var) {
    return TestSignSIMD(CmpEqSIMD(var, Four_Zeros)) == 0xF;
}

FORCEINLINE fltx4 SqrtEstSIMD(const fltx4 &a)  // sqrt(a), more or less
{
    return _mm_sqrt_ps(a);
}

FORCEINLINE fltx4 SqrtSIMD(const fltx4 &a)  // sqrt(a)
{
    return _mm_sqrt_ps(a);
}

FORCEINLINE fltx4
ReciprocalSqrtEstSIMD(const fltx4 &a)  // 1/sqrt(a), more or less
{
    return _mm_rsqrt_ps(a);
}

FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD(const fltx4 &a) {
    fltx4 zero_mask = CmpEqSIMD(a, Four_Zeros);
    fltx4 ret = OrSIMD(a, AndSIMD(Four_Epsilons, zero_mask));
    ret = ReciprocalSqrtEstSIMD(ret);
    return ret;
}

/// uses newton iteration for higher precision results than
/// ReciprocalSqrtEstSIMD
FORCEINLINE fltx4 ReciprocalSqrtSIMD(const fltx4 &a)  // 1/sqrt(a)
{
    fltx4 guess = ReciprocalSqrtEstSIMD(a);
    // newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2));
    guess =
        MulSIMD(guess, SubSIMD(Four_Threes, MulSIMD(a, MulSIMD(guess, guess))));
    guess = MulSIMD(Four_PointFives, guess);
    return guess;
}

FORCEINLINE fltx4 ReciprocalEstSIMD(const fltx4 &a)  // 1/a, more or less
{
    return _mm_rcp_ps(a);
}

/// 1/x for all 4 values, more or less
/// 1/0 will result in a big but NOT infinite result
FORCEINLINE fltx4 ReciprocalEstSaturateSIMD(const fltx4 &a) {
    fltx4 zero_mask = CmpEqSIMD(a, Four_Zeros);
    fltx4 ret = OrSIMD(a, AndSIMD(Four_Epsilons, zero_mask));
    ret = ReciprocalEstSIMD(ret);
    return ret;
}

/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton
/// iteration. No error checking!
FORCEINLINE fltx4 ReciprocalSIMD(const fltx4 &a)  // 1/a
{
    fltx4 ret = ReciprocalEstSIMD(a);
    // newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2
    ret = SubSIMD(AddSIMD(ret, ret), MulSIMD(a, MulSIMD(ret, ret)));
    return ret;
}

/// 1/x for all 4 values.
/// 1/0 will result in a big but NOT infinite result
FORCEINLINE fltx4 ReciprocalSaturateSIMD(const fltx4 &a) {
    fltx4 zero_mask = CmpEqSIMD(a, Four_Zeros);
    fltx4 ret = OrSIMD(a, AndSIMD(Four_Epsilons, zero_mask));
    ret = ReciprocalSIMD(ret);
    return ret;
}

// CHRISG: is it worth doing integer bitfiddling for this?
// 2^x for all values (the antilog)
FORCEINLINE fltx4 ExpSIMD(const fltx4 &toPower) {
    fltx4 retval;
    SubFloat(retval, 0) = powf(2, SubFloat(toPower, 0));
    SubFloat(retval, 1) = powf(2, SubFloat(toPower, 1));
    SubFloat(retval, 2) = powf(2, SubFloat(toPower, 2));
    SubFloat(retval, 3) = powf(2, SubFloat(toPower, 3));

    return retval;
}

// Clamps the components of a vector to a specified minimum and maximum range.
FORCEINLINE fltx4 ClampVectorSIMD(FLTX4 in, FLTX4 min, FLTX4 max) {
    return MaxSIMD(min, MinSIMD(max, in));
}

FORCEINLINE void TransposeSIMD(fltx4 &x, fltx4 &y, fltx4 &z, fltx4 &w) {
    _MM_TRANSPOSE4_PS(x, y, z, w);
}

FORCEINLINE fltx4 FindLowestSIMD3(const fltx4 &a) {
    // a is [x,y,z,G] (where G is garbage)
    // rotate left by one
    fltx4 compareOne = RotateLeft(a);
    // compareOne is [y,z,G,x]
    fltx4 retval = MinSIMD(a, compareOne);
    // retVal is [min(x,y), ... ]
    compareOne = RotateLeft2(a);
    // compareOne is [z, G, x, y]
    retval = MinSIMD(retval, compareOne);
    // retVal = [ min(min(x,y),z)..]
    // splat the x component out to the whole vector and return
    return SplatXSIMD(retval);
}

FORCEINLINE fltx4 FindHighestSIMD3(const fltx4 &a) {
    // a is [x,y,z,G] (where G is garbage)
    // rotate left by one
    fltx4 compareOne = RotateLeft(a);
    // compareOne is [y,z,G,x]
    fltx4 retval = MaxSIMD(a, compareOne);
    // retVal is [max(x,y), ... ]
    compareOne = RotateLeft2(a);
    // compareOne is [z, G, x, y]
    retval = MaxSIMD(retval, compareOne);
    // retVal = [ max(max(x,y),z)..]
    // splat the x component out to the whole vector and return
    return SplatXSIMD(retval);
}

// ------------------------------------
// INTEGER SIMD OPERATIONS.
// ------------------------------------

#if 0 /* pc does not have these ops */
// splat all components of a vector to a signed immediate int number.
FORCEINLINE fltx4 IntSetImmediateSIMD(int to)
{
	//CHRISG: SSE2 has this, but not SSE1. What to do?
	fltx4 retval;
	SubInt( retval, 0 ) = to;
	SubInt( retval, 1 ) = to;
	SubInt( retval, 2 ) = to;
	SubInt( retval, 3 ) = to;
	return retval;
}
#endif

// Load 4 aligned words into a SIMD register
FORCEINLINE i32x4 LoadAlignedIntSIMD(const void *RESTRICT pSIMD) {
    return _mm_load_ps(reinterpret_cast<const float *>(pSIMD));
}

// Load 4 unaligned words into a SIMD register
FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void *RESTRICT pSIMD) {
    return _mm_loadu_ps(reinterpret_cast<const float *>(pSIMD));
}

// save into four words, 16-byte aligned
FORCEINLINE void StoreAlignedIntSIMD(int32 *RESTRICT pSIMD, const fltx4 &a) {
    _mm_store_ps(reinterpret_cast<float *>(pSIMD), a);
}

FORCEINLINE void StoreAlignedIntSIMD(intx4 &pSIMD, const fltx4 &a) {
    _mm_store_ps(reinterpret_cast<float *>(pSIMD.Base()), a);
}

FORCEINLINE void StoreUnalignedIntSIMD(int32 *RESTRICT pSIMD, const fltx4 &a) {
    _mm_storeu_ps(reinterpret_cast<float *>(pSIMD), a);
}

// CHRISG: the conversion functions all seem to operate on m64's only...
// how do we make them work here?

// Take a fltx4 containing fixed-point uints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD(const u32x4 &vSrcA) {
    fltx4 retval;
    SubFloat(retval, 0) = ((float)SubInt(retval, 0));
    SubFloat(retval, 1) = ((float)SubInt(retval, 1));
    SubFloat(retval, 2) = ((float)SubInt(retval, 2));
    SubFloat(retval, 3) = ((float)SubInt(retval, 3));
    return retval;
}

// Take a fltx4 containing fixed-point sints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 SignedIntConvertToFltSIMD(const i32x4 &vSrcA) {
    fltx4 retval;
    SubFloat(retval, 0) = ((float)(reinterpret_cast<const int32 *>(&vSrcA)[0]));
    SubFloat(retval, 1) = ((float)(reinterpret_cast<const int32 *>(&vSrcA)[1]));
    SubFloat(retval, 2) = ((float)(reinterpret_cast<const int32 *>(&vSrcA)[2]));
    SubFloat(retval, 3) = ((float)(reinterpret_cast<const int32 *>(&vSrcA)[3]));
    return retval;
}

/*
  works on fltx4's as if they are four uints.
  the first parameter contains the words to be shifted,
  the second contains the amount to shift by AS INTS

  for i = 0 to 3
  shift = vSrcB_i*32:(i*32)+4
  vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
*/
FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB) {
    i32x4 retval;
    SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
    SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
    SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
    SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);

    return retval;
}

// Fixed-point conversion and save as SIGNED INTS.
// pDest->x = Int (vSrc.x)
// note: some architectures have means of doing
// fixed point conversion when the fix depth is
// specified as an immediate.. but there is no way
// to guarantee an immediate as a parameter to function
// like this.
FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 *RESTRICT pDest,
                                        const fltx4 &vSrc) {
    __m64 bottom = _mm_cvttps_pi32(vSrc);
    __m64 top = _mm_cvttps_pi32(_mm_movehl_ps(vSrc, vSrc));

    *reinterpret_cast<__m64 *>(&(*pDest)[0]) = bottom;
    *reinterpret_cast<__m64 *>(&(*pDest)[2]) = top;

    _mm_empty();
}

#endif

/// class FourVectors stores 4 independent vectors for use in SIMD processing.
/// These vectors are stored in the format x x x x y y y y z z z z so that they
/// can be efficiently SIMD-accelerated.
class ALIGN16 FourVectors {
   public:
    fltx4 x, y, z;

    FORCEINLINE void DuplicateVector(
        Vector const &v)  //< set all 4 vectors to the same vector value
    {
        x = ReplicateX4(v.x);
        y = ReplicateX4(v.y);
        z = ReplicateX4(v.z);
    }

    FORCEINLINE fltx4 const &operator[](int idx) const { return *((&x) + idx); }

    FORCEINLINE fltx4 &operator[](int idx) { return *((&x) + idx); }

    FORCEINLINE void operator+=(
        FourVectors const &b)  //< add 4 vectors to another 4 vectors
    {
        x = AddSIMD(x, b.x);
        y = AddSIMD(y, b.y);
        z = AddSIMD(z, b.z);
    }

    FORCEINLINE void operator-=(
        FourVectors const &b)  //< subtract 4 vectors from another 4
    {
        x = SubSIMD(x, b.x);
        y = SubSIMD(y, b.y);
        z = SubSIMD(z, b.z);
    }

    FORCEINLINE void operator*=(
        FourVectors const &b)  //< scale all four vectors per component scale
    {
        x = MulSIMD(x, b.x);
        y = MulSIMD(y, b.y);
        z = MulSIMD(z, b.z);
    }

    FORCEINLINE void operator*=(const fltx4 &scale)  //< scale
    {
        x = MulSIMD(x, scale);
        y = MulSIMD(y, scale);
        z = MulSIMD(z, scale);
    }

    FORCEINLINE void operator*=(float scale)  //< uniformly scale all 4 vectors
    {
        fltx4 scalepacked = ReplicateX4(scale);
        *this *= scalepacked;
    }

    FORCEINLINE fltx4 operator*(FourVectors const &b) const  //< 4 dot products
    {
        fltx4 dot = MulSIMD(x, b.x);
        dot = MaddSIMD(y, b.y, dot);
        dot = MaddSIMD(z, b.z, dot);
        return dot;
    }

    FORCEINLINE fltx4 operator*(
        Vector const &b) const  //< dot product all 4 vectors with 1 vector
    {
        fltx4 dot = MulSIMD(x, ReplicateX4(b.x));
        dot = MaddSIMD(y, ReplicateX4(b.y), dot);
        dot = MaddSIMD(z, ReplicateX4(b.z), dot);
        return dot;
    }

    FORCEINLINE void VProduct(
        FourVectors const &b)  //< component by component mul
    {
        x = MulSIMD(x, b.x);
        y = MulSIMD(y, b.y);
        z = MulSIMD(z, b.z);
    }
    FORCEINLINE void MakeReciprocal(void)  //< (x,y,z)=(1/x,1/y,1/z)
    {
        x = ReciprocalSIMD(x);
        y = ReciprocalSIMD(y);
        z = ReciprocalSIMD(z);
    }

    FORCEINLINE void MakeReciprocalSaturate(
        void)  //< (x,y,z)=(1/x,1/y,1/z), 1/0=1.0e23
    {
        x = ReciprocalSaturateSIMD(x);
        y = ReciprocalSaturateSIMD(y);
        z = ReciprocalSaturateSIMD(z);
    }

    // Assume the given matrix is a rotation, and rotate these vectors by it.
    // If you have a long list of FourVectors structures that you all want
    // to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
    inline void RotateBy(const matrix3x4_t &matrix);

    /// You can use this to rotate a long array of FourVectors all by the same
    /// matrix. The first parameter is the head of the array. The second is the
    /// number of vectors to rotate. The third is the matrix.
    static void RotateManyBy(FourVectors *RESTRICT pVectors,
                             unsigned int numVectors,
                             const matrix3x4_t &rotationMatrix);

    /// Assume the vectors are points, and transform them in place by the
    /// matrix.
    inline void TransformBy(const matrix3x4_t &matrix);

    /// You can use this to Transform a long array of FourVectors all by the
    /// same matrix. The first parameter is the head of the array. The second is
    /// the number of vectors to rotate. The third is the matrix. The fourth is
    /// the output buffer, which must not overlap the pVectors buffer. This is
    /// not an in-place transformation.
    static void TransformManyBy(FourVectors *RESTRICT pVectors,
                                unsigned int numVectors,
                                const matrix3x4_t &rotationMatrix,
                                FourVectors *RESTRICT pOut);

    /// You can use this to Transform a long array of FourVectors all by the
    /// same matrix. The first parameter is the head of the array. The second is
    /// the number of vectors to rotate. The third is the matrix. The fourth is
    /// the output buffer, which must not overlap the pVectors buffer. This is
    /// an in-place transformation.
    static void TransformManyBy(FourVectors *RESTRICT pVectors,
                                unsigned int numVectors,
                                const matrix3x4_t &rotationMatrix);

    // X(),Y(),Z() - get at the desired component of the i'th (0..3) vector.
    FORCEINLINE const float &X(int idx) const {
        // NOTE: if the output goes into a register, this causes a
        // Load-Hit-Store stall (don't mix fpu/vpu math!)
        return SubFloat((fltx4 &)x, idx);
    }

    FORCEINLINE const float &Y(int idx) const {
        return SubFloat((fltx4 &)y, idx);
    }

    FORCEINLINE const float &Z(int idx) const {
        return SubFloat((fltx4 &)z, idx);
    }

    FORCEINLINE float &X(int idx) { return SubFloat(x, idx); }

    FORCEINLINE float &Y(int idx) { return SubFloat(y, idx); }

    FORCEINLINE float &Z(int idx) { return SubFloat(z, idx); }

    FORCEINLINE Vector Vec(int idx) const  //< unpack one of the vectors
    {
        return Vector(X(idx), Y(idx), Z(idx));
    }

    FourVectors(void) {}

    FourVectors(FourVectors const &src) {
        x = src.x;
        y = src.y;
        z = src.z;
    }

    FORCEINLINE void operator=(FourVectors const &src) {
        x = src.x;
        y = src.y;
        z = src.z;
    }

    /// LoadAndSwizzle - load 4 Vectors into a FourVectors, performing transpose
    /// op
    FORCEINLINE void LoadAndSwizzle(Vector const &a, Vector const &b,
                                    Vector const &c, Vector const &d) {
        // TransposeSIMD has large sub-expressions that the compiler can't
        // eliminate on x360 use an unfolded implementation here
#if _X360
        fltx4 tx = LoadUnalignedSIMD(&a.x);
        fltx4 ty = LoadUnalignedSIMD(&b.x);
        fltx4 tz = LoadUnalignedSIMD(&c.x);
        fltx4 tw = LoadUnalignedSIMD(&d.x);
        fltx4 r0 = __vmrghw(tx, tz);
        fltx4 r1 = __vmrghw(ty, tw);
        fltx4 r2 = __vmrglw(tx, tz);
        fltx4 r3 = __vmrglw(ty, tw);

        x = __vmrghw(r0, r1);
        y = __vmrglw(r0, r1);
        z = __vmrghw(r2, r3);
#else
        x = LoadUnalignedSIMD(&(a.x));
        y = LoadUnalignedSIMD(&(b.x));
        z = LoadUnalignedSIMD(&(c.x));
        fltx4 w = LoadUnalignedSIMD(&(d.x));
        // now, matrix is:
        // x y z ?
        // x y z ?
        // x y z ?
        // x y z ?
        TransposeSIMD(x, y, z, w);
#endif
    }

    /// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing
    /// transpose op. all 4 vectors must be 128 bit boundary
    FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a,
                                           const float *RESTRICT b,
                                           const float *RESTRICT c,
                                           const float *RESTRICT d) {
#if _X360
        fltx4 tx = LoadAlignedSIMD(a);
        fltx4 ty = LoadAlignedSIMD(b);
        fltx4 tz = LoadAlignedSIMD(c);
        fltx4 tw = LoadAlignedSIMD(d);
        fltx4 r0 = __vmrghw(tx, tz);
        fltx4 r1 = __vmrghw(ty, tw);
        fltx4 r2 = __vmrglw(tx, tz);
        fltx4 r3 = __vmrglw(ty, tw);

        x = __vmrghw(r0, r1);
        y = __vmrglw(r0, r1);
        z = __vmrghw(r2, r3);
#else
        x = LoadAlignedSIMD(a);
        y = LoadAlignedSIMD(b);
        z = LoadAlignedSIMD(c);
        fltx4 w = LoadAlignedSIMD(d);
        // now, matrix is:
        // x y z ?
        // x y z ?
        // x y z ?
        // x y z ?
        TransposeSIMD(x, y, z, w);
#endif
    }

    FORCEINLINE void LoadAndSwizzleAligned(Vector const &a, Vector const &b,
                                           Vector const &c, Vector const &d) {
        LoadAndSwizzleAligned(&a.x, &b.x, &c.x, &d.x);
    }

    /// return the squared length of all 4 vectors
    FORCEINLINE fltx4 length2(void) const { return (*this) * (*this); }

    /// return the approximate length of all 4 vectors. uses the sqrt
    /// approximation instruction
    FORCEINLINE fltx4 length(void) const { return SqrtEstSIMD(length2()); }

    /// normalize all 4 vectors in place. not mega-accurate (uses reciprocal
    /// approximation instruction)
    FORCEINLINE void VectorNormalizeFast(void) {
        fltx4 mag_sq = (*this) * (*this);          // length^2
        (*this) *= ReciprocalSqrtEstSIMD(mag_sq);  // *(1.0/sqrt(length^2))
    }

    /// normalize all 4 vectors in place.
    FORCEINLINE void VectorNormalize(void) {
        fltx4 mag_sq = (*this) * (*this);       // length^2
        (*this) *= ReciprocalSqrtSIMD(mag_sq);  // *(1.0/sqrt(length^2))
    }

    /// construct a FourVectors from 4 separate Vectors
    FORCEINLINE FourVectors(Vector const &a, Vector const &b, Vector const &c,
                            Vector const &d) {
        LoadAndSwizzle(a, b, c, d);
    }

    /// construct a FourVectors from 4 separate Vectors
    FORCEINLINE FourVectors(VectorAligned const &a, VectorAligned const &b,
                            VectorAligned const &c, VectorAligned const &d) {
        LoadAndSwizzleAligned(a, b, c, d);
    }

    FORCEINLINE fltx4 DistToSqr(FourVectors const &pnt) {
        fltx4 fl4dX = SubSIMD(pnt.x, x);
        fltx4 fl4dY = SubSIMD(pnt.y, y);
        fltx4 fl4dZ = SubSIMD(pnt.z, z);
        return AddSIMD(MulSIMD(fl4dX, fl4dX),
                       AddSIMD(MulSIMD(fl4dY, fl4dY), MulSIMD(fl4dZ, fl4dZ)));
    }

    FORCEINLINE fltx4 TValueOfClosestPointOnLine(FourVectors const &p0,
                                                 FourVectors const &p1) const {
        FourVectors lineDelta = p1;
        lineDelta -= p0;
        fltx4 OOlineDirDotlineDir = ReciprocalSIMD(p1 * p1);
        FourVectors v4OurPnt = *this;
        v4OurPnt -= p0;
        return MulSIMD(OOlineDirDotlineDir, v4OurPnt * lineDelta);
    }

    FORCEINLINE fltx4 DistSqrToLineSegment(FourVectors const &p0,
                                           FourVectors const &p1) const {
        FourVectors lineDelta = p1;
        FourVectors v4OurPnt = *this;
        v4OurPnt -= p0;
        lineDelta -= p0;

        fltx4 OOlineDirDotlineDir = ReciprocalSIMD(lineDelta * lineDelta);

        fltx4 fl4T = MulSIMD(OOlineDirDotlineDir, v4OurPnt * lineDelta);

        fl4T = MinSIMD(fl4T, Four_Ones);
        fl4T = MaxSIMD(fl4T, Four_Zeros);
        lineDelta *= fl4T;
        return v4OurPnt.DistToSqr(lineDelta);
    }
};

/// form 4 cross products
inline FourVectors operator^(const FourVectors &a, const FourVectors &b) {
    FourVectors ret;
    ret.x = SubSIMD(MulSIMD(a.y, b.z), MulSIMD(a.z, b.y));
    ret.y = SubSIMD(MulSIMD(a.z, b.x), MulSIMD(a.x, b.z));
    ret.z = SubSIMD(MulSIMD(a.x, b.y), MulSIMD(a.y, b.x));
    return ret;
}

/// component-by-componentwise MAX operator
inline FourVectors maximum(const FourVectors &a, const FourVectors &b) {
    FourVectors ret;
    ret.x = MaxSIMD(a.x, b.x);
    ret.y = MaxSIMD(a.y, b.y);
    ret.z = MaxSIMD(a.z, b.z);
    return ret;
}

/// component-by-componentwise MIN operator
inline FourVectors minimum(const FourVectors &a, const FourVectors &b) {
    FourVectors ret;
    ret.x = MinSIMD(a.x, b.x);
    ret.y = MinSIMD(a.y, b.y);
    ret.z = MinSIMD(a.z, b.z);
    return ret;
}

/// calculate reflection vector. incident and normal dir assumed normalized
FORCEINLINE FourVectors VectorReflect(const FourVectors &incident,
                                      const FourVectors &normal) {
    FourVectors ret = incident;
    fltx4 iDotNx2 = incident * normal;
    iDotNx2 = AddSIMD(iDotNx2, iDotNx2);
    FourVectors nPart = normal;
    nPart *= iDotNx2;
    ret -= nPart;  // i-2(n*i)n
    return ret;
}

/// calculate slide vector. removes all components of a vector which are
/// perpendicular to a normal vector.
FORCEINLINE FourVectors VectorSlide(const FourVectors &incident,
                                    const FourVectors &normal) {
    FourVectors ret = incident;
    fltx4 iDotN = incident * normal;
    FourVectors nPart = normal;
    nPart *= iDotN;
    ret -= nPart;  // i-(n*i)n
    return ret;
}

// Assume the given matrix is a rotation, and rotate these vectors by it.
// If you have a long list of FourVectors structures that you all want
// to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
void FourVectors::RotateBy(const matrix3x4_t &matrix) {
    // Splat out each of the entries in the matrix to a fltx4. Do this
    // in the order that we will need them, to hide latency. I'm
    // avoiding making an array of them, so that they'll remain in
    // registers.
    fltx4 matSplat00, matSplat01, matSplat02, matSplat10, matSplat11,
        matSplat12, matSplat20, matSplat21, matSplat22;

    {
        // Load the matrix into local vectors. Sadly, matrix3x4_ts are
        // often unaligned. The w components will be the tranpose row of
        // the matrix, but we don't really care about that.
        fltx4 matCol0 = LoadUnalignedSIMD(matrix[0]);
        fltx4 matCol1 = LoadUnalignedSIMD(matrix[1]);
        fltx4 matCol2 = LoadUnalignedSIMD(matrix[2]);

        matSplat00 = SplatXSIMD(matCol0);
        matSplat01 = SplatYSIMD(matCol0);
        matSplat02 = SplatZSIMD(matCol0);

        matSplat10 = SplatXSIMD(matCol1);
        matSplat11 = SplatYSIMD(matCol1);
        matSplat12 = SplatZSIMD(matCol1);

        matSplat20 = SplatXSIMD(matCol2);
        matSplat21 = SplatYSIMD(matCol2);
        matSplat22 = SplatZSIMD(matCol2);
    }

    // Trust in the compiler to schedule these operations correctly:
    fltx4 outX, outY, outZ;
    outX = AddSIMD(AddSIMD(MulSIMD(x, matSplat00), MulSIMD(y, matSplat01)),
                   MulSIMD(z, matSplat02));
    outY = AddSIMD(AddSIMD(MulSIMD(x, matSplat10), MulSIMD(y, matSplat11)),
                   MulSIMD(z, matSplat12));
    outZ = AddSIMD(AddSIMD(MulSIMD(x, matSplat20), MulSIMD(y, matSplat21)),
                   MulSIMD(z, matSplat22));

    x = outX;
    y = outY;
    z = outZ;
}

// Assume the given matrix is a rotation, and rotate these vectors by it.
// If you have a long list of FourVectors structures that you all want
// to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
void FourVectors::TransformBy(const matrix3x4_t &matrix) {
    // Splat out each of the entries in the matrix to a fltx4. Do this
    // in the order that we will need them, to hide latency. I'm
    // avoiding making an array of them, so that they'll remain in
    // registers.
    fltx4 matSplat00, matSplat01, matSplat02, matSplat10, matSplat11,
        matSplat12, matSplat20, matSplat21, matSplat22;

    {
        // Load the matrix into local vectors. Sadly, matrix3x4_ts are
        // often unaligned. The w components will be the tranpose row of
        // the matrix, but we don't really care about that.
        fltx4 matCol0 = LoadUnalignedSIMD(matrix[0]);
        fltx4 matCol1 = LoadUnalignedSIMD(matrix[1]);
        fltx4 matCol2 = LoadUnalignedSIMD(matrix[2]);

        matSplat00 = SplatXSIMD(matCol0);
        matSplat01 = SplatYSIMD(matCol0);
        matSplat02 = SplatZSIMD(matCol0);

        matSplat10 = SplatXSIMD(matCol1);
        matSplat11 = SplatYSIMD(matCol1);
        matSplat12 = SplatZSIMD(matCol1);

        matSplat20 = SplatXSIMD(matCol2);
        matSplat21 = SplatYSIMD(matCol2);
        matSplat22 = SplatZSIMD(matCol2);
    }

    // Trust in the compiler to schedule these operations correctly:
    fltx4 outX, outY, outZ;

    outX = MaddSIMD(z, matSplat02,
                    AddSIMD(MulSIMD(x, matSplat00), MulSIMD(y, matSplat01)));
    outY = MaddSIMD(z, matSplat12,
                    AddSIMD(MulSIMD(x, matSplat10), MulSIMD(y, matSplat11)));
    outZ = MaddSIMD(z, matSplat22,
                    AddSIMD(MulSIMD(x, matSplat20), MulSIMD(y, matSplat21)));

    x = AddSIMD(outX, ReplicateX4(matrix[0][3]));
    y = AddSIMD(outY, ReplicateX4(matrix[1][3]));
    z = AddSIMD(outZ, ReplicateX4(matrix[2][3]));
}

/// quick, low quality perlin-style noise() function suitable for real time use.
/// return value is -1..1. Only reliable around +/- 1 million or so.
fltx4 NoiseSIMD(const fltx4 &x, const fltx4 &y, const fltx4 &z);
fltx4 NoiseSIMD(FourVectors const &v);

// vector valued noise direction
FourVectors DNoiseSIMD(FourVectors const &v);

// vector value "curl" noise function. see
// http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html
FourVectors CurlNoiseSIMD(FourVectors const &v);

/// calculate the absolute value of a packed single
inline fltx4 fabs(const fltx4 &x) {
    return AndSIMD(x, LoadAlignedSIMD(g_SIMD_clear_signmask));
}

/// negate all four components of a SIMD packed single
inline fltx4 fnegate(const fltx4 &x) {
    return XorSIMD(x, LoadAlignedSIMD(g_SIMD_signmask));
}

fltx4 Pow_FixedPoint_Exponent_SIMD(const fltx4 &x, int exponent);

// PowSIMD - raise a SIMD register to a power.  This is analogous to the C pow()
// function, with some restictions: fractional exponents are only handled with 2
// bits of precision. Basically, fractions of 0,.25,.5, and .75 are handled.
// PowSIMD(x,.30) will be the same as PowSIMD(x,.25). negative and fractional
// powers are handled by the SIMD reciprocal and square root approximation
// instructions and so are not especially accurate ----Note that this routine
// does not raise numeric exceptions because it uses SIMD--- This routine is
// O(log2(exponent)).
inline fltx4 PowSIMD(const fltx4 &x, float exponent) {
    return Pow_FixedPoint_Exponent_SIMD(x, (int)(4.0 * exponent));
}

// random number generation - generate 4 random numbers quickly.

void SeedRandSIMD(uint32 seed);    // seed the random # generator
fltx4 RandSIMD(int nContext = 0);  // return 4 numbers in the 0..1 range

// for multithreaded, you need to use these and use the argument form of
// RandSIMD:
int GetSIMDRandContext(void);
void ReleaseSIMDRandContext(int nContext);

FORCEINLINE fltx4 RandSignedSIMD(void)  // -1..1
{
    return SubSIMD(MulSIMD(Four_Twos, RandSIMD()), Four_Ones);
}

// SIMD versions of mathlib simplespline functions
// hermite basis function for smooth interpolation
// Similar to Gain() above, but very cheap to call
// value should be between 0 & 1 inclusive
inline fltx4 SimpleSpline(const fltx4 &value) {
    // Arranged to avoid a data dependency between these two MULs:
    fltx4 valueDoubled = MulSIMD(value, Four_Twos);
    fltx4 valueSquared = MulSIMD(value, value);

    // Nice little ease-in, ease-out spline-like curve
    return SubSIMD(MulSIMD(Four_Threes, valueSquared),
                   MulSIMD(valueDoubled, valueSquared));
}

// remaps a value in [startInterval, startInterval+rangeInterval] from linear to
// spline using SimpleSpline
inline fltx4 SimpleSplineRemapValWithDeltas(const fltx4 &val, const fltx4 &A,
                                            const fltx4 &BMinusA,
                                            const fltx4 &OneOverBMinusA,
                                            const fltx4 &C,
                                            const fltx4 &DMinusC) {
    // 	if ( A == B )
    // 		return val >= B ? D : C;
    fltx4 cVal = MulSIMD(SubSIMD(val, A), OneOverBMinusA);
    return AddSIMD(C, MulSIMD(DMinusC, SimpleSpline(cVal)));
}

inline fltx4 SimpleSplineRemapValWithDeltasClamped(
    const fltx4 &val, const fltx4 &A, const fltx4 &BMinusA,
    const fltx4 &OneOverBMinusA, const fltx4 &C, const fltx4 &DMinusC) {
    // 	if ( A == B )
    // 		return val >= B ? D : C;
    fltx4 cVal = MulSIMD(SubSIMD(val, A), OneOverBMinusA);
    cVal = MinSIMD(Four_Ones, MaxSIMD(Four_Zeros, cVal));
    return AddSIMD(C, MulSIMD(DMinusC, SimpleSpline(cVal)));
}

FORCEINLINE fltx4 FracSIMD(const fltx4 &val) {
    fltx4 fl4Abs = fabs(val);
    fltx4 ival = SubSIMD(AddSIMD(fl4Abs, Four_2ToThe23s), Four_2ToThe23s);
    ival =
        MaskedAssign(CmpGtSIMD(ival, fl4Abs), SubSIMD(ival, Four_Ones), ival);
    return XorSIMD(SubSIMD(fl4Abs, ival),
                   XorSIMD(val, fl4Abs));  // restore sign bits
}

FORCEINLINE fltx4 Mod2SIMD(const fltx4 &val) {
    fltx4 fl4Abs = fabs(val);
    fltx4 ival = SubSIMD(AndSIMD(LoadAlignedSIMD((float *)g_SIMD_lsbmask),
                                 AddSIMD(fl4Abs, Four_2ToThe23s)),
                         Four_2ToThe23s);
    ival =
        MaskedAssign(CmpGtSIMD(ival, fl4Abs), SubSIMD(ival, Four_Twos), ival);
    return XorSIMD(SubSIMD(fl4Abs, ival),
                   XorSIMD(val, fl4Abs));  // restore sign bits
}

FORCEINLINE fltx4 Mod2SIMDPositiveInput(const fltx4 &val) {
    fltx4 ival = SubSIMD(
        AndSIMD(LoadAlignedSIMD(g_SIMD_lsbmask), AddSIMD(val, Four_2ToThe23s)),
        Four_2ToThe23s);
    ival = MaskedAssign(CmpGtSIMD(ival, val), SubSIMD(ival, Four_Twos), ival);
    return SubSIMD(val, ival);
}

// approximate sin of an angle, with -1..1 representing the whole sin wave
// period instead of -pi..pi. no range reduction is done - for values outside of
// 0..1 you won't like the results
FORCEINLINE fltx4 _SinEst01SIMD(const fltx4 &val) {
    // really rough approximation - x*(4-x*4) - a parabola. s(0) = 0, s(.5) = 1,
    // s(1)=0, smooth in-between. sufficient for simple oscillation.
    return MulSIMD(val, SubSIMD(Four_Fours, MulSIMD(val, Four_Fours)));
}

FORCEINLINE fltx4 _Sin01SIMD(const fltx4 &val) {
    // not a bad approximation : parabola always over-estimates. Squared
    // parabola always underestimates. So lets blend between them:  goodsin =
    // badsin + .225*( badsin^2-badsin)
    fltx4 fl4BadEst =
        MulSIMD(val, SubSIMD(Four_Fours, MulSIMD(val, Four_Fours)));
    return AddSIMD(MulSIMD(Four_Point225s,
                           SubSIMD(MulSIMD(fl4BadEst, fl4BadEst), fl4BadEst)),
                   fl4BadEst);
}

// full range useable implementations
FORCEINLINE fltx4 SinEst01SIMD(const fltx4 &val) {
    fltx4 fl4Abs = fabs(val);
    fltx4 fl4Reduced2 = Mod2SIMDPositiveInput(fl4Abs);
    fltx4 fl4OddMask = CmpGeSIMD(fl4Reduced2, Four_Ones);
    fltx4 fl4val = SubSIMD(fl4Reduced2, AndSIMD(Four_Ones, fl4OddMask));
    fltx4 fl4Sin = _SinEst01SIMD(fl4val);
    fl4Sin = XorSIMD(fl4Sin, AndSIMD(LoadAlignedSIMD(g_SIMD_signmask),
                                     XorSIMD(val, fl4OddMask)));
    return fl4Sin;
}

FORCEINLINE fltx4 Sin01SIMD(const fltx4 &val) {
    fltx4 fl4Abs = fabs(val);
    fltx4 fl4Reduced2 = Mod2SIMDPositiveInput(fl4Abs);
    fltx4 fl4OddMask = CmpGeSIMD(fl4Reduced2, Four_Ones);
    fltx4 fl4val = SubSIMD(fl4Reduced2, AndSIMD(Four_Ones, fl4OddMask));
    fltx4 fl4Sin = _Sin01SIMD(fl4val);
    fl4Sin = XorSIMD(fl4Sin, AndSIMD(LoadAlignedSIMD(g_SIMD_signmask),
                                     XorSIMD(val, fl4OddMask)));
    return fl4Sin;
}

// Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/(
// (1/a-2)*(1-t)+1)

FORCEINLINE fltx4 PreCalcBiasParameter(const fltx4 &bias_parameter) {
    // convert perlin-style-bias parameter to the value right for the
    // approximation
    return SubSIMD(ReciprocalSIMD(bias_parameter), Four_Twos);
}

FORCEINLINE fltx4 BiasSIMD(const fltx4 &val, const fltx4 &precalc_param) {
    // similar to bias function except pass precalced bias value from calling
    // PreCalcBiasParameter.

    //!!speed!! use reciprocal est?
    //!!speed!! could save one op by precalcing _2_ values
    return DivSIMD(val, AddSIMD(MulSIMD(precalc_param, SubSIMD(Four_Ones, val)),
                                Four_Ones));
}

//-----------------------------------------------------------------------------
// Box/plane test
// NOTE: The w component of emins + emaxs must be 1 for this to work
//-----------------------------------------------------------------------------
FORCEINLINE int BoxOnPlaneSideSIMD(const fltx4 &emins, const fltx4 &emaxs,
                                   const cplane_t *p, float tolerance = 0.f) {
    fltx4 corners[2];
    fltx4 normal = LoadUnalignedSIMD(p->normal.Base());
    fltx4 dist = ReplicateX4(-p->dist);
    normal = SetWSIMD(normal, dist);
    fltx4 t4 = ReplicateX4(tolerance);
    fltx4 negt4 = ReplicateX4(-tolerance);
    fltx4 cmp = CmpGeSIMD(normal, Four_Zeros);
    corners[0] = MaskedAssign(cmp, emaxs, emins);
    corners[1] = MaskedAssign(cmp, emins, emaxs);
    fltx4 dot1 = Dot4SIMD(normal, corners[0]);
    fltx4 dot2 = Dot4SIMD(normal, corners[1]);
    cmp = CmpGeSIMD(dot1, t4);
    fltx4 cmp2 = CmpGtSIMD(negt4, dot2);
    fltx4 result = MaskedAssign(cmp, Four_Ones, Four_Zeros);
    fltx4 result2 = MaskedAssign(cmp2, Four_Twos, Four_Zeros);
    result = AddSIMD(result, result2);
    intx4 sides;
    ConvertStoreAsIntsSIMD(&sides, result);
    return sides[0];
}

#endif  // _ssemath_h