Dreamcast: simplify code a bit more

2025-09-15 10:35:11 -04:00 · 2024-06-21 19:25:02 +10:00 · 2024-06-21 19:25:02 +10:00 · 655850e81a
commit 655850e81a
parent e91ce96900
9 changed files with 75 additions and 754 deletions
--- a/src/AudioBackend.c
+++ b/src/AudioBackend.c
@ -1435,7 +1435,7 @@ cc_result Audio_AllocChunks(cc_uint32 size, struct AudioChunk* chunks, int numCh
 	void* dst = memalign(32, size * numChunks);
 	if (!dst) return ERR_OUT_OF_MEMORY;
 	totalSize += size * numChunks;
-	Platform_Log3("ALLOC: %i X %i (%i)", &size, &numChunks, &totalSize);
+	//Platform_Log3("ALLOC: %i X %i (%i)", &size, &numChunks, &totalSize);
 	for (int i = 0; i < numChunks; i++) 
 	{
--- a/src/Graphics_Dreamcast.c
+++ b/src/Graphics_Dreamcast.c
@ -18,7 +18,7 @@ static cc_bool renderingDisabled;
 /*########################################################################################################################*
 *---------------------------------------------------------General---------------------------------------------------------*
 *#########################################################################################################################*/
-static int InitPowerVR(void) {
+static void InitPowerVR(void) {
 	cc_bool autosort = false; // Turn off auto sorting to match traditional GPU behaviour
 	cc_bool fsaa     = false;
 	AUTOSORT_ENABLED = autosort;
@ -507,7 +507,7 @@ cc_bool Gfx_WarnIfNecessary(void) {
 /*########################################################################################################################*
 *----------------------------------------------------------Drawing--------------------------------------------------------*
 *#########################################################################################################################*/
-extern void apply_poly_header(pvr_poly_hdr_t* header, PolyList* activePolyList);
+extern void apply_poly_header(pvr_poly_hdr_t* header, int list_type);
 extern Vertex* DrawColouredQuads(const void* src, Vertex* dst, int numQuads);
 extern Vertex* DrawTexturedQuads(const void* src, Vertex* dst, int numQuads);
@ -522,7 +522,7 @@ void DrawQuads(int count, void* src) {
 	Vertex* beg = aligned_vector_reserve(&output->vector, vec->size + (header_required) + count);
 	if (header_required) {
-		apply_poly_header((pvr_poly_hdr_t*)beg, output);
+		apply_poly_header((pvr_poly_hdr_t*)beg, output->list_type);
 		STATE_DIRTY = GL_FALSE;
 		beg++; 
 		vec->size += 1;
@ -632,9 +632,13 @@ void Gfx_SetViewport(int x, int y, int w, int h) {
 	}
 	STATE_DIRTY = GL_TRUE;
 	glViewport(x, y, w, h);
 	glScissor (x, y, w, h);
 	VIEWPORT.hwidth  = w *  0.5f;
 	VIEWPORT.hheight = h * -0.5f;
 	VIEWPORT.x_plus_hwidth  = x + w * 0.5f;
 	VIEWPORT.y_plus_hheight = y + h * 0.5f;
 	VP_COL_HWIDTH  = VP_TEX_HWIDTH  = w *  0.5f;
 	VP_COL_HHEIGHT = VP_TEX_HHEIGHT = h * -0.5f;
--- a/src/Platform_PSP.c
+++ b/src/Platform_PSP.c
@ -327,7 +327,7 @@ cc_result Socket_Create(cc_socket* s, cc_sockaddr* addr, cc_bool nonblocking) {
 cc_result Socket_Connect(cc_socket s, cc_sockaddr* addr) {
 	struct sockaddr* raw = (struct sockaddr*)addr->data;
-	int res = sceNetInetConnect(*s, raw, addr->size);
+	int res = sceNetInetConnect(s, raw, addr->size);
 	return res < 0 ? sceNetInetGetErrno() : 0;
 }
--- a/third_party/gldc/src/aligned_vector.h
+++ b/third_party/gldc/src/aligned_vector.h
@ -112,14 +112,3 @@ AV_FORCE_INLINE void* aligned_vector_extend(AlignedVector* vector, const uint32_
    return ret;
 }
 AV_FORCE_INLINE void aligned_vector_clear(AlignedVector* vector){
    vector->size = 0;
 }
 AV_FORCE_INLINE void aligned_vector_init(AlignedVector* vector) {
    /* Now initialize the header*/
    vector->size = 0;
    vector->capacity = 0;
    vector->data = NULL;
 }
--- a/third_party/gldc/src/flush.c
+++ b/third_party/gldc/src/flush.c
@ -1,52 +0,0 @@
 #include <stdbool.h>
 #include "gldc.h"
 PolyList OP_LIST;
 PolyList PT_LIST;
 PolyList TR_LIST;
 void glKosInit() {
    _glInitContext();
    _glInitTextures();
    OP_LIST.list_type = PVR_LIST_OP_POLY;
    PT_LIST.list_type = PVR_LIST_PT_POLY;
    TR_LIST.list_type = PVR_LIST_TR_POLY;
    aligned_vector_init(&OP_LIST.vector);
    aligned_vector_init(&PT_LIST.vector);
    aligned_vector_init(&TR_LIST.vector);
    aligned_vector_reserve(&OP_LIST.vector, 1024 * 3);
    aligned_vector_reserve(&PT_LIST.vector,  512 * 3);
    aligned_vector_reserve(&TR_LIST.vector, 1024 * 3);
 }
 void glKosSwapBuffers() {
    _glApplyScissor(true);
    pvr_scene_begin();   
        if(OP_LIST.vector.size > 2) {
            pvr_list_begin(PVR_LIST_OP_POLY);
            SceneListSubmit((Vertex*)OP_LIST.vector.data, OP_LIST.vector.size);
            pvr_list_finish();
        }
        if(PT_LIST.vector.size > 2) {
            pvr_list_begin(PVR_LIST_PT_POLY);
            SceneListSubmit((Vertex*)PT_LIST.vector.data, PT_LIST.vector.size);
            pvr_list_finish();
        }
        if(TR_LIST.vector.size > 2) {
            pvr_list_begin(PVR_LIST_TR_POLY);
            SceneListSubmit((Vertex*)TR_LIST.vector.data, TR_LIST.vector.size);
            pvr_list_finish();
        }
    pvr_scene_finish();
    OP_LIST.vector.size = 0;
    PT_LIST.vector.size = 0;
    TR_LIST.vector.size = 0;
 }
--- a/third_party/gldc/src/gldc.h
+++ b/third_party/gldc/src/gldc.h
@ -2,15 +2,10 @@
 #define PRIVATE_H
 #include <stdint.h>
 #include <stdio.h>
 #include <kos.h>
 #include <dc/pvr.h>
 #include "aligned_vector.h"
 #define MAX_TEXTURE_COUNT 768
 #define GL_SCISSOR_TEST     0x0008
 #define GL_NEAREST          0x2600
 #define GL_LINEAR           0x2601
 #define GL_OUT_OF_MEMORY    0x0505
@ -33,7 +28,6 @@ void   gldcBindTexture(GLuint texture);
 int  gldcAllocTexture(int w, int h, int format);
 void gldcGetTexture(void** data, int* width, int* height);
 void glViewport(int x, int y, int width, int height);
 void glScissor( int x, int y, int width, int height);
 void glKosInit();
@ -106,8 +100,6 @@ typedef struct {
 } __attribute__((aligned(32))) TextureObject;
 void _glInitContext();
 void _glInitSubmissionTarget();
 void _glInitTextures();
 extern TextureObject* TEXTURE_ACTIVE;
@ -152,10 +144,6 @@ void _glApplyScissor(int force);
 extern GLboolean STATE_DIRTY;
 #define MIN(a,b) (((a)<(b))?(a):(b))
 #define MAX(a,b) (((a)>(b))?(a):(b))
 #define CLAMP( X, _MIN, _MAX )  ( (X)<(_MIN) ? (_MIN) : ((X)>(_MAX) ? (_MAX) : (X)) )
 void SceneListSubmit(Vertex* v2, int n);
 static inline int DimensionFlag(int w) {
--- a/third_party/gldc/src/sh4_math.h
+++ b/third_party/gldc/src/sh4_math.h
@ -133,9 +133,6 @@ static const ALL_FLOATS_STRUCT MATH_identity_matrix = {1.0f, 0.0f, 0.0f, 0.0f, 0
  // a*b+c
  float MATH_fmac(float a, float b, float c)
  // a*b-c
  float MATH_fmac_Dec(float a, float b, float c)
  // fminf() - return the min of two floats
  // This doesn't check for NaN
  float MATH_Fast_Fminf(float a, float b)
@ -205,19 +202,6 @@ static inline __attribute__((always_inline)) float MATH_fmac(float a, float b, f
  return c;
 }
 // a*b-c
 static inline __attribute__((always_inline)) float MATH_fmac_Dec(float a, float b, float c)
 {
  asm volatile ("fneg %[floatc]\n\t"
    "fmac fr0, %[floatb], %[floatc]\n"
    : [floatc] "+&f" (c) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
    : "w" (a), [floatb] "f" (b) // inputs
    : // no clobbers
  );
  return c;
 }
 // Fast fminf() - return the min of two floats
 // This doesn't check for NaN
 static inline __attribute__((always_inline)) float MATH_Fast_Fminf(float a, float b)
@ -833,18 +817,6 @@ static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float_Rad(
  // Sum of Squares (w^2 + x^2 + y^2 + z^2)
  float MATH_Sum_of_Squares(float w, float x, float y, float z)
  // Cross product with bonus multiply (vec X vec = orthogonal vec, with an extra a*b=c)
  RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b)
  // Cross product (vec X vec = orthogonal vec)
  RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3)
  // Outer product (vec (X) vec = 4x4 matrix)
  void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4)
  // Matrix transform (4x4 matrix * 4x1 vec = 4x1 vec)
  RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4)
  // 4x4 Matrix transpose (XMTRX^T)
  void MATH_Matrix_Transpose(void)
@ -953,434 +925,6 @@ static inline __attribute__((always_inline)) float MATH_Sum_of_Squares(float w,
  return __z;
 }
 // Cross product: vec X vec = orthogonal vec
 //   _    _       _    _       _    _
 //  |  x1  |     |  y1  |     |  z1  |
 //  |  x2  |  X  |  y2  |  =  |  z2  |
 //  |_ x3 _|     |_ y3 _|     |_ z3 _|
 //
 // With bonus multiply:
 //
 //      a     *     b      =      c
 //
 // IMPORTANT USAGE INFORMATION (cross product):
 //
 // Return vector struct maps as below to the above diagram:
 //
 //  typedef struct {
 //   float z1;
 //   float z2;
 //   float z3;
 //   float z4; // c is stored in z4, and c = a*b if using 'with mult' version (else c = 0)
 // } RETURN_VECTOR_STRUCT;
 //
 //  For people familiar with the unit vector notation, z1 == 'i', z2 == 'j',
 //  and z3 == 'k'.
 //
 // The cross product matrix will also be stored in XMTRX after this, so calling
 // MATH_Matrix_Transform() on a vector after using this function will do a cross
 // product with the same x1-x3 values and a multiply with the same 'a' value
 // as used in this function. In this a situation, 'a' will be multiplied with
 // the x4 parameter of MATH_Matrix_Transform(). a = 0 if not using the 'with mult'
 // version of the cross product function.
 //
 // For reference, XMTRX will look like this:
 //
 //  [  0 -x3 x2 0 ]
 //  [  x3 0 -x1 0 ]
 //  [ -x2 x1 0  0 ]
 //  [  0  0  0  a ] (<-- a = 0 if not using 'with mult')
 //
 // Similarly to how the sine and cosine functions use fsca and return 2 floats,
 // the cross product functions actually return 4 floats. The first 3 are the
 // cross product output, and the 4th is a*b. The SH4 only multiplies 4x4
 // matrices with 4x1 vectors, which is why the output is like that--but it means
 // we also get a bonus float multiplication while we do our cross product!
 //
 // Please do not call this function directly (notice the weird syntax); call
 // MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead.
 static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product_with_Mult(float x3, float a, float y3, float b, float x2, float x1, float y1, float y2)
 {
  // FR4-FR11 are the regs that are passed in, in that order.
  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
  // Temporary variables are necessary per GCC to avoid clobbering:
  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
  float tx1 = x1;
  float tx2 = x2;
  float tx3 = x3;
  float ta = a;
  float ty1 = y1;
  float ty2 = y2;
  float ty3 = y3;
  float tb = b;
  register float __x1 __asm__("fr9") = tx1; // need to negate (need to move to fr6, then negate fr9)
  register float __x2 __asm__("fr8") = tx2; // in place for matrix (need to move to fr2 then negate fr2)
  register float __x3 __asm__("fr4") = tx3; // need to negate (move to fr1 first, then negate fr4)
  register float __a __asm__("fr5") = ta;
  register float __y1 __asm__("fr10") = ty1;
  register float __y2 __asm__("fr11") = ty2;
  register float __y3 __asm__("fr6") = ty3;
  register float __b __asm__("fr7") = tb;
  register float __z1 __asm__("fr0") = 0.0f; // z1
  register float __z2 __asm__("fr1") = 0.0f; // z2 (not moving x3 here yet since a double 0 is needed)
  register float __z3 __asm__("fr2") = tx2; // z3 (this handles putting x2 in fr2)
  register float __c __asm__("fr3") = 0.0f; // c
  // This actually does a matrix transform to do the cross product.
  // It's this:
  //                   _    _       _            _
  //  [  0 -x3 x2 0 ] |  y1  |     | -x3y2 + x2y3 |
  //  [  x3 0 -x1 0 ] |  y2  |  =  |  x3y1 - x1y3 |
  //  [ -x2 x1 0  0 ] |  y3  |     | -x2y1 + x1y2 |
  //  [  0  0  0  a ] |_ b  _|     |_      c     _|
  //
  asm volatile (
    // set up back bank's FV0
    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
    // Save FR12-FR15, which are supposed to be preserved across functions calls.
    // This stops them from getting clobbered and saves 4 stack pushes (memory accesses).
    "fmov DR12, XD12\n\t"
    "fmov DR14, XD14\n\t"
    "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1)
    "fmov DR6, XD2\n\t" // fmov 'y3' and 'b' from FR6, FR7 into position (XF2, XF3)
    // pair move zeros for some speed in setting up front bank for matrix
    "fmov DR0, DR10\n\t" // clear FR10, FR11
    "fmov DR0, DR12\n\t" // clear FR12, FR13
    "fschg\n\t" // switch back to single moves
    // prepare front bank for XMTRX
    "fmov FR5, FR15\n\t" // fmov 'a' into position
    "fmov FR0, FR14\n\t" // clear out FR14
    "fmov FR0, FR7\n\t" // clear out FR7
    "fmov FR0, FR5\n\t" // clear out FR5
    "fneg FR2\n\t" // set up 'x2'
    "fmov FR9, FR6\n\t" // set up 'x1'
    "fneg FR9\n\t"
    "fmov FR4, FR1\n\t" // set up 'x3'
    "fneg FR4\n\t"
    // flip banks and matrix multiply
    "frchg\n\t"
    "ftrv XMTRX, FV0\n"
  : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0)
  : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__a), "f" (__b) // inputs
  : // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved)
  );
  RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c};
  return output;
 }
 // Please do not call this function directly (notice the weird syntax); call
 // MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead.
 static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product(float x3, float zero, float x1, float y3, float x2, float x1_2, float y1, float y2)
 {
  // FR4-FR11 are the regs that are passed in, in that order.
  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
  // Temporary variables are necessary per GCC to avoid clobbering:
  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
  float tx1 = x1;
  float tx2 = x2;
  float tx3 = x3;
  float tx1_2 = x1_2;
  float ty1 = y1;
  float ty2 = y2;
  float ty3 = y3;
  float tzero = zero;
  register float __x1 __asm__("fr6") = tx1; // in place
  register float __x2 __asm__("fr8") = tx2; // in place (fmov to fr2, negate fr2)
  register float __x3 __asm__("fr4") = tx3; // need to negate (fmov to fr1, negate fr4)
  register float __zero __asm__("fr5") = tzero; // in place
  register float __x1_2 __asm__("fr9") = tx1_2; // need to negate
  register float __y1 __asm__("fr10") = ty1;
  register float __y2 __asm__("fr11") = ty2;
  // no __y3 needed in this function
  register float __z1 __asm__("fr0") = tzero; // z1
  register float __z2 __asm__("fr1") = tzero; // z2
  register float __z3 __asm__("fr2") = ty3; // z3
  register float __c __asm__("fr3") = tzero; // c
  // This actually does a matrix transform to do the cross product.
  // It's this:
  //                   _    _       _            _
  //  [  0 -x3 x2 0 ] |  y1  |     | -x3y2 + x2y3 |
  //  [  x3 0 -x1 0 ] |  y2  |  =  |  x3y1 - x1y3 |
  //  [ -x2 x1 0  0 ] |  y3  |     | -x2y1 + x1y2 |
  //  [  0  0  0  0 ] |_ 0  _|     |_      0     _|
  //
  asm volatile (
    // zero out FR7. For some reason, if this is done in C after __z3 is set:
    // register float __y3 __asm__("fr7") = tzero;
    // then GCC will emit a spurious stack push (pushing FR12). So just zero it here.
    "fmov FR5, FR7\n\t"
    // set up back bank's FV0
    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
    // Save FR12-FR15, which are supposed to be preserved across functions calls.
    // This stops them from getting clobbered and saves 4 stack pushes (memory accesses).
    "fmov DR12, XD12\n\t"
    "fmov DR14, XD14\n\t"
    "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1)
    "fmov DR2, XD2\n\t" // fmov 'y3' and '0' from FR2, FR3 into position (XF2, XF3)
    // pair move zeros for some speed in setting up front bank for matrix
    "fmov DR0, DR10\n\t" // clear FR10, FR11
    "fmov DR0, DR12\n\t" // clear FR12, FR13
    "fmov DR0, DR14\n\t" // clear FR14, FR15
    "fschg\n\t" // switch back to single moves
    // prepare front bank for XMTRX
    "fneg FR9\n\t" // set up 'x1'
    "fmov FR8, FR2\n\t" // set up 'x2'
    "fneg FR2\n\t"
    "fmov FR4, FR1\n\t" // set up 'x3'
    "fneg FR4\n\t"
    // flip banks and matrix multiply
    "frchg\n\t"
    "ftrv XMTRX, FV0\n"
  : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0)
  : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__zero), "f" (__x1_2) // inputs
  : "fr7" // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved)
  );
  RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c};
  return output;
 }
 //------------------------------------------------------------------------------
 // Functions that wrap the xMATH_do_Cross_Product[_with_Mult]() functions to make
 // it easier to organize parameters
 //------------------------------------------------------------------------------
 // Cross product with a bonus float multiply (c = a * b)
 static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b)
 {
  return xMATH_do_Cross_Product_with_Mult(x3, a, y3, b, x2, x1, y1, y2);
 }
 // Plain cross product; does not use the bonus float multiply (c = 0 and a in the cross product matrix will be 0)
 // This is a tiny bit faster than 'with_mult' (about 2 cycles faster)
 static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3)
 {
  return xMATH_do_Cross_Product(x3, 0.0f, x1, y3, x2, x1, y1, y2);
 }
 // Outer product: vec (X) vec = matrix
 //   _    _
 //  |  x1  |
 //  |  x2  |  (X)  [ y1 y2 y3 y4 ] = 4x4 matrix
 //  |  x3  |
 //  |_ x4 _|
 //
 // This returns the floats in the back bank (XF0-15), which are inaccessible
 // outside of using frchg or paired-move fmov. It's meant to set up a matrix for
 // use with other matrix functions. GCC also does not touch the XFn bank.
 // This will also wipe out anything stored in the float registers, as it uses the
 // whole FPU register file (all 32 of the float registers).
 static inline __attribute__((always_inline)) void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4)
 {
  // FR4-FR11 are the regs that are passed in, in that order.
  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
  // Temporary variables are necessary per GCC to avoid clobbering:
  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
  float tx1 = x1;
  float tx2 = x2;
  float tx3 = x3;
  float tx4 = x4;
  float ty1 = y1;
  float ty2 = y2;
  float ty3 = y3;
  float ty4 = y4;
  // vector FV4
  register float __x1 __asm__("fr4") = tx1;
  register float __x2 __asm__("fr5") = tx2;
  register float __x3 __asm__("fr6") = tx3;
  register float __x4 __asm__("fr7") = tx4;
  // vector FV8
  register float __y1 __asm__("fr8") = ty1;
  register float __y2 __asm__("fr9") = ty2;
  register float __y3 __asm__("fr10") = ty3; // in place already
  register float __y4 __asm__("fr11") = ty4;
  // This actually does a 4x4 matrix multiply to do the outer product.
  // It's this:
  //
  //  [ x1 x1 x1 x1 ] [ y1 0 0 0 ]     [ x1y1 x1y2 x1y3 x1y4 ]
  //  [ x2 x2 x2 x2 ] [ 0 y2 0 0 ]  =  [ x2y1 x2y2 x2y3 x2y4 ]
  //  [ x3 x3 x3 x3 ] [ 0 0 y3 0 ]     [ x3y1 x3y2 x3y3 x3y4 ]
  //  [ x4 x4 x4 x4 ] [ 0 0 0 y4 ]     [ x4y1 x4y2 x4y3 x4y4 ]
  //
  asm volatile (
    // zero out unoccupied front floats to make a double 0 in DR2
    "fldi0 FR2\n\t"
    "fmov FR2, FR3\n\t"
    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
    // fmov 'x1' and 'x2' from FR4, FR5 into position (XF0,4,8,12, XF1,5,9,13)
    "fmov DR4, XD0\n\t"
    "fmov DR4, XD4\n\t"
    "fmov DR4, XD8\n\t"
    "fmov DR4, XD12\n\t"
    // fmov 'x3' and 'x4' from FR6, FR7 into position (XF2,6,10,14, XF3,7,11,15)
    "fmov DR6, XD2\n\t"
    "fmov DR6, XD6\n\t"
    "fmov DR6, XD10\n\t"
    "fmov DR6, XD14\n\t"
    // set up front floats (y1-y4)
    "fmov DR8, DR0\n\t"
    "fmov DR8, DR4\n\t"
    "fmov DR10, DR14\n\t"
    // finish zeroing out front floats
    "fmov DR2, DR6\n\t"
    "fmov DR2, DR8\n\t"
    "fmov DR2, DR12\n\t"
    "fschg\n\t" // switch back to single-move mode
    // zero out remaining values and matrix multiply 4x4
    "fmov FR2, FR1\n\t"
    "ftrv XMTRX, FV0\n\t"
    "fmov FR6, FR4\n\t"
    "ftrv XMTRX, FV4\n\t"
    "fmov FR8, FR11\n\t"
    "ftrv XMTRX, FV8\n\t"
    "fmov FR12, FR14\n\t"
    "ftrv XMTRX, FV12\n\t"
    // Save output in XF regs
    "frchg\n"
  : // no outputs
  : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__x4), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__y4) // inputs
  : "fr0", "fr1", "fr2", "fr3", "fr12", "fr13", "fr14", "fr15" // clobbers, can't avoid it
  );
  // GCC will restore FR12-FR15 from the stack after this, so we really can't keep the output in the front bank.
 }
 // Matrix transform: matrix * vector = vector
 //                   _    _       _    _
 //  [ ----------- ] |  x1  |     |  z1  |
 //  [ ---XMTRX--- ] |  x2  |  =  |  z2  |
 //  [ ----------- ] |  x3  |     |  z3  |
 //  [ ----------- ] |_ x4 _|     |_ z4 _|
 //
 // IMPORTANT USAGE INFORMATION (matrix transform):
 //
 // Return vector struct maps 1:1 to the above diagram:
 //
 //  typedef struct {
 //   float z1;
 //   float z2;
 //   float z3;
 //   float z4;
 // } RETURN_VECTOR_STRUCT;
 //
 // Similarly to how the sine and cosine functions use fsca and return 2 floats,
 // the matrix transform function actually returns 4 floats. The SH4 only multiplies
 // 4x4 matrices with 4x1 vectors, which is why the output is like that.
 //
 // Multiply a matrix stored in the back bank (XMTRX) with an input vector
 static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4)
 {
  // The floats comprising FV4 are the regs that are passed in.
  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
  // Temporary variables are necessary per GCC to avoid clobbering:
  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
  float tx1 = x1;
  float tx2 = x2;
  float tx3 = x3;
  float tx4 = x4;
  // output vector FV0
  register float __z1 __asm__("fr0") = tx1;
  register float __z2 __asm__("fr1") = tx2;
  register float __z3 __asm__("fr2") = tx3;
  register float __z4 __asm__("fr3") = tx4;
  asm volatile ("ftrv XMTRX, FV0\n\t"
    // have to do this to obey SH4 calling convention--output returned in FV0
    : "+w" (__z1), "+f" (__z2), "+f" (__z3), "+f" (__z4) // outputs, "+" means r/w
    : // no inputs
    : // no clobbers
  );
  RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __z4};
  return output;
 }
 // Matrix Transpose
 //
 // This does a matrix transpose on the matrix in XMTRX, which swaps rows with
 // columns as follows (math notation is [XMTRX]^T):
 //
 //  [ a b c d ] T   [ a e i m ]
 //  [ e f g h ]  =  [ b f j n ]
 //  [ i j k l ]     [ c g k o ]
 //  [ m n o p ]     [ d h l p ]
 //
 // PLEASE NOTE: It is faster to avoid the need for a transpose altogether by
 // structuring matrices and vectors accordingly.
 static inline __attribute__((always_inline)) void MATH_Matrix_Transpose(void)
 {
  asm volatile (
    "frchg\n\t" // fmov for singles only works on front bank
    // FR0, FR5, FR10, and FR15 are already in place
    // swap FR1 and FR4
    "flds FR1, FPUL\n\t"
    "fmov FR4, FR1\n\t"
    "fsts FPUL, FR4\n\t"
    // swap FR2 and FR8
    "flds FR2, FPUL\n\t"
    "fmov FR8, FR2\n\t"
    "fsts FPUL, FR8\n\t"
    // swap FR3 and FR12
    "flds FR3, FPUL\n\t"
    "fmov FR12, FR3\n\t"
    "fsts FPUL, FR12\n\t"
    // swap FR6 and FR9
    "flds FR6, FPUL\n\t"
    "fmov FR9, FR6\n\t"
    "fsts FPUL, FR9\n\t"
    // swap FR7 and FR13
    "flds FR7, FPUL\n\t"
    "fmov FR13, FR7\n\t"
    "fsts FPUL, FR13\n\t"
    // swap FR11 and FR14
    "flds FR11, FPUL\n\t"
    "fmov FR14, FR11\n\t"
    "fsts FPUL, FR14\n\t"
    // restore XMTRX to back bank
    "frchg\n"
    : // no outputs
    : // no inputs
    : "fpul" // clobbers
  );
 }
 // Matrix product: matrix * matrix = matrix
 //
 // These use the whole dang floating point unit.
@ -1582,73 +1126,14 @@ static inline __attribute__((always_inline)) ALL_FLOATS_STRUCT * MATH_Store_XMTR
 //
 /*
  //------------------------------------------------------------------------------
  // Commonly useful functions
  //------------------------------------------------------------------------------
  // Returns 1 if point 't' is inside triangle with vertices 'v0', 'v1', and 'v2', and 0 if not
  int MATH_Is_Point_In_Triangle(float v0x, float v0y, float v1x, float v1y, float v2x, float v2y, float ptx, float pty)
  //------------------------------------------------------------------------------
  // Interpolation
  //------------------------------------------------------------------------------
  // Linear interpolation
  float MATH_Lerp(float a, float b, float t)
  // Speherical interpolation ('theta' in fsca units)
  float MATH_Slerp(float a, float b, float t, float theta)
  //------------------------------------------------------------------------------
  // Fast Sinc functions (unnormalized, sin(x)/x version)
  //------------------------------------------------------------------------------
  // Just pass in MATH_pi * x for normalized versions :)
  // Sinc function (fsca units)
  float MATH_Fast_Sincf(float x)
  // Sinc function (degrees)
  float MATH_Fast_Sincf_Deg(float x)
  // Sinc function (rads)
  float MATH_Fast_Sincf_Rad(float x)
 */
 //------------------------------------------------------------------------------
 // Commonly useful functions
 //------------------------------------------------------------------------------
 // Returns 1 if point 'pt' is inside triangle with vertices 'v0', 'v1', and 'v2', and 0 if not
 // Determines triangle center using barycentric coordinate transformation
 // Adapted from: https://stackoverflow.com/questions/2049582/how-to-determine-if-a-point-is-in-a-2d-triangle
 // Specifically the answer by user 'adreasdr' in addition to the comment by user 'urraka' on the answer from user 'Andreas Brinck'
 //
 // The notation here assumes v0x is the x-component of v0, v0y is the y-component of v0, etc.
 //
 static inline __attribute__((always_inline)) int MATH_Is_Point_In_Triangle(float v0x, float v0y, float v1x, float v1y, float v2x, float v2y, float ptx, float pty)
 {
  float sdot = MATH_fipr(v0y, -v0x, v2y - v0y, v0x - v2x, v2x, v2y, ptx, pty);
  float tdot = MATH_fipr(v0x, -v0y, v0y - v1y, v1x - v0x, v1y, v1x, ptx, pty);
  float areadot = MATH_fipr(-v1y, v0y, v0x, v1x, v2x, -v1x + v2x, v1y - v2y, v2y);
  // 'areadot' could be negative depending on the winding of the triangle
  if(areadot < 0.0f)
  {
    sdot *= -1.0f;
    tdot *= -1.0f;
    areadot *= -1.0f;
  }
  if( (sdot > 0.0f) && (tdot > 0.0f) && (areadot > (sdot + tdot)) )
  {
    return 1;
  }
  return 0;
 }
 //------------------------------------------------------------------------------
 // Interpolation
 //------------------------------------------------------------------------------
@ -1659,123 +1144,6 @@ static inline __attribute__((always_inline)) float MATH_Lerp(float a, float b, f
  return MATH_fmac(t, (b-a), a);
 }
 // Speherical interpolation ('theta' in fsca units)
 static inline __attribute__((always_inline)) float MATH_Slerp(float a, float b, float t, float theta)
 {
  // a is an element of v0, b is an element of v1
  // v = ( v0 * sin(theta - t * theta) + v1 * sin(t * theta) ) / sin(theta)
  // by using sine/cosine identities and properties, this can be optimized to:
  // v = v0 * cos(-t * theta) + ( v0 * ( cos(theta) * sin(-t * theta) ) - sin(-t * theta) * v1 ) / sin(theta)
  // which only requires two calls to fsca.
  // Specifically, sin(a + b) = sin(a)cos(b) + cos(a)sin(b) & sin(-a) = -sin(a)
  // MATH_fsca_* functions return reverse-ordered complex numbers for speed reasons (i.e. normally sine is the imaginary part)
  // This could be made even faster by using MATH_fsca_Int() with 'theta' and 't' as unsigned ints
 #if __GNUC__ <= GNUC_FSCA_ERROR_VERSION
  RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float(theta);
  float sine_value_theta = sine_cosine.sine;
  float cosine_value_theta = sine_cosine.cosine;
  RETURN_FSCA_STRUCT sine_cosine2 = MATH_fsca_Float(-t * theta);
  float sine_value_minus_t_theta = sine_cosine2.sine;
  float cosine_value_minus_t_theta = sine_cosine2.cosine;
 #else
  _Complex float sine_cosine = MATH_fsca_Float(theta);
  float sine_value_theta = __real__ sine_cosine;
  float cosine_value_theta = __imag__ sine_cosine;
  _Complex float sine_cosine2 = MATH_fsca_Float(-t * theta);
  float sine_value_minus_t_theta = __real__ sine_cosine2;
  float cosine_value_minus_t_theta = __imag__ sine_cosine2;
 #endif
  float numer = a * cosine_value_theta * sine_value_minus_t_theta - sine_value_minus_t_theta * b;
  float output_float = a * cosine_value_minus_t_theta + MATH_Fast_Divide(numer, sine_value_theta);
  return output_float;
 }
 //------------------------------------------------------------------------------
 // Fast Sinc (unnormalized, sin(x)/x version)
 //------------------------------------------------------------------------------
 //
 // Just pass in MATH_pi * x for normalized versions :)
 //
 // Sinc function (fsca units)
 static inline __attribute__((always_inline)) float MATH_Fast_Sincf(float x)
 {
  if(x == 0.0f)
  {
    return 1.0f;
  }
 #if __GNUC__ <= GNUC_FSCA_ERROR_VERSION
  RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float(x);
  float sine_value = sine_cosine.sine;
 #else
  _Complex float sine_cosine = MATH_fsca_Float(x);
  float sine_value = __real__ sine_cosine;
 #endif
  return MATH_Fast_Divide(sine_value, x);
 }
 // Sinc function (degrees)
 static inline __attribute__((always_inline)) float MATH_Fast_Sincf_Deg(float x)
 {
  if(x == 0.0f)
  {
    return 1.0f;
  }
 #if __GNUC__ <= GNUC_FSCA_ERROR_VERSION
  RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float_Deg(x);
  float sine_value = sine_cosine.sine;
 #else
  _Complex float sine_cosine = MATH_fsca_Float_Deg(x);
  float sine_value = __real__ sine_cosine;
 #endif
  return MATH_Fast_Divide(sine_value, x);
 }
 // Sinc function (rads)
 static inline __attribute__((always_inline)) float MATH_Fast_Sincf_Rad(float x)
 {
  if(x == 0.0f)
  {
    return 1.0f;
  }
 #if __GNUC__ <= GNUC_FSCA_ERROR_VERSION
  RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float_Rad(x);
  float sine_value = sine_cosine.sine;
 #else
  _Complex float sine_cosine = MATH_fsca_Float_Rad(x);
  float sine_value = __real__ sine_cosine;
 #endif
  return MATH_Fast_Divide(sine_value, x);
 }
 //==============================================================================
 // Miscellaneous Snippets
 //==============================================================================
--- a/third_party/gldc/src/state.c
+++ b/third_party/gldc/src/state.c
@ -1,8 +1,14 @@
 #include <stdbool.h>
 #include <string.h>
 #include <stdio.h>
 #include <kos.h>
 #include <dc/pvr.h>
 #include "gldc.h"
 #define MIN(a,b) (((a)<(b))?(a):(b))
 #define MAX(a,b) (((a)>(b))?(a):(b))
 #define CLAMP( X, _MIN, _MAX )  ( (X)<(_MIN) ? (_MIN) : ((X)>(_MAX) ? (_MAX) : (X)) )
 GLboolean STATE_DIRTY = GL_TRUE;
 GLboolean DEPTH_TEST_ENABLED = GL_FALSE;
@ -21,37 +27,67 @@ GLboolean BLEND_ENABLED = GL_FALSE;
 GLboolean TEXTURES_ENABLED = GL_FALSE;
 GLboolean AUTOSORT_ENABLED = GL_FALSE;
 PolyList OP_LIST;
 PolyList PT_LIST;
 PolyList TR_LIST;
 Viewport VIEWPORT;
 static struct {
    int x;
    int y;
    int width;
    int height;
    GLboolean applied;
-} scissor_rect = {0, 0, 640, 480, false};
+} scissor_rect;
-void _glInitContext() {
+void glKosInit() {
    scissor_rect.x = 0;
    scissor_rect.y = 0;
    scissor_rect.width  = vid_mode->width;
    scissor_rect.height = vid_mode->height;
    _glInitTextures();
    OP_LIST.list_type = PVR_LIST_OP_POLY;
    PT_LIST.list_type = PVR_LIST_PT_POLY;
    TR_LIST.list_type = PVR_LIST_TR_POLY;
    aligned_vector_reserve(&OP_LIST.vector, 1024 * 3);
    aligned_vector_reserve(&PT_LIST.vector,  512 * 3);
    aligned_vector_reserve(&TR_LIST.vector, 1024 * 3);
 }
 void glKosSwapBuffers() {
    _glApplyScissor(true);
    pvr_scene_begin();   
        if (OP_LIST.vector.size > 2) {
            pvr_list_begin(PVR_LIST_OP_POLY);
            SceneListSubmit((Vertex*)OP_LIST.vector.data, OP_LIST.vector.size);
            pvr_list_finish();
        }
        if (PT_LIST.vector.size > 2) {
            pvr_list_begin(PVR_LIST_PT_POLY);
            SceneListSubmit((Vertex*)PT_LIST.vector.data, PT_LIST.vector.size);
            pvr_list_finish();
        }
        if (TR_LIST.vector.size > 2) {
            pvr_list_begin(PVR_LIST_TR_POLY);
            SceneListSubmit((Vertex*)TR_LIST.vector.data, TR_LIST.vector.size);
            pvr_list_finish();
        }
    pvr_scene_finish();
    OP_LIST.vector.size = 0;
    PT_LIST.vector.size = 0;
    TR_LIST.vector.size = 0;
 }
 void glScissor(int x, int y, int width, int height) {
    if(scissor_rect.x == x &&
        scissor_rect.y == y &&
        scissor_rect.width == width &&
        scissor_rect.height == height) {
        return;
    }
    scissor_rect.x = x;
    scissor_rect.y = y;
    scissor_rect.width = width;
    scissor_rect.height = height;
    scissor_rect.applied = false;
    STATE_DIRTY = GL_TRUE; // FIXME: do we need this?
    _glApplyScissor(false);
 }
@ -87,27 +123,27 @@ void _glApplyScissor(int force) {
    PVRTileClipCommand c;
-    int miny, maxx, maxy;
+    int sx, sy, ex, ey;
    int scissor_width  = MAX(MIN(scissor_rect.width,  vid_mode->width),  0);
    int scissor_height = MAX(MIN(scissor_rect.height, vid_mode->height), 0);
    /* force the origin to the lower left-hand corner of the screen */
-    miny = (vid_mode->height - scissor_height) - scissor_rect.y;
+	sx = scissor_rect.x;
-    maxx = (scissor_width + scissor_rect.x);
+    sy = (vid_mode->height - scissor_height) - scissor_rect.y;
-    maxy = (scissor_height + miny);
+    ex = sx + scissor_width;
    ey = sy + scissor_height;
    /* load command structure while mapping screen coords to TA tiles */
    c.flags = PVR_CMD_USERCLIP;
    c.d1 = c.d2 = c.d3 = 0;
-    uint16_t vw = vid_mode->width >> 5;
+    uint16_t vw = vid_mode->width  >> 5;
    uint16_t vh = vid_mode->height >> 5;
-    c.sx = CLAMP(scissor_rect.x >> 5, 0, vw);
+    c.sx = CLAMP(sx >> 5, 0, vw);
-    c.sy = CLAMP(miny >> 5, 0, vh);
+    c.sy = CLAMP(sy >> 5, 0, vh);
-    c.ex = CLAMP((maxx >> 5) - 1, 0, vw);
+    c.ex = CLAMP((ex >> 5) - 1, 0, vw);
-    c.ey = CLAMP((maxy >> 5) - 1, 0, vh);
+    c.ey = CLAMP((ey >> 5) - 1, 0, vh);
    aligned_vector_push_back(&OP_LIST.vector, &c, 1);
    aligned_vector_push_back(&PT_LIST.vector, &c, 1);
@ -116,30 +152,16 @@ void _glApplyScissor(int force) {
    scissor_rect.applied = true;
 }
 Viewport VIEWPORT;
-/* Set the GL viewport */
+void apply_poly_header(pvr_poly_hdr_t* dst, int list_type) {
-void glViewport(int x, int y, int width, int height) {
+    TextureObject* tx1 = TEXTURE_ACTIVE;
    VIEWPORT.hwidth  = width  *  0.5f;
    VIEWPORT.hheight = height * -0.5f;
    VIEWPORT.x_plus_hwidth  = x + width  * 0.5f;
    VIEWPORT.y_plus_hheight = y + height * 0.5f;
 }
 void apply_poly_header(pvr_poly_hdr_t* dst, PolyList* activePolyList) {
    const TextureObject *tx1 = TEXTURE_ACTIVE;
    uint32_t txr_base;
    TRACE();
    int list_type = activePolyList->list_type;
    int gen_color_clamp = PVR_CLRCLAMP_DISABLE;
    int gen_culling = CULLING_ENABLED    ? PVR_CULLING_CW : PVR_CULLING_SMALL;
    int depth_comp  = DEPTH_TEST_ENABLED ? PVR_DEPTHCMP_GEQUAL : PVR_DEPTHCMP_ALWAYS;
    int depth_write = DEPTH_MASK_ENABLED ? PVR_DEPTHWRITE_ENABLE : PVR_DEPTHWRITE_DISABLE;
    int gen_shading   = SHADE_MODEL;
    int gen_clip_mode = SCISSOR_TEST_ENABLED ? PVR_USERCLIP_INSIDE : PVR_USERCLIP_DISABLE;
    int gen_fog_type  = FOG_ENABLED          ? PVR_FOG_TABLE : PVR_FOG_DISABLE;
@ -179,7 +201,7 @@ void apply_poly_header(pvr_poly_hdr_t* dst, PolyList* activePolyList) {
    /* Or in the list type, shading type, color and UV formats */
    dst->cmd |= (list_type             << PVR_TA_CMD_TYPE_SHIFT)     & PVR_TA_CMD_TYPE_MASK;
    dst->cmd |= (PVR_CLRFMT_ARGBPACKED << PVR_TA_CMD_CLRFMT_SHIFT)   & PVR_TA_CMD_CLRFMT_MASK;
-    dst->cmd |= (gen_shading           << PVR_TA_CMD_SHADE_SHIFT)    & PVR_TA_CMD_SHADE_MASK;
+    dst->cmd |= (SHADE_MODEL           << PVR_TA_CMD_SHADE_SHIFT)    & PVR_TA_CMD_SHADE_MASK;
    dst->cmd |= (PVR_UVFMT_32BIT       << PVR_TA_CMD_UVFMT_SHIFT)    & PVR_TA_CMD_UVFMT_MASK;
    dst->cmd |= (gen_clip_mode         << PVR_TA_CMD_USERCLIP_SHIFT) & PVR_TA_CMD_USERCLIP_MASK;
--- a/third_party/gldc/src/texture.c
+++ b/third_party/gldc/src/texture.c
@ -2,6 +2,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <kos.h>
 #include <dc/pvr.h>
 #include "gldc.h"
 #include "yalloc/yalloc.h"