diff --git a/src/AudioBackend.c b/src/AudioBackend.c
index 71bb1ba62..1f45a13c0 100644
--- a/src/AudioBackend.c
+++ b/src/AudioBackend.c
@@ -1435,7 +1435,7 @@ cc_result Audio_AllocChunks(cc_uint32 size, struct AudioChunk* chunks, int numCh
 	void* dst = memalign(32, size * numChunks);
 	if (!dst) return ERR_OUT_OF_MEMORY;
 	totalSize += size * numChunks;
-	Platform_Log3("ALLOC: %i X %i (%i)", &size, &numChunks, &totalSize);
+	//Platform_Log3("ALLOC: %i X %i (%i)", &size, &numChunks, &totalSize);
 
 	for (int i = 0; i < numChunks; i++) 
 	{
diff --git a/src/Graphics_Dreamcast.c b/src/Graphics_Dreamcast.c
index 1085bc4c5..58d4d7190 100644
--- a/src/Graphics_Dreamcast.c
+++ b/src/Graphics_Dreamcast.c
@@ -18,7 +18,7 @@ static cc_bool renderingDisabled;
 /*########################################################################################################################*
 *---------------------------------------------------------General---------------------------------------------------------*
 *#########################################################################################################################*/
-static int InitPowerVR(void) {
+static void InitPowerVR(void) {
 	cc_bool autosort = false; // Turn off auto sorting to match traditional GPU behaviour
 	cc_bool fsaa     = false;
 	AUTOSORT_ENABLED = autosort;
@@ -507,7 +507,7 @@ cc_bool Gfx_WarnIfNecessary(void) {
 /*########################################################################################################################*
 *----------------------------------------------------------Drawing--------------------------------------------------------*
 *#########################################################################################################################*/
-extern void apply_poly_header(pvr_poly_hdr_t* header, PolyList* activePolyList);
+extern void apply_poly_header(pvr_poly_hdr_t* header, int list_type);
 
 extern Vertex* DrawColouredQuads(const void* src, Vertex* dst, int numQuads);
 extern Vertex* DrawTexturedQuads(const void* src, Vertex* dst, int numQuads);
@@ -522,7 +522,7 @@ void DrawQuads(int count, void* src) {
 	Vertex* beg = aligned_vector_reserve(&output->vector, vec->size + (header_required) + count);
 
 	if (header_required) {
-		apply_poly_header((pvr_poly_hdr_t*)beg, output);
+		apply_poly_header((pvr_poly_hdr_t*)beg, output->list_type);
 		STATE_DIRTY = GL_FALSE;
 		beg++; 
 		vec->size += 1;
@@ -632,9 +632,13 @@ void Gfx_SetViewport(int x, int y, int w, int h) {
 	}
 	STATE_DIRTY = GL_TRUE;
 	
-	glViewport(x, y, w, h);
 	glScissor (x, y, w, h);
 
+	VIEWPORT.hwidth  = w *  0.5f;
+	VIEWPORT.hheight = h * -0.5f;
+	VIEWPORT.x_plus_hwidth  = x + w * 0.5f;
+	VIEWPORT.y_plus_hheight = y + h * 0.5f;
+
 	VP_COL_HWIDTH  = VP_TEX_HWIDTH  = w *  0.5f;
 	VP_COL_HHEIGHT = VP_TEX_HHEIGHT = h * -0.5f;
 
diff --git a/src/Platform_PSP.c b/src/Platform_PSP.c
index 605113f38..3313953ab 100644
--- a/src/Platform_PSP.c
+++ b/src/Platform_PSP.c
@@ -327,7 +327,7 @@ cc_result Socket_Create(cc_socket* s, cc_sockaddr* addr, cc_bool nonblocking) {
 cc_result Socket_Connect(cc_socket s, cc_sockaddr* addr) {
 	struct sockaddr* raw = (struct sockaddr*)addr->data;
 	
-	int res = sceNetInetConnect(*s, raw, addr->size);
+	int res = sceNetInetConnect(s, raw, addr->size);
 	return res < 0 ? sceNetInetGetErrno() : 0;
 }
 
diff --git a/third_party/gldc/src/aligned_vector.h b/third_party/gldc/src/aligned_vector.h
index 30dd4ae7e..9ed0f6bf8 100644
--- a/third_party/gldc/src/aligned_vector.h
+++ b/third_party/gldc/src/aligned_vector.h
@@ -112,14 +112,3 @@ AV_FORCE_INLINE void* aligned_vector_extend(AlignedVector* vector, const uint32_
     return ret;
 }
 
-AV_FORCE_INLINE void aligned_vector_clear(AlignedVector* vector){
-    vector->size = 0;
-}
-
-AV_FORCE_INLINE void aligned_vector_init(AlignedVector* vector) {
-    /* Now initialize the header*/
-    vector->size = 0;
-    vector->capacity = 0;
-    vector->data = NULL;
-}
-
diff --git a/third_party/gldc/src/flush.c b/third_party/gldc/src/flush.c
deleted file mode 100644
index 5c274136c..000000000
--- a/third_party/gldc/src/flush.c
+++ /dev/null
@@ -1,52 +0,0 @@
-#include <stdbool.h>
-#include "gldc.h"
-
-PolyList OP_LIST;
-PolyList PT_LIST;
-PolyList TR_LIST;
-
-void glKosInit() {
-    _glInitContext();
-    _glInitTextures();
-
-    OP_LIST.list_type = PVR_LIST_OP_POLY;
-    PT_LIST.list_type = PVR_LIST_PT_POLY;
-    TR_LIST.list_type = PVR_LIST_TR_POLY;
-
-    aligned_vector_init(&OP_LIST.vector);
-    aligned_vector_init(&PT_LIST.vector);
-    aligned_vector_init(&TR_LIST.vector);
-
-    aligned_vector_reserve(&OP_LIST.vector, 1024 * 3);
-    aligned_vector_reserve(&PT_LIST.vector,  512 * 3);
-    aligned_vector_reserve(&TR_LIST.vector, 1024 * 3);
-}
-
-
-void glKosSwapBuffers() {
-    _glApplyScissor(true);
-
-    pvr_scene_begin();   
-        if(OP_LIST.vector.size > 2) {
-            pvr_list_begin(PVR_LIST_OP_POLY);
-            SceneListSubmit((Vertex*)OP_LIST.vector.data, OP_LIST.vector.size);
-            pvr_list_finish();
-        }
-
-        if(PT_LIST.vector.size > 2) {
-            pvr_list_begin(PVR_LIST_PT_POLY);
-            SceneListSubmit((Vertex*)PT_LIST.vector.data, PT_LIST.vector.size);
-            pvr_list_finish();
-        }
-
-        if(TR_LIST.vector.size > 2) {
-            pvr_list_begin(PVR_LIST_TR_POLY);
-            SceneListSubmit((Vertex*)TR_LIST.vector.data, TR_LIST.vector.size);
-            pvr_list_finish();
-        }
-    pvr_scene_finish();
-    
-    OP_LIST.vector.size = 0;
-    PT_LIST.vector.size = 0;
-    TR_LIST.vector.size = 0;
-}
diff --git a/third_party/gldc/src/gldc.h b/third_party/gldc/src/gldc.h
index 1c9846a6e..2e2894573 100644
--- a/third_party/gldc/src/gldc.h
+++ b/third_party/gldc/src/gldc.h
@@ -2,15 +2,10 @@
 #define PRIVATE_H
 
 #include <stdint.h>
-#include <stdio.h>
-#include <kos.h>
-#include <dc/pvr.h>
 #include "aligned_vector.h"
 
 #define MAX_TEXTURE_COUNT 768
 
-
-#define GL_SCISSOR_TEST     0x0008
 #define GL_NEAREST          0x2600
 #define GL_LINEAR           0x2601
 #define GL_OUT_OF_MEMORY    0x0505
@@ -33,7 +28,6 @@ void   gldcBindTexture(GLuint texture);
 int  gldcAllocTexture(int w, int h, int format);
 void gldcGetTexture(void** data, int* width, int* height);
 
-void glViewport(int x, int y, int width, int height);
 void glScissor( int x, int y, int width, int height);
 
 void glKosInit();
@@ -106,8 +100,6 @@ typedef struct {
 } __attribute__((aligned(32))) TextureObject;
 
 
-void _glInitContext();
-void _glInitSubmissionTarget();
 void _glInitTextures();
 
 extern TextureObject* TEXTURE_ACTIVE;
@@ -152,10 +144,6 @@ void _glApplyScissor(int force);
 
 extern GLboolean STATE_DIRTY;
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
-#define MAX(a,b) (((a)>(b))?(a):(b))
-#define CLAMP( X, _MIN, _MAX )  ( (X)<(_MIN) ? (_MIN) : ((X)>(_MAX) ? (_MAX) : (X)) )
-
 void SceneListSubmit(Vertex* v2, int n);
 
 static inline int DimensionFlag(int w) {
diff --git a/third_party/gldc/src/sh4_math.h b/third_party/gldc/src/sh4_math.h
index c12c30d99..34359f6d2 100644
--- a/third_party/gldc/src/sh4_math.h
+++ b/third_party/gldc/src/sh4_math.h
@@ -133,9 +133,6 @@ static const ALL_FLOATS_STRUCT MATH_identity_matrix = {1.0f, 0.0f, 0.0f, 0.0f, 0
   // a*b+c
   float MATH_fmac(float a, float b, float c)
 
-  // a*b-c
-  float MATH_fmac_Dec(float a, float b, float c)
-
   // fminf() - return the min of two floats
   // This doesn't check for NaN
   float MATH_Fast_Fminf(float a, float b)
@@ -205,19 +202,6 @@ static inline __attribute__((always_inline)) float MATH_fmac(float a, float b, f
   return c;
 }
 
-// a*b-c
-static inline __attribute__((always_inline)) float MATH_fmac_Dec(float a, float b, float c)
-{
-  asm volatile ("fneg %[floatc]\n\t"
-    "fmac fr0, %[floatb], %[floatc]\n"
-    : [floatc] "+&f" (c) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
-    : "w" (a), [floatb] "f" (b) // inputs
-    : // no clobbers
-  );
-
-  return c;
-}
-
 // Fast fminf() - return the min of two floats
 // This doesn't check for NaN
 static inline __attribute__((always_inline)) float MATH_Fast_Fminf(float a, float b)
@@ -833,18 +817,6 @@ static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float_Rad(
   // Sum of Squares (w^2 + x^2 + y^2 + z^2)
   float MATH_Sum_of_Squares(float w, float x, float y, float z)
 
-  // Cross product with bonus multiply (vec X vec = orthogonal vec, with an extra a*b=c)
-  RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b)
-
-  // Cross product (vec X vec = orthogonal vec)
-  RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3)
-
-  // Outer product (vec (X) vec = 4x4 matrix)
-  void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4)
-
-  // Matrix transform (4x4 matrix * 4x1 vec = 4x1 vec)
-  RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4)
-
   // 4x4 Matrix transpose (XMTRX^T)
   void MATH_Matrix_Transpose(void)
 
@@ -953,434 +925,6 @@ static inline __attribute__((always_inline)) float MATH_Sum_of_Squares(float w,
   return __z;
 }
 
-// Cross product: vec X vec = orthogonal vec
-//   _    _       _    _       _    _
-//  |  x1  |     |  y1  |     |  z1  |
-//  |  x2  |  X  |  y2  |  =  |  z2  |
-//  |_ x3 _|     |_ y3 _|     |_ z3 _|
-//
-// With bonus multiply:
-//
-//      a     *     b      =      c
-//
-// IMPORTANT USAGE INFORMATION (cross product):
-//
-// Return vector struct maps as below to the above diagram:
-//
-//  typedef struct {
-//   float z1;
-//   float z2;
-//   float z3;
-//   float z4; // c is stored in z4, and c = a*b if using 'with mult' version (else c = 0)
-// } RETURN_VECTOR_STRUCT;
-//
-//  For people familiar with the unit vector notation, z1 == 'i', z2 == 'j',
-//  and z3 == 'k'.
-//
-// The cross product matrix will also be stored in XMTRX after this, so calling
-// MATH_Matrix_Transform() on a vector after using this function will do a cross
-// product with the same x1-x3 values and a multiply with the same 'a' value
-// as used in this function. In this a situation, 'a' will be multiplied with
-// the x4 parameter of MATH_Matrix_Transform(). a = 0 if not using the 'with mult'
-// version of the cross product function.
-//
-// For reference, XMTRX will look like this:
-//
-//  [  0 -x3 x2 0 ]
-//  [  x3 0 -x1 0 ]
-//  [ -x2 x1 0  0 ]
-//  [  0  0  0  a ] (<-- a = 0 if not using 'with mult')
-//
-// Similarly to how the sine and cosine functions use fsca and return 2 floats,
-// the cross product functions actually return 4 floats. The first 3 are the
-// cross product output, and the 4th is a*b. The SH4 only multiplies 4x4
-// matrices with 4x1 vectors, which is why the output is like that--but it means
-// we also get a bonus float multiplication while we do our cross product!
-//
-
-// Please do not call this function directly (notice the weird syntax); call
-// MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead.
-static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product_with_Mult(float x3, float a, float y3, float b, float x2, float x1, float y1, float y2)
-{
-  // FR4-FR11 are the regs that are passed in, in that order.
-  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
-
-  // Temporary variables are necessary per GCC to avoid clobbering:
-  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
-
-  float tx1 = x1;
-  float tx2 = x2;
-  float tx3 = x3;
-  float ta = a;
-
-  float ty1 = y1;
-  float ty2 = y2;
-  float ty3 = y3;
-  float tb = b;
-
-  register float __x1 __asm__("fr9") = tx1; // need to negate (need to move to fr6, then negate fr9)
-  register float __x2 __asm__("fr8") = tx2; // in place for matrix (need to move to fr2 then negate fr2)
-  register float __x3 __asm__("fr4") = tx3; // need to negate (move to fr1 first, then negate fr4)
-  register float __a __asm__("fr5") = ta;
-
-  register float __y1 __asm__("fr10") = ty1;
-  register float __y2 __asm__("fr11") = ty2;
-  register float __y3 __asm__("fr6") = ty3;
-  register float __b __asm__("fr7") = tb;
-
-  register float __z1 __asm__("fr0") = 0.0f; // z1
-  register float __z2 __asm__("fr1") = 0.0f; // z2 (not moving x3 here yet since a double 0 is needed)
-  register float __z3 __asm__("fr2") = tx2; // z3 (this handles putting x2 in fr2)
-  register float __c __asm__("fr3") = 0.0f; // c
-
-  // This actually does a matrix transform to do the cross product.
-  // It's this:
-  //                   _    _       _            _
-  //  [  0 -x3 x2 0 ] |  y1  |     | -x3y2 + x2y3 |
-  //  [  x3 0 -x1 0 ] |  y2  |  =  |  x3y1 - x1y3 |
-  //  [ -x2 x1 0  0 ] |  y3  |     | -x2y1 + x1y2 |
-  //  [  0  0  0  a ] |_ b  _|     |_      c     _|
-  //
-
-  asm volatile (
-    // set up back bank's FV0
-    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
-
-    // Save FR12-FR15, which are supposed to be preserved across functions calls.
-    // This stops them from getting clobbered and saves 4 stack pushes (memory accesses).
-    "fmov DR12, XD12\n\t"
-    "fmov DR14, XD14\n\t"
-
-    "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1)
-    "fmov DR6, XD2\n\t" // fmov 'y3' and 'b' from FR6, FR7 into position (XF2, XF3)
-
-    // pair move zeros for some speed in setting up front bank for matrix
-    "fmov DR0, DR10\n\t" // clear FR10, FR11
-    "fmov DR0, DR12\n\t" // clear FR12, FR13
-    "fschg\n\t" // switch back to single moves
-    // prepare front bank for XMTRX
-    "fmov FR5, FR15\n\t" // fmov 'a' into position
-    "fmov FR0, FR14\n\t" // clear out FR14
-    "fmov FR0, FR7\n\t" // clear out FR7
-    "fmov FR0, FR5\n\t" // clear out FR5
-
-    "fneg FR2\n\t" // set up 'x2'
-    "fmov FR9, FR6\n\t" // set up 'x1'
-    "fneg FR9\n\t"
-    "fmov FR4, FR1\n\t" // set up 'x3'
-    "fneg FR4\n\t"
-    // flip banks and matrix multiply
-    "frchg\n\t"
-    "ftrv XMTRX, FV0\n"
-  : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0)
-  : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__a), "f" (__b) // inputs
-  : // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved)
-  );
-
-  RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c};
-  return output;
-}
-
-// Please do not call this function directly (notice the weird syntax); call
-// MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead.
-static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product(float x3, float zero, float x1, float y3, float x2, float x1_2, float y1, float y2)
-{
-  // FR4-FR11 are the regs that are passed in, in that order.
-  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
-
-  // Temporary variables are necessary per GCC to avoid clobbering:
-  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
-
-  float tx1 = x1;
-  float tx2 = x2;
-  float tx3 = x3;
-  float tx1_2 = x1_2;
-
-  float ty1 = y1;
-  float ty2 = y2;
-  float ty3 = y3;
-  float tzero = zero;
-
-  register float __x1 __asm__("fr6") = tx1; // in place
-  register float __x2 __asm__("fr8") = tx2; // in place (fmov to fr2, negate fr2)
-  register float __x3 __asm__("fr4") = tx3; // need to negate (fmov to fr1, negate fr4)
-
-  register float __zero __asm__("fr5") = tzero; // in place
-  register float __x1_2 __asm__("fr9") = tx1_2; // need to negate
-
-  register float __y1 __asm__("fr10") = ty1;
-  register float __y2 __asm__("fr11") = ty2;
-  // no __y3 needed in this function
-
-  register float __z1 __asm__("fr0") = tzero; // z1
-  register float __z2 __asm__("fr1") = tzero; // z2
-  register float __z3 __asm__("fr2") = ty3; // z3
-  register float __c __asm__("fr3") = tzero; // c
-
-  // This actually does a matrix transform to do the cross product.
-  // It's this:
-  //                   _    _       _            _
-  //  [  0 -x3 x2 0 ] |  y1  |     | -x3y2 + x2y3 |
-  //  [  x3 0 -x1 0 ] |  y2  |  =  |  x3y1 - x1y3 |
-  //  [ -x2 x1 0  0 ] |  y3  |     | -x2y1 + x1y2 |
-  //  [  0  0  0  0 ] |_ 0  _|     |_      0     _|
-  //
-
-  asm volatile (
-    // zero out FR7. For some reason, if this is done in C after __z3 is set:
-    // register float __y3 __asm__("fr7") = tzero;
-    // then GCC will emit a spurious stack push (pushing FR12). So just zero it here.
-    "fmov FR5, FR7\n\t"
-    // set up back bank's FV0
-    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
-
-    // Save FR12-FR15, which are supposed to be preserved across functions calls.
-    // This stops them from getting clobbered and saves 4 stack pushes (memory accesses).
-    "fmov DR12, XD12\n\t"
-    "fmov DR14, XD14\n\t"
-
-    "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1)
-    "fmov DR2, XD2\n\t" // fmov 'y3' and '0' from FR2, FR3 into position (XF2, XF3)
-
-    // pair move zeros for some speed in setting up front bank for matrix
-    "fmov DR0, DR10\n\t" // clear FR10, FR11
-    "fmov DR0, DR12\n\t" // clear FR12, FR13
-    "fmov DR0, DR14\n\t" // clear FR14, FR15
-    "fschg\n\t" // switch back to single moves
-    // prepare front bank for XMTRX
-    "fneg FR9\n\t" // set up 'x1'
-    "fmov FR8, FR2\n\t" // set up 'x2'
-    "fneg FR2\n\t"
-    "fmov FR4, FR1\n\t" // set up 'x3'
-    "fneg FR4\n\t"
-    // flip banks and matrix multiply
-    "frchg\n\t"
-    "ftrv XMTRX, FV0\n"
-  : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0)
-  : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__zero), "f" (__x1_2) // inputs
-  : "fr7" // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved)
-  );
-
-  RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c};
-  return output;
-}
-
-//------------------------------------------------------------------------------
-// Functions that wrap the xMATH_do_Cross_Product[_with_Mult]() functions to make
-// it easier to organize parameters
-//------------------------------------------------------------------------------
-
-// Cross product with a bonus float multiply (c = a * b)
-static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b)
-{
-  return xMATH_do_Cross_Product_with_Mult(x3, a, y3, b, x2, x1, y1, y2);
-}
-
-// Plain cross product; does not use the bonus float multiply (c = 0 and a in the cross product matrix will be 0)
-// This is a tiny bit faster than 'with_mult' (about 2 cycles faster)
-static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3)
-{
-  return xMATH_do_Cross_Product(x3, 0.0f, x1, y3, x2, x1, y1, y2);
-}
-
-// Outer product: vec (X) vec = matrix
-//   _    _
-//  |  x1  |
-//  |  x2  |  (X)  [ y1 y2 y3 y4 ] = 4x4 matrix
-//  |  x3  |
-//  |_ x4 _|
-//
-// This returns the floats in the back bank (XF0-15), which are inaccessible
-// outside of using frchg or paired-move fmov. It's meant to set up a matrix for
-// use with other matrix functions. GCC also does not touch the XFn bank.
-// This will also wipe out anything stored in the float registers, as it uses the
-// whole FPU register file (all 32 of the float registers).
-static inline __attribute__((always_inline)) void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4)
-{
-  // FR4-FR11 are the regs that are passed in, in that order.
-  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
-
-  // Temporary variables are necessary per GCC to avoid clobbering:
-  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
-
-  float tx1 = x1;
-  float tx2 = x2;
-  float tx3 = x3;
-  float tx4 = x4;
-
-  float ty1 = y1;
-  float ty2 = y2;
-  float ty3 = y3;
-  float ty4 = y4;
-
-  // vector FV4
-  register float __x1 __asm__("fr4") = tx1;
-  register float __x2 __asm__("fr5") = tx2;
-  register float __x3 __asm__("fr6") = tx3;
-  register float __x4 __asm__("fr7") = tx4;
-
-  // vector FV8
-  register float __y1 __asm__("fr8") = ty1;
-  register float __y2 __asm__("fr9") = ty2;
-  register float __y3 __asm__("fr10") = ty3; // in place already
-  register float __y4 __asm__("fr11") = ty4;
-
-  // This actually does a 4x4 matrix multiply to do the outer product.
-  // It's this:
-  //
-  //  [ x1 x1 x1 x1 ] [ y1 0 0 0 ]     [ x1y1 x1y2 x1y3 x1y4 ]
-  //  [ x2 x2 x2 x2 ] [ 0 y2 0 0 ]  =  [ x2y1 x2y2 x2y3 x2y4 ]
-  //  [ x3 x3 x3 x3 ] [ 0 0 y3 0 ]     [ x3y1 x3y2 x3y3 x3y4 ]
-  //  [ x4 x4 x4 x4 ] [ 0 0 0 y4 ]     [ x4y1 x4y2 x4y3 x4y4 ]
-  //
-
-  asm volatile (
-    // zero out unoccupied front floats to make a double 0 in DR2
-    "fldi0 FR2\n\t"
-    "fmov FR2, FR3\n\t"
-    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
-    // fmov 'x1' and 'x2' from FR4, FR5 into position (XF0,4,8,12, XF1,5,9,13)
-    "fmov DR4, XD0\n\t"
-    "fmov DR4, XD4\n\t"
-    "fmov DR4, XD8\n\t"
-    "fmov DR4, XD12\n\t"
-    // fmov 'x3' and 'x4' from FR6, FR7 into position (XF2,6,10,14, XF3,7,11,15)
-    "fmov DR6, XD2\n\t"
-    "fmov DR6, XD6\n\t"
-    "fmov DR6, XD10\n\t"
-    "fmov DR6, XD14\n\t"
-    // set up front floats (y1-y4)
-    "fmov DR8, DR0\n\t"
-    "fmov DR8, DR4\n\t"
-    "fmov DR10, DR14\n\t"
-    // finish zeroing out front floats
-    "fmov DR2, DR6\n\t"
-    "fmov DR2, DR8\n\t"
-    "fmov DR2, DR12\n\t"
-    "fschg\n\t" // switch back to single-move mode
-    // zero out remaining values and matrix multiply 4x4
-    "fmov FR2, FR1\n\t"
-    "ftrv XMTRX, FV0\n\t"
-
-    "fmov FR6, FR4\n\t"
-    "ftrv XMTRX, FV4\n\t"
-
-    "fmov FR8, FR11\n\t"
-    "ftrv XMTRX, FV8\n\t"
-
-    "fmov FR12, FR14\n\t"
-    "ftrv XMTRX, FV12\n\t"
-    // Save output in XF regs
-    "frchg\n"
-  : // no outputs
-  : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__x4), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__y4) // inputs
-  : "fr0", "fr1", "fr2", "fr3", "fr12", "fr13", "fr14", "fr15" // clobbers, can't avoid it
-  );
-  // GCC will restore FR12-FR15 from the stack after this, so we really can't keep the output in the front bank.
-}
-
-// Matrix transform: matrix * vector = vector
-//                   _    _       _    _
-//  [ ----------- ] |  x1  |     |  z1  |
-//  [ ---XMTRX--- ] |  x2  |  =  |  z2  |
-//  [ ----------- ] |  x3  |     |  z3  |
-//  [ ----------- ] |_ x4 _|     |_ z4 _|
-//
-// IMPORTANT USAGE INFORMATION (matrix transform):
-//
-// Return vector struct maps 1:1 to the above diagram:
-//
-//  typedef struct {
-//   float z1;
-//   float z2;
-//   float z3;
-//   float z4;
-// } RETURN_VECTOR_STRUCT;
-//
-// Similarly to how the sine and cosine functions use fsca and return 2 floats,
-// the matrix transform function actually returns 4 floats. The SH4 only multiplies
-// 4x4 matrices with 4x1 vectors, which is why the output is like that.
-//
-// Multiply a matrix stored in the back bank (XMTRX) with an input vector
-static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4)
-{
-  // The floats comprising FV4 are the regs that are passed in.
-  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
-
-  // Temporary variables are necessary per GCC to avoid clobbering:
-  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
-
-  float tx1 = x1;
-  float tx2 = x2;
-  float tx3 = x3;
-  float tx4 = x4;
-
-  // output vector FV0
-  register float __z1 __asm__("fr0") = tx1;
-  register float __z2 __asm__("fr1") = tx2;
-  register float __z3 __asm__("fr2") = tx3;
-  register float __z4 __asm__("fr3") = tx4;
-
-  asm volatile ("ftrv XMTRX, FV0\n\t"
-    // have to do this to obey SH4 calling convention--output returned in FV0
-    : "+w" (__z1), "+f" (__z2), "+f" (__z3), "+f" (__z4) // outputs, "+" means r/w
-    : // no inputs
-    : // no clobbers
-  );
-
-  RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __z4};
-  return output;
-}
-
-// Matrix Transpose
-//
-// This does a matrix transpose on the matrix in XMTRX, which swaps rows with
-// columns as follows (math notation is [XMTRX]^T):
-//
-//  [ a b c d ] T   [ a e i m ]
-//  [ e f g h ]  =  [ b f j n ]
-//  [ i j k l ]     [ c g k o ]
-//  [ m n o p ]     [ d h l p ]
-//
-// PLEASE NOTE: It is faster to avoid the need for a transpose altogether by
-// structuring matrices and vectors accordingly.
-static inline __attribute__((always_inline)) void MATH_Matrix_Transpose(void)
-{
-  asm volatile (
-    "frchg\n\t" // fmov for singles only works on front bank
-    // FR0, FR5, FR10, and FR15 are already in place
-    // swap FR1 and FR4
-    "flds FR1, FPUL\n\t"
-    "fmov FR4, FR1\n\t"
-    "fsts FPUL, FR4\n\t"
-    // swap FR2 and FR8
-    "flds FR2, FPUL\n\t"
-    "fmov FR8, FR2\n\t"
-    "fsts FPUL, FR8\n\t"
-    // swap FR3 and FR12
-    "flds FR3, FPUL\n\t"
-    "fmov FR12, FR3\n\t"
-    "fsts FPUL, FR12\n\t"
-    // swap FR6 and FR9
-    "flds FR6, FPUL\n\t"
-    "fmov FR9, FR6\n\t"
-    "fsts FPUL, FR9\n\t"
-    // swap FR7 and FR13
-    "flds FR7, FPUL\n\t"
-    "fmov FR13, FR7\n\t"
-    "fsts FPUL, FR13\n\t"
-    // swap FR11 and FR14
-    "flds FR11, FPUL\n\t"
-    "fmov FR14, FR11\n\t"
-    "fsts FPUL, FR14\n\t"
-    // restore XMTRX to back bank
-    "frchg\n"
-    : // no outputs
-    : // no inputs
-    : "fpul" // clobbers
-  );
-}
-
 // Matrix product: matrix * matrix = matrix
 //
 // These use the whole dang floating point unit.
@@ -1582,73 +1126,14 @@ static inline __attribute__((always_inline)) ALL_FLOATS_STRUCT * MATH_Store_XMTR
 //
 /*
 
-  //------------------------------------------------------------------------------
-  // Commonly useful functions
-  //------------------------------------------------------------------------------
-
-  // Returns 1 if point 't' is inside triangle with vertices 'v0', 'v1', and 'v2', and 0 if not
-  int MATH_Is_Point_In_Triangle(float v0x, float v0y, float v1x, float v1y, float v2x, float v2y, float ptx, float pty)
-
   //------------------------------------------------------------------------------
   // Interpolation
   //------------------------------------------------------------------------------
 
   // Linear interpolation
   float MATH_Lerp(float a, float b, float t)
-
-  // Speherical interpolation ('theta' in fsca units)
-  float MATH_Slerp(float a, float b, float t, float theta)
-
-  //------------------------------------------------------------------------------
-  // Fast Sinc functions (unnormalized, sin(x)/x version)
-  //------------------------------------------------------------------------------
-  // Just pass in MATH_pi * x for normalized versions :)
-
-  // Sinc function (fsca units)
-  float MATH_Fast_Sincf(float x)
-
-  // Sinc function (degrees)
-  float MATH_Fast_Sincf_Deg(float x)
-
-  // Sinc function (rads)
-  float MATH_Fast_Sincf_Rad(float x)
-
 */
 
-//------------------------------------------------------------------------------
-// Commonly useful functions
-//------------------------------------------------------------------------------
-
-// Returns 1 if point 'pt' is inside triangle with vertices 'v0', 'v1', and 'v2', and 0 if not
-// Determines triangle center using barycentric coordinate transformation
-// Adapted from: https://stackoverflow.com/questions/2049582/how-to-determine-if-a-point-is-in-a-2d-triangle
-// Specifically the answer by user 'adreasdr' in addition to the comment by user 'urraka' on the answer from user 'Andreas Brinck'
-//
-// The notation here assumes v0x is the x-component of v0, v0y is the y-component of v0, etc.
-//
-static inline __attribute__((always_inline)) int MATH_Is_Point_In_Triangle(float v0x, float v0y, float v1x, float v1y, float v2x, float v2y, float ptx, float pty)
-{
-  float sdot = MATH_fipr(v0y, -v0x, v2y - v0y, v0x - v2x, v2x, v2y, ptx, pty);
-  float tdot = MATH_fipr(v0x, -v0y, v0y - v1y, v1x - v0x, v1y, v1x, ptx, pty);
-
-  float areadot = MATH_fipr(-v1y, v0y, v0x, v1x, v2x, -v1x + v2x, v1y - v2y, v2y);
-
-  // 'areadot' could be negative depending on the winding of the triangle
-  if(areadot < 0.0f)
-  {
-    sdot *= -1.0f;
-    tdot *= -1.0f;
-    areadot *= -1.0f;
-  }
-
-  if( (sdot > 0.0f) && (tdot > 0.0f) && (areadot > (sdot + tdot)) )
-  {
-    return 1;
-  }
-
-  return 0;
-}
-
 //------------------------------------------------------------------------------
 // Interpolation
 //------------------------------------------------------------------------------
@@ -1659,123 +1144,6 @@ static inline __attribute__((always_inline)) float MATH_Lerp(float a, float b, f
   return MATH_fmac(t, (b-a), a);
 }
 
-// Speherical interpolation ('theta' in fsca units)
-static inline __attribute__((always_inline)) float MATH_Slerp(float a, float b, float t, float theta)
-{
-  // a is an element of v0, b is an element of v1
-  // v = ( v0 * sin(theta - t * theta) + v1 * sin(t * theta) ) / sin(theta)
-  // by using sine/cosine identities and properties, this can be optimized to:
-  // v = v0 * cos(-t * theta) + ( v0 * ( cos(theta) * sin(-t * theta) ) - sin(-t * theta) * v1 ) / sin(theta)
-  // which only requires two calls to fsca.
-  // Specifically, sin(a + b) = sin(a)cos(b) + cos(a)sin(b) & sin(-a) = -sin(a)
-
-  // MATH_fsca_* functions return reverse-ordered complex numbers for speed reasons (i.e. normally sine is the imaginary part)
-  // This could be made even faster by using MATH_fsca_Int() with 'theta' and 't' as unsigned ints
-
-#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION
-
-  RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float(theta);
-  float sine_value_theta = sine_cosine.sine;
-  float cosine_value_theta = sine_cosine.cosine;
-
-  RETURN_FSCA_STRUCT sine_cosine2 = MATH_fsca_Float(-t * theta);
-  float sine_value_minus_t_theta = sine_cosine2.sine;
-  float cosine_value_minus_t_theta = sine_cosine2.cosine;
-
-#else
-
-  _Complex float sine_cosine = MATH_fsca_Float(theta);
-  float sine_value_theta = __real__ sine_cosine;
-  float cosine_value_theta = __imag__ sine_cosine;
-
-  _Complex float sine_cosine2 = MATH_fsca_Float(-t * theta);
-  float sine_value_minus_t_theta = __real__ sine_cosine2;
-  float cosine_value_minus_t_theta = __imag__ sine_cosine2;
-
-#endif
-
-  float numer = a * cosine_value_theta * sine_value_minus_t_theta - sine_value_minus_t_theta * b;
-  float output_float = a * cosine_value_minus_t_theta + MATH_Fast_Divide(numer, sine_value_theta);
-
-  return output_float;
-}
-
-//------------------------------------------------------------------------------
-// Fast Sinc (unnormalized, sin(x)/x version)
-//------------------------------------------------------------------------------
-//
-// Just pass in MATH_pi * x for normalized versions :)
-//
-
-// Sinc function (fsca units)
-static inline __attribute__((always_inline)) float MATH_Fast_Sincf(float x)
-{
-  if(x == 0.0f)
-  {
-    return 1.0f;
-  }
-
-#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION
-
-  RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float(x);
-  float sine_value = sine_cosine.sine;
-
-#else
-
-  _Complex float sine_cosine = MATH_fsca_Float(x);
-  float sine_value = __real__ sine_cosine;
-
-#endif
-
-  return MATH_Fast_Divide(sine_value, x);
-}
-
-// Sinc function (degrees)
-static inline __attribute__((always_inline)) float MATH_Fast_Sincf_Deg(float x)
-{
-  if(x == 0.0f)
-  {
-    return 1.0f;
-  }
-
-#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION
-
-  RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float_Deg(x);
-  float sine_value = sine_cosine.sine;
-
-#else
-
-  _Complex float sine_cosine = MATH_fsca_Float_Deg(x);
-  float sine_value = __real__ sine_cosine;
-
-#endif
-
-  return MATH_Fast_Divide(sine_value, x);
-}
-
-// Sinc function (rads)
-static inline __attribute__((always_inline)) float MATH_Fast_Sincf_Rad(float x)
-{
-  if(x == 0.0f)
-  {
-    return 1.0f;
-  }
-
-#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION
-
-  RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float_Rad(x);
-  float sine_value = sine_cosine.sine;
-
-#else
-
-  _Complex float sine_cosine = MATH_fsca_Float_Rad(x);
-  float sine_value = __real__ sine_cosine;
-
-#endif
-
-  return MATH_Fast_Divide(sine_value, x);
-}
-
 //==============================================================================
 // Miscellaneous Snippets
 //==============================================================================
diff --git a/third_party/gldc/src/state.c b/third_party/gldc/src/state.c
index 74e77248a..c9d6588a4 100644
--- a/third_party/gldc/src/state.c
+++ b/third_party/gldc/src/state.c
@@ -1,8 +1,14 @@
 #include <stdbool.h>
 #include <string.h>
 #include <stdio.h>
+#include <kos.h>
+#include <dc/pvr.h>
 #include "gldc.h"
 
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MAX(a,b) (((a)>(b))?(a):(b))
+#define CLAMP( X, _MIN, _MAX )  ( (X)<(_MIN) ? (_MIN) : ((X)>(_MAX) ? (_MAX) : (X)) )
+
 GLboolean STATE_DIRTY = GL_TRUE;
 
 GLboolean DEPTH_TEST_ENABLED = GL_FALSE;
@@ -21,37 +27,67 @@ GLboolean BLEND_ENABLED = GL_FALSE;
 GLboolean TEXTURES_ENABLED = GL_FALSE;
 GLboolean AUTOSORT_ENABLED = GL_FALSE;
 
+PolyList OP_LIST;
+PolyList PT_LIST;
+PolyList TR_LIST;
+Viewport VIEWPORT;
+
 static struct {
     int x;
     int y;
     int width;
     int height;
     GLboolean applied;
-} scissor_rect = {0, 0, 640, 480, false};
+} scissor_rect;
 
-void _glInitContext() {
-    scissor_rect.x = 0;
-    scissor_rect.y = 0;
+void glKosInit() {
     scissor_rect.width  = vid_mode->width;
     scissor_rect.height = vid_mode->height;
+    _glInitTextures();
+
+    OP_LIST.list_type = PVR_LIST_OP_POLY;
+    PT_LIST.list_type = PVR_LIST_PT_POLY;
+    TR_LIST.list_type = PVR_LIST_TR_POLY;
+
+    aligned_vector_reserve(&OP_LIST.vector, 1024 * 3);
+    aligned_vector_reserve(&PT_LIST.vector,  512 * 3);
+    aligned_vector_reserve(&TR_LIST.vector, 1024 * 3);
+}
+
+void glKosSwapBuffers() {
+    _glApplyScissor(true);
+
+    pvr_scene_begin();   
+        if (OP_LIST.vector.size > 2) {
+            pvr_list_begin(PVR_LIST_OP_POLY);
+            SceneListSubmit((Vertex*)OP_LIST.vector.data, OP_LIST.vector.size);
+            pvr_list_finish();
+        }
+
+        if (PT_LIST.vector.size > 2) {
+            pvr_list_begin(PVR_LIST_PT_POLY);
+            SceneListSubmit((Vertex*)PT_LIST.vector.data, PT_LIST.vector.size);
+            pvr_list_finish();
+        }
+
+        if (TR_LIST.vector.size > 2) {
+            pvr_list_begin(PVR_LIST_TR_POLY);
+            SceneListSubmit((Vertex*)TR_LIST.vector.data, TR_LIST.vector.size);
+            pvr_list_finish();
+        }
+    pvr_scene_finish();
+    
+    OP_LIST.vector.size = 0;
+    PT_LIST.vector.size = 0;
+    TR_LIST.vector.size = 0;
 }
 
 void glScissor(int x, int y, int width, int height) {
-
-    if(scissor_rect.x == x &&
-        scissor_rect.y == y &&
-        scissor_rect.width == width &&
-        scissor_rect.height == height) {
-        return;
-    }
-
     scissor_rect.x = x;
     scissor_rect.y = y;
     scissor_rect.width = width;
     scissor_rect.height = height;
     scissor_rect.applied = false;
-    STATE_DIRTY = GL_TRUE; // FIXME: do we need this?
-
     _glApplyScissor(false);
 }
 
@@ -87,27 +123,27 @@ void _glApplyScissor(int force) {
 
     PVRTileClipCommand c;
 
-    int miny, maxx, maxy;
-
+    int sx, sy, ex, ey;
     int scissor_width  = MAX(MIN(scissor_rect.width,  vid_mode->width),  0);
     int scissor_height = MAX(MIN(scissor_rect.height, vid_mode->height), 0);
 
     /* force the origin to the lower left-hand corner of the screen */
-    miny = (vid_mode->height - scissor_height) - scissor_rect.y;
-    maxx = (scissor_width + scissor_rect.x);
-    maxy = (scissor_height + miny);
+	sx = scissor_rect.x;
+    sy = (vid_mode->height - scissor_height) - scissor_rect.y;
+    ex = sx + scissor_width;
+    ey = sy + scissor_height;
 
     /* load command structure while mapping screen coords to TA tiles */
     c.flags = PVR_CMD_USERCLIP;
     c.d1 = c.d2 = c.d3 = 0;
 
-    uint16_t vw = vid_mode->width >> 5;
+    uint16_t vw = vid_mode->width  >> 5;
     uint16_t vh = vid_mode->height >> 5;
 
-    c.sx = CLAMP(scissor_rect.x >> 5, 0, vw);
-    c.sy = CLAMP(miny >> 5, 0, vh);
-    c.ex = CLAMP((maxx >> 5) - 1, 0, vw);
-    c.ey = CLAMP((maxy >> 5) - 1, 0, vh);
+    c.sx = CLAMP(sx >> 5, 0, vw);
+    c.sy = CLAMP(sy >> 5, 0, vh);
+    c.ex = CLAMP((ex >> 5) - 1, 0, vw);
+    c.ey = CLAMP((ey >> 5) - 1, 0, vh);
 
     aligned_vector_push_back(&OP_LIST.vector, &c, 1);
     aligned_vector_push_back(&PT_LIST.vector, &c, 1);
@@ -116,30 +152,16 @@ void _glApplyScissor(int force) {
     scissor_rect.applied = true;
 }
 
-Viewport VIEWPORT;
 
-/* Set the GL viewport */
-void glViewport(int x, int y, int width, int height) {
-    VIEWPORT.hwidth  = width  *  0.5f;
-    VIEWPORT.hheight = height * -0.5f;
-    VIEWPORT.x_plus_hwidth  = x + width  * 0.5f;
-    VIEWPORT.y_plus_hheight = y + height * 0.5f;
-}
-
-
-void apply_poly_header(pvr_poly_hdr_t* dst, PolyList* activePolyList) {
-    const TextureObject *tx1 = TEXTURE_ACTIVE;
+void apply_poly_header(pvr_poly_hdr_t* dst, int list_type) {
+    TextureObject* tx1 = TEXTURE_ACTIVE;
     uint32_t txr_base;
-    TRACE();
-
-    int list_type = activePolyList->list_type;
     int gen_color_clamp = PVR_CLRCLAMP_DISABLE;
 
     int gen_culling = CULLING_ENABLED    ? PVR_CULLING_CW : PVR_CULLING_SMALL;
     int depth_comp  = DEPTH_TEST_ENABLED ? PVR_DEPTHCMP_GEQUAL : PVR_DEPTHCMP_ALWAYS;
     int depth_write = DEPTH_MASK_ENABLED ? PVR_DEPTHWRITE_ENABLE : PVR_DEPTHWRITE_DISABLE;
 
-    int gen_shading   = SHADE_MODEL;
     int gen_clip_mode = SCISSOR_TEST_ENABLED ? PVR_USERCLIP_INSIDE : PVR_USERCLIP_DISABLE;
     int gen_fog_type  = FOG_ENABLED          ? PVR_FOG_TABLE : PVR_FOG_DISABLE;
 
@@ -179,7 +201,7 @@ void apply_poly_header(pvr_poly_hdr_t* dst, PolyList* activePolyList) {
     /* Or in the list type, shading type, color and UV formats */
     dst->cmd |= (list_type             << PVR_TA_CMD_TYPE_SHIFT)     & PVR_TA_CMD_TYPE_MASK;
     dst->cmd |= (PVR_CLRFMT_ARGBPACKED << PVR_TA_CMD_CLRFMT_SHIFT)   & PVR_TA_CMD_CLRFMT_MASK;
-    dst->cmd |= (gen_shading           << PVR_TA_CMD_SHADE_SHIFT)    & PVR_TA_CMD_SHADE_MASK;
+    dst->cmd |= (SHADE_MODEL           << PVR_TA_CMD_SHADE_SHIFT)    & PVR_TA_CMD_SHADE_MASK;
     dst->cmd |= (PVR_UVFMT_32BIT       << PVR_TA_CMD_UVFMT_SHIFT)    & PVR_TA_CMD_UVFMT_MASK;
     dst->cmd |= (gen_clip_mode         << PVR_TA_CMD_USERCLIP_SHIFT) & PVR_TA_CMD_USERCLIP_MASK;
 
diff --git a/third_party/gldc/src/texture.c b/third_party/gldc/src/texture.c
index 27a5f9975..7f32a4355 100644
--- a/third_party/gldc/src/texture.c
+++ b/third_party/gldc/src/texture.c
@@ -2,6 +2,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <kos.h>
+#include <dc/pvr.h>
 
 #include "gldc.h"
 #include "yalloc/yalloc.h"