diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c
index eb5b33ec5..ec488d930 100644
--- a/misc/n64/gpu.c
+++ b/misc/n64/gpu.c
@@ -41,8 +41,6 @@ enum {
 };
 
 typedef struct {
-	int16_t  mvp_matrix_i[4][4];
-    uint16_t mvp_matrix_f[4][4];
     int16_t vp_scale[4];
     int16_t vp_offset[4];
     uint16_t tex_size[2];
diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S
index 4dd7191f7..df03c84d1 100644
--- a/misc/n64/rsp_gpu.S
+++ b/misc/n64/rsp_gpu.S
@@ -1,9 +1,40 @@
 #include <rsp_queue.inc>
 #include <rdpq_macros.h>
-#define MATRIX_SIZE           64
+
 #define GUARD_BAND_FACTOR 2
 
-    .data
+// 1 << VTX_SHIFT, keep in sync with gpu.c
+#define ONE_W K32
+
+#define xxxxXXXX h0
+#define yyyyYYYY h1
+#define zzzzZZZZ h2
+#define wwwwWWWW h3
+
+
+#define SCREEN_VTX_CS_POSi          0     // X, Y, Z, W (all 32-bit)
+#define SCREEN_VTX_CS_POSf          8     // X, Y, Z, W (all 32-bit)
+#define SCREEN_VTX_X               16
+#define SCREEN_VTX_Y               18
+#define SCREEN_VTX_Z               20
+#define SCREEN_VTX_CLIP_CODE       22
+#define SCREEN_VTX_PADDING         23
+#define SCREEN_VTX_RGBA            24
+#define SCREEN_VTX_S_T             28     // 28 S, 30 T
+#define SCREEN_VTX_W               32     // FIXME: this is duplicated in CS_POS
+#define SCREEN_VTX_INVW            36     // 32-bit
+#define SCREEN_VTX_SIZE            40
+
+//0-39 same as screenvtx
+#define PRIM_VTX_TRCODE            40    // trivial-reject clipping flags (against -w/+w)
+#define PRIM_VTX_SIZE              48
+
+#define V0_OFFSET 0 * PRIM_VTX_SIZE
+#define V1_OFFSET 1 * PRIM_VTX_SIZE
+#define V2_OFFSET 2 * PRIM_VTX_SIZE
+#define V3_OFFSET 3 * PRIM_VTX_SIZE
+
+.data
 
     RSPQ_BeginOverlayHeader
         RSPQ_DefineCommand GPUCmd_SetByte,       8   # 0x0
@@ -23,9 +54,9 @@ BANNER1: .ascii "Rasky & Snacchus"
 
     RSPQ_BeginSavedState
 
+    GPU_MATRIX_MVP:         .ds.b   128
 GL_STATE:
-    # This is the GL state that is also used by the pipeline.
-    GL_MATRIX_MVP:          .ds.b   MATRIX_SIZE
+    # This is the GL state that is updated by CPU via GPUCmd_Set commands
     GL_VIEWPORT_SCALE:      .half   0,0,0,0
     GL_VIEWPORT_OFFSET:     .half   0,0,0,0
     GL_STATE_TEX_SIZE:      .half   0,0
@@ -39,29 +70,12 @@ GL_STATE:
 CLIP_CODE_FACTORS:      .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR
 DRAW_TRI_RA:            .word 0
 
-#define SCREEN_VTX_CS_POSi          0     // X, Y, Z, W (all 32-bit)
-#define SCREEN_VTX_CS_POSf          8     // X, Y, Z, W (all 32-bit)
-#define SCREEN_VTX_X               16
-#define SCREEN_VTX_Y               18
-#define SCREEN_VTX_Z               20
-#define SCREEN_VTX_CLIP_CODE       22
-#define SCREEN_VTX_PADDING         23
-#define SCREEN_VTX_RGBA            24
-#define SCREEN_VTX_S_T             28     // 28 S, 30 T
-#define SCREEN_VTX_W               32     // FIXME: this is duplicated in CS_POS
-#define SCREEN_VTX_INVW            36     // 32-bit
-#define SCREEN_VTX_SIZE            40
-
-	.bss
+.bss
     .align 3
-#define VERTEX_CACHE_SIZE     4
-//0-39 same as screenvtx
-#define PRIM_VTX_TRCODE            40    // trivial-reject clipping flags (against -w/+w)
-#define PRIM_VTX_SIZE              42
 
-VERTEX_CACHE:   .dcb.b      PRIM_VTX_SIZE * VERTEX_CACHE_SIZE
+VERTEX_CACHE:   .dcb.b      PRIM_VTX_SIZE * 4
 
-    .text
+.text
 
     .func GPUCmd_SetByte
 GPUCmd_SetByte:
@@ -104,29 +118,47 @@ GPUCmd_MatrixLoad:
     #define src         s6
     #define dst         s7
 
-    #define vrhs01_i     $v02
-    #define vrhs01_f     $v03
-    #define vrhs23_i     $v04
-    #define vrhs23_f     $v05
+    #define vmat0_i  $v02
+    #define vmat1_i  $v03
+    #define vmat2_i  $v04
+    #define vmat3_i  $v05
+    #define vmat0_f  $v06
+    #define vmat1_f  $v07
+    #define vmat2_f  $v08
+    #define vmat3_f  $v09
 
     addi src, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
-    addi dst, zero, %lo(GL_MATRIX_MVP)
+    addi dst, zero, %lo(GPU_MATRIX_MVP)
 
-    # Load the matrix from command parameters (misaligned)
-    lqv vrhs01_i, 0x00,src
-    lrv vrhs01_i, 0x10,src
-    lqv vrhs23_i, 0x10,src
-    lrv vrhs23_i, 0x20,src
-    lqv vrhs01_f, 0x20,src
-    lrv vrhs01_f, 0x30,src
-    lqv vrhs23_f, 0x30,src
-    lrv vrhs23_f, 0x40,src
+    // Load the matrix from command parameters
+    ldv vmat0_i, 0x00,src
+    ldv vmat1_i, 0x08,src
+    ldv vmat2_i, 0x10,src
+    ldv vmat3_i, 0x18,src
+    ldv vmat0_f, 0x20,src
+    ldv vmat1_f, 0x28,src
+    ldv vmat2_f, 0x30,src
+    ldv vmat3_f, 0x38,src
 
-    sqv vrhs01_i, 0x00,dst
-    sqv vrhs23_i, 0x10,dst
-    sqv vrhs01_f, 0x20,dst
+	// Store the matrices, with each row stored twice
+	// This is used by T&L to transform two vertices at once 
+    sdv vmat0_i, 0x00,dst
+    sdv vmat0_i, 0x08,dst
+    sdv vmat1_i, 0x10,dst
+    sdv vmat1_i, 0x18,dst
+    sdv vmat2_i, 0x20,dst
+    sdv vmat2_i, 0x28,dst
+    sdv vmat3_i, 0x30,dst
+    sdv vmat3_i, 0x38,dst
+    sdv vmat0_f, 0x40,dst
+    sdv vmat0_f, 0x48,dst
+    sdv vmat1_f, 0x50,dst
+    sdv vmat1_f, 0x58,dst
+    sdv vmat2_f, 0x60,dst
+    sdv vmat2_f, 0x68,dst
+    sdv vmat3_f, 0x70,dst
     jr ra
-    sqv vrhs23_f, 0x30,dst
+    sdv vmat3_f, 0x78,dst
 
 #undef src
 #undef dst
@@ -138,7 +170,6 @@ GPUCmd_DrawQuad:
     #define vtx         a0
     #define mtx_ptr     s0
     #define src_ptr     s4
-	#define vcount      s3
 
     #define v___        $v01
 
@@ -164,74 +195,134 @@ GPUCmd_DrawQuad:
 
     addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
     li vtx, %lo(VERTEX_CACHE)
-	li vcount, 4
 
-    li mtx_ptr, %lo(GL_MATRIX_MVP)
-    ldv vmtx0_i.e0,  0x00,mtx_ptr
-    ldv vmtx1_i.e0,  0x08,mtx_ptr
-    ldv vmtx2_i.e0,  0x10,mtx_ptr
-    ldv vmtx3_i.e0,  0x18,mtx_ptr
-    ldv vmtx0_f.e0,  0x20,mtx_ptr
-    ldv vmtx1_f.e0,  0x28,mtx_ptr
-    ldv vmtx2_f.e0,  0x30,mtx_ptr
-    ldv vmtx3_f.e0,  0x38,mtx_ptr
+    li mtx_ptr, %lo(GPU_MATRIX_MVP)
+    lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I  m00.I m01.I m02.I m03.I]
+    lqv vmtx1_i, 0x10,mtx_ptr // etc
+    lqv vmtx2_i, 0x20,mtx_ptr
+    lqv vmtx3_i, 0x30,mtx_ptr
+    lqv vmtx0_f, 0x40,mtx_ptr
+    lqv vmtx1_f, 0x50,mtx_ptr
+    lqv vmtx2_f, 0x60,mtx_ptr
+    lqv vmtx3_f, 0x70,mtx_ptr
 
-upload_vertex:
-	ldv vpos,  0, src_ptr # Load X, Y, Z, W
-	llv vcol,  8, src_ptr # Load RGBA
-	llv vtex, 12, src_ptr # Load U, V
+### VERTEX 0
+	ldv vpos.e0,  0, src_ptr // Load v0 X, Y, Z
+	ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
 
 	# matrix multiply
-    vmudn v___,      vmtx0_f, vpos.h0
-    vmadh v___,      vmtx0_i, vpos.h0
-    vmadn v___,      vmtx1_f, vpos.h1
-    vmadh v___,      vmtx1_i, vpos.h1
-    vmadn v___,      vmtx2_f, vpos.h2
-    vmadh v___,      vmtx2_i, vpos.h2
-    vmadn v___,      vmtx3_f, vpos.h3
-    vmadh vcspos_i,  vmtx3_i, vpos.h3
+    vmudn v___,      vmtx0_f, vpos.xxxxXXXX
+    vmadh v___,      vmtx0_i, vpos.xxxxXXXX
+    vmadn v___,      vmtx1_f, vpos.yyyyYYYY
+    vmadh v___,      vmtx1_i, vpos.yyyyYYYY
+    vmadn v___,      vmtx2_f, vpos.zzzzZZZZ
+    vmadh v___,      vmtx2_i, vpos.zzzzZZZZ
+    vmadn v___,      vmtx3_f, ONE_W
+    vmadh vcspos_i,  vmtx3_i, ONE_W
     vmadn vcspos_f,  vzero,   vzero
 
-    slv vcol, SCREEN_VTX_RGBA, vtx
-    slv vtex, SCREEN_VTX_S_T,  vtx
+	llv vcol.e0,  8, src_ptr // Load v0 RGBA
+	llv vtex.e0, 12, src_ptr // Load v0 U, V
+	llv vcol.e2, 24, src_ptr // Load v1 RGBA
+	llv vtex.e2, 28, src_ptr // Load v1 U, V
 
     # 32-bit right shift by 5, to keep the clip space coordinates unscaled
     vmudm vcspos_i, vcspos_i, K2048
     vmadl vcspos_f, vcspos_f, K2048
 
-	addi vcount,  -1
-	addi src_ptr, 16
-
-    sdv vcspos_i, SCREEN_VTX_CS_POSi,vtx
-    sdv vcspos_f, SCREEN_VTX_CS_POSf,vtx
+    slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx
+    slv vtex.e0, SCREEN_VTX_S_T  + V0_OFFSET, vtx
+    slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx
+    slv vtex.e2, SCREEN_VTX_S_T  + V1_OFFSET, vtx
 
     # Calculate and store clipping flags against CS.W.
     # These will be used for trivial rejections.
-    vch v___, vcspos_i, vcspos_i.w
-    vcl v___, vcspos_f, vcspos_f.w
+    vch v___, vcspos_i, vcspos_i.wwwwWWWW
+    vcl v___, vcspos_f, vcspos_f.wwwwWWWW
+
     cfc2 t0, COP2_CTRL_VCC
-    andi t0, 0x707   # Isolate X/Y/Z flags
+    sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx
+    sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx
+    sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx
+    sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx
 
-    # Compress flags to 8 bit
-    srl t1, t0, 5
-    andi t0, 0x7
-    or t0, t1
-    sb t0, PRIM_VTX_TRCODE(vtx)
+###################### VERTEX 2
+	ldv vpos.e0, 32, src_ptr // Load v2 X, Y, Z
+	ldv vpos.e4, 48, src_ptr // Load v3 X, Y, Z
+    andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
+    srl  t1, t2, 5     // Shift hi flags to be aligned next to lo flags
+    andi t2, t2, 0x7   // Isolate lo clip flags
+    or   t2, t1        // Merge clip flags (compressed to 6 bits)
 
-	bnez vcount, upload_vertex
-	addi vtx, PRIM_VTX_SIZE
+	# matrix multiply
+    vmudn v___,      vmtx0_f, vpos.xxxxXXXX
+    vmadh v___,      vmtx0_i, vpos.xxxxXXXX
+    vmadn v___,      vmtx1_f, vpos.yyyyYYYY
+    sb   t2, (PRIM_VTX_TRCODE + V0_OFFSET)(vtx)
+    vmadh v___,      vmtx1_i, vpos.yyyyYYYY
+	srl  t0, t0, 4
+    vmadn v___,      vmtx2_f, vpos.zzzzZZZZ
+    andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
+    vmadh v___,      vmtx2_i, vpos.zzzzZZZZ
+    srl  t1, t2, 5     // Shift hi flags to be aligned next to lo flags
+    vmadn v___,      vmtx3_f, ONE_W
+    andi t2, t2, 0x7   // Isolate lo clip flags
+    vmadh vcspos_i,  vmtx3_i, ONE_W
+    or   t2, t1        // Merge clip flags (compressed to 6 bits)
+    vmadn vcspos_f,  vzero,   vzero
+    sb   t2, (PRIM_VTX_TRCODE + V1_OFFSET)(vtx)
 
+	llv vcol.e4, 40, src_ptr # Load v2 RGBA
+	llv vtex.e4, 44, src_ptr # Load v2 U, V
+	llv vcol.e6, 56, src_ptr # Load v3 RGBA
+	llv vtex.e6, 60, src_ptr # Load v3 U, V
+
+    # 32-bit right shift by 5, to keep the clip space coordinates unscaled
+    vmudm vcspos_i, vcspos_i, K2048
+    vmadl vcspos_f, vcspos_f, K2048
+
+    slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx
+    slv vtex.e4, SCREEN_VTX_S_T  + V2_OFFSET, vtx
+    slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx
+    slv vtex.e6, SCREEN_VTX_S_T  + V3_OFFSET, vtx
+
+    # Calculate and store clipping flags against CS.W.
+    # These will be used for trivial rejections.
+    vch v___, vcspos_i, vcspos_i.wwwwWWWW
+    vcl v___, vcspos_f, vcspos_f.wwwwWWWW
+
+    cfc2 t0, COP2_CTRL_VCC
+    sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx
+    sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx
+
+    andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
+    srl  t1, t2, 5     // Shift hi flags to be aligned next to lo flags
+    andi t2, t2, 0x7   // Isolate lo clip flags
+    or   t2, t1        // Merge clip flags (compressed to 6 bits)
+    sb   t2, (PRIM_VTX_TRCODE + V2_OFFSET)(vtx)
+
+###################### VERTEX 3
+
+    sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx
+    sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx
+
+	srl  t0, t0, 4
+    andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
+    srl  t1, t2, 5     // Shift hi flags to be aligned next to lo flags
+    andi t2, t2, 0x7   // Isolate lo clip flags
+    or   t2, t1        // Merge clip flags (compressed to 6 bits)
+    sb   t2, (PRIM_VTX_TRCODE + V3_OFFSET)(vtx)
 	
 	# now do the actual drawing
-	li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE
-	li a2, %lo(VERTEX_CACHE) + 1*PRIM_VTX_SIZE
+	li a1, %lo(VERTEX_CACHE) + V0_OFFSET
+	li a2, %lo(VERTEX_CACHE) + V1_OFFSET
 	jal GPUCmd_DrawTriangle
-	li a3, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE
+	li a3, %lo(VERTEX_CACHE) + V2_OFFSET
 
-	li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE
-	li a2, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE
+	li a1, %lo(VERTEX_CACHE) + V0_OFFSET
+	li a2, %lo(VERTEX_CACHE) + V2_OFFSET
 	jal GPUCmd_DrawTriangle
-	li a3, %lo(VERTEX_CACHE) + 3*PRIM_VTX_SIZE
+	li a3, %lo(VERTEX_CACHE) + V3_OFFSET
 
     j RSPQ_Loop
     nop
@@ -352,13 +443,14 @@ GL_CalcClipCodes:
     
     vch v___, vguard_i, vguard_i.w
     vcl v___, vguard_f, vguard_f.w
+
     cfc2 t0, COP2_CTRL_VCC
-    andi t0, 0x707
-    srl t1, t0, 5
-    andi t0, 0x7
-    or t0, t1
+    andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
+    srl  t1, t2, 5     // Shift hi flags to be aligned next to lo flags
+    andi t2, t2, 0x7   // Isolate lo clip flags
+    or   t2, t1        // Merge clip flags (compressed to 6 bits)
     jr ra
-    sb t0,  SCREEN_VTX_CLIP_CODE(dst)
+    sb t2,  SCREEN_VTX_CLIP_CODE(dst)
 
     #undef dst
     #undef vcspos_i