Merge pull request #1356 from ClassiCube/N64Optim

Optimise GPU backend for N64
2025-09-08 23:10:52 -04:00 · 2025-05-02 22:02:43 +10:00 · 2025-05-02 22:02:43 +10:00 · 8b36940221
commit 8b36940221
parent c770b22b38 a389afe03a
6 changed files with 1284 additions and 121 deletions
--- a/misc/n64/Makefile
+++ b/misc/n64/Makefile
@ -1,23 +1,28 @@
 BUILD_DIR		= build-n64
-SOURCE_DIR	 	= src
+SOURCE_DIR	 	= misc/n64
 N64_ROM_TITLE 	= "ClassiCube"
 N64_ROM_RTC 	= true
 TARGET			= ClassiCube-n64
-N64_MKDFS_ROOT	= "misc/n64"
+N64_MKDFS_ROOT	= "misc/n64/files"

 CFILES 	:= $(notdir $(wildcard src/*.c))
-OFILES 	:= $(CFILES:.c=.o)
+OFILES 	:= $(CFILES:.c=.o) rsp_gpu.o
 OBJS 	:= $(addprefix $(BUILD_DIR)/,$(OFILES))
 CFLAGS	:= -Wno-error=missing-braces -Wno-error=strict-aliasing -Wno-error=incompatible-pointer-types

 default: $(TARGET).z64

+$(BUILD_DIR)/%.o: src/%.c 
+	@mkdir -p $(dir $@)
+	@echo "    [CC] $<"
+	$(CC) -c $(CFLAGS) -o $@ $<
+
 include $(N64_INST)/include/n64.mk

 $(TARGET).z64: N64_ROM_TITLE = "ClassiCube"
 $(TARGET).z64: $(BUILD_DIR)/filesystem.dfs

-$(BUILD_DIR)/filesystem.dfs: misc/n64/default.zip
+$(BUILD_DIR)/filesystem.dfs: misc/n64/files/default.zip

 $(BUILD_DIR)/ClassiCube-n64.elf: $(OBJS)

--- a/misc/n64/files/default.zip
+++ b/misc/n64/files/default.zip
--- a/misc/n64/gpu.c
+++ b/misc/n64/gpu.c
@ -0,0 +1,242 @@
+#include "rspq.h"
+#include "rdpq.h"
+#include "rdpq_rect.h"
+#include "rdpq_mode.h"
+#include "rdpq_debug.h"
+#include "display.h"
+
+// This is a severely cutdown version of libdragon's OpenGL implementation
+#define VTX_SHIFT 5
+#define TEX_SHIFT 8
+
+static uint32_t gpup_id;
+//DEFINE_RSP_UCODE(rsp_gpu);
+extern uint8_t _binary_build_n64_rsp_gpu_text_bin_start[];
+extern uint8_t _binary_build_n64_rsp_gpu_data_bin_start[];
+extern uint8_t _binary_build_n64_rsp_gpu_meta_bin_start[];
+extern uint8_t _binary_build_n64_rsp_gpu_text_bin_end[0];
+extern uint8_t _binary_build_n64_rsp_gpu_data_bin_end[0];
+extern uint8_t _binary_build_n64_rsp_gpu_meta_bin_end[0];
+
+static rsp_ucode_t rsp_gpu = (rsp_ucode_t){
+	.code     = _binary_build_n64_rsp_gpu_text_bin_start,
+	.code_end = _binary_build_n64_rsp_gpu_text_bin_end,
+	.data     = _binary_build_n64_rsp_gpu_data_bin_start,
+	.data_end = _binary_build_n64_rsp_gpu_data_bin_end,
+	.meta     = _binary_build_n64_rsp_gpu_meta_bin_start,
+	.meta_end = _binary_build_n64_rsp_gpu_meta_bin_end,
+	.name     = "rsp_gpu"
+};
+
+enum {
+    GPU_CMD_SET_BYTE         = 0x0,
+    GPU_CMD_SET_SHORT        = 0x1,
+    GPU_CMD_SET_WORD         = 0x2,
+    GPU_CMD_SET_LONG         = 0x3,
+
+    GPU_CMD_DRAW_QUAD        = 0x4,
+    GPU_CMD_MATRIX_LOAD      = 0x5,
+
+	GPU_CMD_PUSH_RDP         = 0x6,
+};
+
+typedef struct {
+	int16_t  mvp_matrix_i[4][4];
+    uint16_t mvp_matrix_f[4][4];
+    int16_t vp_scale[4];
+    int16_t vp_offset[4];
+    uint16_t tex_size[2];
+    uint16_t tex_offset[2];
+    uint16_t tri_cmd;
+    uint16_t tri_cull;
+} __attribute__((aligned(8), packed)) gpu_state;
+
+__attribute__((always_inline))
+static inline void gpu_set_byte(uint32_t offset, uint8_t value)
+{
+    rspq_write(gpup_id, GPU_CMD_SET_BYTE, offset, value);
+}
+
+__attribute__((always_inline))
+static inline void gpu_set_short(uint32_t offset, uint16_t value)
+{
+    rspq_write(gpup_id, GPU_CMD_SET_SHORT, offset, value);
+}
+
+__attribute__((always_inline))
+static inline void gpu_set_word(uint32_t offset, uint32_t value)
+{
+    rspq_write(gpup_id, GPU_CMD_SET_WORD, offset, value);
+}
+
+__attribute__((always_inline))
+static inline void gpu_set_long(uint32_t offset, uint64_t value)
+{
+    rspq_write(gpup_id, GPU_CMD_SET_LONG, offset, value >> 32, value & 0xFFFFFFFF);
+}
+
+#define RDP_CMD_SYNC_PIPE       0xE7000000
+#define RDP_CMD_SET_BLEND_COLOR 0xF9000000
+
+__attribute__((always_inline))
+static inline void gpu_push_rdp(uint32_t a1, uint64_t a2)
+{
+    rdpq_write(2, gpup_id, GPU_CMD_PUSH_RDP, 0, a1, a2);
+}
+
+
+static float gpu_vp_scale[3];
+static float gpu_vp_offset[3];
+static bool  gpu_texturing;
+static void* gpu_pointer;
+static int   gpu_stride;
+
+#define GPU_ATTR_Z     (1 <<  8)
+#define GPU_ATTR_TEX   (1 <<  9)
+#define GPU_ATTR_SHADE (1 << 10)
+#define GPU_ATTR_EDGE  (1 << 11)
+static bool gpu_attr_z, gpu_attr_tex;
+
+static void gpuUpdateFormat(void)
+{
+	uint16_t cmd = 0xC000 | GPU_ATTR_SHADE | GPU_ATTR_EDGE;
+
+	if (gpu_attr_z)   cmd |= GPU_ATTR_Z;
+	if (gpu_attr_tex) cmd |= GPU_ATTR_TEX;
+
+	gpu_set_short(offsetof(gpu_state, tri_cmd), cmd);
+}
+
+static void gpuSetTexSize(uint16_t width, uint16_t height)
+{
+    gpu_set_word(offsetof(gpu_state, tex_size[0]), (width << 16) | height);
+}
+
+
+static inline void write_shorts(rspq_write_t *w, const uint16_t *s, uint32_t count)
+{
+    for (uint32_t i = 0; i < count; i += 2)
+    {
+        uint32_t packed = ((uint32_t)s[i] << 16) | (uint32_t)s[i+1];
+        rspq_write_arg(w, packed);
+    }
+}
+
+static inline void gpu_matrix_write(rspq_write_t* w, const float* m)
+{
+    uint16_t integer[16];
+    uint16_t fraction[16];
+
+    for (uint32_t i = 0; i < 16; i++)
+    {
+        int32_t fixed = m[i] * (1<<16);
+        integer[i] = (uint16_t)((fixed & 0xFFFF0000) >> 16);
+        fraction[i] = (uint16_t)(fixed & 0x0000FFFF);
+    }
+
+    write_shorts(w, integer, 16);
+    write_shorts(w, fraction, 16);
+}
+
+static void gpuLoadMatrix(const float* m)
+{
+    rspq_write_t w = rspq_write_begin(gpup_id, GPU_CMD_MATRIX_LOAD, 17);
+    rspq_write_arg(&w, 0); // padding
+    gpu_matrix_write(&w, m);
+    rspq_write_end(&w);
+}
+
+static inline void put_word(rspq_write_t* s, uint16_t v1, uint16_t v2)
+{
+	rspq_write_arg(s, v2 | (v1 << 16));
+}
+
+static void upload_vertex(rspq_write_t* s, uint32_t index)
+{
+	char* ptr = gpu_pointer + index * gpu_stride;
+
+	float* vtx = (float*)(ptr + 0);
+	put_word(s, vtx[0] * (1<<VTX_SHIFT),
+				vtx[1] * (1<<VTX_SHIFT));
+	put_word(s, vtx[2] * (1<<VTX_SHIFT),
+				1.0f   * (1<<VTX_SHIFT));
+
+	uint32_t* col = (uint32_t*)(ptr + 12);
+	rspq_write_arg(s, *col);
+
+	if (gpu_texturing) {
+		float* tex = (float*)(ptr + 16);
+		put_word(s, tex[0] * (1<<TEX_SHIFT),
+					tex[1] * (1<<TEX_SHIFT));
+	} else {
+		put_word(s, 0,
+					0);
+    }
+}
+
+static void gpuDrawArrays(uint32_t first, uint32_t count)
+{
+    for (uint32_t i = 0; i < count; i += 4)
+    {
+    	rspq_write_t s = rspq_write_begin(gpup_id, GPU_CMD_DRAW_QUAD, 17);
+    	rspq_write_arg(&s, 0); // padding
+       	for (uint32_t j = 0; j < 4; j++)
+		{
+        	upload_vertex(&s, first + i + j);
+		}
+    	rspq_write_end(&s);
+    }
+}
+
+static void gpuDepthRange(float n, float f)
+{
+    gpu_vp_scale[2]  = (f - n) * 0.5f;
+    gpu_vp_offset[2] = n + (f - n) * 0.5f;
+
+    gpu_set_short(offsetof(gpu_state, vp_scale[2]),  gpu_vp_scale[2]  * 4);
+    gpu_set_short(offsetof(gpu_state, vp_offset[2]), gpu_vp_offset[2] * 4);
+}
+
+static void gpuViewport(int x, int y, int w, int h)
+{
+    gpu_vp_scale[0]  = w * 0.5f;
+    gpu_vp_scale[1]  = h * -0.5f;
+    gpu_vp_offset[0] = x + w * 0.5f;
+    gpu_vp_offset[1] = y + h * 0.5f;
+
+    // Screen coordinates are s13.2
+    #define SCREEN_XY_SCALE   4.0f
+    #define SCREEN_Z_SCALE    32767.0f
+
+    // * 2.0f to compensate for RSP reciprocal missing 1 bit
+    uint16_t scale_x  = gpu_vp_scale[0] * SCREEN_XY_SCALE * 2.0f;
+    uint16_t scale_y  = gpu_vp_scale[1] * SCREEN_XY_SCALE * 2.0f;
+    uint16_t scale_z  = gpu_vp_scale[2] * SCREEN_Z_SCALE  * 2.0f;
+
+    uint16_t offset_x = gpu_vp_offset[0] * SCREEN_XY_SCALE;
+    uint16_t offset_y = gpu_vp_offset[1] * SCREEN_XY_SCALE;
+    uint16_t offset_z = gpu_vp_offset[2] * SCREEN_Z_SCALE;
+
+    gpu_set_long( 
+        offsetof(gpu_state, vp_scale), 
+        ((uint64_t)scale_x << 48) | ((uint64_t)scale_y << 32) | ((uint64_t)scale_z << 16));
+    gpu_set_long( 
+        offsetof(gpu_state, vp_offset), 
+        ((uint64_t)offset_x << 48) | ((uint64_t)offset_y << 32) | ((uint64_t)offset_z << 16));
+}
+
+static void gpuSetCullFace(bool enabled) {
+	// 1 = cull backfaces
+	// 2 = don't cull
+    gpu_set_short(offsetof(gpu_state, tri_cull), enabled ? 1 : 2);
+}
+
+static void gpu_init() {
+    gpup_id = rspq_overlay_register(&rsp_gpu);
+    gpuDepthRange(0, 1);
+}
+
+static void gpu_close() {
+    rspq_wait();
+    rspq_overlay_unregister(gpup_id);
+}
--- a/misc/n64/rsp_gpu.S
+++ b/misc/n64/rsp_gpu.S
@ -0,0 +1,531 @@
+#include <rsp_queue.inc>
+#include <rdpq_macros.h>
+#define MATRIX_SIZE           64
+#define GUARD_BAND_FACTOR 2
+
+    .data
+
+    RSPQ_BeginOverlayHeader
+        RSPQ_DefineCommand GPUCmd_SetByte,       8   # 0x0
+        RSPQ_DefineCommand GPUCmd_SetShort,      8   # 0x1
+        RSPQ_DefineCommand GPUCmd_SetWord,       8   # 0x2
+        RSPQ_DefineCommand GPUCmd_SetLong,       12  # 0x3
+
+        RSPQ_DefineCommand GPUCmd_DrawQuad,      68  # 0x4
+        RSPQ_DefineCommand GPUCmd_MatrixLoad,    68  # 0x5
+
+        RSPQ_DefineCommand GPUCmd_PushRDP,       12  # 0x6
+    RSPQ_EndOverlayHeader
+
+    .align 4
+BANNER0: .ascii " RSP OpenGL T&L "
+BANNER1: .ascii "Rasky & Snacchus"
+
+    RSPQ_BeginSavedState
+
+GL_STATE:
+    # This is the GL state that is also used by the pipeline.
+    GL_MATRIX_MVP:          .ds.b   MATRIX_SIZE
+    GL_VIEWPORT_SCALE:      .half   0,0,0,0
+    GL_VIEWPORT_OFFSET:     .half   0,0,0,0
+    GL_STATE_TEX_SIZE:      .half   0,0
+    GL_STATE_TEX_OFFSET:    .half   0,0
+    GL_TRI_CMD:             .half   0
+    GL_TRI_CULL:            .half   0
+
+    RSPQ_EndSavedState
+
+    .align 4
+CLIP_CODE_FACTORS:      .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR
+DRAW_TRI_RA:            .word 0
+
+#define SCREEN_VTX_CS_POSi          0     // X, Y, Z, W (all 32-bit)
+#define SCREEN_VTX_CS_POSf          8     // X, Y, Z, W (all 32-bit)
+#define SCREEN_VTX_X               16
+#define SCREEN_VTX_Y               18
+#define SCREEN_VTX_Z               20
+#define SCREEN_VTX_CLIP_CODE       22
+#define SCREEN_VTX_PADDING         23
+#define SCREEN_VTX_RGBA            24
+#define SCREEN_VTX_S_T             28     // 28 S, 30 T
+#define SCREEN_VTX_W               32     // FIXME: this is duplicated in CS_POS
+#define SCREEN_VTX_INVW            36     // 32-bit
+#define SCREEN_VTX_SIZE            40
+
+	.bss
+    .align 3
+#define VERTEX_CACHE_SIZE     4
+//0-39 same as screenvtx
+#define PRIM_VTX_TRCODE            40    // trivial-reject clipping flags (against -w/+w)
+#define PRIM_VTX_SIZE              42
+
+VERTEX_CACHE:   .dcb.b      PRIM_VTX_SIZE * VERTEX_CACHE_SIZE
+
+    .text
+
+    .func GPUCmd_SetByte
+GPUCmd_SetByte:
+    jr ra
+    sb a1, %lo(GL_STATE)(a0)
+    .endfunc
+
+    .func GPUCmd_SetShort
+GPUCmd_SetShort:
+    jr ra
+    sh a1, %lo(GL_STATE)(a0)
+    .endfunc
+
+    .func GPUCmd_SetWord
+GPUCmd_SetWord:
+    jr ra
+    sw a1, %lo(GL_STATE) + 0(a0)
+    .endfunc
+
+    .func GPUCmd_SetLong
+GPUCmd_SetLong:
+    sw a2, %lo(GL_STATE) + 4(a0)
+    jr ra
+    sw a1, %lo(GL_STATE) + 0(a0)
+    .endfunc
+
+
+    .func GPUCmd_PushRDP
+GPUCmd_PushRDP:
+	# RDP command is expected in a0 and a1
+    move a0, a1
+	move a1, a2
+
+    jal_and_j RDPQ_Write8, RDPQ_Finalize
+    .endfunc
+
+
+    .func GPUCmd_MatrixLoad
+GPUCmd_MatrixLoad:
+    #define src         s6
+    #define dst         s7
+
+    #define vrhs01_i     $v02
+    #define vrhs01_f     $v03
+    #define vrhs23_i     $v04
+    #define vrhs23_f     $v05
+
+    addi src, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
+    addi dst, zero, %lo(GL_MATRIX_MVP)
+
+    # Load the matrix from command parameters (misaligned)
+    lqv vrhs01_i, 0x00,src
+    lrv vrhs01_i, 0x10,src
+    lqv vrhs23_i, 0x10,src
+    lrv vrhs23_i, 0x20,src
+    lqv vrhs01_f, 0x20,src
+    lrv vrhs01_f, 0x30,src
+    lqv vrhs23_f, 0x30,src
+    lrv vrhs23_f, 0x40,src
+
+    sqv vrhs01_i, 0x00,dst
+    sqv vrhs23_i, 0x10,dst
+    sqv vrhs01_f, 0x20,dst
+    jr ra
+    sqv vrhs23_f, 0x30,dst
+
+#undef src
+#undef dst
+    .endfunc
+
+    .align 3
+    .func GPUCmd_DrawQuad
+GPUCmd_DrawQuad:
+    #define vtx         a0
+    #define mtx_ptr     s0
+    #define src_ptr     s4
+	#define vcount      s3
+
+    #define v___        $v01
+
+    #define vmtx0_i     $v16       //  m00 m01 m02 m03
+    #define vmtx0_f     $v17
+    #define vmtx1_i     $v18       //  m10 m11 m12 m13
+    #define vmtx1_f     $v19
+    #define vmtx2_i     $v20       //  m20 m21 m22 m23
+    #define vmtx2_f     $v21
+    #define vmtx3_i     $v22       //  m30 m31 m32 m03
+    #define vmtx3_f     $v23
+
+    #define vpos        $v24
+    #define vcol        $v25
+    #define vtex        $v26
+    #define vcspos_i    $v28
+    #define vcspos_f    $v29
+
+    #define x  e0
+    #define y  e1
+    #define z  e2
+    #define w  e3
+
+    addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
+    li vtx, %lo(VERTEX_CACHE)
+	li vcount, 4
+
+    li mtx_ptr, %lo(GL_MATRIX_MVP)
+    ldv vmtx0_i.e0,  0x00,mtx_ptr
+    ldv vmtx1_i.e0,  0x08,mtx_ptr
+    ldv vmtx2_i.e0,  0x10,mtx_ptr
+    ldv vmtx3_i.e0,  0x18,mtx_ptr
+    ldv vmtx0_f.e0,  0x20,mtx_ptr
+    ldv vmtx1_f.e0,  0x28,mtx_ptr
+    ldv vmtx2_f.e0,  0x30,mtx_ptr
+    ldv vmtx3_f.e0,  0x38,mtx_ptr
+
+upload_vertex:
+	ldv vpos,  0, src_ptr # Load X, Y, Z, W
+	llv vcol,  8, src_ptr # Load RGBA
+	llv vtex, 12, src_ptr # Load U, V
+
+	# matrix multiply
+    vmudn v___,      vmtx0_f, vpos.h0
+    vmadh v___,      vmtx0_i, vpos.h0
+    vmadn v___,      vmtx1_f, vpos.h1
+    vmadh v___,      vmtx1_i, vpos.h1
+    vmadn v___,      vmtx2_f, vpos.h2
+    vmadh v___,      vmtx2_i, vpos.h2
+    vmadn v___,      vmtx3_f, vpos.h3
+    vmadh vcspos_i,  vmtx3_i, vpos.h3
+    vmadn vcspos_f,  vzero,   vzero
+
+    slv vcol, SCREEN_VTX_RGBA, vtx
+    slv vtex, SCREEN_VTX_S_T,  vtx
+
+    # 32-bit right shift by 5, to keep the clip space coordinates unscaled
+    vmudm vcspos_i, vcspos_i, vshift8.e4
+    vmadl vcspos_f, vcspos_f, vshift8.e4
+
+	addi vcount,  -1
+	addi src_ptr, 16
+
+    sdv vcspos_i, SCREEN_VTX_CS_POSi,vtx
+    sdv vcspos_f, SCREEN_VTX_CS_POSf,vtx
+
+    # Calculate and store clipping flags against CS.W.
+    # These will be used for trivial rejections.
+    vch v___, vcspos_i, vcspos_i.w
+    vcl v___, vcspos_f, vcspos_f.w
+    cfc2 t0, COP2_CTRL_VCC
+    andi t0, 0x707   # Isolate X/Y/Z flags
+
+    # Compress flags to 8 bit
+    srl t1, t0, 5
+    andi t0, 0x7
+    or t0, t1
+    sb t0, PRIM_VTX_TRCODE(vtx)
+
+	bnez vcount, upload_vertex
+	addi vtx, PRIM_VTX_SIZE
+
+	
+	# now do the actual drawing
+	li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE
+	li a2, %lo(VERTEX_CACHE) + 1*PRIM_VTX_SIZE
+	jal GPUCmd_DrawTriangle
+	li a3, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE
+
+	li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE
+	li a2, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE
+	jal GPUCmd_DrawTriangle
+	li a3, %lo(VERTEX_CACHE) + 3*PRIM_VTX_SIZE
+
+    j RSPQ_Loop
+    nop
+    #undef src_ptr
+    #undef vtx
+
+    #undef x
+    #undef y
+    #undef z
+    #undef w
+
+    #undef v___     
+
+    #undef vmtx0_i   
+    #undef vmtx0_f  
+    #undef vmtx1_i   
+    #undef vmtx1_f  
+    #undef vmtx2_i   
+    #undef vmtx2_f  
+    #undef vmtx3_i   
+    #undef vmtx3_f  
+
+    #undef vpos      
+    #undef vcspos_i  
+    #undef vcspos_f  
+
+    .endfunc
+
+    ################################################################
+    # GL_CalcScreenSpace
+    #
+    # Args:
+    #   s3   = Destination vertex address
+    #   $v02 = Clip space position (fractional part)
+    #   $v03 = Clip space position (integer part)
+    #
+    ################################################################
+    .func GL_CalcScreenSpace
+GL_CalcScreenSpace:
+    #define dst          s3
+    #define vcspos_f     $v02
+    #define vcspos_i     $v03
+    #define vinvw_f      $v23
+    #define vinvw_i      $v24
+    #define vviewscale   $v25
+    #define vviewoff     $v26
+    #define vscreenpos_i $v27
+    #define vscreenpos_f $v28
+    #define v___         $v29
+    #define w            e3
+
+    # Calculate 32-bit inverse W
+    # TODO: NR?
+    vrcph vinvw_i.w, vcspos_i.w
+    vrcpl vinvw_f.w, vcspos_f.w
+    vrcph vinvw_i.w, vzero.e0
+
+    # Calculate screenspace coords
+    li t0, %lo(GL_VIEWPORT_SCALE)
+    ldv vviewscale, 0,t0
+    ldv vviewoff,   8,t0
+
+    vmudl v___,         vcspos_f, vinvw_f.w
+    vmadm v___,         vcspos_i, vinvw_f.w
+    vmadn vscreenpos_f, vcspos_f, vinvw_i.w
+    vmadh vscreenpos_i, vcspos_i, vinvw_i.w
+
+    vmudn vscreenpos_f, vscreenpos_f, vviewscale
+    vmadh vscreenpos_i, vscreenpos_i, vviewscale
+    vadd vscreenpos_i, vviewoff
+
+    sdv vscreenpos_i, SCREEN_VTX_X     ,dst
+    ssv vcspos_i.w,   SCREEN_VTX_W+0   ,dst 
+    ssv vcspos_f.w,   SCREEN_VTX_W+2   ,dst 
+    ssv vinvw_i.w,    SCREEN_VTX_INVW+0,dst
+    ssv vinvw_f.w,    SCREEN_VTX_INVW+2,dst
+    jr ra
+    sb zero,          SCREEN_VTX_PADDING(dst)
+
+    #undef dst
+    #undef vcspos_f
+    #undef vcspos_i
+    #undef vinvw_f
+    #undef vinvw_i
+    #undef vviewscale
+    #undef vviewoff
+    #undef vscreenpos_i
+    #undef vscreenpos_f
+    #undef v___
+    #undef w
+
+    .endfunc
+
+    ################################################################
+    # GL_CalcClipCodes
+    #
+    # Args:
+    #   s3   = Destination vertex address
+    #   $v02 = Clip space position (fractional part)
+    #   $v03 = Clip space position (integer part)
+    #
+    ################################################################
+    .func GL_CalcClipCodes
+GL_CalcClipCodes:
+    #define dst          s3
+    #define vcspos_f     $v02
+    #define vcspos_i     $v03
+    #define vguard_f     $v27
+    #define vguard_i     $v28
+    #define v___         $v29
+    #define w            e3
+
+    li t0, %lo(CLIP_CODE_FACTORS)
+    ldv vguard_i,  0,t0
+
+    vmudn vguard_f, vcspos_f, vguard_i
+    vmadh vguard_i, vcspos_i, vguard_i
+    
+    vch v___, vguard_i, vguard_i.w
+    vcl v___, vguard_f, vguard_f.w
+    cfc2 t0, COP2_CTRL_VCC
+    andi t0, 0x707
+    srl t1, t0, 5
+    andi t0, 0x7
+    or t0, t1
+    jr ra
+    sb t0,  SCREEN_VTX_CLIP_CODE(dst)
+
+    #undef dst
+    #undef vcspos_i
+    #undef vcspos_f
+    #undef vguard_i
+    #undef vguard_f
+    #undef v___
+    #undef w
+
+    .endfunc
+
+    ################################################################
+    # GL_TnL
+    #
+    # Args:
+    #   s3 = address of the vertex in DMEM (usually within VERTEX_CACHE)
+    #
+    ################################################################
+    .func GL_TnL
+GL_TnL:
+    #define vtx          s3
+
+    #define v___         $v01
+    #define vcspos_f     $v02
+    #define vcspos_i     $v03
+    #define vtexsize     $v06
+    #define vtexoffset   $v07
+    #define vst          $v08
+    #define vst_i        $v28
+    #define vst_f        $v29
+    move ra2, ra
+
+    llv vst, SCREEN_VTX_S_T, vtx  # S + T
+
+    li t0, %lo(GL_STATE_TEX_SIZE)
+    llv vtexsize,   0,t0
+    llv vtexoffset, 4,t0
+
+    # Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active)
+    #vmudn v___,  vst, vtexsize
+    # vmadh vst, vtexoffset, K1
+
+    #vmudn v___,  vst, vtexsize
+    #vmadh vst,   vtexoffset, K1
+    #vmudl vst,   vst, vtexsize
+
+	vmudh v___,  vst, vtexsize
+	vsar  vst_i, COP2_ACC_HI
+	vsar  vst_f, COP2_ACC_MD
+
+	vmudl vst_f, vst_f, K8192
+	vmadm vst_i, vst_i, K8192
+	vmadn vst,   vzero, vzero
+
+    #undef vst_i
+    #undef vst_f
+
+    lbu t0,    PRIM_VTX_TRCODE(vtx)
+    slv vst,   SCREEN_VTX_S_T, vtx
+
+    ldv vcspos_f, SCREEN_VTX_CS_POSf,vtx
+    ldv vcspos_i, SCREEN_VTX_CS_POSi,vtx
+
+    # Mark this vertex as having T&L applied
+    ori t0, 0x80
+
+    jal GL_CalcScreenSpace
+    sb t0, PRIM_VTX_TRCODE(vtx)
+
+    j GL_CalcClipCodes
+    move ra, ra2
+
+    #undef vcspos_f
+    #undef vcspos_i
+    #undef vtexsize
+    #undef vtexoffset
+
+    #undef vtx
+
+    #undef v___
+    #undef vrgba
+    #undef vst
+    #undef s
+
+    .endfunc
+
+
+    .func GPUCmd_DrawTriangle
+GPUCmd_DrawTriangle:
+    #define vtx1        a1
+    #define vtx2        a2
+    #define vtx3        a3
+    #define trcode1     t6
+    #define trcode2     t7
+    #define trcode3     t8
+    sw ra, %lo(DRAW_TRI_RA) # TODO find a register for this
+
+    # Trivial reject: if all the vertices are out of the same plane (at least one),
+    # the triangle is out of the viewport.
+    # NOTE: This deliberately uses lb instead of lbu so the sign bit is extended.
+    #       The MSB of each TR-code is a bit flag that is set if the vertex has already
+    #       had T&L applied once.
+    lb trcode1, PRIM_VTX_TRCODE(vtx1)
+    lb trcode2, PRIM_VTX_TRCODE(vtx2)
+    lb trcode3, PRIM_VTX_TRCODE(vtx3)
+    and t0, trcode1, trcode2
+    and t0, trcode3
+    andi t0, 0x3F
+    bnez t0, JrRa
+    nop
+
+    # Perform T&L for each vertex if we haven't already
+    bgezal trcode1, GL_TnL
+    move s3, vtx1
+
+    bgezal trcode2, GL_TnL
+    move s3, vtx2
+
+    bgezal trcode3, GL_TnL
+    move s3, vtx3
+
+    lbu t0, SCREEN_VTX_CLIP_CODE(vtx1)
+    lbu t1, SCREEN_VTX_CLIP_CODE(vtx2)
+    lbu t2, SCREEN_VTX_CLIP_CODE(vtx3)
+    or t5, t0, t1
+    or t5, t2
+
+    move s1, zero
+    beqz t5, gl_draw_single_triangle
+    move s2, zero
+
+    jal GL_ClipTriangle
+    nop
+
+    beqz v1, gl_draw_triangle_end
+    addi s2, -6
+    lhu s5, 0(s1)
+gl_draw_clipped_triangles_loop:
+    move vtx1, s5
+    lhu vtx2, 2(s1)
+    lhu vtx3, 4(s1)
+
+gl_draw_single_triangle:
+    addi vtx1, SCREEN_VTX_X
+    addi vtx2, SCREEN_VTX_X
+    addi vtx3, SCREEN_VTX_X
+    
+    lhu a0, %lo(GL_TRI_CMD)
+    lh  v0, %lo(GL_TRI_CULL)
+    jal RDPQ_Triangle
+    li s3, %lo(RDPQ_CMD_STAGING)
+
+    jal RDPQ_Send
+    li s4, %lo(RDPQ_CMD_STAGING)
+
+    blt s1, s2, gl_draw_clipped_triangles_loop
+    addi s1, 2
+
+gl_draw_triangle_end:
+	lw ra, %lo(DRAW_TRI_RA)
+    jr ra
+    nop
+
+    #undef vtx1
+    #undef vtx2
+    #undef vtx3
+    .endfunc
+
+#include "rsp_gpu_clipping.inc"
+#include <rsp_rdpq.inc>
--- a/misc/n64/rsp_gpu_clipping.inc
+++ b/misc/n64/rsp_gpu_clipping.inc
@ -0,0 +1,380 @@
+#define CLIPPING_PLANE_COUNT  6
+#define CLIPPING_CACHE_SIZE   9
+#define CLIPPING_PLANE_SIZE   8
+
+    .section .data.gl_clipping
+
+    .align 4
+CLIP_PLANES:
+    .half 1, 0, 0, GUARD_BAND_FACTOR
+    .half 0, 1, 0, GUARD_BAND_FACTOR
+    .half 0, 0, 1, 1
+    .half 1, 0, 0, -GUARD_BAND_FACTOR
+    .half 0, 1, 0, -GUARD_BAND_FACTOR
+    .half 0, 0, 1, -1
+
+    .align 4
+CACHE_OFFSETS: .half 2,4,6,8,10,12,14,16,18
+
+    .section .bss.gl_clipping
+
+CLIP_CACHE: .dcb.b     SCREEN_VTX_SIZE * CLIPPING_CACHE_SIZE
+CLIP_CACHE_END:
+
+CLIP_LISTS:
+    CLIP_LIST0: .dcb.w  CLIPPING_CACHE_SIZE
+    CLIP_LIST1: .dcb.w  CLIPPING_CACHE_SIZE
+
+
+    .section .text.gl_clipping
+
+    ################################################################
+    # GL_ClipTriangle
+    #   Clip a triangle against the view-frustum by using the Sutherland-Hodgman algorithm
+    #   https://en.wikipedia.org/wiki/Sutherland%E2%80%93Hodgman_algorithm
+    # Args:
+    #   a1-a3 = Vertices
+    #   t5    = OR'd clip flags of the triangle's vertices
+    # Returns:
+    #   s1    = Pointer to list of output vertices
+    #   s2    = Pointer to end of list
+    ################################################################
+    .func GL_ClipTriangle
+GL_ClipTriangle:
+    #define out_count       v1
+    #define clip_flags      t5
+    #define plane_flag      t6
+    #define in_count        t7
+    #define in_end          t8
+    #define in_list         s0
+    #define out_list        s1
+    #define plane           s2
+    #define intersection    s3
+    #define cur_ptr         s4
+    #define prev_ptr        s5
+    #define cur_vtx         s6
+    #define prev_vtx        s7
+    #define p0              k0
+    #define p1              k1
+    #define vtx1            a1
+    #define vtx2            a2
+    #define vtx3            a3
+
+    #define vplane          $v01
+    #define vint_f          $v02
+    #define vint_i          $v03
+    #define vdot_i          $v04
+    #define vdot_f          $v05
+    #define vdiff_i         $v06
+    #define vdiff_f         $v07
+    #define va_i            $v08
+    #define va_f            $v09
+    #define vpos_i          $v10
+    #define vpos_f          $v11
+    #define vattr0          $v12
+    #define vattr1          $v13
+    #define voff0           $v14
+    #define voff1           $v15
+    #define vcache0         $v16
+    #define vcache1         $v17
+    #define v__             $v29
+
+    move ra2, ra
+
+    # Init in_list as empty
+    li in_list, %lo(CLIP_LIST0)
+    move in_count, zero
+
+    # Put three original vertices in the out_list
+    # (So after the initial swap they will be in the in_list)
+    li out_list, %lo(CLIP_LIST1)
+    sh vtx1, 0(out_list)
+    sh vtx2, 2(out_list)
+    sh vtx3, 4(out_list)
+    li out_count, 3*2
+
+    li plane, %lo(CLIP_PLANES)
+    li plane_flag, 1
+
+    # Load cache offsets    
+    li t0, %lo(CACHE_OFFSETS)
+    vxor voff1, voff1
+    lqv voff0,  0,t0
+    lsv voff1, 16,t0
+
+    # Temporarily use the RDP staging area as a map of which cache slots are used
+    # Init to zero
+    li t0, %lo(RDPQ_CMD_STAGING)
+    sqv vzero,  0,t0
+    sqv vzero, 16,t0
+
+    # Iterate over the 6 clipping planes
+gl_clip_plane_loop:
+    and t0, clip_flags, plane_flag
+    beqz t0, gl_clip_plane_loop_end
+    move t1, in_list
+
+    # Swap in and out lists
+
+    # If the out list is empty from the last iteration, 
+    # the triangle has no visible points and we are done
+    beqz out_count, gl_clip_return
+    move in_list, out_list
+    move out_list, t1
+    move in_count, out_count
+    move out_count, zero
+
+    # Iterate over the egdes of the polygon in the input list
+    # The current edge is between cur_vtx and prev_vtx
+    move cur_ptr, in_list
+    add in_end, in_list, in_count
+    # Init the "previous" vertex to the last in the list for the wrap-around
+    addi prev_ptr, in_end, -2
+
+gl_clip_edge_loop:
+    #define cur_flag  t3
+    #define prev_flag t4
+
+    # Check which side of the plane the two vertices are on
+    lhu cur_vtx, 0(cur_ptr)
+    lhu prev_vtx, 0(prev_ptr)
+    lbu cur_flag, SCREEN_VTX_CLIP_CODE(cur_vtx)
+    lbu prev_flag, SCREEN_VTX_CLIP_CODE(prev_vtx)
+    and cur_flag, plane_flag
+    and prev_flag, plane_flag
+
+    # If they are on opposite sides, there is an intersection
+    xor t0, cur_flag, prev_flag
+    beqz t0, gl_clip_no_intersection
+    move p0, cur_vtx
+
+    # Swap the two points if necessary to make intersection calculation consistent
+    # This will make sure p0 is always inside and p1 is always outside
+    bnez prev_flag, gl_clip_no_swap
+    move p1, prev_vtx
+    xor p0, p0, p1
+    xor p1, p0, p1
+    xor p0, p0, p1
+
+    #undef prev_flag
+
+gl_clip_no_swap:
+    # Calculate intersection of the line segment and the plane
+
+    li t0, %lo(RDPQ_CMD_STAGING)
+    lqv vcache0,    0,t0
+    lqv vcache1,   16,t0
+
+    # Repeat plane coefficients twice
+    ldv vplane.e0,  0,plane
+    ldv vplane.e4,  0,plane
+
+    # vpos: x0  y0  z0  w0  x1  y1  z1  w1
+    ldv vpos_i.e0,  SCREEN_VTX_CS_POSi,p0
+    ldv vpos_f.e0,  SCREEN_VTX_CS_POSf,p0
+    ldv vpos_i.e4,  SCREEN_VTX_CS_POSi,p1
+    ldv vpos_f.e4,  SCREEN_VTX_CS_POSf,p1
+
+    # vint: x1  y1  z1  w1
+    ldv vint_i.e0,  SCREEN_VTX_CS_POSi,p1
+    ldv vint_f.e0,  SCREEN_VTX_CS_POSf,p1
+
+    # vattr0: r0  g0  b0  a0  s0  t0
+    luv vattr0.e0,  SCREEN_VTX_RGBA   ,p0
+    llv vattr0.e4,  SCREEN_VTX_S_T    ,p0
+
+    # vattr1: r1  g1  b1  a1  s1  t1
+    luv vattr1.e0,  SCREEN_VTX_RGBA   ,p1
+    llv vattr1.e4,  SCREEN_VTX_S_T    ,p1
+
+    # Find first free slot in clip cache
+
+    # Add the values from the "used slots map" to the cache offsets
+    # After this, each lane will contain the offset of its corresponding cache slot,
+    # but only if the slot is not used. If it is used, it will contain some large value.
+    vaddc vcache0, voff0
+    vaddc vcache1, voff1
+
+    # Look for the smallest value, which will end up in vcache.e0
+    # Because used slots are marked as large values, they will never be found.
+    vlt vcache0, vcache0.q1
+    vlt vcache0, vcache0.h2
+    vlt vcache0, vcache0.e4
+    vlt vcache0, vcache1.e0
+
+    mfc2 t0, vcache0.e0
+
+    # Mark slot as used by storing some large value (careful of overflows!)
+    li t1, 0xFF
+    sh t1, %lo(RDPQ_CMD_STAGING)-2(t0)
+
+    # t0 is the index multiplied by 2
+    # intersection = t0 * 20 = t0 * 16 + t0 * 4
+    sll intersection, t0, 4
+    sll t1, t0, 2
+    add intersection, t1
+
+    # CAUTION: intersection might point to the same address as either p0 or p1,
+    # because one of them is the previous point, which could have been marked unused
+    # in the previous iteration. As long as we don't access p0 or p1 after writing to
+    # intersection, this is fine.
+    addi intersection, %lo(CLIP_CACHE) - SCREEN_VTX_SIZE
+
+    # Store the cache offset in unused memory (used later when finding the cache slot to mark as unused)
+    sb t0, SCREEN_VTX_PADDING(intersection)
+
+    # Compute dot products of both positions with the clip plane
+    # vdot.e0: d0 = dot(p0, plane)
+    # vdot.e4: d1 = dot(p1, plane)
+    vmudn vdot_f, vpos_f, vplane
+    vmadh vdot_i, vpos_i, vplane
+    vaddc vdot_f, vdot_f.q1
+    vadd  vdot_i, vdot_i.q1
+    vaddc vdot_f, vdot_f.h2
+    vadd  vdot_i, vdot_i.h2
+
+    # d0 - d1
+    vsubc vdiff_f, vdot_f, vdot_f.e4
+    vsub  vdiff_i, vdot_i, vdot_i.e4
+
+    # 1 / (d0 - d1)
+    vrcph v__.e0,  vdiff_i.e0
+    vrcpl va_f.e0, vdiff_f.e0
+    vrcph va_i.e0, vzero.e0
+
+    # a = d0 / (d0 - d1)
+    vmudl v__,  va_f, vdot_f.e0
+    vmadm v__,  va_i, vdot_f.e0
+    vmadn va_f, va_f, vdot_i.e0
+
+    # Prepare 0x7FFF in va_i.e0
+    vsubc va_i, vshift8, K1
+
+    # a = min(a, 1)
+    vge  v__,  va_f, vzero
+    vmrg va_f, va_f, va_i.e0
+
+    # Account for right shift introduced by vrcp
+    vmudn va_f, va_f, K2
+
+    # p1 - p0
+    vsubc vint_f, vpos_f
+    vsub  vint_i, vpos_i
+    # attr1 - attr0
+    vsubc vattr1, vattr0
+
+    # Result of linear interpolation:
+    # p0 + a * (p1 - p0)
+    vmudl v__,    vint_f, va_f.e0
+    vmadm v__,    vint_i, va_f.e0
+    vmadn vint_f, vpos_f, K1
+    vmadh vint_i, vpos_i, K1
+
+    # a * (attr1 - attr0)
+    vmudm vattr1, vattr1, va_f.e0
+
+    # attr0 + a * (attr1 - attr0)
+    vaddc vattr0, vattr1
+
+    # Store results
+    sdv vint_i.e0,  SCREEN_VTX_CS_POSi,intersection
+    sdv vint_f.e0,  SCREEN_VTX_CS_POSf,intersection
+    suv vattr0.e0,  SCREEN_VTX_RGBA   ,intersection
+    jal GL_CalcClipCodes
+    slv vattr0.e4,  SCREEN_VTX_S_T    ,intersection
+
+    # Add intersection to the output list
+    add t0, out_list, out_count
+    sh intersection, 0(t0)
+    addi out_count, 2
+
+gl_clip_no_intersection:
+    # If cur_vtx is inside, add it to the output list
+    bnez cur_flag, gl_clip_no_current
+    add t0, out_list, out_count
+    sh cur_vtx, 0(t0)
+    b gl_clip_edge_loop_end
+    addi out_count, 2
+
+    #undef cur_flag
+
+gl_clip_no_current:
+    # Check if the vertex is stored in the clip cache
+    lbu t0, SCREEN_VTX_PADDING(cur_vtx)
+    beqz t0, gl_clip_edge_loop_end
+    # Reset the padding field to zero, so the screen space values won't be recalculated below
+    sb zero, SCREEN_VTX_PADDING(cur_vtx)
+    # If so, mark it as unused
+    sh zero, %lo(RDPQ_CMD_STAGING)-2(t0)
+    
+gl_clip_edge_loop_end:
+    # Advance to the next edge
+    addi cur_ptr, 2
+    blt cur_ptr, in_end, gl_clip_edge_loop
+    addi prev_ptr, cur_ptr, -2
+
+gl_clip_plane_loop_end:
+    # Advance to the next clipping plane
+    sll plane_flag, 1
+    blt plane_flag, (1<<CLIPPING_PLANE_COUNT), gl_clip_plane_loop
+    addi plane, CLIPPING_PLANE_SIZE
+
+    #define cache_vtx s3
+    #define cache_end s5
+
+    # Calculate screen space values for new vertices (in the clip cache)
+    # TODO: maybe iterate over out_list instead
+    li cache_vtx, %lo(CLIP_CACHE)
+    li cache_end, %lo(CLIP_CACHE_END) - SCREEN_VTX_SIZE
+gl_clip_finalize_loop:
+    lbu t0, SCREEN_VTX_PADDING(cache_vtx)
+    neg t0
+
+    # Only calculate screen space values if the vertex is actually used
+    ldv vint_i,  SCREEN_VTX_CS_POSi,cache_vtx
+    bltzal t0, GL_CalcScreenSpace
+    ldv vint_f,  SCREEN_VTX_CS_POSf,cache_vtx
+
+    blt cache_vtx, cache_end, gl_clip_finalize_loop
+    addi cache_vtx, SCREEN_VTX_SIZE
+
+gl_clip_return:
+    # Done!
+    jr ra2
+    add s2, out_list, out_count
+
+    #undef cache_vtx
+    #undef cache_end
+    #undef clip_flags
+    #undef plane_flag
+    #undef in_count
+    #undef out_count
+    #undef in_end
+    #undef intersection
+    #undef in_list
+    #undef out_list
+    #undef plane
+    #undef cur_ptr
+    #undef prev_ptr
+    #undef cur_vtx
+    #undef prev_vtx
+    #undef p0
+    #undef p1
+    #undef vtx1
+    #undef vtx2
+    #undef vtx3
+    #undef vplane
+    #undef vpos_i
+    #undef vpos_f
+    #undef vdot_i
+    #undef vdot_f
+    #undef vdiff_i
+    #undef vdiff_f
+    #undef va_f
+    #undef vint_i
+    #undef vint_f
+    #undef vattr0
+    #undef vattr1
+    #undef v__
+
+    .endfunc
--- a/src/Graphics_N64.c
+++ b/src/Graphics_N64.c
@ -5,23 +5,34 @@
 #include "Logger.h"
 #include "Window.h"
 #include <libdragon.h>
-#include <GL/gl.h>
-#include <GL/gl_integration.h>
 #include <malloc.h>
-
-typedef void (*GL_SetupVBFunc)(void);
-static GL_SetupVBFunc gfx_setupVBFunc;
-
+#include <rspq_profile.h>
+#include "../misc/n64/gpu.c"

 /*########################################################################################################################*
 *---------------------------------------------------------General---------------------------------------------------------*
 *#########################################################################################################################*/
 static surface_t zbuffer;
+static GfxResourceID white_square;

 void Gfx_Create(void) {
-    gl_init();
+	rspq_init();
+	//rspq_profile_start();
+    rdpq_init();
    //rdpq_debug_start(); // TODO debug
    //rdpq_debug_log(true);
+
+	rdpq_set_mode_standard();
+	__rdpq_mode_change_som(SOM_TEXTURE_PERSP, SOM_TEXTURE_PERSP);
+	__rdpq_mode_change_som(SOM_ZMODE_MASK,    SOM_ZMODE_OPAQUE);
+	rdpq_mode_dithering(DITHER_SQUARE_SQUARE);
+
+    gpu_init();
+
+	// Set alpha compare threshold
+	gpu_push_rdp(RDP_CMD_SYNC_PIPE, 0);
+	gpu_push_rdp(RDP_CMD_SET_BLEND_COLOR, (0 << 24) | (0 << 16) | (0 << 8) | 127);
+
    zbuffer = surface_alloc(FMT_RGBA16, display_get_width(), display_get_height());
    
 	Gfx.MaxTexWidth  = 256;
@ -36,6 +47,9 @@ void Gfx_Create(void) {

 	Gfx.SupportsNonPowTwoTextures = true;
 	Gfx_RestoreState();
+
+	Gfx_SetFaceCulling(false);
+	Gfx_SetViewport(0, 0, Game.Width, Game.Height);
 }

 cc_bool Gfx_TryRestoreContext(void) {
@ -44,11 +58,9 @@ cc_bool Gfx_TryRestoreContext(void) {

 void Gfx_Free(void) {
 	Gfx_FreeState();
-	gl_close();
+	gpu_close();
 }

-#define gl_Toggle(cap) if (enabled) { glEnable(cap); } else { glDisable(cap); }
-

 /*########################################################################################################################*
 *-----------------------------------------------------------Misc----------------------------------------------------------*
@ -73,21 +85,17 @@ void Gfx_SetVSync(cc_bool vsync) {
 void Gfx_OnWindowResize(void) { }

 void Gfx_SetViewport(int x, int y, int w, int h) {
-	glViewport(x, Game.Height - h - y, w, h);
-}
-void Gfx_SetScissor (int x, int y, int w, int h) {
-	cc_bool enabled = x != 0 || y != 0 || w != Game.Width || h != Game.Height;
-	if (enabled) { glEnable(GL_SCISSOR_TEST); } else { glDisable(GL_SCISSOR_TEST); }
-
-	glScissor(x, Game.Height - h - y, w, h);
+	gpuViewport(x, y, w, h);
 }

+void Gfx_SetScissor(int x, int y, int w, int h) {
+	rdpq_set_scissor(x, y, x + w, y + h);
+}

 void Gfx_BeginFrame(void) {
 	surface_t* disp = display_get();
    rdpq_attach(disp, &zbuffer);
    
-	gl_context_begin();
 	Platform_LogConst("GFX ctx beg");
 }

@ -113,9 +121,11 @@ void Gfx_ClearColor(PackedCol color) {

 void Gfx_EndFrame(void) {
 	Platform_LogConst("GFX ctx end");
-	gl_context_end();
    rdpq_detach_show();
-//Platform_LogConst("GFX END");
+	//Platform_LogConst("GFX END");
+
+	//rspq_profile_dump();
+	//rspq_profile_next_frame();
 }


@ -124,14 +134,32 @@ void Gfx_EndFrame(void) {
 *#########################################################################################################################*/
 typedef struct CCTexture {
 	surface_t surface;
-	GLuint textureID;
+	rspq_block_t* upload_block;
 } CCTexture;

+void Gfx_BindTexture(GfxResourceID texId) {
+	if (!texId) texId = white_square;
+	CCTexture* tex = (CCTexture*)texId;
+
+	rspq_block_run(tex->upload_block);
+	gpuSetTexSize(tex->surface.width, tex->surface.height);
+}
+
 #define ALIGNUP8(size) (((size) + 7) & ~0x07)

 // A8 B8 G8 R8 > A1 B5 G5 B5
 #define To16BitPixel(src) \
-	((src & 0x80) >> 7) | ((src & 0xF800) >> 10) | ((src & 0xF80000) >> 13) | ((src & 0xF8000000) >> 16);	
+	((src & 0x80) >> 7) | ((src & 0xF800) >> 10) | ((src & 0xF80000) >> 13) | ((src & 0xF8000000) >> 16);
+
+static void UploadTexture(CCTexture* tex, rdpq_texparms_t* params) {
+	rspq_block_begin();
+
+	rdpq_tex_multi_begin();
+	rdpq_tex_upload(TILE0, &tex->surface, params);
+	rdpq_tex_multi_end();
+
+	tex->upload_block = rspq_block_end();
+}

 GfxResourceID Gfx_AllocTexture(struct Bitmap* bmp, int rowWidth, cc_uint8 flags, cc_bool mipmaps) {
 	cc_bool bit16  = flags & TEXTURE_FLAG_LOWRES;
@ -141,15 +169,8 @@ GfxResourceID Gfx_AllocTexture(struct Bitmap* bmp, int rowWidth, cc_uint8 flags,
 	if (pitch * bmp->height > 4096) return 0;
 	
 	CCTexture* tex = Mem_Alloc(1, sizeof(CCTexture), "texture");
-	
-	glGenTextures(1, &tex->textureID);
-	glBindTexture(GL_TEXTURE_2D, tex->textureID);
-	// NOTE: Enabling these fixes textures, but seems to break on cen64
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, mipmaps ? GL_LINEAR : GL_NEAREST);
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, mipmaps ? GL_LINEAR : GL_NEAREST);
-	
-	tex->surface  = surface_alloc(bit16 ? FMT_RGBA16 : FMT_RGBA32, bmp->width, bmp->height);
-	surface_t* fb = &tex->surface;
+	tex->surface   = surface_alloc(bit16 ? FMT_RGBA16 : FMT_RGBA32, bmp->width, bmp->height);
+	surface_t* fb  = &tex->surface;
 		
 	if (bit16) {
 		cc_uint32* src = (cc_uint32*)bmp->scan0;
@ -172,33 +193,17 @@ GfxResourceID Gfx_AllocTexture(struct Bitmap* bmp, int rowWidth, cc_uint8 flags,
 						bmp, rowWidth * BITMAPCOLOR_SIZE);
 	}
 	
-	
 	rdpq_texparms_t params =
 	{
        .s.repeats = (flags & TEXTURE_FLAG_NONPOW2) ? 1 : REPEAT_INFINITE,
        .t.repeats = (flags & TEXTURE_FLAG_NONPOW2) ? 1 : REPEAT_INFINITE,
    };
-
-	// rdpq_tex_upload(TILE0, &tex->surface, &params);
-	glSurfaceTexImageN64(GL_TEXTURE_2D, 0, fb, &params);
+	UploadTexture(tex, &params);
 	return tex;
 }

-void Gfx_BindTexture(GfxResourceID texId) {
-	CCTexture* tex = (CCTexture*)texId;
-	GLuint glID = tex ? tex->textureID : 0;
-	//Platform_Log1("BIND: %i", &glID);
-	
-    //rdpq_debug_log(true);
-	glBindTexture(GL_TEXTURE_2D, glID);	
-   // rdpq_debug_log(false);
-}
-
 void Gfx_UpdateTexture(GfxResourceID texId, int x, int y, struct Bitmap* part, int rowWidth, cc_bool mipmaps) {
-	// TODO: Just memcpying doesn't actually work. maybe due to glSurfaceTexImageN64 caching the RSQ upload block?
-	// TODO: Is there a more optimised approach than just calling glSurfaceTexImageN64
 	CCTexture* tex = (CCTexture*)texId;
-	
 	surface_t* fb  = &tex->surface;
 	cc_uint32* src = (cc_uint32*)part->scan0 + x;
 	cc_uint8*  dst = (cc_uint8*)fb->buffer  + (x * 4) + (y * fb->stride);
@ -210,21 +215,22 @@ void Gfx_UpdateTexture(GfxResourceID texId, int x, int y, struct Bitmap* part, i
 				 part->width * 4);
 	}
 	
-	
-	glBindTexture(GL_TEXTURE_2D, tex->textureID);
 	rdpq_texparms_t params = (rdpq_texparms_t){
        .s.repeats = REPEAT_INFINITE,
        .t.repeats = REPEAT_INFINITE,
    };
-	// rdpq_tex_upload(TILE0, &tex->surface, &params);
-	glSurfaceTexImageN64(GL_TEXTURE_2D, 0, fb, &params);
+
+	rdpq_call_deferred((void (*)(void*))rspq_block_free, tex->upload_block);
+	UploadTexture(tex, &params);
 }

 void Gfx_DeleteTexture(GfxResourceID* texId) {
 	CCTexture* tex = (CCTexture*)(*texId);
 	if (!tex) return;
 	
-	glDeleteTextures(1, &tex->textureID);
+	if (tex->upload_block) rdpq_call_deferred((void (*)(void*))rspq_block_free, tex->upload_block);
+	surface_free(&tex->surface);
+
 	Mem_Free(tex);
 	*texId = NULL;
 }
@ -236,29 +242,46 @@ void Gfx_DisableMipmaps(void) { }
 /*########################################################################################################################*
 *-----------------------------------------------------State management----------------------------------------------------*
 *#########################################################################################################################*/
-void Gfx_SetFaceCulling(cc_bool enabled)   { gl_Toggle(GL_CULL_FACE); }
-static void SetAlphaBlend(cc_bool enabled) { gl_Toggle(GL_BLEND); }
-void Gfx_SetAlphaArgBlend(cc_bool enabled) { }
-
-static void SetColorWrite(cc_bool r, cc_bool g, cc_bool b, cc_bool a) {
-	//glColorMask(r, g, b, a); TODO
+void Gfx_SetFaceCulling(cc_bool enabled) { 
+	gpuSetCullFace(enabled);
 }

-void Gfx_SetDepthWrite(cc_bool enabled) { glDepthMask(enabled); }
-void Gfx_SetDepthTest(cc_bool enabled) { gl_Toggle(GL_DEPTH_TEST); }
+static void SetAlphaBlend(cc_bool enabled) { 
+	rdpq_mode_blender(enabled ? RDPQ_BLENDER_MULTIPLY : 0);
+	__rdpq_mode_change_som(SOM_ZMODE_MASK, enabled ? SOM_ZMODE_TRANSPARENT : SOM_ZMODE_OPAQUE);
+}
+
+void Gfx_SetAlphaArgBlend(cc_bool enabled) { }
+
+static void SetAlphaTest(cc_bool enabled) {
+	__rdpq_mode_change_som(SOM_ALPHACOMPARE_MASK, enabled ? SOM_ALPHACOMPARE_THRESHOLD : 0);
+}
+
+static void SetColorWrite(cc_bool r, cc_bool g, cc_bool b, cc_bool a) {
+	//gpuColorMask(r, g, b, a); TODO
+}
+
+void Gfx_SetDepthWrite(cc_bool enabled) { 
+	__rdpq_mode_change_som(SOM_Z_WRITE, enabled ? SOM_Z_WRITE : 0);
+}
+
+void Gfx_SetDepthTest(cc_bool enabled) { 
+	__rdpq_mode_change_som(SOM_Z_COMPARE, enabled ? SOM_Z_COMPARE : 0);
+
+	gpu_attr_z = enabled;
+	gpuUpdateFormat();
+}

 static void Gfx_FreeState(void) { FreeDefaultResources(); }
 static void Gfx_RestoreState(void) {
 	InitDefaultResources();
-	glEnableClientState(GL_VERTEX_ARRAY);
-	glEnableClientState(GL_COLOR_ARRAY);
 	gfx_format = -1;
-
-	glHint(GL_FOG_HINT, GL_NICEST);
-	glAlphaFunc(GL_GREATER, 0.5f);
-	glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
-	glDepthFunc(GL_LESS);
-	//glEnable(GL_RDPQ_TEXTURING_N64);
+	
+	// 1x1 dummy white texture
+	struct Bitmap bmp;
+	BitmapCol pixels[1] = { BITMAPCOLOR_WHITE };
+	Bitmap_Init(bmp, 1, 1, pixels);
+	white_square = Gfx_CreateTexture(&bmp, 0, false);
 }

 cc_bool Gfx_WarnIfNecessary(void) { return false; }
@ -348,8 +371,8 @@ static rspq_block_t* VB_GetCached(struct VertexBuffer* vb, int offset, int count
 		if (vb->cache.blocks[i]) continue;

 		rspq_block_begin();
-		gfx_setupVBFunc();
-		glDrawArrays(GL_QUADS, offset, count);
+		gpu_pointer = gfx_vb->vertices;
+		gpuDrawArrays(offset, count);
 		rspq_block_t* block = rspq_block_end();

 		vb->cache.blocks[i] = block;
@ -435,80 +458,64 @@ void Gfx_SetFogEnd(float value) {
 void Gfx_SetFogMode(FogFunc func) {
 }

-static void SetAlphaTest(cc_bool enabled) {
-	if (enabled) { glEnable(GL_ALPHA_TEST); } else { glDisable(GL_ALPHA_TEST); }
-}
-
 void Gfx_DepthOnlyRendering(cc_bool depthOnly) {
 	depthOnlyRendering = depthOnly; // TODO: Better approach? maybe using glBlendFunc instead?
 	cc_bool enabled    = !depthOnly;
+
 	//SetColorWrite(enabled & gfx_colorMask[0], enabled & gfx_colorMask[1], 
 	//			  enabled & gfx_colorMask[2], enabled & gfx_colorMask[3]);
-	if (enabled) { glEnable(GL_TEXTURE_2D); } else { glDisable(GL_TEXTURE_2D); }
+	gpu_attr_tex = enabled;
+	gpuUpdateFormat();
 }


 /*########################################################################################################################*
 *---------------------------------------------------------Matrices--------------------------------------------------------*
 *#########################################################################################################################*/
-static GLenum matrix_modes[3] = { GL_PROJECTION, GL_MODELVIEW, GL_TEXTURE };
-static int lastMatrix;
+static struct Matrix _view, _proj;

 void Gfx_LoadMatrix(MatrixType type, const struct Matrix* matrix) {
-	if (type != lastMatrix) { lastMatrix = type; glMatrixMode(matrix_modes[type]); }
+	if (type == MATRIX_VIEW) _view = *matrix;
+	if (type == MATRIX_PROJ) _proj = *matrix;

-	if (matrix == &Matrix_Identity) {
-		glLoadIdentity();
-	} else {
-		glLoadMatrixf((const float*)matrix);
-	}
+	struct Matrix mvp __attribute__((aligned(64)));	
+	Matrix_Mul(&mvp, &_view, &_proj);
+	gpuLoadMatrix((const float*)&mvp);
 }

 void Gfx_LoadMVP(const struct Matrix* view, const struct Matrix* proj, struct Matrix* mvp) {
-	Gfx_LoadMatrix(MATRIX_VIEW, view);
-	Gfx_LoadMatrix(MATRIX_PROJ, proj);
+	_proj = *proj;
+	_view = *view;
+
 	Matrix_Mul(mvp, view, proj);
+	gpuLoadMatrix((const float*)mvp);
 }

-static struct Matrix texMatrix = Matrix_IdentityValue;
 void Gfx_EnableTextureOffset(float x, float y) {
-	texMatrix.row4.x = x; texMatrix.row4.y = y;
-	Gfx_LoadMatrix(2, &texMatrix);
+	// TODO
 }

-void Gfx_DisableTextureOffset(void) { Gfx_LoadMatrix(2, &Matrix_Identity); }
+void Gfx_DisableTextureOffset(void) { }


 /*########################################################################################################################*
 *--------------------------------------------------------Rendering--------------------------------------------------------*
 *#########################################################################################################################*/
-static void GL_SetupVbColoured(void) {
-	glVertexPointer(3, GL_FLOAT,        SIZEOF_VERTEX_COLOURED, (void*)(gfx_vb->vertices + 0));
-	glColorPointer(4, GL_UNSIGNED_BYTE, SIZEOF_VERTEX_COLOURED, (void*)(gfx_vb->vertices + 12));
-}
-
-static void GL_SetupVbTextured(void) {
-	glVertexPointer(3, GL_FLOAT,        SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices + 0));
-	glColorPointer(4, GL_UNSIGNED_BYTE, SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices + 12));
-	glTexCoordPointer(2, GL_FLOAT,      SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices + 16));
-}
-
 void Gfx_SetVertexFormat(VertexFormat fmt) {
 	if (fmt == gfx_format) return;
 	gfx_format = fmt;
 	gfx_stride = strideSizes[fmt];
+	gpu_stride = gfx_stride;

 	if (fmt == VERTEX_FORMAT_TEXTURED) {
-		glEnableClientState(GL_TEXTURE_COORD_ARRAY);
-		glEnable(GL_TEXTURE_2D);
-
-		gfx_setupVBFunc = GL_SetupVbTextured;
+		rdpq_mode_combiner(RDPQ_COMBINER_TEX_SHADE);
 	} else {
-		glDisableClientState(GL_TEXTURE_COORD_ARRAY);
-		glDisable(GL_TEXTURE_2D);
-
-		gfx_setupVBFunc = GL_SetupVbColoured;
+		rdpq_mode_combiner(RDPQ_COMBINER_SHADE);
 	}
+
+	gpu_texturing = fmt == VERTEX_FORMAT_TEXTURED;
+	gpu_attr_tex = gpu_texturing;
+	gpuUpdateFormat();
 }

 void Gfx_DrawVb_Lines(int verticesCount) {
@ -520,8 +527,8 @@ void Gfx_DrawVb_IndexedTris_Range(int verticesCount, int startVertex, DrawHints
 	if (block) {
 		rspq_block_run(block);
 	} else {
-		gfx_setupVBFunc();
-		glDrawArrays(GL_QUADS, startVertex, verticesCount);
+		gpu_pointer = gfx_vb->vertices;
+		gpuDrawArrays(startVertex, verticesCount);
 	}
 }

@ -531,8 +538,8 @@ void Gfx_DrawVb_IndexedTris(int verticesCount) {
 	if (block) {
 		rspq_block_run(block);
 	} else {
-		gfx_setupVBFunc();
-		glDrawArrays(GL_QUADS, 0, verticesCount);
+		gpu_pointer = gfx_vb->vertices;
+		gpuDrawArrays(0, verticesCount);
 	}
 }

@ -543,10 +550,8 @@ void Gfx_DrawIndexedTris_T2fC4b(int verticesCount, int startVertex) {
 	if (block) {
 		rspq_block_run(block);
 	} else {
-		glVertexPointer(3, GL_FLOAT,        SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices));
-		glColorPointer(4, GL_UNSIGNED_BYTE, SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices + 12));
-		glTexCoordPointer(2, GL_FLOAT,      SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices + 16));
-		glDrawArrays(GL_QUADS, startVertex, verticesCount);
+		gpu_pointer = gfx_vb->vertices;
+		gpuDrawArrays(startVertex, verticesCount);
 	}
 }
 #endif