diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index 201bf6ffb..b1611871b 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -30,7 +30,7 @@ static rsp_ucode_t rsp_gpu = (rsp_ucode_t){ enum { GPU_CMD_SET_SHORT = 0x0, - GPU_CMD_SET_WORD = 0x1, + GPU_CMD_SET_TEX_WORD = 0x1, GPU_CMD_SET_LONG = 0x2, GPU_CMD_DRAW_QUAD = 0x3, @@ -42,8 +42,8 @@ enum { typedef struct { int16_t vp_scale[4]; int16_t vp_offset[4]; - uint16_t tex_size[2]; - uint16_t tex_offset[2]; + uint16_t tex_size[8]; + uint16_t tex_offset[8]; uint16_t tri_cmd; uint16_t tri_cull; } __attribute__((aligned(8), packed)) gpu_state; @@ -55,9 +55,9 @@ static inline void gpu_set_short(uint32_t offset, uint16_t value) } __attribute__((always_inline)) -static inline void gpu_set_word(uint32_t offset, uint32_t value) +static inline void gpu_set_tex_word(uint32_t offset, uint32_t value) { - rspq_write(gpup_id, GPU_CMD_SET_WORD, offset, value); + rspq_write(gpup_id, GPU_CMD_SET_TEX_WORD, offset, value); } __attribute__((always_inline)) @@ -97,12 +97,12 @@ static void gpuUpdateFormat(void) static void gpuSetTexSize(uint16_t width, uint16_t height) { - gpu_set_word(offsetof(gpu_state, tex_size[0]), (width << 16) | height); + gpu_set_tex_word(offsetof(gpu_state, tex_size[0]), (width << 16) | height); } static void gpuSetTexOffset(uint16_t width, uint16_t height) { - gpu_set_word(offsetof(gpu_state, tex_offset[0]), (width << 16) | height); + gpu_set_tex_word(offsetof(gpu_state, tex_offset[0]), (width << 16) | height); } diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S index 4a6969af4..c2d2effae 100644 --- a/misc/n64/rsp_gpu.S +++ b/misc/n64/rsp_gpu.S @@ -36,7 +36,7 @@ RSPQ_BeginOverlayHeader RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x0 - RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x1 + RSPQ_DefineCommand GPUCmd_SetTexWord, 8 # 0x1 RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x2 RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x3 @@ -56,8 +56,8 @@ GL_STATE: # This is the GL state that is updated by CPU via GPUCmd_Set commands GL_VIEWPORT_SCALE: .half 0,0,0,0 GL_VIEWPORT_OFFSET: .half 0,0,0,0 - GL_STATE_TEX_SIZE: .half 0,0 - GL_STATE_TEX_OFFSET: .half 0,0 + GL_STATE_TEX_SIZE: .half 0,0, 0,0, 0,0, 0,0 + GL_STATE_TEX_OFFSET: .half 0,0, 0,0, 0,0, 0,0 GL_TRI_CMD: .half 0 GL_TRI_CULL: .half 0 @@ -79,10 +79,14 @@ GPUCmd_SetShort: sh a1, %lo(GL_STATE)(a0) .endfunc - .func GPUCmd_SetWord -GPUCmd_SetWord: +// Store 4 times, so can be transformed by 4 vertices later + .func GPUCmd_SetTexWord +GPUCmd_SetTexWord: + sw a1, %lo(GL_STATE) + 0(a0) + sw a1, %lo(GL_STATE) + 4(a0) + sw a1, %lo(GL_STATE) + 8(a0) jr ra - sw a1, %lo(GL_STATE) + 0(a0) + sw a1, %lo(GL_STATE) + 12(a0) .endfunc .func GPUCmd_SetLong @@ -279,39 +283,8 @@ GL_TnL: #define v___ $v01 #define vcspos_f $v02 #define vcspos_i $v03 - #define vtexsize $v06 - #define vtexoffset $v07 - #define vst $v08 - #define vst_i $v28 - #define vst_f $v29 move ra2, ra - llv vst, SCREEN_VTX_S_T, vtx # S + T - - li t0, %lo(GL_STATE_TEX_SIZE) - llv vtexsize, 0,t0 - llv vtexoffset, 4,t0 - - # Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active) - #vmudn v___, vst, vtexsize - # vmadh vst, vtexoffset, K1 - - #vmudn v___, vst, vtexsize - #vmudl vst, vst, vtexsize - - vmudn vst_f, vst, vtexsize # ACC = vst * vtexsize, VST_F = ACC & 0xFFFF - #####vmadn vst_f, vtexoffset, K1 - vmadh vst_i, vzero, vzero # ACC += zero * zero, VST_I = ACC >> 16 - - // Shift texture coords right 5 bits - vmudm v___, vst_i, K2048 # ACC = (vst_i << 11) - vmadl vst, vst_f, K2048 # ACC += (vst_f << 11) >> 16, VST = ACC & 0xFFFF - - #undef vst_i - #undef vst_f - - slv vst, SCREEN_VTX_S_T, vtx - ldv vcspos_f, SCREEN_VTX_CS_POSf,vtx jal GL_CalcScreenSpace ldv vcspos_i, SCREEN_VTX_CS_POSi,vtx @@ -321,14 +294,11 @@ GL_TnL: #undef vcspos_f #undef vcspos_i - #undef vtexsize - #undef vtexoffset #undef vtx #undef v___ #undef vrgba - #undef vst #undef s .endfunc @@ -343,6 +313,11 @@ GPUCmd_DrawQuad: #define v___ $v01 + #define vst_i $v12 + #define vst_f $v13 + #define vtexsize $v14 + #define vtexoffset $v15 + #define vmtx0_i $v16 // m00 m01 m02 m03 #define vmtx0_f $v17 #define vmtx1_i $v18 // m10 m11 m12 m13 @@ -404,10 +379,11 @@ GPUCmd_DrawQuad: vmudm vcspos_i, vcspos_i, K2048 vmadl vcspos_f, vcspos_f, K2048 + li t6, %lo(GL_STATE_TEX_SIZE) + lqv vtexsize, 0x00, t6 slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx_ptr - slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx_ptr + lqv vtexoffset, 0x10, t6 slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx_ptr - slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx_ptr // Calculate and store clipping flags against CS.W. // These will be used for trivial rejections. @@ -450,10 +426,13 @@ GPUCmd_DrawQuad: vmudm vcspos_i, vcspos_i, K2048 vmadl vcspos_f, vcspos_f, K2048 + // Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active) + vmudn vst_f, vtex, vtexsize # ACC = vtex * vtexsize, VST_F = ACC & 0xFFFF + #vmadn vst_f,vtexoffset, K1 + vmadh vst_i, vzero, vzero # ACC += zero * zero, VST_I = ACC >> 16 + slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr - slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx_ptr slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr - slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx_ptr // Calculate and store clipping flags against CS.W. // These will be used for trivial rejections. @@ -466,13 +445,19 @@ GPUCmd_DrawQuad: sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx_ptr sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx_ptr + // Shift texture coords right 5 bits + vmudm v___, vst_i, K2048 # ACC = (vst_i << 11) + vmadl vtex, vst_f, K2048 # ACC += (vst_f << 11) >> 16, vtex = ACC & 0xFFFF + andi v2_cflags, tmp, XYZ_CLIP_FLAGS srl tmp, tmp, 4 andi v3_cflags, tmp, XYZ_CLIP_FLAGS #undef src_ptr - #undef vtx_ptr - #undef v___ + #undef vst_i + #undef vst_f + #undef vtexsize + #undef vtexoffset #undef vmtx0_i #undef vmtx0_f @@ -487,20 +472,24 @@ GPUCmd_DrawQuad: #undef vcspos_i #undef vcspos_f -// ######################## -// Trivial rejection check -// ######################## + // ### Trivial rejection check ### // If for any plane, all 4 vertices are outside the plane, // then the quad is out of the viewport and can be trivially rejected and tmp, v0_cflags, v1_cflags and tmp, v2_cflags and tmp, v3_cflags - bnez tmp, JrRa - nop + bnez tmp, JrRa // slv is delay slot + + // ### Perform rest of T&L ### + slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx_ptr + slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx_ptr + slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx_ptr + slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx_ptr + + #undef vtx_ptr + #undef v___ + #undef vtex -// ######################## -// Perform rest of T&L -// ######################## jal GL_TnL li s3, %lo(VERTEX_CACHE) + V0_OFFSET jal GL_TnL