diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index eb5b33ec5..ec488d930 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -41,8 +41,6 @@ enum { }; typedef struct { - int16_t mvp_matrix_i[4][4]; - uint16_t mvp_matrix_f[4][4]; int16_t vp_scale[4]; int16_t vp_offset[4]; uint16_t tex_size[2]; diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S index 4dd7191f7..df03c84d1 100644 --- a/misc/n64/rsp_gpu.S +++ b/misc/n64/rsp_gpu.S @@ -1,9 +1,40 @@ #include #include -#define MATRIX_SIZE 64 + #define GUARD_BAND_FACTOR 2 - .data +// 1 << VTX_SHIFT, keep in sync with gpu.c +#define ONE_W K32 + +#define xxxxXXXX h0 +#define yyyyYYYY h1 +#define zzzzZZZZ h2 +#define wwwwWWWW h3 + + +#define SCREEN_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit) +#define SCREEN_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit) +#define SCREEN_VTX_X 16 +#define SCREEN_VTX_Y 18 +#define SCREEN_VTX_Z 20 +#define SCREEN_VTX_CLIP_CODE 22 +#define SCREEN_VTX_PADDING 23 +#define SCREEN_VTX_RGBA 24 +#define SCREEN_VTX_S_T 28 // 28 S, 30 T +#define SCREEN_VTX_W 32 // FIXME: this is duplicated in CS_POS +#define SCREEN_VTX_INVW 36 // 32-bit +#define SCREEN_VTX_SIZE 40 + +//0-39 same as screenvtx +#define PRIM_VTX_TRCODE 40 // trivial-reject clipping flags (against -w/+w) +#define PRIM_VTX_SIZE 48 + +#define V0_OFFSET 0 * PRIM_VTX_SIZE +#define V1_OFFSET 1 * PRIM_VTX_SIZE +#define V2_OFFSET 2 * PRIM_VTX_SIZE +#define V3_OFFSET 3 * PRIM_VTX_SIZE + +.data RSPQ_BeginOverlayHeader RSPQ_DefineCommand GPUCmd_SetByte, 8 # 0x0 @@ -23,9 +54,9 @@ BANNER1: .ascii "Rasky & Snacchus" RSPQ_BeginSavedState + GPU_MATRIX_MVP: .ds.b 128 GL_STATE: - # This is the GL state that is also used by the pipeline. - GL_MATRIX_MVP: .ds.b MATRIX_SIZE + # This is the GL state that is updated by CPU via GPUCmd_Set commands GL_VIEWPORT_SCALE: .half 0,0,0,0 GL_VIEWPORT_OFFSET: .half 0,0,0,0 GL_STATE_TEX_SIZE: .half 0,0 @@ -39,29 +70,12 @@ GL_STATE: CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR DRAW_TRI_RA: .word 0 -#define SCREEN_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit) -#define SCREEN_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit) -#define SCREEN_VTX_X 16 -#define SCREEN_VTX_Y 18 -#define SCREEN_VTX_Z 20 -#define SCREEN_VTX_CLIP_CODE 22 -#define SCREEN_VTX_PADDING 23 -#define SCREEN_VTX_RGBA 24 -#define SCREEN_VTX_S_T 28 // 28 S, 30 T -#define SCREEN_VTX_W 32 // FIXME: this is duplicated in CS_POS -#define SCREEN_VTX_INVW 36 // 32-bit -#define SCREEN_VTX_SIZE 40 - - .bss +.bss .align 3 -#define VERTEX_CACHE_SIZE 4 -//0-39 same as screenvtx -#define PRIM_VTX_TRCODE 40 // trivial-reject clipping flags (against -w/+w) -#define PRIM_VTX_SIZE 42 -VERTEX_CACHE: .dcb.b PRIM_VTX_SIZE * VERTEX_CACHE_SIZE +VERTEX_CACHE: .dcb.b PRIM_VTX_SIZE * 4 - .text +.text .func GPUCmd_SetByte GPUCmd_SetByte: @@ -104,29 +118,47 @@ GPUCmd_MatrixLoad: #define src s6 #define dst s7 - #define vrhs01_i $v02 - #define vrhs01_f $v03 - #define vrhs23_i $v04 - #define vrhs23_f $v05 + #define vmat0_i $v02 + #define vmat1_i $v03 + #define vmat2_i $v04 + #define vmat3_i $v05 + #define vmat0_f $v06 + #define vmat1_f $v07 + #define vmat2_f $v08 + #define vmat3_f $v09 addi src, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64 - addi dst, zero, %lo(GL_MATRIX_MVP) + addi dst, zero, %lo(GPU_MATRIX_MVP) - # Load the matrix from command parameters (misaligned) - lqv vrhs01_i, 0x00,src - lrv vrhs01_i, 0x10,src - lqv vrhs23_i, 0x10,src - lrv vrhs23_i, 0x20,src - lqv vrhs01_f, 0x20,src - lrv vrhs01_f, 0x30,src - lqv vrhs23_f, 0x30,src - lrv vrhs23_f, 0x40,src + // Load the matrix from command parameters + ldv vmat0_i, 0x00,src + ldv vmat1_i, 0x08,src + ldv vmat2_i, 0x10,src + ldv vmat3_i, 0x18,src + ldv vmat0_f, 0x20,src + ldv vmat1_f, 0x28,src + ldv vmat2_f, 0x30,src + ldv vmat3_f, 0x38,src - sqv vrhs01_i, 0x00,dst - sqv vrhs23_i, 0x10,dst - sqv vrhs01_f, 0x20,dst + // Store the matrices, with each row stored twice + // This is used by T&L to transform two vertices at once + sdv vmat0_i, 0x00,dst + sdv vmat0_i, 0x08,dst + sdv vmat1_i, 0x10,dst + sdv vmat1_i, 0x18,dst + sdv vmat2_i, 0x20,dst + sdv vmat2_i, 0x28,dst + sdv vmat3_i, 0x30,dst + sdv vmat3_i, 0x38,dst + sdv vmat0_f, 0x40,dst + sdv vmat0_f, 0x48,dst + sdv vmat1_f, 0x50,dst + sdv vmat1_f, 0x58,dst + sdv vmat2_f, 0x60,dst + sdv vmat2_f, 0x68,dst + sdv vmat3_f, 0x70,dst jr ra - sqv vrhs23_f, 0x30,dst + sdv vmat3_f, 0x78,dst #undef src #undef dst @@ -138,7 +170,6 @@ GPUCmd_DrawQuad: #define vtx a0 #define mtx_ptr s0 #define src_ptr s4 - #define vcount s3 #define v___ $v01 @@ -164,74 +195,134 @@ GPUCmd_DrawQuad: addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64 li vtx, %lo(VERTEX_CACHE) - li vcount, 4 - li mtx_ptr, %lo(GL_MATRIX_MVP) - ldv vmtx0_i.e0, 0x00,mtx_ptr - ldv vmtx1_i.e0, 0x08,mtx_ptr - ldv vmtx2_i.e0, 0x10,mtx_ptr - ldv vmtx3_i.e0, 0x18,mtx_ptr - ldv vmtx0_f.e0, 0x20,mtx_ptr - ldv vmtx1_f.e0, 0x28,mtx_ptr - ldv vmtx2_f.e0, 0x30,mtx_ptr - ldv vmtx3_f.e0, 0x38,mtx_ptr + li mtx_ptr, %lo(GPU_MATRIX_MVP) + lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I m00.I m01.I m02.I m03.I] + lqv vmtx1_i, 0x10,mtx_ptr // etc + lqv vmtx2_i, 0x20,mtx_ptr + lqv vmtx3_i, 0x30,mtx_ptr + lqv vmtx0_f, 0x40,mtx_ptr + lqv vmtx1_f, 0x50,mtx_ptr + lqv vmtx2_f, 0x60,mtx_ptr + lqv vmtx3_f, 0x70,mtx_ptr -upload_vertex: - ldv vpos, 0, src_ptr # Load X, Y, Z, W - llv vcol, 8, src_ptr # Load RGBA - llv vtex, 12, src_ptr # Load U, V +### VERTEX 0 + ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z + ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z # matrix multiply - vmudn v___, vmtx0_f, vpos.h0 - vmadh v___, vmtx0_i, vpos.h0 - vmadn v___, vmtx1_f, vpos.h1 - vmadh v___, vmtx1_i, vpos.h1 - vmadn v___, vmtx2_f, vpos.h2 - vmadh v___, vmtx2_i, vpos.h2 - vmadn v___, vmtx3_f, vpos.h3 - vmadh vcspos_i, vmtx3_i, vpos.h3 + vmudn v___, vmtx0_f, vpos.xxxxXXXX + vmadh v___, vmtx0_i, vpos.xxxxXXXX + vmadn v___, vmtx1_f, vpos.yyyyYYYY + vmadh v___, vmtx1_i, vpos.yyyyYYYY + vmadn v___, vmtx2_f, vpos.zzzzZZZZ + vmadh v___, vmtx2_i, vpos.zzzzZZZZ + vmadn v___, vmtx3_f, ONE_W + vmadh vcspos_i, vmtx3_i, ONE_W vmadn vcspos_f, vzero, vzero - slv vcol, SCREEN_VTX_RGBA, vtx - slv vtex, SCREEN_VTX_S_T, vtx + llv vcol.e0, 8, src_ptr // Load v0 RGBA + llv vtex.e0, 12, src_ptr // Load v0 U, V + llv vcol.e2, 24, src_ptr // Load v1 RGBA + llv vtex.e2, 28, src_ptr // Load v1 U, V # 32-bit right shift by 5, to keep the clip space coordinates unscaled vmudm vcspos_i, vcspos_i, K2048 vmadl vcspos_f, vcspos_f, K2048 - addi vcount, -1 - addi src_ptr, 16 - - sdv vcspos_i, SCREEN_VTX_CS_POSi,vtx - sdv vcspos_f, SCREEN_VTX_CS_POSf,vtx + slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx + slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx + slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx + slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx # Calculate and store clipping flags against CS.W. # These will be used for trivial rejections. - vch v___, vcspos_i, vcspos_i.w - vcl v___, vcspos_f, vcspos_f.w + vch v___, vcspos_i, vcspos_i.wwwwWWWW + vcl v___, vcspos_f, vcspos_f.wwwwWWWW + cfc2 t0, COP2_CTRL_VCC - andi t0, 0x707 # Isolate X/Y/Z flags + sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx + sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx + sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx + sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx - # Compress flags to 8 bit - srl t1, t0, 5 - andi t0, 0x7 - or t0, t1 - sb t0, PRIM_VTX_TRCODE(vtx) +###################### VERTEX 2 + ldv vpos.e0, 32, src_ptr // Load v2 X, Y, Z + ldv vpos.e4, 48, src_ptr // Load v3 X, Y, Z + andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags + srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags + andi t2, t2, 0x7 // Isolate lo clip flags + or t2, t1 // Merge clip flags (compressed to 6 bits) - bnez vcount, upload_vertex - addi vtx, PRIM_VTX_SIZE + # matrix multiply + vmudn v___, vmtx0_f, vpos.xxxxXXXX + vmadh v___, vmtx0_i, vpos.xxxxXXXX + vmadn v___, vmtx1_f, vpos.yyyyYYYY + sb t2, (PRIM_VTX_TRCODE + V0_OFFSET)(vtx) + vmadh v___, vmtx1_i, vpos.yyyyYYYY + srl t0, t0, 4 + vmadn v___, vmtx2_f, vpos.zzzzZZZZ + andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags + vmadh v___, vmtx2_i, vpos.zzzzZZZZ + srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags + vmadn v___, vmtx3_f, ONE_W + andi t2, t2, 0x7 // Isolate lo clip flags + vmadh vcspos_i, vmtx3_i, ONE_W + or t2, t1 // Merge clip flags (compressed to 6 bits) + vmadn vcspos_f, vzero, vzero + sb t2, (PRIM_VTX_TRCODE + V1_OFFSET)(vtx) + llv vcol.e4, 40, src_ptr # Load v2 RGBA + llv vtex.e4, 44, src_ptr # Load v2 U, V + llv vcol.e6, 56, src_ptr # Load v3 RGBA + llv vtex.e6, 60, src_ptr # Load v3 U, V + + # 32-bit right shift by 5, to keep the clip space coordinates unscaled + vmudm vcspos_i, vcspos_i, K2048 + vmadl vcspos_f, vcspos_f, K2048 + + slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx + slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx + slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx + slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx + + # Calculate and store clipping flags against CS.W. + # These will be used for trivial rejections. + vch v___, vcspos_i, vcspos_i.wwwwWWWW + vcl v___, vcspos_f, vcspos_f.wwwwWWWW + + cfc2 t0, COP2_CTRL_VCC + sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx + sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx + + andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags + srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags + andi t2, t2, 0x7 // Isolate lo clip flags + or t2, t1 // Merge clip flags (compressed to 6 bits) + sb t2, (PRIM_VTX_TRCODE + V2_OFFSET)(vtx) + +###################### VERTEX 3 + + sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx + sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx + + srl t0, t0, 4 + andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags + srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags + andi t2, t2, 0x7 // Isolate lo clip flags + or t2, t1 // Merge clip flags (compressed to 6 bits) + sb t2, (PRIM_VTX_TRCODE + V3_OFFSET)(vtx) # now do the actual drawing - li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE - li a2, %lo(VERTEX_CACHE) + 1*PRIM_VTX_SIZE + li a1, %lo(VERTEX_CACHE) + V0_OFFSET + li a2, %lo(VERTEX_CACHE) + V1_OFFSET jal GPUCmd_DrawTriangle - li a3, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE + li a3, %lo(VERTEX_CACHE) + V2_OFFSET - li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE - li a2, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE + li a1, %lo(VERTEX_CACHE) + V0_OFFSET + li a2, %lo(VERTEX_CACHE) + V2_OFFSET jal GPUCmd_DrawTriangle - li a3, %lo(VERTEX_CACHE) + 3*PRIM_VTX_SIZE + li a3, %lo(VERTEX_CACHE) + V3_OFFSET j RSPQ_Loop nop @@ -352,13 +443,14 @@ GL_CalcClipCodes: vch v___, vguard_i, vguard_i.w vcl v___, vguard_f, vguard_f.w + cfc2 t0, COP2_CTRL_VCC - andi t0, 0x707 - srl t1, t0, 5 - andi t0, 0x7 - or t0, t1 + andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags + srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags + andi t2, t2, 0x7 // Isolate lo clip flags + or t2, t1 // Merge clip flags (compressed to 6 bits) jr ra - sb t0, SCREEN_VTX_CLIP_CODE(dst) + sb t2, SCREEN_VTX_CLIP_CODE(dst) #undef dst #undef vcspos_i