mirror of
https://github.com/ClassiCube/ClassiCube.git
synced 2025-09-12 00:56:40 -04:00
N64: Optimise T&L further (complex world down to 10.3 ms)
This commit is contained in:
parent
af4494284d
commit
da9b8209d6
@ -30,7 +30,7 @@ static rsp_ucode_t rsp_gpu = (rsp_ucode_t){
|
|||||||
|
|
||||||
enum {
|
enum {
|
||||||
GPU_CMD_SET_SHORT = 0x0,
|
GPU_CMD_SET_SHORT = 0x0,
|
||||||
GPU_CMD_SET_WORD = 0x1,
|
GPU_CMD_SET_TEX_WORD = 0x1,
|
||||||
GPU_CMD_SET_LONG = 0x2,
|
GPU_CMD_SET_LONG = 0x2,
|
||||||
|
|
||||||
GPU_CMD_DRAW_QUAD = 0x3,
|
GPU_CMD_DRAW_QUAD = 0x3,
|
||||||
@ -42,8 +42,8 @@ enum {
|
|||||||
typedef struct {
|
typedef struct {
|
||||||
int16_t vp_scale[4];
|
int16_t vp_scale[4];
|
||||||
int16_t vp_offset[4];
|
int16_t vp_offset[4];
|
||||||
uint16_t tex_size[2];
|
uint16_t tex_size[8];
|
||||||
uint16_t tex_offset[2];
|
uint16_t tex_offset[8];
|
||||||
uint16_t tri_cmd;
|
uint16_t tri_cmd;
|
||||||
uint16_t tri_cull;
|
uint16_t tri_cull;
|
||||||
} __attribute__((aligned(8), packed)) gpu_state;
|
} __attribute__((aligned(8), packed)) gpu_state;
|
||||||
@ -55,9 +55,9 @@ static inline void gpu_set_short(uint32_t offset, uint16_t value)
|
|||||||
}
|
}
|
||||||
|
|
||||||
__attribute__((always_inline))
|
__attribute__((always_inline))
|
||||||
static inline void gpu_set_word(uint32_t offset, uint32_t value)
|
static inline void gpu_set_tex_word(uint32_t offset, uint32_t value)
|
||||||
{
|
{
|
||||||
rspq_write(gpup_id, GPU_CMD_SET_WORD, offset, value);
|
rspq_write(gpup_id, GPU_CMD_SET_TEX_WORD, offset, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
__attribute__((always_inline))
|
__attribute__((always_inline))
|
||||||
@ -97,12 +97,12 @@ static void gpuUpdateFormat(void)
|
|||||||
|
|
||||||
static void gpuSetTexSize(uint16_t width, uint16_t height)
|
static void gpuSetTexSize(uint16_t width, uint16_t height)
|
||||||
{
|
{
|
||||||
gpu_set_word(offsetof(gpu_state, tex_size[0]), (width << 16) | height);
|
gpu_set_tex_word(offsetof(gpu_state, tex_size[0]), (width << 16) | height);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void gpuSetTexOffset(uint16_t width, uint16_t height)
|
static void gpuSetTexOffset(uint16_t width, uint16_t height)
|
||||||
{
|
{
|
||||||
gpu_set_word(offsetof(gpu_state, tex_offset[0]), (width << 16) | height);
|
gpu_set_tex_word(offsetof(gpu_state, tex_offset[0]), (width << 16) | height);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@
|
|||||||
|
|
||||||
RSPQ_BeginOverlayHeader
|
RSPQ_BeginOverlayHeader
|
||||||
RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x0
|
RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x0
|
||||||
RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x1
|
RSPQ_DefineCommand GPUCmd_SetTexWord, 8 # 0x1
|
||||||
RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x2
|
RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x2
|
||||||
|
|
||||||
RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x3
|
RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x3
|
||||||
@ -56,8 +56,8 @@ GL_STATE:
|
|||||||
# This is the GL state that is updated by CPU via GPUCmd_Set commands
|
# This is the GL state that is updated by CPU via GPUCmd_Set commands
|
||||||
GL_VIEWPORT_SCALE: .half 0,0,0,0
|
GL_VIEWPORT_SCALE: .half 0,0,0,0
|
||||||
GL_VIEWPORT_OFFSET: .half 0,0,0,0
|
GL_VIEWPORT_OFFSET: .half 0,0,0,0
|
||||||
GL_STATE_TEX_SIZE: .half 0,0
|
GL_STATE_TEX_SIZE: .half 0,0, 0,0, 0,0, 0,0
|
||||||
GL_STATE_TEX_OFFSET: .half 0,0
|
GL_STATE_TEX_OFFSET: .half 0,0, 0,0, 0,0, 0,0
|
||||||
GL_TRI_CMD: .half 0
|
GL_TRI_CMD: .half 0
|
||||||
GL_TRI_CULL: .half 0
|
GL_TRI_CULL: .half 0
|
||||||
|
|
||||||
@ -79,10 +79,14 @@ GPUCmd_SetShort:
|
|||||||
sh a1, %lo(GL_STATE)(a0)
|
sh a1, %lo(GL_STATE)(a0)
|
||||||
.endfunc
|
.endfunc
|
||||||
|
|
||||||
.func GPUCmd_SetWord
|
// Store 4 times, so can be transformed by 4 vertices later
|
||||||
GPUCmd_SetWord:
|
.func GPUCmd_SetTexWord
|
||||||
jr ra
|
GPUCmd_SetTexWord:
|
||||||
sw a1, %lo(GL_STATE) + 0(a0)
|
sw a1, %lo(GL_STATE) + 0(a0)
|
||||||
|
sw a1, %lo(GL_STATE) + 4(a0)
|
||||||
|
sw a1, %lo(GL_STATE) + 8(a0)
|
||||||
|
jr ra
|
||||||
|
sw a1, %lo(GL_STATE) + 12(a0)
|
||||||
.endfunc
|
.endfunc
|
||||||
|
|
||||||
.func GPUCmd_SetLong
|
.func GPUCmd_SetLong
|
||||||
@ -279,39 +283,8 @@ GL_TnL:
|
|||||||
#define v___ $v01
|
#define v___ $v01
|
||||||
#define vcspos_f $v02
|
#define vcspos_f $v02
|
||||||
#define vcspos_i $v03
|
#define vcspos_i $v03
|
||||||
#define vtexsize $v06
|
|
||||||
#define vtexoffset $v07
|
|
||||||
#define vst $v08
|
|
||||||
#define vst_i $v28
|
|
||||||
#define vst_f $v29
|
|
||||||
move ra2, ra
|
move ra2, ra
|
||||||
|
|
||||||
llv vst, SCREEN_VTX_S_T, vtx # S + T
|
|
||||||
|
|
||||||
li t0, %lo(GL_STATE_TEX_SIZE)
|
|
||||||
llv vtexsize, 0,t0
|
|
||||||
llv vtexoffset, 4,t0
|
|
||||||
|
|
||||||
# Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active)
|
|
||||||
#vmudn v___, vst, vtexsize
|
|
||||||
# vmadh vst, vtexoffset, K1
|
|
||||||
|
|
||||||
#vmudn v___, vst, vtexsize
|
|
||||||
#vmudl vst, vst, vtexsize
|
|
||||||
|
|
||||||
vmudn vst_f, vst, vtexsize # ACC = vst * vtexsize, VST_F = ACC & 0xFFFF
|
|
||||||
#####vmadn vst_f, vtexoffset, K1
|
|
||||||
vmadh vst_i, vzero, vzero # ACC += zero * zero, VST_I = ACC >> 16
|
|
||||||
|
|
||||||
// Shift texture coords right 5 bits
|
|
||||||
vmudm v___, vst_i, K2048 # ACC = (vst_i << 11)
|
|
||||||
vmadl vst, vst_f, K2048 # ACC += (vst_f << 11) >> 16, VST = ACC & 0xFFFF
|
|
||||||
|
|
||||||
#undef vst_i
|
|
||||||
#undef vst_f
|
|
||||||
|
|
||||||
slv vst, SCREEN_VTX_S_T, vtx
|
|
||||||
|
|
||||||
ldv vcspos_f, SCREEN_VTX_CS_POSf,vtx
|
ldv vcspos_f, SCREEN_VTX_CS_POSf,vtx
|
||||||
jal GL_CalcScreenSpace
|
jal GL_CalcScreenSpace
|
||||||
ldv vcspos_i, SCREEN_VTX_CS_POSi,vtx
|
ldv vcspos_i, SCREEN_VTX_CS_POSi,vtx
|
||||||
@ -321,14 +294,11 @@ GL_TnL:
|
|||||||
|
|
||||||
#undef vcspos_f
|
#undef vcspos_f
|
||||||
#undef vcspos_i
|
#undef vcspos_i
|
||||||
#undef vtexsize
|
|
||||||
#undef vtexoffset
|
|
||||||
|
|
||||||
#undef vtx
|
#undef vtx
|
||||||
|
|
||||||
#undef v___
|
#undef v___
|
||||||
#undef vrgba
|
#undef vrgba
|
||||||
#undef vst
|
|
||||||
#undef s
|
#undef s
|
||||||
|
|
||||||
.endfunc
|
.endfunc
|
||||||
@ -343,6 +313,11 @@ GPUCmd_DrawQuad:
|
|||||||
|
|
||||||
#define v___ $v01
|
#define v___ $v01
|
||||||
|
|
||||||
|
#define vst_i $v12
|
||||||
|
#define vst_f $v13
|
||||||
|
#define vtexsize $v14
|
||||||
|
#define vtexoffset $v15
|
||||||
|
|
||||||
#define vmtx0_i $v16 // m00 m01 m02 m03
|
#define vmtx0_i $v16 // m00 m01 m02 m03
|
||||||
#define vmtx0_f $v17
|
#define vmtx0_f $v17
|
||||||
#define vmtx1_i $v18 // m10 m11 m12 m13
|
#define vmtx1_i $v18 // m10 m11 m12 m13
|
||||||
@ -404,10 +379,11 @@ GPUCmd_DrawQuad:
|
|||||||
vmudm vcspos_i, vcspos_i, K2048
|
vmudm vcspos_i, vcspos_i, K2048
|
||||||
vmadl vcspos_f, vcspos_f, K2048
|
vmadl vcspos_f, vcspos_f, K2048
|
||||||
|
|
||||||
|
li t6, %lo(GL_STATE_TEX_SIZE)
|
||||||
|
lqv vtexsize, 0x00, t6
|
||||||
slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx_ptr
|
slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx_ptr
|
||||||
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx_ptr
|
lqv vtexoffset, 0x10, t6
|
||||||
slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx_ptr
|
slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx_ptr
|
||||||
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx_ptr
|
|
||||||
|
|
||||||
// Calculate and store clipping flags against CS.W.
|
// Calculate and store clipping flags against CS.W.
|
||||||
// These will be used for trivial rejections.
|
// These will be used for trivial rejections.
|
||||||
@ -450,10 +426,13 @@ GPUCmd_DrawQuad:
|
|||||||
vmudm vcspos_i, vcspos_i, K2048
|
vmudm vcspos_i, vcspos_i, K2048
|
||||||
vmadl vcspos_f, vcspos_f, K2048
|
vmadl vcspos_f, vcspos_f, K2048
|
||||||
|
|
||||||
|
// Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active)
|
||||||
|
vmudn vst_f, vtex, vtexsize # ACC = vtex * vtexsize, VST_F = ACC & 0xFFFF
|
||||||
|
#vmadn vst_f,vtexoffset, K1
|
||||||
|
vmadh vst_i, vzero, vzero # ACC += zero * zero, VST_I = ACC >> 16
|
||||||
|
|
||||||
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr
|
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr
|
||||||
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx_ptr
|
|
||||||
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr
|
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr
|
||||||
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx_ptr
|
|
||||||
|
|
||||||
// Calculate and store clipping flags against CS.W.
|
// Calculate and store clipping flags against CS.W.
|
||||||
// These will be used for trivial rejections.
|
// These will be used for trivial rejections.
|
||||||
@ -466,13 +445,19 @@ GPUCmd_DrawQuad:
|
|||||||
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx_ptr
|
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx_ptr
|
||||||
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx_ptr
|
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx_ptr
|
||||||
|
|
||||||
|
// Shift texture coords right 5 bits
|
||||||
|
vmudm v___, vst_i, K2048 # ACC = (vst_i << 11)
|
||||||
|
vmadl vtex, vst_f, K2048 # ACC += (vst_f << 11) >> 16, vtex = ACC & 0xFFFF
|
||||||
|
|
||||||
andi v2_cflags, tmp, XYZ_CLIP_FLAGS
|
andi v2_cflags, tmp, XYZ_CLIP_FLAGS
|
||||||
srl tmp, tmp, 4
|
srl tmp, tmp, 4
|
||||||
andi v3_cflags, tmp, XYZ_CLIP_FLAGS
|
andi v3_cflags, tmp, XYZ_CLIP_FLAGS
|
||||||
|
|
||||||
#undef src_ptr
|
#undef src_ptr
|
||||||
#undef vtx_ptr
|
#undef vst_i
|
||||||
#undef v___
|
#undef vst_f
|
||||||
|
#undef vtexsize
|
||||||
|
#undef vtexoffset
|
||||||
|
|
||||||
#undef vmtx0_i
|
#undef vmtx0_i
|
||||||
#undef vmtx0_f
|
#undef vmtx0_f
|
||||||
@ -487,20 +472,24 @@ GPUCmd_DrawQuad:
|
|||||||
#undef vcspos_i
|
#undef vcspos_i
|
||||||
#undef vcspos_f
|
#undef vcspos_f
|
||||||
|
|
||||||
// ########################
|
// ### Trivial rejection check ###
|
||||||
// Trivial rejection check
|
|
||||||
// ########################
|
|
||||||
// If for any plane, all 4 vertices are outside the plane,
|
// If for any plane, all 4 vertices are outside the plane,
|
||||||
// then the quad is out of the viewport and can be trivially rejected
|
// then the quad is out of the viewport and can be trivially rejected
|
||||||
and tmp, v0_cflags, v1_cflags
|
and tmp, v0_cflags, v1_cflags
|
||||||
and tmp, v2_cflags
|
and tmp, v2_cflags
|
||||||
and tmp, v3_cflags
|
and tmp, v3_cflags
|
||||||
bnez tmp, JrRa
|
bnez tmp, JrRa // slv is delay slot
|
||||||
nop
|
|
||||||
|
// ### Perform rest of T&L ###
|
||||||
|
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx_ptr
|
||||||
|
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx_ptr
|
||||||
|
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx_ptr
|
||||||
|
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx_ptr
|
||||||
|
|
||||||
|
#undef vtx_ptr
|
||||||
|
#undef v___
|
||||||
|
#undef vtex
|
||||||
|
|
||||||
// ########################
|
|
||||||
// Perform rest of T&L
|
|
||||||
// ########################
|
|
||||||
jal GL_TnL
|
jal GL_TnL
|
||||||
li s3, %lo(VERTEX_CACHE) + V0_OFFSET
|
li s3, %lo(VERTEX_CACHE) + V0_OFFSET
|
||||||
jal GL_TnL
|
jal GL_TnL
|
||||||
|
Loading…
x
Reference in New Issue
Block a user