mirror of
https://github.com/ClassiCube/ClassiCube.git
synced 2025-08-03 10:47:39 -04:00
N64: Save 3 cycles in RSP T&L loop
This commit is contained in:
parent
00a1a49405
commit
af4494284d
@ -29,15 +29,14 @@ static rsp_ucode_t rsp_gpu = (rsp_ucode_t){
|
||||
};
|
||||
|
||||
enum {
|
||||
GPU_CMD_SET_BYTE = 0x0,
|
||||
GPU_CMD_SET_SHORT = 0x1,
|
||||
GPU_CMD_SET_WORD = 0x2,
|
||||
GPU_CMD_SET_LONG = 0x3,
|
||||
GPU_CMD_SET_SHORT = 0x0,
|
||||
GPU_CMD_SET_WORD = 0x1,
|
||||
GPU_CMD_SET_LONG = 0x2,
|
||||
|
||||
GPU_CMD_DRAW_QUAD = 0x4,
|
||||
GPU_CMD_MATRIX_LOAD = 0x5,
|
||||
GPU_CMD_DRAW_QUAD = 0x3,
|
||||
GPU_CMD_MATRIX_LOAD = 0x4,
|
||||
|
||||
GPU_CMD_PUSH_RDP = 0x6,
|
||||
GPU_CMD_PUSH_RDP = 0x5,
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
@ -49,12 +48,6 @@ typedef struct {
|
||||
uint16_t tri_cull;
|
||||
} __attribute__((aligned(8), packed)) gpu_state;
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline void gpu_set_byte(uint32_t offset, uint8_t value)
|
||||
{
|
||||
rspq_write(gpup_id, GPU_CMD_SET_BYTE, offset, value);
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline void gpu_set_short(uint32_t offset, uint16_t value)
|
||||
{
|
||||
@ -82,9 +75,6 @@ static inline void gpu_push_rdp(uint32_t a1, uint64_t a2)
|
||||
rdpq_write(2, gpup_id, GPU_CMD_PUSH_RDP, 0, a1, a2);
|
||||
}
|
||||
|
||||
|
||||
static float gpu_vp_scale[3];
|
||||
static float gpu_vp_offset[3];
|
||||
static bool gpu_texturing;
|
||||
static void* gpu_pointer;
|
||||
static int gpu_stride;
|
||||
@ -191,34 +181,28 @@ static void gpuDrawArrays(uint32_t first, uint32_t count)
|
||||
}
|
||||
}
|
||||
|
||||
static void gpuDepthRange(float n, float f)
|
||||
{
|
||||
gpu_vp_scale[2] = (f - n) * 0.5f;
|
||||
gpu_vp_offset[2] = n + (f - n) * 0.5f;
|
||||
|
||||
gpu_set_short(offsetof(gpu_state, vp_scale[2]), gpu_vp_scale[2] * 4);
|
||||
gpu_set_short(offsetof(gpu_state, vp_offset[2]), gpu_vp_offset[2] * 4);
|
||||
}
|
||||
|
||||
static void gpuViewport(int x, int y, int w, int h)
|
||||
{
|
||||
gpu_vp_scale[0] = w * 0.5f;
|
||||
gpu_vp_scale[1] = h * -0.5f;
|
||||
gpu_vp_offset[0] = x + w * 0.5f;
|
||||
gpu_vp_offset[1] = y + h * 0.5f;
|
||||
float vp_scale_x = w * 0.5f;
|
||||
float vp_scale_y = h * -0.5f;
|
||||
float vp_scale_z = 0.5f;
|
||||
|
||||
float vp_offset_x = x + w * 0.5f;
|
||||
float vp_offset_y = y + h * 0.5f;
|
||||
float vp_offset_z = 0.5f;
|
||||
|
||||
// Screen coordinates are s13.2
|
||||
#define SCREEN_XY_SCALE 4.0f
|
||||
#define SCREEN_Z_SCALE 32767.0f
|
||||
|
||||
// * 2.0f to compensate for RSP reciprocal missing 1 bit
|
||||
uint16_t scale_x = gpu_vp_scale[0] * SCREEN_XY_SCALE * 2.0f;
|
||||
uint16_t scale_y = gpu_vp_scale[1] * SCREEN_XY_SCALE * 2.0f;
|
||||
uint16_t scale_z = gpu_vp_scale[2] * SCREEN_Z_SCALE * 2.0f;
|
||||
uint16_t scale_x = vp_scale_x * SCREEN_XY_SCALE * 2.0f;
|
||||
uint16_t scale_y = vp_scale_y * SCREEN_XY_SCALE * 2.0f;
|
||||
uint16_t scale_z = vp_scale_z * SCREEN_Z_SCALE * 2.0f;
|
||||
|
||||
uint16_t offset_x = gpu_vp_offset[0] * SCREEN_XY_SCALE;
|
||||
uint16_t offset_y = gpu_vp_offset[1] * SCREEN_XY_SCALE;
|
||||
uint16_t offset_z = gpu_vp_offset[2] * SCREEN_Z_SCALE;
|
||||
uint16_t offset_x = vp_offset_x * SCREEN_XY_SCALE;
|
||||
uint16_t offset_y = vp_offset_y * SCREEN_XY_SCALE;
|
||||
uint16_t offset_z = vp_offset_z * SCREEN_Z_SCALE;
|
||||
|
||||
gpu_set_long(
|
||||
offsetof(gpu_state, vp_scale),
|
||||
@ -236,7 +220,6 @@ static void gpuSetCullFace(bool enabled) {
|
||||
|
||||
static void gpu_init() {
|
||||
gpup_id = rspq_overlay_register(&rsp_gpu);
|
||||
gpuDepthRange(0, 1);
|
||||
}
|
||||
|
||||
static void gpu_close() {
|
||||
|
@ -35,15 +35,14 @@
|
||||
.data
|
||||
|
||||
RSPQ_BeginOverlayHeader
|
||||
RSPQ_DefineCommand GPUCmd_SetByte, 8 # 0x0
|
||||
RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x1
|
||||
RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x2
|
||||
RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x3
|
||||
RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x0
|
||||
RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x1
|
||||
RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x2
|
||||
|
||||
RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x4
|
||||
RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x5
|
||||
RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x3
|
||||
RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x4
|
||||
|
||||
RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x6
|
||||
RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x5
|
||||
RSPQ_EndOverlayHeader
|
||||
|
||||
.align 4
|
||||
@ -74,12 +73,6 @@ VERTEX_CACHE: .dcb.b SCREEN_VTX_SIZE * 4
|
||||
|
||||
.text
|
||||
|
||||
.func GPUCmd_SetByte
|
||||
GPUCmd_SetByte:
|
||||
jr ra
|
||||
sb a1, %lo(GL_STATE)(a0)
|
||||
.endfunc
|
||||
|
||||
.func GPUCmd_SetShort
|
||||
GPUCmd_SetShort:
|
||||
jr ra
|
||||
@ -344,7 +337,7 @@ GL_TnL:
|
||||
.align 3
|
||||
.func GPUCmd_DrawQuad
|
||||
GPUCmd_DrawQuad:
|
||||
#define vtx a0
|
||||
#define vtx_ptr a0
|
||||
#define mtx_ptr s0
|
||||
#define src_ptr s4
|
||||
|
||||
@ -373,9 +366,12 @@ GPUCmd_DrawQuad:
|
||||
// t5 is used by GL_ClipTriangle
|
||||
|
||||
addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
|
||||
li vtx, %lo(VERTEX_CACHE)
|
||||
li vtx_ptr, %lo(VERTEX_CACHE)
|
||||
li mtx_ptr, %lo(GPU_MATRIX_MVP)
|
||||
|
||||
ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z
|
||||
ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
|
||||
|
||||
li mtx_ptr, %lo(GPU_MATRIX_MVP)
|
||||
lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I m00.I m01.I m02.I m03.I]
|
||||
lqv vmtx1_i, 0x10,mtx_ptr // etc
|
||||
lqv vmtx2_i, 0x20,mtx_ptr
|
||||
@ -388,9 +384,6 @@ GPUCmd_DrawQuad:
|
||||
// ########################
|
||||
// Vertex 0 and 1 transform
|
||||
// ########################
|
||||
ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z
|
||||
ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
|
||||
|
||||
// matrix multiply
|
||||
vmudn v___, vmtx0_f, vpos.xxxxXXXX
|
||||
vmadh v___, vmtx0_i, vpos.xxxxXXXX
|
||||
@ -407,25 +400,25 @@ GPUCmd_DrawQuad:
|
||||
llv vcol.e2, 24, src_ptr // Load v1 RGBA
|
||||
llv vtex.e2, 28, src_ptr // Load v1 U, V
|
||||
|
||||
# 32-bit right shift by 5, to keep the clip space coordinates unscaled
|
||||
// 32-bit right shift by 5, to keep the clip space coordinates unscaled
|
||||
vmudm vcspos_i, vcspos_i, K2048
|
||||
vmadl vcspos_f, vcspos_f, K2048
|
||||
|
||||
slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx
|
||||
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx
|
||||
slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx
|
||||
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx
|
||||
slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx_ptr
|
||||
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx_ptr
|
||||
slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx_ptr
|
||||
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx_ptr
|
||||
|
||||
# Calculate and store clipping flags against CS.W.
|
||||
# These will be used for trivial rejections.
|
||||
// Calculate and store clipping flags against CS.W.
|
||||
// These will be used for trivial rejections.
|
||||
vch v___, vcspos_i, vcspos_i.wwwwWWWW
|
||||
vcl v___, vcspos_f, vcspos_f.wwwwWWWW
|
||||
|
||||
cfc2 tmp, COP2_CTRL_VCC
|
||||
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx
|
||||
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx
|
||||
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx
|
||||
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx
|
||||
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx_ptr
|
||||
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx_ptr
|
||||
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx_ptr
|
||||
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx_ptr
|
||||
|
||||
// ########################
|
||||
// Vertex 2 and 3 transform
|
||||
@ -437,7 +430,7 @@ GPUCmd_DrawQuad:
|
||||
srl tmp, tmp, 4
|
||||
andi v1_cflags, tmp, XYZ_CLIP_FLAGS
|
||||
|
||||
# matrix multiply
|
||||
// matrix multiply
|
||||
vmudn v___, vmtx0_f, vpos.xxxxXXXX
|
||||
vmadh v___, vmtx0_i, vpos.xxxxXXXX
|
||||
vmadn v___, vmtx1_f, vpos.yyyyYYYY
|
||||
@ -453,31 +446,32 @@ GPUCmd_DrawQuad:
|
||||
llv vcol.e6, 56, src_ptr # Load v3 RGBA
|
||||
llv vtex.e6, 60, src_ptr # Load v3 U, V
|
||||
|
||||
# 32-bit right shift by 5, to keep the clip space coordinates unscaled
|
||||
// 32-bit right shift by 5, to keep the clip space coordinates unscaled
|
||||
vmudm vcspos_i, vcspos_i, K2048
|
||||
vmadl vcspos_f, vcspos_f, K2048
|
||||
|
||||
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx
|
||||
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx
|
||||
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx
|
||||
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx
|
||||
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr
|
||||
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx_ptr
|
||||
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr
|
||||
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx_ptr
|
||||
|
||||
# Calculate and store clipping flags against CS.W.
|
||||
# These will be used for trivial rejections.
|
||||
// Calculate and store clipping flags against CS.W.
|
||||
// These will be used for trivial rejections.
|
||||
vch v___, vcspos_i, vcspos_i.wwwwWWWW
|
||||
vcl v___, vcspos_f, vcspos_f.wwwwWWWW
|
||||
|
||||
cfc2 tmp, COP2_CTRL_VCC
|
||||
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx
|
||||
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx
|
||||
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx
|
||||
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx
|
||||
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx_ptr
|
||||
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx_ptr
|
||||
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx_ptr
|
||||
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx_ptr
|
||||
|
||||
andi v2_cflags, tmp, XYZ_CLIP_FLAGS
|
||||
srl tmp, tmp, 4
|
||||
andi v3_cflags, tmp, XYZ_CLIP_FLAGS
|
||||
|
||||
#undef src_ptr
|
||||
#undef vtx
|
||||
#undef vtx_ptr
|
||||
#undef v___
|
||||
|
||||
#undef vmtx0_i
|
||||
|
Loading…
x
Reference in New Issue
Block a user