Optimise vertex upload

This commit is contained in:
UnknownShadow200 2025-05-02 07:16:29 +10:00
parent b73d03b199
commit db9b359b8b
3 changed files with 50 additions and 53 deletions

View File

@ -34,7 +34,7 @@ enum {
GPU_CMD_SET_LONG = 0x3, GPU_CMD_SET_LONG = 0x3,
GPU_CMD_DRAW_QUAD = 0x4, GPU_CMD_DRAW_QUAD = 0x4,
GPU_CMD_UPLOAD_VTX = 0x5, GPU_CMD_UPLOAD_QUAD = 0x5,
GPU_CMD_MATRIX_LOAD = 0x6, GPU_CMD_MATRIX_LOAD = 0x6,
GPU_CMD_PUSH_RDP = 0x7, GPU_CMD_PUSH_RDP = 0x7,
@ -156,41 +156,40 @@ static inline void put_word(rspq_write_t* s, uint16_t v1, uint16_t v2)
rspq_write_arg(s, v2 | (v1 << 16)); rspq_write_arg(s, v2 | (v1 << 16));
} }
static void upload_vertex(uint32_t index, uint8_t cache_index) static void upload_vertex(rspq_write_t* s, uint32_t index)
{ {
rspq_write_t s = rspq_write_begin(gpup_id, GPU_CMD_UPLOAD_VTX, 5);
rspq_write_arg(&s, cache_index * PRIM_VTX_SIZE);
char* ptr = gpu_pointer + index * gpu_stride; char* ptr = gpu_pointer + index * gpu_stride;
float* vtx = (float*)(ptr + 0); float* vtx = (float*)(ptr + 0);
put_word(&s, vtx[0] * (1<<VTX_SHIFT), put_word(s, vtx[0] * (1<<VTX_SHIFT),
vtx[1] * (1<<VTX_SHIFT)); vtx[1] * (1<<VTX_SHIFT));
put_word(&s, vtx[2] * (1<<VTX_SHIFT), put_word(s, vtx[2] * (1<<VTX_SHIFT),
1.0f * (1<<VTX_SHIFT)); 1.0f * (1<<VTX_SHIFT));
uint32_t* col = (uint32_t*)(ptr + 12); // TODO put_byte ? uint32_t* col = (uint32_t*)(ptr + 12);
rspq_write_arg(&s, *col); rspq_write_arg(s, *col);
if (gpu_texturing) { if (gpu_texturing) {
float* tex = (float*)(ptr + 16); float* tex = (float*)(ptr + 16);
put_word(&s, tex[0] * (1<<TEX_SHIFT), put_word(s, tex[0] * (1<<TEX_SHIFT),
tex[1] * (1<<TEX_SHIFT)); tex[1] * (1<<TEX_SHIFT));
} else { } else {
put_word(&s, 0, put_word(s, 0,
0); 0);
} }
rspq_write_end(&s);
} }
static void gpuDrawArrays(uint32_t first, uint32_t count) static void gpuDrawArrays(uint32_t first, uint32_t count)
{ {
for (uint32_t i = 0; i < count; i++) for (uint32_t i = 0; i < count; i += 4)
{ {
uint8_t cache_index = i & 3; rspq_write_t s = rspq_write_begin(gpup_id, GPU_CMD_UPLOAD_QUAD, 17);
upload_vertex(first + i, cache_index); rspq_write_arg(&s, 0);
for (uint32_t j = 0; j < 4; j++)
// Last vertex of quad? {
if ((i & 3) != 3) continue; upload_vertex(&s, first + i + j);
}
rspq_write_end(&s);
// We pass -1 because the triangle can be clipped and split into multiple // We pass -1 because the triangle can be clipped and split into multiple
// triangles. // triangles.

View File

@ -10,7 +10,7 @@
RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x3 RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x3
RSPQ_DefineCommand GPUCmd_DrawQuad, 4 # 0x4 RSPQ_DefineCommand GPUCmd_DrawQuad, 4 # 0x4
RSPQ_DefineCommand GPUCmd_UploadVertex, 20 # 0x5 RSPQ_DefineCommand GPUCmd_UploadQuad, 68 # 0x5
RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x6 RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x6
RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x7 RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x7
@ -38,15 +38,9 @@ VERTEX_CACHE: .dcb.b PRIM_VTX_SIZE * VERTEX_CACHE_SIZE
RSPQ_EndSavedState RSPQ_EndSavedState
.align 4 .align 4
CACHE_OFFSETS: .half 2,4,6,8,10,12,14,16,18
CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR
DRAW_TRI_RA: .word 0 DRAW_TRI_RA: .word 0
#define CLIPPING_PLANE_COUNT 6
#define CLIPPING_CACHE_SIZE 9
#define CLIPPING_PLANE_SIZE 8
#define SCREEN_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit) #define SCREEN_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit)
#define SCREEN_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit) #define SCREEN_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit)
#define SCREEN_VTX_X 16 #define SCREEN_VTX_X 16
@ -131,23 +125,13 @@ GPUCmd_MatrixLoad:
#undef dst #undef dst
.endfunc .endfunc
########################################
# GPUCmd_UploadVertex
#
# Arguments:
# * 0x00 (a0): offset within VERTEX_CACHE
# * 0x04 (a1): object space X, Y (16-bit)
# * 0x08 (a2): object space Z, W (16-bit)
# * 0x0C (a3): RGBA (8-bit each one)
# * 0x10: S, T (16-bit)
#
########################################
.align 3 .align 3
.func GPUCmd_UploadVertex .func GPUCmd_UploadQuad
GPUCmd_UploadVertex: GPUCmd_UploadQuad:
#define vtx a0 #define vtx a0
#define mtx_ptr s0 #define mtx_ptr s0
#define src_ptr s4 #define src_ptr s4
#define vcount s3
#define v___ $v01 #define v___ $v01
@ -171,17 +155,10 @@ GPUCmd_UploadVertex:
#define z e2 #define z e2
#define w e3 #define w e3
addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 16 addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
li vtx, %lo(VERTEX_CACHE)
li vcount, 4
ldv vpos, 0, src_ptr # Load X, Y, Z, W
llv vcol, 8, src_ptr # Load RGBA
llv vtex, 12, src_ptr # Load U, V
addi vtx, %lo(VERTEX_CACHE)
slv vcol, SCREEN_VTX_RGBA, vtx
slv vtex, SCREEN_VTX_S_T, vtx
# == matrix multiply ==
li mtx_ptr, %lo(GL_MATRIX_MVP) li mtx_ptr, %lo(GL_MATRIX_MVP)
ldv vmtx0_i.e0, 0x00,mtx_ptr ldv vmtx0_i.e0, 0x00,mtx_ptr
ldv vmtx1_i.e0, 0x08,mtx_ptr ldv vmtx1_i.e0, 0x08,mtx_ptr
@ -192,6 +169,12 @@ GPUCmd_UploadVertex:
ldv vmtx2_f.e0, 0x30,mtx_ptr ldv vmtx2_f.e0, 0x30,mtx_ptr
ldv vmtx3_f.e0, 0x38,mtx_ptr ldv vmtx3_f.e0, 0x38,mtx_ptr
upload_vertex:
ldv vpos, 0, src_ptr # Load X, Y, Z, W
llv vcol, 8, src_ptr # Load RGBA
llv vtex, 12, src_ptr # Load U, V
# matrix multiply
vmudn v___, vmtx0_f, vpos.h0 vmudn v___, vmtx0_f, vpos.h0
vmadh v___, vmtx0_i, vpos.h0 vmadh v___, vmtx0_i, vpos.h0
vmadn v___, vmtx1_f, vpos.h1 vmadn v___, vmtx1_f, vpos.h1
@ -201,12 +184,17 @@ GPUCmd_UploadVertex:
vmadn v___, vmtx3_f, vpos.h3 vmadn v___, vmtx3_f, vpos.h3
vmadh vcspos_i, vmtx3_i, vpos.h3 vmadh vcspos_i, vmtx3_i, vpos.h3
vmadn vcspos_f, vzero, vzero vmadn vcspos_f, vzero, vzero
# == end matrix multiply ==
slv vcol, SCREEN_VTX_RGBA, vtx
slv vtex, SCREEN_VTX_S_T, vtx
# 32-bit right shift by 5, to keep the clip space coordinates unscaled # 32-bit right shift by 5, to keep the clip space coordinates unscaled
vmudm vcspos_i, vcspos_i, vshift8.e4 vmudm vcspos_i, vcspos_i, vshift8.e4
vmadl vcspos_f, vcspos_f, vshift8.e4 vmadl vcspos_f, vcspos_f, vshift8.e4
addi vcount, -1
addi src_ptr, 16
sdv vcspos_i, SCREEN_VTX_CS_POSi,vtx sdv vcspos_i, SCREEN_VTX_CS_POSi,vtx
sdv vcspos_f, SCREEN_VTX_CS_POSf,vtx sdv vcspos_f, SCREEN_VTX_CS_POSf,vtx
@ -221,9 +209,13 @@ GPUCmd_UploadVertex:
srl t1, t0, 5 srl t1, t0, 5
andi t0, 0x7 andi t0, 0x7
or t0, t1 or t0, t1
jr ra
sb t0, PRIM_VTX_TRCODE(vtx) sb t0, PRIM_VTX_TRCODE(vtx)
bnez vcount, upload_vertex
addi vtx, PRIM_VTX_SIZE
jr ra
nop
#undef src_ptr #undef src_ptr
#undef vtx #undef vtx

View File

@ -1,3 +1,6 @@
#define CLIPPING_PLANE_COUNT 6
#define CLIPPING_CACHE_SIZE 9
#define CLIPPING_PLANE_SIZE 8
.section .data.gl_clipping .section .data.gl_clipping
@ -10,6 +13,9 @@ CLIP_PLANES:
.half 0, 1, 0, -GUARD_BAND_FACTOR .half 0, 1, 0, -GUARD_BAND_FACTOR
.half 0, 0, 1, -1 .half 0, 0, 1, -1
.align 4
CACHE_OFFSETS: .half 2,4,6,8,10,12,14,16,18
.section .bss.gl_clipping .section .bss.gl_clipping
CLIP_CACHE: .dcb.b SCREEN_VTX_SIZE * CLIPPING_CACHE_SIZE CLIP_CACHE: .dcb.b SCREEN_VTX_SIZE * CLIPPING_CACHE_SIZE