From eba646cebbab06b8a35bbb71237fd6d1df4006da Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Sun, 27 Apr 2025 12:58:17 +1000 Subject: [PATCH] N64 optimised, stage 2 --- misc/n64/Makefile | 13 +- misc/n64/{ => files}/default.zip | Bin misc/n64/gl_constants.h | 49 +++ misc/n64/gpu.c | 398 +++++++++++++++++++++ misc/n64/rsp_gpu.S | 585 +++++++++++++++++++++++++++++++ misc/n64/rsp_gpu_clipping.inc | 374 ++++++++++++++++++++ src/Graphics_N64.c | 20 +- 7 files changed, 1429 insertions(+), 10 deletions(-) rename misc/n64/{ => files}/default.zip (100%) create mode 100644 misc/n64/gl_constants.h create mode 100644 misc/n64/gpu.c create mode 100644 misc/n64/rsp_gpu.S create mode 100644 misc/n64/rsp_gpu_clipping.inc diff --git a/misc/n64/Makefile b/misc/n64/Makefile index f8fcbfe9e..c243c55fd 100644 --- a/misc/n64/Makefile +++ b/misc/n64/Makefile @@ -1,23 +1,28 @@ BUILD_DIR = build-n64 -SOURCE_DIR = src +SOURCE_DIR = misc/n64 N64_ROM_TITLE = "ClassiCube" N64_ROM_RTC = true TARGET = ClassiCube-n64 -N64_MKDFS_ROOT = "misc/n64" +N64_MKDFS_ROOT = "misc/n64/files" CFILES := $(notdir $(wildcard src/*.c)) -OFILES := $(CFILES:.c=.o) +OFILES := $(CFILES:.c=.o) rsp_gpu.o OBJS := $(addprefix $(BUILD_DIR)/,$(OFILES)) CFLAGS := -Wno-error=missing-braces -Wno-error=strict-aliasing -Wno-error=incompatible-pointer-types default: $(TARGET).z64 +$(BUILD_DIR)/%.o: src/%.c + @mkdir -p $(dir $@) + @echo " [CC] $<" + $(CC) -c $(CFLAGS) -o $@ $< + include $(N64_INST)/include/n64.mk $(TARGET).z64: N64_ROM_TITLE = "ClassiCube" $(TARGET).z64: $(BUILD_DIR)/filesystem.dfs -$(BUILD_DIR)/filesystem.dfs: misc/n64/default.zip +$(BUILD_DIR)/filesystem.dfs: misc/n64/files/default.zip $(BUILD_DIR)/ClassiCube-n64.elf: $(OBJS) diff --git a/misc/n64/default.zip b/misc/n64/files/default.zip similarity index 100% rename from misc/n64/default.zip rename to misc/n64/files/default.zip diff --git a/misc/n64/gl_constants.h b/misc/n64/gl_constants.h new file mode 100644 index 000000000..cf6ea3ce6 --- /dev/null +++ b/misc/n64/gl_constants.h @@ -0,0 +1,49 @@ +#ifndef __GL_CONSTANTS +#define __GL_CONSTANTS + +#define VERTEX_CACHE_SIZE 16 + +#define MATRIX_SIZE 64 + +#define TEXTURE_BILINEAR_MASK 0x001 +#define TEXTURE_INTERPOLATE_MASK 0x002 +#define TEXTURE_MIPMAP_MASK 0x100 + +#define VTX_SHIFT 5 +#define TEX_SHIFT 8 + +#define FLAG_DEPTH_TEST (1 << 8) +#define FLAG_TEXTURE_ACTIVE (1 << 9) + +#define GUARD_BAND_FACTOR 2 + +#define ASSERT_INVALID_VTX_ID 0x2001 + +#define TEX_COORD_SHIFT 6 +#define HALF_TEXEL 0x0010 + +#define TEX_BILINEAR_SHIFT 13 +#define TEX_BILINEAR_OFFSET_SHIFT 4 + +#define BILINEAR_TEX_OFFSET_SHIFT 9 + +#define TRICMD_ATTR_MASK 0x300 + +#define PRIM_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit) +#define PRIM_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit) +#define PRIM_VTX_X 16 // Object space position (16-bit) +#define PRIM_VTX_Y 18 // Object space position (16-bit) +#define PRIM_VTX_Z 20 // Object space position (16-bit) +#define PRIM_VTX_W 22 // Object space position (16-bit) +#define PRIM_VTX_R 24 +#define PRIM_VTX_G 26 +#define PRIM_VTX_B 28 +#define PRIM_VTX_A 30 +#define PRIM_VTX_TEX_S 32 +#define PRIM_VTX_TEX_T 34 +#define PRIM_VTX_TEX_R 36 +#define PRIM_VTX_TEX_Q 38 +#define PRIM_VTX_TRCODE 40 // trivial-reject clipping flags (against -w/+w) +#define PRIM_VTX_SIZE 42 + +#endif diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c new file mode 100644 index 000000000..d84943dab --- /dev/null +++ b/misc/n64/gpu.c @@ -0,0 +1,398 @@ +#include "GL/gl.h" +#include "rspq.h" +#include "rdpq.h" +#include "rdpq_rect.h" +#include "rdpq_mode.h" +#include "rdpq_debug.h" +#include "display.h" +#include "rdp.h" +#include +#include +#include +#include "gl_constants.h" + +// This is a severely cutdown version of libdragon's OpenGL implementation + +static uint32_t glp_id; +//DEFINE_RSP_UCODE(rsp_gpu); +extern uint8_t _binary_build_n64_rsp_gpu_text_bin_start[]; +extern uint8_t _binary_build_n64_rsp_gpu_data_bin_start[]; +extern uint8_t _binary_build_n64_rsp_gpu_meta_bin_start[]; +extern uint8_t _binary_build_n64_rsp_gpu_text_bin_end[0]; +extern uint8_t _binary_build_n64_rsp_gpu_data_bin_end[0]; +extern uint8_t _binary_build_n64_rsp_gpu_meta_bin_end[0]; + +static rsp_ucode_t rsp_gpu = (rsp_ucode_t){ + .code = _binary_build_n64_rsp_gpu_text_bin_start, + .code_end = _binary_build_n64_rsp_gpu_text_bin_end, + .data = _binary_build_n64_rsp_gpu_data_bin_start, + .data_end = _binary_build_n64_rsp_gpu_data_bin_end, + .meta = _binary_build_n64_rsp_gpu_meta_bin_start, + .meta_end = _binary_build_n64_rsp_gpu_meta_bin_end, + .name = "rsp_gpu" +}; + +enum { + GPU_CMD_SET_FLAG = 0x0, + GPU_CMD_SET_BYTE = 0x1, + GPU_CMD_SET_SHORT = 0x2, + GPU_CMD_SET_WORD = 0x3, + GPU_CMD_SET_LONG = 0x4, + + GPU_CMD_DRAW_TRI = 0x5, + GPU_CMD_UPLOAD_VTX = 0x6, + + GPU_CMD_MATRIX_LOAD = 0x7, + GPU_CMD_PRE_INIT_PIPE = 0x8, +}; + +enum { + ATTRIB_VERTEX, + ATTRIB_COLOR, + ATTRIB_TEXCOORD, + ATTRIB_COUNT +}; + +typedef struct { + GLfloat scale[3]; + GLfloat offset[3]; +} gl_viewport_t; + +typedef struct { + int16_t i[4][4]; + uint16_t f[4][4]; +} gl_matrix_srv_t; +_Static_assert(sizeof(gl_matrix_srv_t) == MATRIX_SIZE, "Matrix size does not match"); + +typedef struct { + rspq_write_t w; + union { + uint8_t bytes[4]; + uint32_t word; + }; + uint32_t buffer_head; +} gl_cmd_stream_t; + +typedef struct { + GLsizei stride; + const GLvoid *pointer; + bool enabled; +} gl_array_t; + +typedef struct { + gl_matrix_srv_t mvp_matrix; + int16_t viewport_scale[4]; + int16_t viewport_offset[4]; + uint32_t flags; + uint16_t tex_size[2]; + uint16_t tex_offset[2]; + uint16_t tri_cmd; + uint16_t tri_cull; +} __attribute__((aligned(8), packed)) gl_server_state_t; + +static inline const void *gl_get_attrib_element(const gl_array_t *src, uint32_t index) +{ + return src->pointer + index * src->stride; +} + +static inline gl_cmd_stream_t gl_cmd_stream_begin(uint32_t ovl_id, uint32_t cmd_id, int size) +{ + return (gl_cmd_stream_t) { + .w = rspq_write_begin(ovl_id, cmd_id, size), + .buffer_head = 2, + }; +} + +static inline void gl_cmd_stream_commit(gl_cmd_stream_t *s) +{ + rspq_write_arg(&s->w, s->word); + s->buffer_head = 0; + s->word = 0; +} + +static inline void gl_cmd_stream_put_half(gl_cmd_stream_t *s, uint16_t v) +{ + s->bytes[s->buffer_head++] = v >> 8; + s->bytes[s->buffer_head++] = v & 0xFF; + + if (s->buffer_head == sizeof(uint32_t)) { + gl_cmd_stream_commit(s); + } +} + +static inline void gl_cmd_stream_end(gl_cmd_stream_t *s) +{ + if (s->buffer_head > 0) { + gl_cmd_stream_commit(s); + } + + rspq_write_end(&s->w); +} + +__attribute__((always_inline)) +static inline void gl_set_flag_raw(uint32_t offset, uint32_t flag, bool value) +{ + rspq_write(glp_id, GPU_CMD_SET_FLAG, offset | value, value ? flag : ~flag); +} + +__attribute__((always_inline)) +static inline void gl_set_flag(uint32_t flag, bool value) +{ + gl_set_flag_raw(offsetof(gl_server_state_t, flags), flag, value); +} + +__attribute__((always_inline)) +static inline void gl_set_byte(uint32_t offset, uint8_t value) +{ + rspq_write(glp_id, GPU_CMD_SET_BYTE, offset, value); +} + +__attribute__((always_inline)) +static inline void gl_set_short(uint32_t offset, uint16_t value) +{ + rspq_write(glp_id, GPU_CMD_SET_SHORT, offset, value); +} + +__attribute__((always_inline)) +static inline void gl_set_word(uint32_t offset, uint32_t value) +{ + rspq_write(glp_id, GPU_CMD_SET_WORD, offset, value); +} + +__attribute__((always_inline)) +static inline void gl_set_long(uint32_t offset, uint64_t value) +{ + rspq_write(glp_id, GPU_CMD_SET_LONG, offset, value >> 32, value & 0xFFFFFFFF); +} + +static inline void glpipe_draw_triangle(int i0, int i1, int i2) +{ + // We pass -1 because the triangle can be clipped and split into multiple + // triangles. + rdpq_write(-1, glp_id, GPU_CMD_DRAW_TRI, + (i0*PRIM_VTX_SIZE), + ((i1*PRIM_VTX_SIZE)<<16) | (i2*PRIM_VTX_SIZE) + ); +} + + +static gl_viewport_t state_viewport; +static gl_array_t state_arrays[ATTRIB_COUNT]; + +void gl_init() +{ + glp_id = rspq_overlay_register(&rsp_gpu); + glDepthRange(0, 1); +} + +void gl_close() +{ + rspq_wait(); + rspq_overlay_unregister(glp_id); +} + +void gl_set_flag2(GLenum target, bool value) +{ + switch (target) { + case GL_DEPTH_TEST: + gl_set_flag(FLAG_DEPTH_TEST, value); + break; + case GL_TEXTURE_2D: + gl_set_flag(FLAG_TEXTURE_ACTIVE, value); + break; + } +} + +void glEnable(GLenum target) +{ + gl_set_flag2(target, true); +} + +void glDisable(GLenum target) +{ + gl_set_flag2(target, false); +} + +void glTexSizeN64(uint16_t width, uint16_t height) +{ + gl_set_word(offsetof(gl_server_state_t, tex_size[0]), (width << 16) | height); +} + + +static inline void write_shorts(rspq_write_t *w, const uint16_t *s, uint32_t count) +{ + for (uint32_t i = 0; i < count; i += 2) + { + uint32_t packed = ((uint32_t)s[i] << 16) | (uint32_t)s[i+1]; + rspq_write_arg(w, packed); + } +} + +static inline void gl_matrix_write(rspq_write_t *w, const GLfloat *m) +{ + uint16_t integer[16]; + uint16_t fraction[16]; + + for (uint32_t i = 0; i < 16; i++) + { + int32_t fixed = m[i] * (1<<16); + integer[i] = (uint16_t)((fixed & 0xFFFF0000) >> 16); + fraction[i] = (uint16_t)(fixed & 0x0000FFFF); + } + + write_shorts(w, integer, 16); + write_shorts(w, fraction, 16); +} + +void glLoadMatrixf(const GLfloat *m) +{ + rspq_write_t w = rspq_write_begin(glp_id, GPU_CMD_MATRIX_LOAD, 17); + rspq_write_arg(&w, false); // no multiply + gl_matrix_write(&w, m); + rspq_write_end(&w); +} + +static void upload_vertex(const gl_array_t *arrays, uint32_t index, uint8_t cache_index) +{ + gl_cmd_stream_t s = gl_cmd_stream_begin(glp_id, GPU_CMD_UPLOAD_VTX, 6); + gl_cmd_stream_put_half(&s, cache_index * PRIM_VTX_SIZE); + + const float* vtx = gl_get_attrib_element(&arrays[ATTRIB_VERTEX], index); + gl_cmd_stream_put_half(&s, vtx[0] * (1<stride = stride; + array->pointer = pointer; +} + +void glTexCoordPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *pointer) +{ + gl_array_t *array = &state_arrays[ATTRIB_TEXCOORD]; + array->stride = stride; + array->pointer = pointer; +} + +void glColorPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *pointer) +{ + gl_array_t *array = &state_arrays[ATTRIB_COLOR]; + array->stride = stride; + array->pointer = pointer; +} + +void gl_set_array_enabled(int array_type, bool enabled) +{ + state_arrays[array_type].enabled = enabled; +} + +void glEnableClientState(GLenum array) +{ + gl_set_array_enabled(gl_array_type_from_enum(array), true); +} + +void glDisableClientState(GLenum array) +{ + gl_set_array_enabled(gl_array_type_from_enum(array), false); +} + +void glDrawArrays(GLenum mode, GLint first, GLsizei count) +{ + rspq_write(glp_id, GPU_CMD_PRE_INIT_PIPE); + gl_rsp_draw_arrays(first, count); +} + +void glDepthRange(GLclampd n, GLclampd f) +{ + state_viewport.scale[2] = (f - n) * 0.5f; + state_viewport.offset[2] = n + (f - n) * 0.5f; + + gl_set_short( + offsetof(gl_server_state_t, viewport_scale) + sizeof(int16_t) * 2, + state_viewport.scale[2] * 4); + gl_set_short( + offsetof(gl_server_state_t, viewport_offset) + sizeof(int16_t) * 2, + state_viewport.offset[2] * 4); +} + +void glViewport(GLint x, GLint y, GLsizei w, GLsizei h) +{ + state_viewport.scale[0] = w * 0.5f; + state_viewport.scale[1] = h * -0.5f; + state_viewport.offset[0] = x + w * 0.5f; + state_viewport.offset[1] = y + h * 0.5f; + + // Screen coordinates are s13.2 + #define SCREEN_XY_SCALE 4.0f + #define SCREEN_Z_SCALE 32767.0f + + // * 2.0f to compensate for RSP reciprocal missing 1 bit + uint16_t scale_x = state_viewport.scale[0] * SCREEN_XY_SCALE * 2.0f; + uint16_t scale_y = state_viewport.scale[1] * SCREEN_XY_SCALE * 2.0f; + uint16_t scale_z = state_viewport.scale[2] * SCREEN_Z_SCALE * 2.0f; + + uint16_t offset_x = state_viewport.offset[0] * SCREEN_XY_SCALE; + uint16_t offset_y = state_viewport.offset[1] * SCREEN_XY_SCALE; + uint16_t offset_z = state_viewport.offset[2] * SCREEN_Z_SCALE; + + gl_set_long( + offsetof(gl_server_state_t, viewport_scale), + ((uint64_t)scale_x << 48) | ((uint64_t)scale_y << 32) | ((uint64_t)scale_z << 16)); + gl_set_long( + offsetof(gl_server_state_t, viewport_offset), + ((uint64_t)offset_x << 48) | ((uint64_t)offset_y << 32) | ((uint64_t)offset_z << 16)); +} + +void glCullFace(GLenum mode) +{ + // 1 = cull backfaces + // 2 = don't cull + gl_set_short(offsetof(gl_server_state_t, tri_cull), mode ? 1 : 2); +} diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S new file mode 100644 index 000000000..f910bbec6 --- /dev/null +++ b/misc/n64/rsp_gpu.S @@ -0,0 +1,585 @@ +#include +#include +#include "gl_constants.h" + .data + + RSPQ_BeginOverlayHeader + RSPQ_DefineCommand GLCmd_SetFlag, 8 # 0x0 + RSPQ_DefineCommand GLCmd_SetByte, 8 # 0x1 + RSPQ_DefineCommand GLCmd_SetShort, 8 # 0x2 + RSPQ_DefineCommand GLCmd_SetWord, 8 # 0x3 + RSPQ_DefineCommand GLCmd_SetLong, 12 # 0x4 + + RSPQ_DefineCommand GLCmd_DrawTriangle, 8 # 0x5 + RSPQ_DefineCommand GLCmd_UploadVertex, 24 # 0x6 + + RSPQ_DefineCommand GLCmd_MatrixLoad, 68 # 0x7 + RSPQ_DefineCommand GLCmd_PreInitPipe, 4 # 0x8 + RSPQ_EndOverlayHeader + + .align 4 +BANNER0: .ascii " RSP OpenGL T&L " +BANNER1: .ascii "Rasky & Snacchus" + + RSPQ_BeginSavedState + +GL_STATE: + # This is the GL state that is also used by the pipeline. + GL_MATRIX_MVP: .ds.b MATRIX_SIZE + GL_VIEWPORT_SCALE: .half 0,0,0,0 + GL_VIEWPORT_OFFSET: .half 0,0,0,0 + GL_STATE_FLAGS: .word 0 + GL_STATE_TEX_SIZE: .half 0,0 + GL_STATE_TEX_OFFSET: .half 0,0 + GL_TRI_CMD: .half 0 + GL_TRI_CULL: .half 0 + + .align 3 +VERTEX_CACHE: .dcb.b PRIM_VTX_SIZE * VERTEX_CACHE_SIZE + + RSPQ_EndSavedState + + .align 4 +CACHE_OFFSETS: .half 2,4,6,8,10,12,14,16,18 + +CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR + +#define CLIPPING_PLANE_COUNT 6 +#define CLIPPING_CACHE_SIZE 9 +#define CLIPPING_PLANE_SIZE 8 + +#define SCREEN_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit) +#define SCREEN_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit) +#define SCREEN_VTX_X 16 +#define SCREEN_VTX_Y 18 +#define SCREEN_VTX_Z 20 +#define SCREEN_VTX_CLIP_CODE 22 +#define SCREEN_VTX_PADDING 23 +#define SCREEN_VTX_RGBA 24 +#define SCREEN_VTX_S 28 +#define SCREEN_VTX_T 30 +#define SCREEN_VTX_W 32 // FIXME: this is duplicated in CS_POS +#define SCREEN_VTX_INVW 36 // 32-bit +#define SCREEN_VTX_SIZE 40 + + .text + + ############################################################# + # GLCmd_SetFlag + # + # Sets or clears a flag + # + # ARGS: + # a0: Bit 31..24: Command id + # Bit 11..2: Offset of flag value in GL_STATE + # Bit 0: If 1, set the flag, otherwise clear it + # a1: flag mask (inverted if clearing) + ############################################################# + .func GLCmd_SetFlag +GLCmd_SetFlag: + li t0, ~0x3 + and t0, a0, t0 + andi t1, a0, 1 + lw t2, %lo(GL_STATE)(t0) + beqz t1, 1f + and t3, t2, a1 + or t3, t2, a1 + +1: + jr ra + sw t3, %lo(GL_STATE)(t0) + .endfunc + + .func GLCmd_SetByte +GLCmd_SetByte: + jr ra + sb a1, %lo(GL_STATE)(a0) + .endfunc + + .func GLCmd_SetShort +GLCmd_SetShort: + jr ra + sh a1, %lo(GL_STATE)(a0) + .endfunc + + .func GLCmd_SetWord +GLCmd_SetWord: + jr ra + sw a1, %lo(GL_STATE) + 0(a0) + .endfunc + + .func GLCmd_SetLong +GLCmd_SetLong: + sw a2, %lo(GL_STATE) + 4(a0) + jr ra + sw a1, %lo(GL_STATE) + 0(a0) + .endfunc + + + ######################################## + # GLCmd_UploadVertex + # + # Arguments: + # * 0x00 (a0): offset within VERTEX_CACHE + # * 0x04 (a1): object space X, Y (16-bit) + # * 0x08 (a2): object space Z, W (16-bit) + # * 0x0C (a3): RGBA (8-bit each one) + # * 0x10: S, T (16-bit) + # * 0x14: normal X, Y, Z (8-bit each one) (LSB must be 0) + # + ######################################## + .align 3 + .func GLCmd_UploadVertex +GLCmd_UploadVertex: + #define vtx a0 + #define mtx_ptr s0 + #define cmd_ptr s4 + + #define v___ $v01 + + #define vmtx0_i $v16 // m00 m01 m02 m03 + #define vmtx0_f $v17 + #define vmtx1_i $v18 // m10 m11 m12 m13 + #define vmtx1_f $v19 + #define vmtx2_i $v20 // m20 m21 m22 m23 + #define vmtx2_f $v21 + #define vmtx3_i $v22 // m30 m31 m32 m03 + #define vmtx3_f $v23 + + #define vpos $v24 + #define vcol $v25 + #define vtex $v26 + #define vcspos_i $v28 + #define vcspos_f $v29 + + #define x e0 + #define y e1 + #define z e2 + #define w e3 + + addi cmd_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) + 4 + sub cmd_ptr, rspq_cmd_size + + ldv vpos, 0, cmd_ptr # Load X, Y, Z, W + ldv vcol, 8, cmd_ptr # Load R, G, B, A + llv vtex, 16, cmd_ptr # Load U, V + + addi vtx, %lo(VERTEX_CACHE) + sdv vpos, PRIM_VTX_X ,vtx + sdv vcol, PRIM_VTX_R ,vtx + sdv vtex, PRIM_VTX_TEX_S ,vtx + +# == matrix multiply == + li mtx_ptr, %lo(GL_MATRIX_MVP) + ldv vmtx0_i.e0, 0x00,mtx_ptr + ldv vmtx1_i.e0, 0x08,mtx_ptr + ldv vmtx2_i.e0, 0x10,mtx_ptr + ldv vmtx3_i.e0, 0x18,mtx_ptr + ldv vmtx0_f.e0, 0x20,mtx_ptr + ldv vmtx1_f.e0, 0x28,mtx_ptr + ldv vmtx2_f.e0, 0x30,mtx_ptr + ldv vmtx3_f.e0, 0x38,mtx_ptr + + vmudn v___, vmtx0_f, vpos.h0 + vmadh v___, vmtx0_i, vpos.h0 + vmadn v___, vmtx1_f, vpos.h1 + vmadh v___, vmtx1_i, vpos.h1 + vmadn v___, vmtx2_f, vpos.h2 + vmadh v___, vmtx2_i, vpos.h2 + vmadn v___, vmtx3_f, vpos.h3 + vmadh vcspos_i, vmtx3_i, vpos.h3 + vmadn vcspos_f, vzero, vzero +# == end matrix multiply == + + # 32-bit right shift by 5, to keep the clip space coordinates unscaled + vmudm vcspos_i, vcspos_i, vshift8.e4 + vmadl vcspos_f, vcspos_f, vshift8.e4 + + sdv vcspos_i, PRIM_VTX_CS_POSi,vtx + sdv vcspos_f, PRIM_VTX_CS_POSf,vtx + + # Calculate and store clipping flags against CS.W. + # These will be used for trivial rejections. + vch v___, vcspos_i, vcspos_i.w + vcl v___, vcspos_f, vcspos_f.w + cfc2 t0, COP2_CTRL_VCC + andi t0, 0x707 # Isolate X/Y/Z flags + + # Compress flags to 8 bit + srl t1, t0, 5 + andi t0, 0x7 + or t0, t1 + jr ra + sb t0, PRIM_VTX_TRCODE(vtx) + + #undef cmd_ptr + #undef vtx + #undef in_xy + #undef in_zw + #undef in_rgba + #undef vtx_id + + #undef x + #undef y + #undef z + #undef w + + #undef v___ + + #undef vmtx0_i + #undef vmtx0_f + #undef vmtx1_i + #undef vmtx1_f + #undef vmtx2_i + #undef vmtx2_f + #undef vmtx3_i + #undef vmtx3_f + + #undef vpos + #undef vcspos_i + #undef vcspos_f + + .endfunc + + ################################################################ + # GL_CalcScreenSpace + # + # Args: + # s3 = Destination vertex address + # $v02 = Clip space position (fractional part) + # $v03 = Clip space position (integer part) + # + ################################################################ + .func GL_CalcScreenSpace +GL_CalcScreenSpace: + #define dst s3 + #define vcspos_f $v02 + #define vcspos_i $v03 + #define vinvw_f $v23 + #define vinvw_i $v24 + #define vviewscale $v25 + #define vviewoff $v26 + #define vscreenpos_i $v27 + #define vscreenpos_f $v28 + #define v___ $v29 + #define w e3 + + # Calculate 32-bit inverse W + # TODO: NR? + vrcph vinvw_i.w, vcspos_i.w + vrcpl vinvw_f.w, vcspos_f.w + vrcph vinvw_i.w, vzero.e0 + + # Calculate screenspace coords + li t0, %lo(GL_VIEWPORT_SCALE) + ldv vviewscale, 0,t0 + ldv vviewoff, 8,t0 + + vmudl v___, vcspos_f, vinvw_f.w + vmadm v___, vcspos_i, vinvw_f.w + vmadn vscreenpos_f, vcspos_f, vinvw_i.w + vmadh vscreenpos_i, vcspos_i, vinvw_i.w + + vmudn vscreenpos_f, vscreenpos_f, vviewscale + vmadh vscreenpos_i, vscreenpos_i, vviewscale + vadd vscreenpos_i, vviewoff + + sdv vscreenpos_i, SCREEN_VTX_X ,dst + ssv vcspos_i.w, SCREEN_VTX_W+0 ,dst + ssv vcspos_f.w, SCREEN_VTX_W+2 ,dst + ssv vinvw_i.w, SCREEN_VTX_INVW+0,dst + ssv vinvw_f.w, SCREEN_VTX_INVW+2,dst + jr ra + sb zero, SCREEN_VTX_PADDING(dst) + + #undef dst + #undef vcspos_f + #undef vcspos_i + #undef vinvw_f + #undef vinvw_i + #undef vviewscale + #undef vviewoff + #undef vscreenpos_i + #undef vscreenpos_f + #undef v___ + #undef w + + .endfunc + + ################################################################ + # GL_CalcClipCodes + # + # Args: + # s3 = Destination vertex address + # $v02 = Clip space position (fractional part) + # $v03 = Clip space position (integer part) + # + ################################################################ + .func GL_CalcClipCodes +GL_CalcClipCodes: + #define dst s3 + #define vcspos_f $v02 + #define vcspos_i $v03 + #define vguard_f $v27 + #define vguard_i $v28 + #define v___ $v29 + #define w e3 + + li t0, %lo(CLIP_CODE_FACTORS) + ldv vguard_i, 0,t0 + + vmudn vguard_f, vcspos_f, vguard_i + vmadh vguard_i, vcspos_i, vguard_i + + vch v___, vguard_i, vguard_i.w + vcl v___, vguard_f, vguard_f.w + cfc2 t0, COP2_CTRL_VCC + andi t0, 0x707 + srl t1, t0, 5 + andi t0, 0x7 + or t0, t1 + jr ra + sb t0, SCREEN_VTX_CLIP_CODE(dst) + + #undef dst + #undef vcspos_i + #undef vcspos_f + #undef vguard_i + #undef vguard_f + #undef v___ + #undef w + + .endfunc + + ################################################################ + # GL_TnL + # + # Args: + # s3 = address of the vertex in DMEM (usually within VERTEX_CACHE) + # + ################################################################ + .func GL_TnL +GL_TnL: + #define tmp_ptr s2 + #define vtx s3 + #define s e0 + move ra2, ra + + #define v___ $v01 + #define vrgba $v04 + + ldv vrgba.e0, PRIM_VTX_R, vtx # R + G + B + A + ldv vrgba.e4, PRIM_VTX_R, vtx # R + G + B + A + + #define vtexsize $v06 + #define vtexoffset $v07 + #define vstrq $v08 + + ldv vstrq, PRIM_VTX_TEX_S,vtx # S + T + R + Q + suv vrgba, SCREEN_VTX_RGBA,vtx + + li s1, %lo(GL_STATE_TEX_SIZE) + llv vtexsize.s, 0,s1 + llv vtexoffset.s, 4,s1 + + #define vinvq_i $v26 + #define vinvq_f $v27 + #define vstrq_i $v28 + #define vstrq_f $v29 + #define q e3 + + # Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active) + #vmudn v___, vstrq, vtexsize + # vmadh vstrq, vtexoffset, K1 + + #vmudn v___, vstrq, vtexsize + #vmadh vstrq, vtexoffset, K1 + #vmudl vstrq, vstrq, vtexsize + + vmudh v___, vstrq, vtexsize + vsar vstrq_i, COP2_ACC_HI + vsar vstrq_f, COP2_ACC_MD + + vmudl vstrq_f, vstrq_f, K8192 + vmadm vstrq_i, vstrq_i, K8192 + vmadn vstrq, vzero, vzero + + #undef vinvq_i + #undef vinvq_f + #undef vstrq_i + #undef vstrq_f + #undef q + + lbu t0, PRIM_VTX_TRCODE(vtx) + + #define vcspos_f $v02 + #define vcspos_i $v03 + + ldv vcspos_f, PRIM_VTX_CS_POSf,vtx + ldv vcspos_i, PRIM_VTX_CS_POSi,vtx + + # Mark this vertex as having T&L applied + ori t0, 0x80 + sb t0, PRIM_VTX_TRCODE(vtx) + + jal GL_CalcScreenSpace + slv vstrq.s, SCREEN_VTX_S,vtx + + j GL_CalcClipCodes + move ra, ra2 + + #undef vcspos_f + #undef vcspos_i + #undef vtexsize + #undef vtexoffset + + #undef vtx + + #undef v___ + #undef vrgba + #undef vst + #undef s + + .endfunc + + + ################################################################ + # GLCmd_DrawTriangle + # + # Arguments: + # a0: Bit 31..24: Command id + # Bit 11..0: Offset into vertex cache of vtx1 + # a1: Bit 27..16: Offset into vertex cache of vtx2 + # Bit 11..0: Offset into vertex cache of vtx3 + # + ################################################################ + .func GLCmd_DrawTriangle +GLCmd_DrawTriangle: + #define vtx1 a1 + #define vtx2 a2 + #define vtx3 a3 + #define trcode1 t6 + #define trcode2 t7 + #define trcode3 t8 + + addi vtx3, a1, %lo(VERTEX_CACHE) + srl vtx2, a1, 16 + addi vtx2, %lo(VERTEX_CACHE) + addi vtx1, a0, %lo(VERTEX_CACHE) + + # Trivial reject: if all the vertices are out of the same plane (at least one), + # the triangle is out of the viewport. + # NOTE: This deliberately uses lb instead of lbu so the sign bit is extended. + # The MSB of each TR-code is a bit flag that is set if the vertex has already + # had T&L applied once. + lb trcode1, PRIM_VTX_TRCODE(vtx1) + lb trcode2, PRIM_VTX_TRCODE(vtx2) + lb trcode3, PRIM_VTX_TRCODE(vtx3) + and t0, trcode1, trcode2 + and t0, trcode3 + andi t0, 0x3F + bnez t0, JrRa + nop + + # Perform T&L for each vertex if we haven't already + bgezal trcode1, GL_TnL + move s3, vtx1 + + bgezal trcode2, GL_TnL + move s3, vtx2 + + bgezal trcode3, GL_TnL + move s3, vtx3 + + lbu t0, SCREEN_VTX_CLIP_CODE(vtx1) + lbu t1, SCREEN_VTX_CLIP_CODE(vtx2) + lbu t2, SCREEN_VTX_CLIP_CODE(vtx3) + or t5, t0, t1 + or t5, t2 + + move s1, zero + beqz t5, gl_draw_single_triangle + move s2, zero + + jal GL_ClipTriangle + nop + + beqz v1, gl_draw_triangle_end + addi s2, -6 + lhu s5, 0(s1) +gl_draw_clipped_triangles_loop: + move vtx1, s5 + lhu vtx2, 2(s1) + lhu vtx3, 4(s1) + +gl_draw_single_triangle: + addi vtx1, SCREEN_VTX_X + addi vtx2, SCREEN_VTX_X + addi vtx3, SCREEN_VTX_X + + lhu a0, %lo(GL_TRI_CMD) + lh v0, %lo(GL_TRI_CULL) + jal RDPQ_Triangle + li s3, %lo(RDPQ_CMD_STAGING) + + jal RDPQ_Send + li s4, %lo(RDPQ_CMD_STAGING) + + blt s1, s2, gl_draw_clipped_triangles_loop + addi s1, 2 + +gl_draw_triangle_end: + j RSPQ_Loop + nop + + #undef vtx1 + #undef vtx2 + #undef vtx3 + + .endfunc + + +GLCmd_MatrixLoad: + #define src s6 + #define dst s7 + + #define vrhs01_i $v02 + #define vrhs01_f $v03 + #define vrhs23_i $v04 + #define vrhs23_f $v05 + + addi src, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64 + addi dst, zero, %lo(GL_MATRIX_MVP) + + # Load the matrix from command parameters (misaligned) + lqv vrhs01_i, 0x00,src + lrv vrhs01_i, 0x10,src + lqv vrhs23_i, 0x10,src + lrv vrhs23_i, 0x20,src + lqv vrhs01_f, 0x20,src + lrv vrhs01_f, 0x30,src + lqv vrhs23_f, 0x30,src + lrv vrhs23_f, 0x40,src + + sqv vrhs01_i, 0x00,dst + sqv vrhs23_i, 0x10,dst + sqv vrhs01_f, 0x20,dst + jr ra + sqv vrhs23_f, 0x30,dst + + .func GLCmd_PreInitPipe +GLCmd_PreInitPipe: + #define state_flags k1 + #define tri_cmd t4 + + lw tri_cmd, %lo(GL_STATE_FLAGS) + ori tri_cmd, 0xCC00 + jr ra + sh tri_cmd, %lo(GL_TRI_CMD) + + #undef tri_cmd + #undef state_flags + .endfunc + +#include "rsp_gpu_clipping.inc" +#include diff --git a/misc/n64/rsp_gpu_clipping.inc b/misc/n64/rsp_gpu_clipping.inc new file mode 100644 index 000000000..90d753dad --- /dev/null +++ b/misc/n64/rsp_gpu_clipping.inc @@ -0,0 +1,374 @@ + + .section .data.gl_clipping + + .align 4 +CLIP_PLANES: + .half 1, 0, 0, GUARD_BAND_FACTOR + .half 0, 1, 0, GUARD_BAND_FACTOR + .half 0, 0, 1, 1 + .half 1, 0, 0, -GUARD_BAND_FACTOR + .half 0, 1, 0, -GUARD_BAND_FACTOR + .half 0, 0, 1, -1 + + .section .bss.gl_clipping + +CLIP_CACHE: .dcb.b SCREEN_VTX_SIZE * CLIPPING_CACHE_SIZE +CLIP_CACHE_END: + +CLIP_LISTS: + CLIP_LIST0: .dcb.w CLIPPING_CACHE_SIZE + CLIP_LIST1: .dcb.w CLIPPING_CACHE_SIZE + + + .section .text.gl_clipping + + ################################################################ + # GL_ClipTriangle + # Clip a triangle against the view-frustum by using the Sutherland-Hodgman algorithm + # https://en.wikipedia.org/wiki/Sutherland%E2%80%93Hodgman_algorithm + # Args: + # a1-a3 = Vertices + # t5 = OR'd clip flags of the triangle's vertices + # Returns: + # s1 = Pointer to list of output vertices + # s2 = Pointer to end of list + ################################################################ + .func GL_ClipTriangle +GL_ClipTriangle: + #define out_count v1 + #define clip_flags t5 + #define plane_flag t6 + #define in_count t7 + #define in_end t8 + #define in_list s0 + #define out_list s1 + #define plane s2 + #define intersection s3 + #define cur_ptr s4 + #define prev_ptr s5 + #define cur_vtx s6 + #define prev_vtx s7 + #define p0 k0 + #define p1 k1 + #define vtx1 a1 + #define vtx2 a2 + #define vtx3 a3 + + #define vplane $v01 + #define vint_f $v02 + #define vint_i $v03 + #define vdot_i $v04 + #define vdot_f $v05 + #define vdiff_i $v06 + #define vdiff_f $v07 + #define va_i $v08 + #define va_f $v09 + #define vpos_i $v10 + #define vpos_f $v11 + #define vattr0 $v12 + #define vattr1 $v13 + #define voff0 $v14 + #define voff1 $v15 + #define vcache0 $v16 + #define vcache1 $v17 + #define v__ $v29 + + move ra2, ra + + # Init in_list as empty + li in_list, %lo(CLIP_LIST0) + move in_count, zero + + # Put three original vertices in the out_list + # (So after the initial swap they will be in the in_list) + li out_list, %lo(CLIP_LIST1) + sh vtx1, 0(out_list) + sh vtx2, 2(out_list) + sh vtx3, 4(out_list) + li out_count, 3*2 + + li plane, %lo(CLIP_PLANES) + li plane_flag, 1 + + # Load cache offsets + li t0, %lo(CACHE_OFFSETS) + vxor voff1, voff1 + lqv voff0, 0,t0 + lsv voff1, 16,t0 + + # Temporarily use the RDP staging area as a map of which cache slots are used + # Init to zero + li t0, %lo(RDPQ_CMD_STAGING) + sqv vzero, 0,t0 + sqv vzero, 16,t0 + + # Iterate over the 6 clipping planes +gl_clip_plane_loop: + and t0, clip_flags, plane_flag + beqz t0, gl_clip_plane_loop_end + move t1, in_list + + # Swap in and out lists + + # If the out list is empty from the last iteration, + # the triangle has no visible points and we are done + beqz out_count, gl_clip_return + move in_list, out_list + move out_list, t1 + move in_count, out_count + move out_count, zero + + # Iterate over the egdes of the polygon in the input list + # The current edge is between cur_vtx and prev_vtx + move cur_ptr, in_list + add in_end, in_list, in_count + # Init the "previous" vertex to the last in the list for the wrap-around + addi prev_ptr, in_end, -2 + +gl_clip_edge_loop: + #define cur_flag t3 + #define prev_flag t4 + + # Check which side of the plane the two vertices are on + lhu cur_vtx, 0(cur_ptr) + lhu prev_vtx, 0(prev_ptr) + lbu cur_flag, SCREEN_VTX_CLIP_CODE(cur_vtx) + lbu prev_flag, SCREEN_VTX_CLIP_CODE(prev_vtx) + and cur_flag, plane_flag + and prev_flag, plane_flag + + # If they are on opposite sides, there is an intersection + xor t0, cur_flag, prev_flag + beqz t0, gl_clip_no_intersection + move p0, cur_vtx + + # Swap the two points if necessary to make intersection calculation consistent + # This will make sure p0 is always inside and p1 is always outside + bnez prev_flag, gl_clip_no_swap + move p1, prev_vtx + xor p0, p0, p1 + xor p1, p0, p1 + xor p0, p0, p1 + + #undef prev_flag + +gl_clip_no_swap: + # Calculate intersection of the line segment and the plane + + li t0, %lo(RDPQ_CMD_STAGING) + lqv vcache0, 0,t0 + lqv vcache1, 16,t0 + + # Repeat plane coefficients twice + ldv vplane.e0, 0,plane + ldv vplane.e4, 0,plane + + # vpos: x0 y0 z0 w0 x1 y1 z1 w1 + ldv vpos_i.e0, SCREEN_VTX_CS_POSi,p0 + ldv vpos_f.e0, SCREEN_VTX_CS_POSf,p0 + ldv vpos_i.e4, SCREEN_VTX_CS_POSi,p1 + ldv vpos_f.e4, SCREEN_VTX_CS_POSf,p1 + + # vint: x1 y1 z1 w1 + ldv vint_i.e0, SCREEN_VTX_CS_POSi,p1 + ldv vint_f.e0, SCREEN_VTX_CS_POSf,p1 + + # vattr0: r0 g0 b0 a0 s0 t0 + luv vattr0.e0, SCREEN_VTX_RGBA ,p0 + llv vattr0.e4, SCREEN_VTX_S ,p0 + + # vattr1: r1 g1 b1 a1 s1 t1 + luv vattr1.e0, SCREEN_VTX_RGBA ,p1 + llv vattr1.e4, SCREEN_VTX_S ,p1 + + # Find first free slot in clip cache + + # Add the values from the "used slots map" to the cache offsets + # After this, each lane will contain the offset of its corresponding cache slot, + # but only if the slot is not used. If it is used, it will contain some large value. + vaddc vcache0, voff0 + vaddc vcache1, voff1 + + # Look for the smallest value, which will end up in vcache.e0 + # Because used slots are marked as large values, they will never be found. + vlt vcache0, vcache0.q1 + vlt vcache0, vcache0.h2 + vlt vcache0, vcache0.e4 + vlt vcache0, vcache1.e0 + + mfc2 t0, vcache0.e0 + + # Mark slot as used by storing some large value (careful of overflows!) + li t1, 0xFF + sh t1, %lo(RDPQ_CMD_STAGING)-2(t0) + + # t0 is the index multiplied by 2 + # intersection = t0 * 20 = t0 * 16 + t0 * 4 + sll intersection, t0, 4 + sll t1, t0, 2 + add intersection, t1 + + # CAUTION: intersection might point to the same address as either p0 or p1, + # because one of them is the previous point, which could have been marked unused + # in the previous iteration. As long as we don't access p0 or p1 after writing to + # intersection, this is fine. + addi intersection, %lo(CLIP_CACHE) - SCREEN_VTX_SIZE + + # Store the cache offset in unused memory (used later when finding the cache slot to mark as unused) + sb t0, SCREEN_VTX_PADDING(intersection) + + # Compute dot products of both positions with the clip plane + # vdot.e0: d0 = dot(p0, plane) + # vdot.e4: d1 = dot(p1, plane) + vmudn vdot_f, vpos_f, vplane + vmadh vdot_i, vpos_i, vplane + vaddc vdot_f, vdot_f.q1 + vadd vdot_i, vdot_i.q1 + vaddc vdot_f, vdot_f.h2 + vadd vdot_i, vdot_i.h2 + + # d0 - d1 + vsubc vdiff_f, vdot_f, vdot_f.e4 + vsub vdiff_i, vdot_i, vdot_i.e4 + + # 1 / (d0 - d1) + vrcph v__.e0, vdiff_i.e0 + vrcpl va_f.e0, vdiff_f.e0 + vrcph va_i.e0, vzero.e0 + + # a = d0 / (d0 - d1) + vmudl v__, va_f, vdot_f.e0 + vmadm v__, va_i, vdot_f.e0 + vmadn va_f, va_f, vdot_i.e0 + + # Prepare 0x7FFF in va_i.e0 + vsubc va_i, vshift8, K1 + + # a = min(a, 1) + vge v__, va_f, vzero + vmrg va_f, va_f, va_i.e0 + + # Account for right shift introduced by vrcp + vmudn va_f, va_f, K2 + + # p1 - p0 + vsubc vint_f, vpos_f + vsub vint_i, vpos_i + # attr1 - attr0 + vsubc vattr1, vattr0 + + # Result of linear interpolation: + # p0 + a * (p1 - p0) + vmudl v__, vint_f, va_f.e0 + vmadm v__, vint_i, va_f.e0 + vmadn vint_f, vpos_f, K1 + vmadh vint_i, vpos_i, K1 + + # a * (attr1 - attr0) + vmudm vattr1, vattr1, va_f.e0 + + # attr0 + a * (attr1 - attr0) + vaddc vattr0, vattr1 + + # Store results + sdv vint_i.e0, SCREEN_VTX_CS_POSi,intersection + sdv vint_f.e0, SCREEN_VTX_CS_POSf,intersection + suv vattr0.e0, SCREEN_VTX_RGBA ,intersection + jal GL_CalcClipCodes + slv vattr0.e4, SCREEN_VTX_S ,intersection + + # Add intersection to the output list + add t0, out_list, out_count + sh intersection, 0(t0) + addi out_count, 2 + +gl_clip_no_intersection: + # If cur_vtx is inside, add it to the output list + bnez cur_flag, gl_clip_no_current + add t0, out_list, out_count + sh cur_vtx, 0(t0) + b gl_clip_edge_loop_end + addi out_count, 2 + + #undef cur_flag + +gl_clip_no_current: + # Check if the vertex is stored in the clip cache + lbu t0, SCREEN_VTX_PADDING(cur_vtx) + beqz t0, gl_clip_edge_loop_end + # Reset the padding field to zero, so the screen space values won't be recalculated below + sb zero, SCREEN_VTX_PADDING(cur_vtx) + # If so, mark it as unused + sh zero, %lo(RDPQ_CMD_STAGING)-2(t0) + +gl_clip_edge_loop_end: + # Advance to the next edge + addi cur_ptr, 2 + blt cur_ptr, in_end, gl_clip_edge_loop + addi prev_ptr, cur_ptr, -2 + +gl_clip_plane_loop_end: + # Advance to the next clipping plane + sll plane_flag, 1 + blt plane_flag, (1< -#include -#include #include +#include +#include "../misc/n64/gpu.c" typedef void (*GL_SetupVBFunc)(void); static GL_SetupVBFunc gfx_setupVBFunc; @@ -20,16 +20,21 @@ static surface_t zbuffer; static GfxResourceID white_square; void Gfx_Create(void) { + rspq_init(); + //rspq_profile_start(); rdpq_init(); + //rdpq_debug_start(); // TODO debug + //rdpq_debug_log(true); rdpq_set_mode_standard(); __rdpq_mode_change_som(SOM_TEXTURE_PERSP, SOM_TEXTURE_PERSP); __rdpq_mode_change_som(SOM_ZMODE_MASK, SOM_ZMODE_OPAQUE); rdpq_mode_dithering(DITHER_SQUARE_SQUARE); + // Set alpha compare threshold + rdpq_set_blend_color(RGBA32(0,0,0, 127)); + gl_init(); - //rdpq_debug_start(); // TODO debug - //rdpq_debug_log(true); zbuffer = surface_alloc(FMT_RGBA16, display_get_width(), display_get_height()); Gfx.MaxTexWidth = 256; @@ -121,7 +126,10 @@ void Gfx_ClearColor(PackedCol color) { void Gfx_EndFrame(void) { Platform_LogConst("GFX ctx end"); rdpq_detach_show(); -//Platform_LogConst("GFX END"); + //Platform_LogConst("GFX END"); + + //rspq_profile_dump(); + //rspq_profile_next_frame(); } @@ -253,7 +261,7 @@ static void SetAlphaBlend(cc_bool enabled) { void Gfx_SetAlphaArgBlend(cc_bool enabled) { } static void SetAlphaTest(cc_bool enabled) { - rdpq_mode_alphacompare(enabled ? 127 : 0); + __rdpq_mode_change_som(SOM_ALPHACOMPARE_MASK, enabled ? SOM_ALPHACOMPARE_THRESHOLD : 0); } static void SetColorWrite(cc_bool r, cc_bool g, cc_bool b, cc_bool a) {