From 3ef1f91d9dfa9f3d326c6204b2d6e9df391ae834 Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Sun, 27 Apr 2025 08:34:04 +1000 Subject: [PATCH 01/14] N64 optimised, stage 1 --- src/Graphics_N64.c | 153 +++++++++++++++++++++++++-------------------- src/Server.c | 4 +- src/main.c | 4 +- 3 files changed, 90 insertions(+), 71 deletions(-) diff --git a/src/Graphics_N64.c b/src/Graphics_N64.c index dae6d6989..4a42427a5 100644 --- a/src/Graphics_N64.c +++ b/src/Graphics_N64.c @@ -17,8 +17,16 @@ static GL_SetupVBFunc gfx_setupVBFunc; *---------------------------------------------------------General---------------------------------------------------------* *#########################################################################################################################*/ static surface_t zbuffer; +static GfxResourceID white_square; void Gfx_Create(void) { + rdpq_init(); + + rdpq_set_mode_standard(); + __rdpq_mode_change_som(SOM_TEXTURE_PERSP, SOM_TEXTURE_PERSP); + __rdpq_mode_change_som(SOM_ZMODE_MASK, SOM_ZMODE_OPAQUE); + rdpq_mode_dithering(DITHER_SQUARE_SQUARE); + gl_init(); //rdpq_debug_start(); // TODO debug //rdpq_debug_log(true); @@ -36,6 +44,9 @@ void Gfx_Create(void) { Gfx.SupportsNonPowTwoTextures = true; Gfx_RestoreState(); + + Gfx_SetFaceCulling(false); + Gfx_SetViewport(0, 0, Game.Width, Game.Height); } cc_bool Gfx_TryRestoreContext(void) { @@ -73,21 +84,17 @@ void Gfx_SetVSync(cc_bool vsync) { void Gfx_OnWindowResize(void) { } void Gfx_SetViewport(int x, int y, int w, int h) { - glViewport(x, Game.Height - h - y, w, h); -} -void Gfx_SetScissor (int x, int y, int w, int h) { - cc_bool enabled = x != 0 || y != 0 || w != Game.Width || h != Game.Height; - if (enabled) { glEnable(GL_SCISSOR_TEST); } else { glDisable(GL_SCISSOR_TEST); } - - glScissor(x, Game.Height - h - y, w, h); + glViewport(x, y, w, h); } +void Gfx_SetScissor(int x, int y, int w, int h) { + rdpq_set_scissor(x, y, x + w, y + h); +} void Gfx_BeginFrame(void) { surface_t* disp = display_get(); rdpq_attach(disp, &zbuffer); - gl_context_begin(); Platform_LogConst("GFX ctx beg"); } @@ -113,7 +120,6 @@ void Gfx_ClearColor(PackedCol color) { void Gfx_EndFrame(void) { Platform_LogConst("GFX ctx end"); - gl_context_end(); rdpq_detach_show(); //Platform_LogConst("GFX END"); } @@ -124,14 +130,32 @@ void Gfx_EndFrame(void) { *#########################################################################################################################*/ typedef struct CCTexture { surface_t surface; - GLuint textureID; + rspq_block_t* upload_block; } CCTexture; +void Gfx_BindTexture(GfxResourceID texId) { + if (!texId) texId = white_square; + CCTexture* tex = (CCTexture*)texId; + + rspq_block_run(tex->upload_block); + glTexSizeN64(tex->surface.width, tex->surface.height); +} + #define ALIGNUP8(size) (((size) + 7) & ~0x07) // A8 B8 G8 R8 > A1 B5 G5 B5 #define To16BitPixel(src) \ - ((src & 0x80) >> 7) | ((src & 0xF800) >> 10) | ((src & 0xF80000) >> 13) | ((src & 0xF8000000) >> 16); + ((src & 0x80) >> 7) | ((src & 0xF800) >> 10) | ((src & 0xF80000) >> 13) | ((src & 0xF8000000) >> 16); + +static void UploadTexture(CCTexture* tex, rdpq_texparms_t* params) { + rspq_block_begin(); + + rdpq_tex_multi_begin(); + rdpq_tex_upload(TILE0, &tex->surface, params); + rdpq_tex_multi_end(); + + tex->upload_block = rspq_block_end(); +} GfxResourceID Gfx_AllocTexture(struct Bitmap* bmp, int rowWidth, cc_uint8 flags, cc_bool mipmaps) { cc_bool bit16 = flags & TEXTURE_FLAG_LOWRES; @@ -141,15 +165,8 @@ GfxResourceID Gfx_AllocTexture(struct Bitmap* bmp, int rowWidth, cc_uint8 flags, if (pitch * bmp->height > 4096) return 0; CCTexture* tex = Mem_Alloc(1, sizeof(CCTexture), "texture"); - - glGenTextures(1, &tex->textureID); - glBindTexture(GL_TEXTURE_2D, tex->textureID); - // NOTE: Enabling these fixes textures, but seems to break on cen64 - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, mipmaps ? GL_LINEAR : GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, mipmaps ? GL_LINEAR : GL_NEAREST); - - tex->surface = surface_alloc(bit16 ? FMT_RGBA16 : FMT_RGBA32, bmp->width, bmp->height); - surface_t* fb = &tex->surface; + tex->surface = surface_alloc(bit16 ? FMT_RGBA16 : FMT_RGBA32, bmp->width, bmp->height); + surface_t* fb = &tex->surface; if (bit16) { cc_uint32* src = (cc_uint32*)bmp->scan0; @@ -172,28 +189,15 @@ GfxResourceID Gfx_AllocTexture(struct Bitmap* bmp, int rowWidth, cc_uint8 flags, bmp, rowWidth * BITMAPCOLOR_SIZE); } - rdpq_texparms_t params = { .s.repeats = (flags & TEXTURE_FLAG_NONPOW2) ? 1 : REPEAT_INFINITE, .t.repeats = (flags & TEXTURE_FLAG_NONPOW2) ? 1 : REPEAT_INFINITE, }; - - // rdpq_tex_upload(TILE0, &tex->surface, ¶ms); - glSurfaceTexImageN64(GL_TEXTURE_2D, 0, fb, ¶ms); + UploadTexture(tex, ¶ms); return tex; } -void Gfx_BindTexture(GfxResourceID texId) { - CCTexture* tex = (CCTexture*)texId; - GLuint glID = tex ? tex->textureID : 0; - //Platform_Log1("BIND: %i", &glID); - - //rdpq_debug_log(true); - glBindTexture(GL_TEXTURE_2D, glID); - // rdpq_debug_log(false); -} - void Gfx_UpdateTexture(GfxResourceID texId, int x, int y, struct Bitmap* part, int rowWidth, cc_bool mipmaps) { // TODO: Just memcpying doesn't actually work. maybe due to glSurfaceTexImageN64 caching the RSQ upload block? // TODO: Is there a more optimised approach than just calling glSurfaceTexImageN64 @@ -210,21 +214,22 @@ void Gfx_UpdateTexture(GfxResourceID texId, int x, int y, struct Bitmap* part, i part->width * 4); } - - glBindTexture(GL_TEXTURE_2D, tex->textureID); rdpq_texparms_t params = (rdpq_texparms_t){ .s.repeats = REPEAT_INFINITE, .t.repeats = REPEAT_INFINITE, }; - // rdpq_tex_upload(TILE0, &tex->surface, ¶ms); - glSurfaceTexImageN64(GL_TEXTURE_2D, 0, fb, ¶ms); + + rdpq_call_deferred((void (*)(void*))rspq_block_free, tex->upload_block); + UploadTexture(tex, ¶ms); } void Gfx_DeleteTexture(GfxResourceID* texId) { CCTexture* tex = (CCTexture*)(*texId); if (!tex) return; - glDeleteTextures(1, &tex->textureID); + if (tex->upload_block) rdpq_call_deferred((void (*)(void*))rspq_block_free, tex->upload_block); + surface_free(&tex->surface); + Mem_Free(tex); *texId = NULL; } @@ -236,16 +241,34 @@ void Gfx_DisableMipmaps(void) { } /*########################################################################################################################* *-----------------------------------------------------State management----------------------------------------------------* *#########################################################################################################################*/ -void Gfx_SetFaceCulling(cc_bool enabled) { gl_Toggle(GL_CULL_FACE); } -static void SetAlphaBlend(cc_bool enabled) { gl_Toggle(GL_BLEND); } +void Gfx_SetFaceCulling(cc_bool enabled) { + glCullFace(enabled ? GL_BACK : 0); +} + +static void SetAlphaBlend(cc_bool enabled) { + rdpq_mode_blender(enabled ? RDPQ_BLENDER_MULTIPLY : 0); + __rdpq_mode_change_som(SOM_ZMODE_MASK, enabled ? SOM_ZMODE_TRANSPARENT : SOM_ZMODE_OPAQUE); +} + void Gfx_SetAlphaArgBlend(cc_bool enabled) { } +static void SetAlphaTest(cc_bool enabled) { + rdpq_mode_alphacompare(enabled ? 127 : 0); +} + static void SetColorWrite(cc_bool r, cc_bool g, cc_bool b, cc_bool a) { //glColorMask(r, g, b, a); TODO } -void Gfx_SetDepthWrite(cc_bool enabled) { glDepthMask(enabled); } -void Gfx_SetDepthTest(cc_bool enabled) { gl_Toggle(GL_DEPTH_TEST); } +void Gfx_SetDepthWrite(cc_bool enabled) { + __rdpq_mode_change_som(SOM_Z_WRITE, enabled ? SOM_Z_WRITE : 0); +} + +void Gfx_SetDepthTest(cc_bool enabled) { + __rdpq_mode_change_som(SOM_Z_COMPARE, enabled ? SOM_Z_COMPARE : 0); + + gl_Toggle(GL_DEPTH_TEST); +} static void Gfx_FreeState(void) { FreeDefaultResources(); } static void Gfx_RestoreState(void) { @@ -253,12 +276,12 @@ static void Gfx_RestoreState(void) { glEnableClientState(GL_VERTEX_ARRAY); glEnableClientState(GL_COLOR_ARRAY); gfx_format = -1; - - glHint(GL_FOG_HINT, GL_NICEST); - glAlphaFunc(GL_GREATER, 0.5f); - glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); - glDepthFunc(GL_LESS); - //glEnable(GL_RDPQ_TEXTURING_N64); + + // 1x1 dummy white texture + struct Bitmap bmp; + BitmapCol pixels[1] = { BITMAPCOLOR_WHITE }; + Bitmap_Init(bmp, 1, 1, pixels); + white_square = Gfx_CreateTexture(&bmp, 0, false); } cc_bool Gfx_WarnIfNecessary(void) { return false; } @@ -435,10 +458,6 @@ void Gfx_SetFogEnd(float value) { void Gfx_SetFogMode(FogFunc func) { } -static void SetAlphaTest(cc_bool enabled) { - if (enabled) { glEnable(GL_ALPHA_TEST); } else { glDisable(GL_ALPHA_TEST); } -} - void Gfx_DepthOnlyRendering(cc_bool depthOnly) { depthOnlyRendering = depthOnly; // TODO: Better approach? maybe using glBlendFunc instead? cc_bool enabled = !depthOnly; @@ -451,32 +470,30 @@ void Gfx_DepthOnlyRendering(cc_bool depthOnly) { /*########################################################################################################################* *---------------------------------------------------------Matrices--------------------------------------------------------* *#########################################################################################################################*/ -static GLenum matrix_modes[3] = { GL_PROJECTION, GL_MODELVIEW, GL_TEXTURE }; -static int lastMatrix; +static struct Matrix _view, _proj; void Gfx_LoadMatrix(MatrixType type, const struct Matrix* matrix) { - if (type != lastMatrix) { lastMatrix = type; glMatrixMode(matrix_modes[type]); } + if (type == MATRIX_VIEW) _view = *matrix; + if (type == MATRIX_PROJ) _proj = *matrix; - if (matrix == &Matrix_Identity) { - glLoadIdentity(); - } else { - glLoadMatrixf((const float*)matrix); - } + struct Matrix mvp __attribute__((aligned(64))); + Matrix_Mul(&mvp, &_view, &_proj); + glLoadMatrixf((const float*)&mvp); } void Gfx_LoadMVP(const struct Matrix* view, const struct Matrix* proj, struct Matrix* mvp) { - Gfx_LoadMatrix(MATRIX_VIEW, view); - Gfx_LoadMatrix(MATRIX_PROJ, proj); + _proj = *proj; + _view = *view; + Matrix_Mul(mvp, view, proj); + glLoadMatrixf((const float*)mvp); } -static struct Matrix texMatrix = Matrix_IdentityValue; void Gfx_EnableTextureOffset(float x, float y) { - texMatrix.row4.x = x; texMatrix.row4.y = y; - Gfx_LoadMatrix(2, &texMatrix); + // TODO } -void Gfx_DisableTextureOffset(void) { Gfx_LoadMatrix(2, &Matrix_Identity); } +void Gfx_DisableTextureOffset(void) { } /*########################################################################################################################* @@ -503,11 +520,13 @@ void Gfx_SetVertexFormat(VertexFormat fmt) { glEnable(GL_TEXTURE_2D); gfx_setupVBFunc = GL_SetupVbTextured; + rdpq_mode_combiner(RDPQ_COMBINER_TEX_SHADE); } else { glDisableClientState(GL_TEXTURE_COORD_ARRAY); glDisable(GL_TEXTURE_2D); gfx_setupVBFunc = GL_SetupVbColoured; + rdpq_mode_combiner(RDPQ_COMBINER_SHADE); } } diff --git a/src/Server.c b/src/Server.c index f6fea3c60..deae93fb8 100644 --- a/src/Server.c +++ b/src/Server.c @@ -145,12 +145,12 @@ static void SPConnection_BeginConnect(void) { World_SetDimensions(horSize, verSize, horSize); #if defined CC_BUILD_N64 || defined CC_BUILD_NDS || defined CC_BUILD_PS1 || defined CC_BUILD_SATURN || defined CC_BUILD_32X || defined CC_BUILD_GBA - Gen_Active = &FlatgrassGen; + Gen_Active = &NotchyGen; #else Gen_Active = &NotchyGen; #endif - Gen_Seed = Random_Next(&rnd, Int32_MaxValue); + Gen_Seed = 400;//Random_Next(&rnd, Int32_MaxValue); Gen_Start(); GeneratingScreen_Show(); diff --git a/src/main.c b/src/main.c index 6abab2a5a..48b880dad 100644 --- a/src/main.c +++ b/src/main.c @@ -153,12 +153,12 @@ static int RunProgram(int argc, char** argv) { struct ResumeInfo r; cc_string host; -#ifdef _MSC_VER +//#ifdef _MSC_VER /* NOTE: Make sure to comment this out before pushing a commit */ //cc_string rawArgs = String_FromConst("UnknownShadow200 fffff 127.0.0.1 25565"); //cc_string rawArgs = String_FromConst("UnknownShadow200"); //argsCount = String_UNSAFE_Split(&rawArgs, ' ', args, 4); -#endif +//#endif if (argsCount == 0) { #ifdef CC_BUILD_WEB From eba646cebbab06b8a35bbb71237fd6d1df4006da Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Sun, 27 Apr 2025 12:58:17 +1000 Subject: [PATCH 02/14] N64 optimised, stage 2 --- misc/n64/Makefile | 13 +- misc/n64/{ => files}/default.zip | Bin misc/n64/gl_constants.h | 49 +++ misc/n64/gpu.c | 398 +++++++++++++++++++++ misc/n64/rsp_gpu.S | 585 +++++++++++++++++++++++++++++++ misc/n64/rsp_gpu_clipping.inc | 374 ++++++++++++++++++++ src/Graphics_N64.c | 20 +- 7 files changed, 1429 insertions(+), 10 deletions(-) rename misc/n64/{ => files}/default.zip (100%) create mode 100644 misc/n64/gl_constants.h create mode 100644 misc/n64/gpu.c create mode 100644 misc/n64/rsp_gpu.S create mode 100644 misc/n64/rsp_gpu_clipping.inc diff --git a/misc/n64/Makefile b/misc/n64/Makefile index f8fcbfe9e..c243c55fd 100644 --- a/misc/n64/Makefile +++ b/misc/n64/Makefile @@ -1,23 +1,28 @@ BUILD_DIR = build-n64 -SOURCE_DIR = src +SOURCE_DIR = misc/n64 N64_ROM_TITLE = "ClassiCube" N64_ROM_RTC = true TARGET = ClassiCube-n64 -N64_MKDFS_ROOT = "misc/n64" +N64_MKDFS_ROOT = "misc/n64/files" CFILES := $(notdir $(wildcard src/*.c)) -OFILES := $(CFILES:.c=.o) +OFILES := $(CFILES:.c=.o) rsp_gpu.o OBJS := $(addprefix $(BUILD_DIR)/,$(OFILES)) CFLAGS := -Wno-error=missing-braces -Wno-error=strict-aliasing -Wno-error=incompatible-pointer-types default: $(TARGET).z64 +$(BUILD_DIR)/%.o: src/%.c + @mkdir -p $(dir $@) + @echo " [CC] $<" + $(CC) -c $(CFLAGS) -o $@ $< + include $(N64_INST)/include/n64.mk $(TARGET).z64: N64_ROM_TITLE = "ClassiCube" $(TARGET).z64: $(BUILD_DIR)/filesystem.dfs -$(BUILD_DIR)/filesystem.dfs: misc/n64/default.zip +$(BUILD_DIR)/filesystem.dfs: misc/n64/files/default.zip $(BUILD_DIR)/ClassiCube-n64.elf: $(OBJS) diff --git a/misc/n64/default.zip b/misc/n64/files/default.zip similarity index 100% rename from misc/n64/default.zip rename to misc/n64/files/default.zip diff --git a/misc/n64/gl_constants.h b/misc/n64/gl_constants.h new file mode 100644 index 000000000..cf6ea3ce6 --- /dev/null +++ b/misc/n64/gl_constants.h @@ -0,0 +1,49 @@ +#ifndef __GL_CONSTANTS +#define __GL_CONSTANTS + +#define VERTEX_CACHE_SIZE 16 + +#define MATRIX_SIZE 64 + +#define TEXTURE_BILINEAR_MASK 0x001 +#define TEXTURE_INTERPOLATE_MASK 0x002 +#define TEXTURE_MIPMAP_MASK 0x100 + +#define VTX_SHIFT 5 +#define TEX_SHIFT 8 + +#define FLAG_DEPTH_TEST (1 << 8) +#define FLAG_TEXTURE_ACTIVE (1 << 9) + +#define GUARD_BAND_FACTOR 2 + +#define ASSERT_INVALID_VTX_ID 0x2001 + +#define TEX_COORD_SHIFT 6 +#define HALF_TEXEL 0x0010 + +#define TEX_BILINEAR_SHIFT 13 +#define TEX_BILINEAR_OFFSET_SHIFT 4 + +#define BILINEAR_TEX_OFFSET_SHIFT 9 + +#define TRICMD_ATTR_MASK 0x300 + +#define PRIM_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit) +#define PRIM_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit) +#define PRIM_VTX_X 16 // Object space position (16-bit) +#define PRIM_VTX_Y 18 // Object space position (16-bit) +#define PRIM_VTX_Z 20 // Object space position (16-bit) +#define PRIM_VTX_W 22 // Object space position (16-bit) +#define PRIM_VTX_R 24 +#define PRIM_VTX_G 26 +#define PRIM_VTX_B 28 +#define PRIM_VTX_A 30 +#define PRIM_VTX_TEX_S 32 +#define PRIM_VTX_TEX_T 34 +#define PRIM_VTX_TEX_R 36 +#define PRIM_VTX_TEX_Q 38 +#define PRIM_VTX_TRCODE 40 // trivial-reject clipping flags (against -w/+w) +#define PRIM_VTX_SIZE 42 + +#endif diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c new file mode 100644 index 000000000..d84943dab --- /dev/null +++ b/misc/n64/gpu.c @@ -0,0 +1,398 @@ +#include "GL/gl.h" +#include "rspq.h" +#include "rdpq.h" +#include "rdpq_rect.h" +#include "rdpq_mode.h" +#include "rdpq_debug.h" +#include "display.h" +#include "rdp.h" +#include +#include +#include +#include "gl_constants.h" + +// This is a severely cutdown version of libdragon's OpenGL implementation + +static uint32_t glp_id; +//DEFINE_RSP_UCODE(rsp_gpu); +extern uint8_t _binary_build_n64_rsp_gpu_text_bin_start[]; +extern uint8_t _binary_build_n64_rsp_gpu_data_bin_start[]; +extern uint8_t _binary_build_n64_rsp_gpu_meta_bin_start[]; +extern uint8_t _binary_build_n64_rsp_gpu_text_bin_end[0]; +extern uint8_t _binary_build_n64_rsp_gpu_data_bin_end[0]; +extern uint8_t _binary_build_n64_rsp_gpu_meta_bin_end[0]; + +static rsp_ucode_t rsp_gpu = (rsp_ucode_t){ + .code = _binary_build_n64_rsp_gpu_text_bin_start, + .code_end = _binary_build_n64_rsp_gpu_text_bin_end, + .data = _binary_build_n64_rsp_gpu_data_bin_start, + .data_end = _binary_build_n64_rsp_gpu_data_bin_end, + .meta = _binary_build_n64_rsp_gpu_meta_bin_start, + .meta_end = _binary_build_n64_rsp_gpu_meta_bin_end, + .name = "rsp_gpu" +}; + +enum { + GPU_CMD_SET_FLAG = 0x0, + GPU_CMD_SET_BYTE = 0x1, + GPU_CMD_SET_SHORT = 0x2, + GPU_CMD_SET_WORD = 0x3, + GPU_CMD_SET_LONG = 0x4, + + GPU_CMD_DRAW_TRI = 0x5, + GPU_CMD_UPLOAD_VTX = 0x6, + + GPU_CMD_MATRIX_LOAD = 0x7, + GPU_CMD_PRE_INIT_PIPE = 0x8, +}; + +enum { + ATTRIB_VERTEX, + ATTRIB_COLOR, + ATTRIB_TEXCOORD, + ATTRIB_COUNT +}; + +typedef struct { + GLfloat scale[3]; + GLfloat offset[3]; +} gl_viewport_t; + +typedef struct { + int16_t i[4][4]; + uint16_t f[4][4]; +} gl_matrix_srv_t; +_Static_assert(sizeof(gl_matrix_srv_t) == MATRIX_SIZE, "Matrix size does not match"); + +typedef struct { + rspq_write_t w; + union { + uint8_t bytes[4]; + uint32_t word; + }; + uint32_t buffer_head; +} gl_cmd_stream_t; + +typedef struct { + GLsizei stride; + const GLvoid *pointer; + bool enabled; +} gl_array_t; + +typedef struct { + gl_matrix_srv_t mvp_matrix; + int16_t viewport_scale[4]; + int16_t viewport_offset[4]; + uint32_t flags; + uint16_t tex_size[2]; + uint16_t tex_offset[2]; + uint16_t tri_cmd; + uint16_t tri_cull; +} __attribute__((aligned(8), packed)) gl_server_state_t; + +static inline const void *gl_get_attrib_element(const gl_array_t *src, uint32_t index) +{ + return src->pointer + index * src->stride; +} + +static inline gl_cmd_stream_t gl_cmd_stream_begin(uint32_t ovl_id, uint32_t cmd_id, int size) +{ + return (gl_cmd_stream_t) { + .w = rspq_write_begin(ovl_id, cmd_id, size), + .buffer_head = 2, + }; +} + +static inline void gl_cmd_stream_commit(gl_cmd_stream_t *s) +{ + rspq_write_arg(&s->w, s->word); + s->buffer_head = 0; + s->word = 0; +} + +static inline void gl_cmd_stream_put_half(gl_cmd_stream_t *s, uint16_t v) +{ + s->bytes[s->buffer_head++] = v >> 8; + s->bytes[s->buffer_head++] = v & 0xFF; + + if (s->buffer_head == sizeof(uint32_t)) { + gl_cmd_stream_commit(s); + } +} + +static inline void gl_cmd_stream_end(gl_cmd_stream_t *s) +{ + if (s->buffer_head > 0) { + gl_cmd_stream_commit(s); + } + + rspq_write_end(&s->w); +} + +__attribute__((always_inline)) +static inline void gl_set_flag_raw(uint32_t offset, uint32_t flag, bool value) +{ + rspq_write(glp_id, GPU_CMD_SET_FLAG, offset | value, value ? flag : ~flag); +} + +__attribute__((always_inline)) +static inline void gl_set_flag(uint32_t flag, bool value) +{ + gl_set_flag_raw(offsetof(gl_server_state_t, flags), flag, value); +} + +__attribute__((always_inline)) +static inline void gl_set_byte(uint32_t offset, uint8_t value) +{ + rspq_write(glp_id, GPU_CMD_SET_BYTE, offset, value); +} + +__attribute__((always_inline)) +static inline void gl_set_short(uint32_t offset, uint16_t value) +{ + rspq_write(glp_id, GPU_CMD_SET_SHORT, offset, value); +} + +__attribute__((always_inline)) +static inline void gl_set_word(uint32_t offset, uint32_t value) +{ + rspq_write(glp_id, GPU_CMD_SET_WORD, offset, value); +} + +__attribute__((always_inline)) +static inline void gl_set_long(uint32_t offset, uint64_t value) +{ + rspq_write(glp_id, GPU_CMD_SET_LONG, offset, value >> 32, value & 0xFFFFFFFF); +} + +static inline void glpipe_draw_triangle(int i0, int i1, int i2) +{ + // We pass -1 because the triangle can be clipped and split into multiple + // triangles. + rdpq_write(-1, glp_id, GPU_CMD_DRAW_TRI, + (i0*PRIM_VTX_SIZE), + ((i1*PRIM_VTX_SIZE)<<16) | (i2*PRIM_VTX_SIZE) + ); +} + + +static gl_viewport_t state_viewport; +static gl_array_t state_arrays[ATTRIB_COUNT]; + +void gl_init() +{ + glp_id = rspq_overlay_register(&rsp_gpu); + glDepthRange(0, 1); +} + +void gl_close() +{ + rspq_wait(); + rspq_overlay_unregister(glp_id); +} + +void gl_set_flag2(GLenum target, bool value) +{ + switch (target) { + case GL_DEPTH_TEST: + gl_set_flag(FLAG_DEPTH_TEST, value); + break; + case GL_TEXTURE_2D: + gl_set_flag(FLAG_TEXTURE_ACTIVE, value); + break; + } +} + +void glEnable(GLenum target) +{ + gl_set_flag2(target, true); +} + +void glDisable(GLenum target) +{ + gl_set_flag2(target, false); +} + +void glTexSizeN64(uint16_t width, uint16_t height) +{ + gl_set_word(offsetof(gl_server_state_t, tex_size[0]), (width << 16) | height); +} + + +static inline void write_shorts(rspq_write_t *w, const uint16_t *s, uint32_t count) +{ + for (uint32_t i = 0; i < count; i += 2) + { + uint32_t packed = ((uint32_t)s[i] << 16) | (uint32_t)s[i+1]; + rspq_write_arg(w, packed); + } +} + +static inline void gl_matrix_write(rspq_write_t *w, const GLfloat *m) +{ + uint16_t integer[16]; + uint16_t fraction[16]; + + for (uint32_t i = 0; i < 16; i++) + { + int32_t fixed = m[i] * (1<<16); + integer[i] = (uint16_t)((fixed & 0xFFFF0000) >> 16); + fraction[i] = (uint16_t)(fixed & 0x0000FFFF); + } + + write_shorts(w, integer, 16); + write_shorts(w, fraction, 16); +} + +void glLoadMatrixf(const GLfloat *m) +{ + rspq_write_t w = rspq_write_begin(glp_id, GPU_CMD_MATRIX_LOAD, 17); + rspq_write_arg(&w, false); // no multiply + gl_matrix_write(&w, m); + rspq_write_end(&w); +} + +static void upload_vertex(const gl_array_t *arrays, uint32_t index, uint8_t cache_index) +{ + gl_cmd_stream_t s = gl_cmd_stream_begin(glp_id, GPU_CMD_UPLOAD_VTX, 6); + gl_cmd_stream_put_half(&s, cache_index * PRIM_VTX_SIZE); + + const float* vtx = gl_get_attrib_element(&arrays[ATTRIB_VERTEX], index); + gl_cmd_stream_put_half(&s, vtx[0] * (1<stride = stride; + array->pointer = pointer; +} + +void glTexCoordPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *pointer) +{ + gl_array_t *array = &state_arrays[ATTRIB_TEXCOORD]; + array->stride = stride; + array->pointer = pointer; +} + +void glColorPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *pointer) +{ + gl_array_t *array = &state_arrays[ATTRIB_COLOR]; + array->stride = stride; + array->pointer = pointer; +} + +void gl_set_array_enabled(int array_type, bool enabled) +{ + state_arrays[array_type].enabled = enabled; +} + +void glEnableClientState(GLenum array) +{ + gl_set_array_enabled(gl_array_type_from_enum(array), true); +} + +void glDisableClientState(GLenum array) +{ + gl_set_array_enabled(gl_array_type_from_enum(array), false); +} + +void glDrawArrays(GLenum mode, GLint first, GLsizei count) +{ + rspq_write(glp_id, GPU_CMD_PRE_INIT_PIPE); + gl_rsp_draw_arrays(first, count); +} + +void glDepthRange(GLclampd n, GLclampd f) +{ + state_viewport.scale[2] = (f - n) * 0.5f; + state_viewport.offset[2] = n + (f - n) * 0.5f; + + gl_set_short( + offsetof(gl_server_state_t, viewport_scale) + sizeof(int16_t) * 2, + state_viewport.scale[2] * 4); + gl_set_short( + offsetof(gl_server_state_t, viewport_offset) + sizeof(int16_t) * 2, + state_viewport.offset[2] * 4); +} + +void glViewport(GLint x, GLint y, GLsizei w, GLsizei h) +{ + state_viewport.scale[0] = w * 0.5f; + state_viewport.scale[1] = h * -0.5f; + state_viewport.offset[0] = x + w * 0.5f; + state_viewport.offset[1] = y + h * 0.5f; + + // Screen coordinates are s13.2 + #define SCREEN_XY_SCALE 4.0f + #define SCREEN_Z_SCALE 32767.0f + + // * 2.0f to compensate for RSP reciprocal missing 1 bit + uint16_t scale_x = state_viewport.scale[0] * SCREEN_XY_SCALE * 2.0f; + uint16_t scale_y = state_viewport.scale[1] * SCREEN_XY_SCALE * 2.0f; + uint16_t scale_z = state_viewport.scale[2] * SCREEN_Z_SCALE * 2.0f; + + uint16_t offset_x = state_viewport.offset[0] * SCREEN_XY_SCALE; + uint16_t offset_y = state_viewport.offset[1] * SCREEN_XY_SCALE; + uint16_t offset_z = state_viewport.offset[2] * SCREEN_Z_SCALE; + + gl_set_long( + offsetof(gl_server_state_t, viewport_scale), + ((uint64_t)scale_x << 48) | ((uint64_t)scale_y << 32) | ((uint64_t)scale_z << 16)); + gl_set_long( + offsetof(gl_server_state_t, viewport_offset), + ((uint64_t)offset_x << 48) | ((uint64_t)offset_y << 32) | ((uint64_t)offset_z << 16)); +} + +void glCullFace(GLenum mode) +{ + // 1 = cull backfaces + // 2 = don't cull + gl_set_short(offsetof(gl_server_state_t, tri_cull), mode ? 1 : 2); +} diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S new file mode 100644 index 000000000..f910bbec6 --- /dev/null +++ b/misc/n64/rsp_gpu.S @@ -0,0 +1,585 @@ +#include +#include +#include "gl_constants.h" + .data + + RSPQ_BeginOverlayHeader + RSPQ_DefineCommand GLCmd_SetFlag, 8 # 0x0 + RSPQ_DefineCommand GLCmd_SetByte, 8 # 0x1 + RSPQ_DefineCommand GLCmd_SetShort, 8 # 0x2 + RSPQ_DefineCommand GLCmd_SetWord, 8 # 0x3 + RSPQ_DefineCommand GLCmd_SetLong, 12 # 0x4 + + RSPQ_DefineCommand GLCmd_DrawTriangle, 8 # 0x5 + RSPQ_DefineCommand GLCmd_UploadVertex, 24 # 0x6 + + RSPQ_DefineCommand GLCmd_MatrixLoad, 68 # 0x7 + RSPQ_DefineCommand GLCmd_PreInitPipe, 4 # 0x8 + RSPQ_EndOverlayHeader + + .align 4 +BANNER0: .ascii " RSP OpenGL T&L " +BANNER1: .ascii "Rasky & Snacchus" + + RSPQ_BeginSavedState + +GL_STATE: + # This is the GL state that is also used by the pipeline. + GL_MATRIX_MVP: .ds.b MATRIX_SIZE + GL_VIEWPORT_SCALE: .half 0,0,0,0 + GL_VIEWPORT_OFFSET: .half 0,0,0,0 + GL_STATE_FLAGS: .word 0 + GL_STATE_TEX_SIZE: .half 0,0 + GL_STATE_TEX_OFFSET: .half 0,0 + GL_TRI_CMD: .half 0 + GL_TRI_CULL: .half 0 + + .align 3 +VERTEX_CACHE: .dcb.b PRIM_VTX_SIZE * VERTEX_CACHE_SIZE + + RSPQ_EndSavedState + + .align 4 +CACHE_OFFSETS: .half 2,4,6,8,10,12,14,16,18 + +CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR + +#define CLIPPING_PLANE_COUNT 6 +#define CLIPPING_CACHE_SIZE 9 +#define CLIPPING_PLANE_SIZE 8 + +#define SCREEN_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit) +#define SCREEN_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit) +#define SCREEN_VTX_X 16 +#define SCREEN_VTX_Y 18 +#define SCREEN_VTX_Z 20 +#define SCREEN_VTX_CLIP_CODE 22 +#define SCREEN_VTX_PADDING 23 +#define SCREEN_VTX_RGBA 24 +#define SCREEN_VTX_S 28 +#define SCREEN_VTX_T 30 +#define SCREEN_VTX_W 32 // FIXME: this is duplicated in CS_POS +#define SCREEN_VTX_INVW 36 // 32-bit +#define SCREEN_VTX_SIZE 40 + + .text + + ############################################################# + # GLCmd_SetFlag + # + # Sets or clears a flag + # + # ARGS: + # a0: Bit 31..24: Command id + # Bit 11..2: Offset of flag value in GL_STATE + # Bit 0: If 1, set the flag, otherwise clear it + # a1: flag mask (inverted if clearing) + ############################################################# + .func GLCmd_SetFlag +GLCmd_SetFlag: + li t0, ~0x3 + and t0, a0, t0 + andi t1, a0, 1 + lw t2, %lo(GL_STATE)(t0) + beqz t1, 1f + and t3, t2, a1 + or t3, t2, a1 + +1: + jr ra + sw t3, %lo(GL_STATE)(t0) + .endfunc + + .func GLCmd_SetByte +GLCmd_SetByte: + jr ra + sb a1, %lo(GL_STATE)(a0) + .endfunc + + .func GLCmd_SetShort +GLCmd_SetShort: + jr ra + sh a1, %lo(GL_STATE)(a0) + .endfunc + + .func GLCmd_SetWord +GLCmd_SetWord: + jr ra + sw a1, %lo(GL_STATE) + 0(a0) + .endfunc + + .func GLCmd_SetLong +GLCmd_SetLong: + sw a2, %lo(GL_STATE) + 4(a0) + jr ra + sw a1, %lo(GL_STATE) + 0(a0) + .endfunc + + + ######################################## + # GLCmd_UploadVertex + # + # Arguments: + # * 0x00 (a0): offset within VERTEX_CACHE + # * 0x04 (a1): object space X, Y (16-bit) + # * 0x08 (a2): object space Z, W (16-bit) + # * 0x0C (a3): RGBA (8-bit each one) + # * 0x10: S, T (16-bit) + # * 0x14: normal X, Y, Z (8-bit each one) (LSB must be 0) + # + ######################################## + .align 3 + .func GLCmd_UploadVertex +GLCmd_UploadVertex: + #define vtx a0 + #define mtx_ptr s0 + #define cmd_ptr s4 + + #define v___ $v01 + + #define vmtx0_i $v16 // m00 m01 m02 m03 + #define vmtx0_f $v17 + #define vmtx1_i $v18 // m10 m11 m12 m13 + #define vmtx1_f $v19 + #define vmtx2_i $v20 // m20 m21 m22 m23 + #define vmtx2_f $v21 + #define vmtx3_i $v22 // m30 m31 m32 m03 + #define vmtx3_f $v23 + + #define vpos $v24 + #define vcol $v25 + #define vtex $v26 + #define vcspos_i $v28 + #define vcspos_f $v29 + + #define x e0 + #define y e1 + #define z e2 + #define w e3 + + addi cmd_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) + 4 + sub cmd_ptr, rspq_cmd_size + + ldv vpos, 0, cmd_ptr # Load X, Y, Z, W + ldv vcol, 8, cmd_ptr # Load R, G, B, A + llv vtex, 16, cmd_ptr # Load U, V + + addi vtx, %lo(VERTEX_CACHE) + sdv vpos, PRIM_VTX_X ,vtx + sdv vcol, PRIM_VTX_R ,vtx + sdv vtex, PRIM_VTX_TEX_S ,vtx + +# == matrix multiply == + li mtx_ptr, %lo(GL_MATRIX_MVP) + ldv vmtx0_i.e0, 0x00,mtx_ptr + ldv vmtx1_i.e0, 0x08,mtx_ptr + ldv vmtx2_i.e0, 0x10,mtx_ptr + ldv vmtx3_i.e0, 0x18,mtx_ptr + ldv vmtx0_f.e0, 0x20,mtx_ptr + ldv vmtx1_f.e0, 0x28,mtx_ptr + ldv vmtx2_f.e0, 0x30,mtx_ptr + ldv vmtx3_f.e0, 0x38,mtx_ptr + + vmudn v___, vmtx0_f, vpos.h0 + vmadh v___, vmtx0_i, vpos.h0 + vmadn v___, vmtx1_f, vpos.h1 + vmadh v___, vmtx1_i, vpos.h1 + vmadn v___, vmtx2_f, vpos.h2 + vmadh v___, vmtx2_i, vpos.h2 + vmadn v___, vmtx3_f, vpos.h3 + vmadh vcspos_i, vmtx3_i, vpos.h3 + vmadn vcspos_f, vzero, vzero +# == end matrix multiply == + + # 32-bit right shift by 5, to keep the clip space coordinates unscaled + vmudm vcspos_i, vcspos_i, vshift8.e4 + vmadl vcspos_f, vcspos_f, vshift8.e4 + + sdv vcspos_i, PRIM_VTX_CS_POSi,vtx + sdv vcspos_f, PRIM_VTX_CS_POSf,vtx + + # Calculate and store clipping flags against CS.W. + # These will be used for trivial rejections. + vch v___, vcspos_i, vcspos_i.w + vcl v___, vcspos_f, vcspos_f.w + cfc2 t0, COP2_CTRL_VCC + andi t0, 0x707 # Isolate X/Y/Z flags + + # Compress flags to 8 bit + srl t1, t0, 5 + andi t0, 0x7 + or t0, t1 + jr ra + sb t0, PRIM_VTX_TRCODE(vtx) + + #undef cmd_ptr + #undef vtx + #undef in_xy + #undef in_zw + #undef in_rgba + #undef vtx_id + + #undef x + #undef y + #undef z + #undef w + + #undef v___ + + #undef vmtx0_i + #undef vmtx0_f + #undef vmtx1_i + #undef vmtx1_f + #undef vmtx2_i + #undef vmtx2_f + #undef vmtx3_i + #undef vmtx3_f + + #undef vpos + #undef vcspos_i + #undef vcspos_f + + .endfunc + + ################################################################ + # GL_CalcScreenSpace + # + # Args: + # s3 = Destination vertex address + # $v02 = Clip space position (fractional part) + # $v03 = Clip space position (integer part) + # + ################################################################ + .func GL_CalcScreenSpace +GL_CalcScreenSpace: + #define dst s3 + #define vcspos_f $v02 + #define vcspos_i $v03 + #define vinvw_f $v23 + #define vinvw_i $v24 + #define vviewscale $v25 + #define vviewoff $v26 + #define vscreenpos_i $v27 + #define vscreenpos_f $v28 + #define v___ $v29 + #define w e3 + + # Calculate 32-bit inverse W + # TODO: NR? + vrcph vinvw_i.w, vcspos_i.w + vrcpl vinvw_f.w, vcspos_f.w + vrcph vinvw_i.w, vzero.e0 + + # Calculate screenspace coords + li t0, %lo(GL_VIEWPORT_SCALE) + ldv vviewscale, 0,t0 + ldv vviewoff, 8,t0 + + vmudl v___, vcspos_f, vinvw_f.w + vmadm v___, vcspos_i, vinvw_f.w + vmadn vscreenpos_f, vcspos_f, vinvw_i.w + vmadh vscreenpos_i, vcspos_i, vinvw_i.w + + vmudn vscreenpos_f, vscreenpos_f, vviewscale + vmadh vscreenpos_i, vscreenpos_i, vviewscale + vadd vscreenpos_i, vviewoff + + sdv vscreenpos_i, SCREEN_VTX_X ,dst + ssv vcspos_i.w, SCREEN_VTX_W+0 ,dst + ssv vcspos_f.w, SCREEN_VTX_W+2 ,dst + ssv vinvw_i.w, SCREEN_VTX_INVW+0,dst + ssv vinvw_f.w, SCREEN_VTX_INVW+2,dst + jr ra + sb zero, SCREEN_VTX_PADDING(dst) + + #undef dst + #undef vcspos_f + #undef vcspos_i + #undef vinvw_f + #undef vinvw_i + #undef vviewscale + #undef vviewoff + #undef vscreenpos_i + #undef vscreenpos_f + #undef v___ + #undef w + + .endfunc + + ################################################################ + # GL_CalcClipCodes + # + # Args: + # s3 = Destination vertex address + # $v02 = Clip space position (fractional part) + # $v03 = Clip space position (integer part) + # + ################################################################ + .func GL_CalcClipCodes +GL_CalcClipCodes: + #define dst s3 + #define vcspos_f $v02 + #define vcspos_i $v03 + #define vguard_f $v27 + #define vguard_i $v28 + #define v___ $v29 + #define w e3 + + li t0, %lo(CLIP_CODE_FACTORS) + ldv vguard_i, 0,t0 + + vmudn vguard_f, vcspos_f, vguard_i + vmadh vguard_i, vcspos_i, vguard_i + + vch v___, vguard_i, vguard_i.w + vcl v___, vguard_f, vguard_f.w + cfc2 t0, COP2_CTRL_VCC + andi t0, 0x707 + srl t1, t0, 5 + andi t0, 0x7 + or t0, t1 + jr ra + sb t0, SCREEN_VTX_CLIP_CODE(dst) + + #undef dst + #undef vcspos_i + #undef vcspos_f + #undef vguard_i + #undef vguard_f + #undef v___ + #undef w + + .endfunc + + ################################################################ + # GL_TnL + # + # Args: + # s3 = address of the vertex in DMEM (usually within VERTEX_CACHE) + # + ################################################################ + .func GL_TnL +GL_TnL: + #define tmp_ptr s2 + #define vtx s3 + #define s e0 + move ra2, ra + + #define v___ $v01 + #define vrgba $v04 + + ldv vrgba.e0, PRIM_VTX_R, vtx # R + G + B + A + ldv vrgba.e4, PRIM_VTX_R, vtx # R + G + B + A + + #define vtexsize $v06 + #define vtexoffset $v07 + #define vstrq $v08 + + ldv vstrq, PRIM_VTX_TEX_S,vtx # S + T + R + Q + suv vrgba, SCREEN_VTX_RGBA,vtx + + li s1, %lo(GL_STATE_TEX_SIZE) + llv vtexsize.s, 0,s1 + llv vtexoffset.s, 4,s1 + + #define vinvq_i $v26 + #define vinvq_f $v27 + #define vstrq_i $v28 + #define vstrq_f $v29 + #define q e3 + + # Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active) + #vmudn v___, vstrq, vtexsize + # vmadh vstrq, vtexoffset, K1 + + #vmudn v___, vstrq, vtexsize + #vmadh vstrq, vtexoffset, K1 + #vmudl vstrq, vstrq, vtexsize + + vmudh v___, vstrq, vtexsize + vsar vstrq_i, COP2_ACC_HI + vsar vstrq_f, COP2_ACC_MD + + vmudl vstrq_f, vstrq_f, K8192 + vmadm vstrq_i, vstrq_i, K8192 + vmadn vstrq, vzero, vzero + + #undef vinvq_i + #undef vinvq_f + #undef vstrq_i + #undef vstrq_f + #undef q + + lbu t0, PRIM_VTX_TRCODE(vtx) + + #define vcspos_f $v02 + #define vcspos_i $v03 + + ldv vcspos_f, PRIM_VTX_CS_POSf,vtx + ldv vcspos_i, PRIM_VTX_CS_POSi,vtx + + # Mark this vertex as having T&L applied + ori t0, 0x80 + sb t0, PRIM_VTX_TRCODE(vtx) + + jal GL_CalcScreenSpace + slv vstrq.s, SCREEN_VTX_S,vtx + + j GL_CalcClipCodes + move ra, ra2 + + #undef vcspos_f + #undef vcspos_i + #undef vtexsize + #undef vtexoffset + + #undef vtx + + #undef v___ + #undef vrgba + #undef vst + #undef s + + .endfunc + + + ################################################################ + # GLCmd_DrawTriangle + # + # Arguments: + # a0: Bit 31..24: Command id + # Bit 11..0: Offset into vertex cache of vtx1 + # a1: Bit 27..16: Offset into vertex cache of vtx2 + # Bit 11..0: Offset into vertex cache of vtx3 + # + ################################################################ + .func GLCmd_DrawTriangle +GLCmd_DrawTriangle: + #define vtx1 a1 + #define vtx2 a2 + #define vtx3 a3 + #define trcode1 t6 + #define trcode2 t7 + #define trcode3 t8 + + addi vtx3, a1, %lo(VERTEX_CACHE) + srl vtx2, a1, 16 + addi vtx2, %lo(VERTEX_CACHE) + addi vtx1, a0, %lo(VERTEX_CACHE) + + # Trivial reject: if all the vertices are out of the same plane (at least one), + # the triangle is out of the viewport. + # NOTE: This deliberately uses lb instead of lbu so the sign bit is extended. + # The MSB of each TR-code is a bit flag that is set if the vertex has already + # had T&L applied once. + lb trcode1, PRIM_VTX_TRCODE(vtx1) + lb trcode2, PRIM_VTX_TRCODE(vtx2) + lb trcode3, PRIM_VTX_TRCODE(vtx3) + and t0, trcode1, trcode2 + and t0, trcode3 + andi t0, 0x3F + bnez t0, JrRa + nop + + # Perform T&L for each vertex if we haven't already + bgezal trcode1, GL_TnL + move s3, vtx1 + + bgezal trcode2, GL_TnL + move s3, vtx2 + + bgezal trcode3, GL_TnL + move s3, vtx3 + + lbu t0, SCREEN_VTX_CLIP_CODE(vtx1) + lbu t1, SCREEN_VTX_CLIP_CODE(vtx2) + lbu t2, SCREEN_VTX_CLIP_CODE(vtx3) + or t5, t0, t1 + or t5, t2 + + move s1, zero + beqz t5, gl_draw_single_triangle + move s2, zero + + jal GL_ClipTriangle + nop + + beqz v1, gl_draw_triangle_end + addi s2, -6 + lhu s5, 0(s1) +gl_draw_clipped_triangles_loop: + move vtx1, s5 + lhu vtx2, 2(s1) + lhu vtx3, 4(s1) + +gl_draw_single_triangle: + addi vtx1, SCREEN_VTX_X + addi vtx2, SCREEN_VTX_X + addi vtx3, SCREEN_VTX_X + + lhu a0, %lo(GL_TRI_CMD) + lh v0, %lo(GL_TRI_CULL) + jal RDPQ_Triangle + li s3, %lo(RDPQ_CMD_STAGING) + + jal RDPQ_Send + li s4, %lo(RDPQ_CMD_STAGING) + + blt s1, s2, gl_draw_clipped_triangles_loop + addi s1, 2 + +gl_draw_triangle_end: + j RSPQ_Loop + nop + + #undef vtx1 + #undef vtx2 + #undef vtx3 + + .endfunc + + +GLCmd_MatrixLoad: + #define src s6 + #define dst s7 + + #define vrhs01_i $v02 + #define vrhs01_f $v03 + #define vrhs23_i $v04 + #define vrhs23_f $v05 + + addi src, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64 + addi dst, zero, %lo(GL_MATRIX_MVP) + + # Load the matrix from command parameters (misaligned) + lqv vrhs01_i, 0x00,src + lrv vrhs01_i, 0x10,src + lqv vrhs23_i, 0x10,src + lrv vrhs23_i, 0x20,src + lqv vrhs01_f, 0x20,src + lrv vrhs01_f, 0x30,src + lqv vrhs23_f, 0x30,src + lrv vrhs23_f, 0x40,src + + sqv vrhs01_i, 0x00,dst + sqv vrhs23_i, 0x10,dst + sqv vrhs01_f, 0x20,dst + jr ra + sqv vrhs23_f, 0x30,dst + + .func GLCmd_PreInitPipe +GLCmd_PreInitPipe: + #define state_flags k1 + #define tri_cmd t4 + + lw tri_cmd, %lo(GL_STATE_FLAGS) + ori tri_cmd, 0xCC00 + jr ra + sh tri_cmd, %lo(GL_TRI_CMD) + + #undef tri_cmd + #undef state_flags + .endfunc + +#include "rsp_gpu_clipping.inc" +#include diff --git a/misc/n64/rsp_gpu_clipping.inc b/misc/n64/rsp_gpu_clipping.inc new file mode 100644 index 000000000..90d753dad --- /dev/null +++ b/misc/n64/rsp_gpu_clipping.inc @@ -0,0 +1,374 @@ + + .section .data.gl_clipping + + .align 4 +CLIP_PLANES: + .half 1, 0, 0, GUARD_BAND_FACTOR + .half 0, 1, 0, GUARD_BAND_FACTOR + .half 0, 0, 1, 1 + .half 1, 0, 0, -GUARD_BAND_FACTOR + .half 0, 1, 0, -GUARD_BAND_FACTOR + .half 0, 0, 1, -1 + + .section .bss.gl_clipping + +CLIP_CACHE: .dcb.b SCREEN_VTX_SIZE * CLIPPING_CACHE_SIZE +CLIP_CACHE_END: + +CLIP_LISTS: + CLIP_LIST0: .dcb.w CLIPPING_CACHE_SIZE + CLIP_LIST1: .dcb.w CLIPPING_CACHE_SIZE + + + .section .text.gl_clipping + + ################################################################ + # GL_ClipTriangle + # Clip a triangle against the view-frustum by using the Sutherland-Hodgman algorithm + # https://en.wikipedia.org/wiki/Sutherland%E2%80%93Hodgman_algorithm + # Args: + # a1-a3 = Vertices + # t5 = OR'd clip flags of the triangle's vertices + # Returns: + # s1 = Pointer to list of output vertices + # s2 = Pointer to end of list + ################################################################ + .func GL_ClipTriangle +GL_ClipTriangle: + #define out_count v1 + #define clip_flags t5 + #define plane_flag t6 + #define in_count t7 + #define in_end t8 + #define in_list s0 + #define out_list s1 + #define plane s2 + #define intersection s3 + #define cur_ptr s4 + #define prev_ptr s5 + #define cur_vtx s6 + #define prev_vtx s7 + #define p0 k0 + #define p1 k1 + #define vtx1 a1 + #define vtx2 a2 + #define vtx3 a3 + + #define vplane $v01 + #define vint_f $v02 + #define vint_i $v03 + #define vdot_i $v04 + #define vdot_f $v05 + #define vdiff_i $v06 + #define vdiff_f $v07 + #define va_i $v08 + #define va_f $v09 + #define vpos_i $v10 + #define vpos_f $v11 + #define vattr0 $v12 + #define vattr1 $v13 + #define voff0 $v14 + #define voff1 $v15 + #define vcache0 $v16 + #define vcache1 $v17 + #define v__ $v29 + + move ra2, ra + + # Init in_list as empty + li in_list, %lo(CLIP_LIST0) + move in_count, zero + + # Put three original vertices in the out_list + # (So after the initial swap they will be in the in_list) + li out_list, %lo(CLIP_LIST1) + sh vtx1, 0(out_list) + sh vtx2, 2(out_list) + sh vtx3, 4(out_list) + li out_count, 3*2 + + li plane, %lo(CLIP_PLANES) + li plane_flag, 1 + + # Load cache offsets + li t0, %lo(CACHE_OFFSETS) + vxor voff1, voff1 + lqv voff0, 0,t0 + lsv voff1, 16,t0 + + # Temporarily use the RDP staging area as a map of which cache slots are used + # Init to zero + li t0, %lo(RDPQ_CMD_STAGING) + sqv vzero, 0,t0 + sqv vzero, 16,t0 + + # Iterate over the 6 clipping planes +gl_clip_plane_loop: + and t0, clip_flags, plane_flag + beqz t0, gl_clip_plane_loop_end + move t1, in_list + + # Swap in and out lists + + # If the out list is empty from the last iteration, + # the triangle has no visible points and we are done + beqz out_count, gl_clip_return + move in_list, out_list + move out_list, t1 + move in_count, out_count + move out_count, zero + + # Iterate over the egdes of the polygon in the input list + # The current edge is between cur_vtx and prev_vtx + move cur_ptr, in_list + add in_end, in_list, in_count + # Init the "previous" vertex to the last in the list for the wrap-around + addi prev_ptr, in_end, -2 + +gl_clip_edge_loop: + #define cur_flag t3 + #define prev_flag t4 + + # Check which side of the plane the two vertices are on + lhu cur_vtx, 0(cur_ptr) + lhu prev_vtx, 0(prev_ptr) + lbu cur_flag, SCREEN_VTX_CLIP_CODE(cur_vtx) + lbu prev_flag, SCREEN_VTX_CLIP_CODE(prev_vtx) + and cur_flag, plane_flag + and prev_flag, plane_flag + + # If they are on opposite sides, there is an intersection + xor t0, cur_flag, prev_flag + beqz t0, gl_clip_no_intersection + move p0, cur_vtx + + # Swap the two points if necessary to make intersection calculation consistent + # This will make sure p0 is always inside and p1 is always outside + bnez prev_flag, gl_clip_no_swap + move p1, prev_vtx + xor p0, p0, p1 + xor p1, p0, p1 + xor p0, p0, p1 + + #undef prev_flag + +gl_clip_no_swap: + # Calculate intersection of the line segment and the plane + + li t0, %lo(RDPQ_CMD_STAGING) + lqv vcache0, 0,t0 + lqv vcache1, 16,t0 + + # Repeat plane coefficients twice + ldv vplane.e0, 0,plane + ldv vplane.e4, 0,plane + + # vpos: x0 y0 z0 w0 x1 y1 z1 w1 + ldv vpos_i.e0, SCREEN_VTX_CS_POSi,p0 + ldv vpos_f.e0, SCREEN_VTX_CS_POSf,p0 + ldv vpos_i.e4, SCREEN_VTX_CS_POSi,p1 + ldv vpos_f.e4, SCREEN_VTX_CS_POSf,p1 + + # vint: x1 y1 z1 w1 + ldv vint_i.e0, SCREEN_VTX_CS_POSi,p1 + ldv vint_f.e0, SCREEN_VTX_CS_POSf,p1 + + # vattr0: r0 g0 b0 a0 s0 t0 + luv vattr0.e0, SCREEN_VTX_RGBA ,p0 + llv vattr0.e4, SCREEN_VTX_S ,p0 + + # vattr1: r1 g1 b1 a1 s1 t1 + luv vattr1.e0, SCREEN_VTX_RGBA ,p1 + llv vattr1.e4, SCREEN_VTX_S ,p1 + + # Find first free slot in clip cache + + # Add the values from the "used slots map" to the cache offsets + # After this, each lane will contain the offset of its corresponding cache slot, + # but only if the slot is not used. If it is used, it will contain some large value. + vaddc vcache0, voff0 + vaddc vcache1, voff1 + + # Look for the smallest value, which will end up in vcache.e0 + # Because used slots are marked as large values, they will never be found. + vlt vcache0, vcache0.q1 + vlt vcache0, vcache0.h2 + vlt vcache0, vcache0.e4 + vlt vcache0, vcache1.e0 + + mfc2 t0, vcache0.e0 + + # Mark slot as used by storing some large value (careful of overflows!) + li t1, 0xFF + sh t1, %lo(RDPQ_CMD_STAGING)-2(t0) + + # t0 is the index multiplied by 2 + # intersection = t0 * 20 = t0 * 16 + t0 * 4 + sll intersection, t0, 4 + sll t1, t0, 2 + add intersection, t1 + + # CAUTION: intersection might point to the same address as either p0 or p1, + # because one of them is the previous point, which could have been marked unused + # in the previous iteration. As long as we don't access p0 or p1 after writing to + # intersection, this is fine. + addi intersection, %lo(CLIP_CACHE) - SCREEN_VTX_SIZE + + # Store the cache offset in unused memory (used later when finding the cache slot to mark as unused) + sb t0, SCREEN_VTX_PADDING(intersection) + + # Compute dot products of both positions with the clip plane + # vdot.e0: d0 = dot(p0, plane) + # vdot.e4: d1 = dot(p1, plane) + vmudn vdot_f, vpos_f, vplane + vmadh vdot_i, vpos_i, vplane + vaddc vdot_f, vdot_f.q1 + vadd vdot_i, vdot_i.q1 + vaddc vdot_f, vdot_f.h2 + vadd vdot_i, vdot_i.h2 + + # d0 - d1 + vsubc vdiff_f, vdot_f, vdot_f.e4 + vsub vdiff_i, vdot_i, vdot_i.e4 + + # 1 / (d0 - d1) + vrcph v__.e0, vdiff_i.e0 + vrcpl va_f.e0, vdiff_f.e0 + vrcph va_i.e0, vzero.e0 + + # a = d0 / (d0 - d1) + vmudl v__, va_f, vdot_f.e0 + vmadm v__, va_i, vdot_f.e0 + vmadn va_f, va_f, vdot_i.e0 + + # Prepare 0x7FFF in va_i.e0 + vsubc va_i, vshift8, K1 + + # a = min(a, 1) + vge v__, va_f, vzero + vmrg va_f, va_f, va_i.e0 + + # Account for right shift introduced by vrcp + vmudn va_f, va_f, K2 + + # p1 - p0 + vsubc vint_f, vpos_f + vsub vint_i, vpos_i + # attr1 - attr0 + vsubc vattr1, vattr0 + + # Result of linear interpolation: + # p0 + a * (p1 - p0) + vmudl v__, vint_f, va_f.e0 + vmadm v__, vint_i, va_f.e0 + vmadn vint_f, vpos_f, K1 + vmadh vint_i, vpos_i, K1 + + # a * (attr1 - attr0) + vmudm vattr1, vattr1, va_f.e0 + + # attr0 + a * (attr1 - attr0) + vaddc vattr0, vattr1 + + # Store results + sdv vint_i.e0, SCREEN_VTX_CS_POSi,intersection + sdv vint_f.e0, SCREEN_VTX_CS_POSf,intersection + suv vattr0.e0, SCREEN_VTX_RGBA ,intersection + jal GL_CalcClipCodes + slv vattr0.e4, SCREEN_VTX_S ,intersection + + # Add intersection to the output list + add t0, out_list, out_count + sh intersection, 0(t0) + addi out_count, 2 + +gl_clip_no_intersection: + # If cur_vtx is inside, add it to the output list + bnez cur_flag, gl_clip_no_current + add t0, out_list, out_count + sh cur_vtx, 0(t0) + b gl_clip_edge_loop_end + addi out_count, 2 + + #undef cur_flag + +gl_clip_no_current: + # Check if the vertex is stored in the clip cache + lbu t0, SCREEN_VTX_PADDING(cur_vtx) + beqz t0, gl_clip_edge_loop_end + # Reset the padding field to zero, so the screen space values won't be recalculated below + sb zero, SCREEN_VTX_PADDING(cur_vtx) + # If so, mark it as unused + sh zero, %lo(RDPQ_CMD_STAGING)-2(t0) + +gl_clip_edge_loop_end: + # Advance to the next edge + addi cur_ptr, 2 + blt cur_ptr, in_end, gl_clip_edge_loop + addi prev_ptr, cur_ptr, -2 + +gl_clip_plane_loop_end: + # Advance to the next clipping plane + sll plane_flag, 1 + blt plane_flag, (1< -#include -#include #include +#include +#include "../misc/n64/gpu.c" typedef void (*GL_SetupVBFunc)(void); static GL_SetupVBFunc gfx_setupVBFunc; @@ -20,16 +20,21 @@ static surface_t zbuffer; static GfxResourceID white_square; void Gfx_Create(void) { + rspq_init(); + //rspq_profile_start(); rdpq_init(); + //rdpq_debug_start(); // TODO debug + //rdpq_debug_log(true); rdpq_set_mode_standard(); __rdpq_mode_change_som(SOM_TEXTURE_PERSP, SOM_TEXTURE_PERSP); __rdpq_mode_change_som(SOM_ZMODE_MASK, SOM_ZMODE_OPAQUE); rdpq_mode_dithering(DITHER_SQUARE_SQUARE); + // Set alpha compare threshold + rdpq_set_blend_color(RGBA32(0,0,0, 127)); + gl_init(); - //rdpq_debug_start(); // TODO debug - //rdpq_debug_log(true); zbuffer = surface_alloc(FMT_RGBA16, display_get_width(), display_get_height()); Gfx.MaxTexWidth = 256; @@ -121,7 +126,10 @@ void Gfx_ClearColor(PackedCol color) { void Gfx_EndFrame(void) { Platform_LogConst("GFX ctx end"); rdpq_detach_show(); -//Platform_LogConst("GFX END"); + //Platform_LogConst("GFX END"); + + //rspq_profile_dump(); + //rspq_profile_next_frame(); } @@ -253,7 +261,7 @@ static void SetAlphaBlend(cc_bool enabled) { void Gfx_SetAlphaArgBlend(cc_bool enabled) { } static void SetAlphaTest(cc_bool enabled) { - rdpq_mode_alphacompare(enabled ? 127 : 0); + __rdpq_mode_change_som(SOM_ALPHACOMPARE_MASK, enabled ? SOM_ALPHACOMPARE_THRESHOLD : 0); } static void SetColorWrite(cc_bool r, cc_bool g, cc_bool b, cc_bool a) { From 1f395b9a6c346acab8abf39b9dcaf9e39961d56e Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Sun, 27 Apr 2025 13:29:04 +1000 Subject: [PATCH 03/14] N64 optimised, stage 3 --- misc/n64/gpu.c | 271 +++++++++++++++++---------------------------- src/Graphics_N64.c | 73 ++++-------- 2 files changed, 122 insertions(+), 222 deletions(-) diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index d84943dab..a496eb173 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -1,4 +1,3 @@ -#include "GL/gl.h" #include "rspq.h" #include "rdpq.h" #include "rdpq_rect.h" @@ -6,14 +5,11 @@ #include "rdpq_debug.h" #include "display.h" #include "rdp.h" -#include -#include -#include #include "gl_constants.h" // This is a severely cutdown version of libdragon's OpenGL implementation -static uint32_t glp_id; +static uint32_t gpup_id; //DEFINE_RSP_UCODE(rsp_gpu); extern uint8_t _binary_build_n64_rsp_gpu_text_bin_start[]; extern uint8_t _binary_build_n64_rsp_gpu_data_bin_start[]; @@ -46,23 +42,16 @@ enum { GPU_CMD_PRE_INIT_PIPE = 0x8, }; -enum { - ATTRIB_VERTEX, - ATTRIB_COLOR, - ATTRIB_TEXCOORD, - ATTRIB_COUNT -}; - typedef struct { - GLfloat scale[3]; - GLfloat offset[3]; -} gl_viewport_t; + float scale[3]; + float offset[3]; +} gpu_viewport_t; typedef struct { int16_t i[4][4]; uint16_t f[4][4]; -} gl_matrix_srv_t; -_Static_assert(sizeof(gl_matrix_srv_t) == MATRIX_SIZE, "Matrix size does not match"); +} gpu_matrix_srv_t; +_Static_assert(sizeof(gpu_matrix_srv_t) == MATRIX_SIZE, "Matrix size does not match"); typedef struct { rspq_write_t w; @@ -71,16 +60,10 @@ typedef struct { uint32_t word; }; uint32_t buffer_head; -} gl_cmd_stream_t; +} gpu_cmd_stream_t; typedef struct { - GLsizei stride; - const GLvoid *pointer; - bool enabled; -} gl_array_t; - -typedef struct { - gl_matrix_srv_t mvp_matrix; + gpu_matrix_srv_t mvp_matrix; int16_t viewport_scale[4]; int16_t viewport_offset[4]; uint32_t flags; @@ -88,134 +71,111 @@ typedef struct { uint16_t tex_offset[2]; uint16_t tri_cmd; uint16_t tri_cull; -} __attribute__((aligned(8), packed)) gl_server_state_t; +} __attribute__((aligned(8), packed)) gpu_state; -static inline const void *gl_get_attrib_element(const gl_array_t *src, uint32_t index) +static inline gpu_cmd_stream_t gpu_cmd_stream_begin(uint32_t ovl_id, uint32_t cmd_id, int size) { - return src->pointer + index * src->stride; -} - -static inline gl_cmd_stream_t gl_cmd_stream_begin(uint32_t ovl_id, uint32_t cmd_id, int size) -{ - return (gl_cmd_stream_t) { + return (gpu_cmd_stream_t) { .w = rspq_write_begin(ovl_id, cmd_id, size), .buffer_head = 2, }; } -static inline void gl_cmd_stream_commit(gl_cmd_stream_t *s) +static inline void gpu_cmd_stream_commit(gpu_cmd_stream_t *s) { rspq_write_arg(&s->w, s->word); s->buffer_head = 0; s->word = 0; } -static inline void gl_cmd_stream_put_half(gl_cmd_stream_t *s, uint16_t v) +static inline void gpu_cmd_stream_put_half(gpu_cmd_stream_t *s, uint16_t v) { s->bytes[s->buffer_head++] = v >> 8; s->bytes[s->buffer_head++] = v & 0xFF; if (s->buffer_head == sizeof(uint32_t)) { - gl_cmd_stream_commit(s); + gpu_cmd_stream_commit(s); } } -static inline void gl_cmd_stream_end(gl_cmd_stream_t *s) +static inline void gpu_cmd_stream_end(gpu_cmd_stream_t *s) { if (s->buffer_head > 0) { - gl_cmd_stream_commit(s); + gpu_cmd_stream_commit(s); } rspq_write_end(&s->w); } __attribute__((always_inline)) -static inline void gl_set_flag_raw(uint32_t offset, uint32_t flag, bool value) +static inline void gpu_set_flag_raw(uint32_t offset, uint32_t flag, bool value) { - rspq_write(glp_id, GPU_CMD_SET_FLAG, offset | value, value ? flag : ~flag); + rspq_write(gpup_id, GPU_CMD_SET_FLAG, offset | value, value ? flag : ~flag); } __attribute__((always_inline)) -static inline void gl_set_flag(uint32_t flag, bool value) +static inline void gpu_set_flag(uint32_t flag, bool value) { - gl_set_flag_raw(offsetof(gl_server_state_t, flags), flag, value); + gpu_set_flag_raw(offsetof(gpu_state, flags), flag, value); } __attribute__((always_inline)) -static inline void gl_set_byte(uint32_t offset, uint8_t value) +static inline void gpu_set_byte(uint32_t offset, uint8_t value) { - rspq_write(glp_id, GPU_CMD_SET_BYTE, offset, value); + rspq_write(gpup_id, GPU_CMD_SET_BYTE, offset, value); } __attribute__((always_inline)) -static inline void gl_set_short(uint32_t offset, uint16_t value) +static inline void gpu_set_short(uint32_t offset, uint16_t value) { - rspq_write(glp_id, GPU_CMD_SET_SHORT, offset, value); + rspq_write(gpup_id, GPU_CMD_SET_SHORT, offset, value); } __attribute__((always_inline)) -static inline void gl_set_word(uint32_t offset, uint32_t value) +static inline void gpu_set_word(uint32_t offset, uint32_t value) { - rspq_write(glp_id, GPU_CMD_SET_WORD, offset, value); + rspq_write(gpup_id, GPU_CMD_SET_WORD, offset, value); } __attribute__((always_inline)) -static inline void gl_set_long(uint32_t offset, uint64_t value) +static inline void gpu_set_long(uint32_t offset, uint64_t value) { - rspq_write(glp_id, GPU_CMD_SET_LONG, offset, value >> 32, value & 0xFFFFFFFF); + rspq_write(gpup_id, GPU_CMD_SET_LONG, offset, value >> 32, value & 0xFFFFFFFF); } -static inline void glpipe_draw_triangle(int i0, int i1, int i2) +static inline void gpupipe_draw_triangle(int i0, int i1, int i2) { // We pass -1 because the triangle can be clipped and split into multiple // triangles. - rdpq_write(-1, glp_id, GPU_CMD_DRAW_TRI, + rdpq_write(-1, gpup_id, GPU_CMD_DRAW_TRI, (i0*PRIM_VTX_SIZE), ((i1*PRIM_VTX_SIZE)<<16) | (i2*PRIM_VTX_SIZE) ); } -static gl_viewport_t state_viewport; -static gl_array_t state_arrays[ATTRIB_COUNT]; +static gpu_viewport_t state_viewport; +static bool gpu_texturing; +static void* gpu_pointer; +static int gpu_stride; -void gl_init() -{ - glp_id = rspq_overlay_register(&rsp_gpu); - glDepthRange(0, 1); -} - -void gl_close() -{ - rspq_wait(); - rspq_overlay_unregister(glp_id); -} - -void gl_set_flag2(GLenum target, bool value) +#define GPU_ATTR_Z 0 +#define GPU_ATTR_TEX 1 +static void gpuSetFlag(int target, bool value) { switch (target) { - case GL_DEPTH_TEST: - gl_set_flag(FLAG_DEPTH_TEST, value); + case GPU_ATTR_Z: + gpu_set_flag(FLAG_DEPTH_TEST, value); break; - case GL_TEXTURE_2D: - gl_set_flag(FLAG_TEXTURE_ACTIVE, value); + case GPU_ATTR_TEX: + gpu_set_flag(FLAG_TEXTURE_ACTIVE, value); break; } } -void glEnable(GLenum target) +static void gpuSetTexSize(uint16_t width, uint16_t height) { - gl_set_flag2(target, true); -} - -void glDisable(GLenum target) -{ - gl_set_flag2(target, false); -} - -void glTexSizeN64(uint16_t width, uint16_t height) -{ - gl_set_word(offsetof(gl_server_state_t, tex_size[0]), (width << 16) | height); + gpu_set_word(offsetof(gpu_state, tex_size[0]), (width << 16) | height); } @@ -228,7 +188,7 @@ static inline void write_shorts(rspq_write_t *w, const uint16_t *s, uint32_t cou } } -static inline void gl_matrix_write(rspq_write_t *w, const GLfloat *m) +static inline void gpu_matrix_write(rspq_write_t* w, const float* m) { uint16_t integer[16]; uint16_t fraction[16]; @@ -244,125 +204,80 @@ static inline void gl_matrix_write(rspq_write_t *w, const GLfloat *m) write_shorts(w, fraction, 16); } -void glLoadMatrixf(const GLfloat *m) +static void gpuLoadMatrix(const float* m) { - rspq_write_t w = rspq_write_begin(glp_id, GPU_CMD_MATRIX_LOAD, 17); + rspq_write_t w = rspq_write_begin(gpup_id, GPU_CMD_MATRIX_LOAD, 17); rspq_write_arg(&w, false); // no multiply - gl_matrix_write(&w, m); + gpu_matrix_write(&w, m); rspq_write_end(&w); } -static void upload_vertex(const gl_array_t *arrays, uint32_t index, uint8_t cache_index) +static void upload_vertex(uint32_t index, uint8_t cache_index) { - gl_cmd_stream_t s = gl_cmd_stream_begin(glp_id, GPU_CMD_UPLOAD_VTX, 6); - gl_cmd_stream_put_half(&s, cache_index * PRIM_VTX_SIZE); + gpu_cmd_stream_t s = gpu_cmd_stream_begin(gpup_id, GPU_CMD_UPLOAD_VTX, 6); + gpu_cmd_stream_put_half(&s, cache_index * PRIM_VTX_SIZE); + char* ptr = gpu_pointer + index * gpu_stride; - const float* vtx = gl_get_attrib_element(&arrays[ATTRIB_VERTEX], index); - gl_cmd_stream_put_half(&s, vtx[0] * (1<stride = stride; - array->pointer = pointer; -} - -void glTexCoordPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *pointer) -{ - gl_array_t *array = &state_arrays[ATTRIB_TEXCOORD]; - array->stride = stride; - array->pointer = pointer; -} - -void glColorPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *pointer) -{ - gl_array_t *array = &state_arrays[ATTRIB_COLOR]; - array->stride = stride; - array->pointer = pointer; -} - -void gl_set_array_enabled(int array_type, bool enabled) -{ - state_arrays[array_type].enabled = enabled; -} - -void glEnableClientState(GLenum array) -{ - gl_set_array_enabled(gl_array_type_from_enum(array), true); -} - -void glDisableClientState(GLenum array) -{ - gl_set_array_enabled(gl_array_type_from_enum(array), false); -} - -void glDrawArrays(GLenum mode, GLint first, GLsizei count) -{ - rspq_write(glp_id, GPU_CMD_PRE_INIT_PIPE); - gl_rsp_draw_arrays(first, count); -} - -void glDepthRange(GLclampd n, GLclampd f) +static void gpuDepthRange(float n, float f) { state_viewport.scale[2] = (f - n) * 0.5f; state_viewport.offset[2] = n + (f - n) * 0.5f; - gl_set_short( - offsetof(gl_server_state_t, viewport_scale) + sizeof(int16_t) * 2, + gpu_set_short( + offsetof(gpu_state, viewport_scale) + sizeof(int16_t) * 2, state_viewport.scale[2] * 4); - gl_set_short( - offsetof(gl_server_state_t, viewport_offset) + sizeof(int16_t) * 2, + gpu_set_short( + offsetof(gpu_state, viewport_offset) + sizeof(int16_t) * 2, state_viewport.offset[2] * 4); } -void glViewport(GLint x, GLint y, GLsizei w, GLsizei h) +static void gpuViewport(int x, int y, int w, int h) { state_viewport.scale[0] = w * 0.5f; state_viewport.scale[1] = h * -0.5f; @@ -382,17 +297,29 @@ void glViewport(GLint x, GLint y, GLsizei w, GLsizei h) uint16_t offset_y = state_viewport.offset[1] * SCREEN_XY_SCALE; uint16_t offset_z = state_viewport.offset[2] * SCREEN_Z_SCALE; - gl_set_long( - offsetof(gl_server_state_t, viewport_scale), + gpu_set_long( + offsetof(gpu_state, viewport_scale), ((uint64_t)scale_x << 48) | ((uint64_t)scale_y << 32) | ((uint64_t)scale_z << 16)); - gl_set_long( - offsetof(gl_server_state_t, viewport_offset), + gpu_set_long( + offsetof(gpu_state, viewport_offset), ((uint64_t)offset_x << 48) | ((uint64_t)offset_y << 32) | ((uint64_t)offset_z << 16)); } -void glCullFace(GLenum mode) +static void gpuSetCullFace(bool enabled) { // 1 = cull backfaces // 2 = don't cull - gl_set_short(offsetof(gl_server_state_t, tri_cull), mode ? 1 : 2); + gpu_set_short(offsetof(gpu_state, tri_cull), enabled ? 1 : 2); +} + +void gpu_init() +{ + gpup_id = rspq_overlay_register(&rsp_gpu); + gpuDepthRange(0, 1); +} + +void gpu_close() +{ + rspq_wait(); + rspq_overlay_unregister(gpup_id); } diff --git a/src/Graphics_N64.c b/src/Graphics_N64.c index 2331f86ec..7c7c207ca 100644 --- a/src/Graphics_N64.c +++ b/src/Graphics_N64.c @@ -9,10 +9,6 @@ #include #include "../misc/n64/gpu.c" -typedef void (*GL_SetupVBFunc)(void); -static GL_SetupVBFunc gfx_setupVBFunc; - - /*########################################################################################################################* *---------------------------------------------------------General---------------------------------------------------------* *#########################################################################################################################*/ @@ -34,7 +30,7 @@ void Gfx_Create(void) { // Set alpha compare threshold rdpq_set_blend_color(RGBA32(0,0,0, 127)); - gl_init(); + gpu_init(); zbuffer = surface_alloc(FMT_RGBA16, display_get_width(), display_get_height()); Gfx.MaxTexWidth = 256; @@ -60,11 +56,9 @@ cc_bool Gfx_TryRestoreContext(void) { void Gfx_Free(void) { Gfx_FreeState(); - gl_close(); + gpu_close(); } -#define gl_Toggle(cap) if (enabled) { glEnable(cap); } else { glDisable(cap); } - /*########################################################################################################################* *-----------------------------------------------------------Misc----------------------------------------------------------* @@ -89,7 +83,7 @@ void Gfx_SetVSync(cc_bool vsync) { void Gfx_OnWindowResize(void) { } void Gfx_SetViewport(int x, int y, int w, int h) { - glViewport(x, y, w, h); + gpuViewport(x, y, w, h); } void Gfx_SetScissor(int x, int y, int w, int h) { @@ -146,7 +140,7 @@ void Gfx_BindTexture(GfxResourceID texId) { CCTexture* tex = (CCTexture*)texId; rspq_block_run(tex->upload_block); - glTexSizeN64(tex->surface.width, tex->surface.height); + gpuSetTexSize(tex->surface.width, tex->surface.height); } #define ALIGNUP8(size) (((size) + 7) & ~0x07) @@ -207,10 +201,7 @@ GfxResourceID Gfx_AllocTexture(struct Bitmap* bmp, int rowWidth, cc_uint8 flags, } void Gfx_UpdateTexture(GfxResourceID texId, int x, int y, struct Bitmap* part, int rowWidth, cc_bool mipmaps) { - // TODO: Just memcpying doesn't actually work. maybe due to glSurfaceTexImageN64 caching the RSQ upload block? - // TODO: Is there a more optimised approach than just calling glSurfaceTexImageN64 CCTexture* tex = (CCTexture*)texId; - surface_t* fb = &tex->surface; cc_uint32* src = (cc_uint32*)part->scan0 + x; cc_uint8* dst = (cc_uint8*)fb->buffer + (x * 4) + (y * fb->stride); @@ -250,7 +241,7 @@ void Gfx_DisableMipmaps(void) { } *-----------------------------------------------------State management----------------------------------------------------* *#########################################################################################################################*/ void Gfx_SetFaceCulling(cc_bool enabled) { - glCullFace(enabled ? GL_BACK : 0); + gpuSetCullFace(enabled); } static void SetAlphaBlend(cc_bool enabled) { @@ -265,7 +256,7 @@ static void SetAlphaTest(cc_bool enabled) { } static void SetColorWrite(cc_bool r, cc_bool g, cc_bool b, cc_bool a) { - //glColorMask(r, g, b, a); TODO + //gpuColorMask(r, g, b, a); TODO } void Gfx_SetDepthWrite(cc_bool enabled) { @@ -275,14 +266,12 @@ void Gfx_SetDepthWrite(cc_bool enabled) { void Gfx_SetDepthTest(cc_bool enabled) { __rdpq_mode_change_som(SOM_Z_COMPARE, enabled ? SOM_Z_COMPARE : 0); - gl_Toggle(GL_DEPTH_TEST); + gpuSetFlag(GPU_ATTR_Z, enabled); } static void Gfx_FreeState(void) { FreeDefaultResources(); } static void Gfx_RestoreState(void) { InitDefaultResources(); - glEnableClientState(GL_VERTEX_ARRAY); - glEnableClientState(GL_COLOR_ARRAY); gfx_format = -1; // 1x1 dummy white texture @@ -379,8 +368,8 @@ static rspq_block_t* VB_GetCached(struct VertexBuffer* vb, int offset, int count if (vb->cache.blocks[i]) continue; rspq_block_begin(); - gfx_setupVBFunc(); - glDrawArrays(GL_QUADS, offset, count); + gpu_pointer = gfx_vb->vertices; + gpuDrawArrays(offset, count); rspq_block_t* block = rspq_block_end(); vb->cache.blocks[i] = block; @@ -469,9 +458,10 @@ void Gfx_SetFogMode(FogFunc func) { void Gfx_DepthOnlyRendering(cc_bool depthOnly) { depthOnlyRendering = depthOnly; // TODO: Better approach? maybe using glBlendFunc instead? cc_bool enabled = !depthOnly; + //SetColorWrite(enabled & gfx_colorMask[0], enabled & gfx_colorMask[1], // enabled & gfx_colorMask[2], enabled & gfx_colorMask[3]); - if (enabled) { glEnable(GL_TEXTURE_2D); } else { glDisable(GL_TEXTURE_2D); } + gpuSetFlag(GPU_ATTR_TEX, enabled); } @@ -486,7 +476,7 @@ void Gfx_LoadMatrix(MatrixType type, const struct Matrix* matrix) { struct Matrix mvp __attribute__((aligned(64))); Matrix_Mul(&mvp, &_view, &_proj); - glLoadMatrixf((const float*)&mvp); + gpuLoadMatrix((const float*)&mvp); } void Gfx_LoadMVP(const struct Matrix* view, const struct Matrix* proj, struct Matrix* mvp) { @@ -494,7 +484,7 @@ void Gfx_LoadMVP(const struct Matrix* view, const struct Matrix* proj, struct Ma _view = *view; Matrix_Mul(mvp, view, proj); - glLoadMatrixf((const float*)mvp); + gpuLoadMatrix((const float*)mvp); } void Gfx_EnableTextureOffset(float x, float y) { @@ -507,35 +497,20 @@ void Gfx_DisableTextureOffset(void) { } /*########################################################################################################################* *--------------------------------------------------------Rendering--------------------------------------------------------* *#########################################################################################################################*/ -static void GL_SetupVbColoured(void) { - glVertexPointer(3, GL_FLOAT, SIZEOF_VERTEX_COLOURED, (void*)(gfx_vb->vertices + 0)); - glColorPointer(4, GL_UNSIGNED_BYTE, SIZEOF_VERTEX_COLOURED, (void*)(gfx_vb->vertices + 12)); -} - -static void GL_SetupVbTextured(void) { - glVertexPointer(3, GL_FLOAT, SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices + 0)); - glColorPointer(4, GL_UNSIGNED_BYTE, SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices + 12)); - glTexCoordPointer(2, GL_FLOAT, SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices + 16)); -} - void Gfx_SetVertexFormat(VertexFormat fmt) { if (fmt == gfx_format) return; gfx_format = fmt; gfx_stride = strideSizes[fmt]; + gpu_stride = gfx_stride; if (fmt == VERTEX_FORMAT_TEXTURED) { - glEnableClientState(GL_TEXTURE_COORD_ARRAY); - glEnable(GL_TEXTURE_2D); - - gfx_setupVBFunc = GL_SetupVbTextured; rdpq_mode_combiner(RDPQ_COMBINER_TEX_SHADE); } else { - glDisableClientState(GL_TEXTURE_COORD_ARRAY); - glDisable(GL_TEXTURE_2D); - - gfx_setupVBFunc = GL_SetupVbColoured; rdpq_mode_combiner(RDPQ_COMBINER_SHADE); } + + gpu_texturing = fmt == VERTEX_FORMAT_TEXTURED; + gpuSetFlag(GPU_ATTR_TEX, gpu_texturing); } void Gfx_DrawVb_Lines(int verticesCount) { @@ -547,8 +522,8 @@ void Gfx_DrawVb_IndexedTris_Range(int verticesCount, int startVertex, DrawHints if (block) { rspq_block_run(block); } else { - gfx_setupVBFunc(); - glDrawArrays(GL_QUADS, startVertex, verticesCount); + gpu_pointer = gfx_vb->vertices; + gpuDrawArrays(startVertex, verticesCount); } } @@ -558,8 +533,8 @@ void Gfx_DrawVb_IndexedTris(int verticesCount) { if (block) { rspq_block_run(block); } else { - gfx_setupVBFunc(); - glDrawArrays(GL_QUADS, 0, verticesCount); + gpu_pointer = gfx_vb->vertices; + gpuDrawArrays(0, verticesCount); } } @@ -570,10 +545,8 @@ void Gfx_DrawIndexedTris_T2fC4b(int verticesCount, int startVertex) { if (block) { rspq_block_run(block); } else { - glVertexPointer(3, GL_FLOAT, SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices)); - glColorPointer(4, GL_UNSIGNED_BYTE, SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices + 12)); - glTexCoordPointer(2, GL_FLOAT, SIZEOF_VERTEX_TEXTURED, (void*)(gfx_vb->vertices + 16)); - glDrawArrays(GL_QUADS, startVertex, verticesCount); + gpu_pointer = gfx_vb->vertices; + gpuDrawArrays(startVertex, verticesCount); } } #endif From 4dc90741d71a5c1e07c77eed763684884629c3e2 Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Sun, 27 Apr 2025 13:38:39 +1000 Subject: [PATCH 04/14] N64 optimised, stage 4 --- misc/n64/gpu.c | 103 +++++++++++++++---------------------------------- 1 file changed, 31 insertions(+), 72 deletions(-) diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index a496eb173..05d003cbf 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -73,40 +73,6 @@ typedef struct { uint16_t tri_cull; } __attribute__((aligned(8), packed)) gpu_state; -static inline gpu_cmd_stream_t gpu_cmd_stream_begin(uint32_t ovl_id, uint32_t cmd_id, int size) -{ - return (gpu_cmd_stream_t) { - .w = rspq_write_begin(ovl_id, cmd_id, size), - .buffer_head = 2, - }; -} - -static inline void gpu_cmd_stream_commit(gpu_cmd_stream_t *s) -{ - rspq_write_arg(&s->w, s->word); - s->buffer_head = 0; - s->word = 0; -} - -static inline void gpu_cmd_stream_put_half(gpu_cmd_stream_t *s, uint16_t v) -{ - s->bytes[s->buffer_head++] = v >> 8; - s->bytes[s->buffer_head++] = v & 0xFF; - - if (s->buffer_head == sizeof(uint32_t)) { - gpu_cmd_stream_commit(s); - } -} - -static inline void gpu_cmd_stream_end(gpu_cmd_stream_t *s) -{ - if (s->buffer_head > 0) { - gpu_cmd_stream_commit(s); - } - - rspq_write_end(&s->w); -} - __attribute__((always_inline)) static inline void gpu_set_flag_raw(uint32_t offset, uint32_t flag, bool value) { @@ -143,7 +109,7 @@ static inline void gpu_set_long(uint32_t offset, uint64_t value) rspq_write(gpup_id, GPU_CMD_SET_LONG, offset, value >> 32, value & 0xFFFFFFFF); } -static inline void gpupipe_draw_triangle(int i0, int i1, int i2) +static inline void gpu_draw_triangle(int i0, int i1, int i2) { // We pass -1 because the triangle can be clipped and split into multiple // triangles. @@ -212,37 +178,43 @@ static void gpuLoadMatrix(const float* m) rspq_write_end(&w); } +static inline void put_word(rspq_write_t* s, uint16_t v1, uint16_t v2) +{ + rspq_write_arg(s, v2 | (v1 << 16)); +} + static void upload_vertex(uint32_t index, uint8_t cache_index) { - gpu_cmd_stream_t s = gpu_cmd_stream_begin(gpup_id, GPU_CMD_UPLOAD_VTX, 6); - gpu_cmd_stream_put_half(&s, cache_index * PRIM_VTX_SIZE); + rspq_write_t s = rspq_write_begin(gpup_id, GPU_CMD_UPLOAD_VTX, 6); + rspq_write_arg(&s, cache_index * PRIM_VTX_SIZE); char* ptr = gpu_pointer + index * gpu_stride; float* vtx = (float*)(ptr + 0); - gpu_cmd_stream_put_half(&s, vtx[0] * (1< Date: Mon, 28 Apr 2025 20:18:53 +1000 Subject: [PATCH 05/14] N64 optimised, stage 5 --- misc/n64/gpu.c | 54 +++++++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index 05d003cbf..86b1c32b3 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -4,7 +4,6 @@ #include "rdpq_mode.h" #include "rdpq_debug.h" #include "display.h" -#include "rdp.h" #include "gl_constants.h" // This is a severely cutdown version of libdragon's OpenGL implementation @@ -42,30 +41,16 @@ enum { GPU_CMD_PRE_INIT_PIPE = 0x8, }; -typedef struct { - float scale[3]; - float offset[3]; -} gpu_viewport_t; - typedef struct { int16_t i[4][4]; uint16_t f[4][4]; } gpu_matrix_srv_t; _Static_assert(sizeof(gpu_matrix_srv_t) == MATRIX_SIZE, "Matrix size does not match"); -typedef struct { - rspq_write_t w; - union { - uint8_t bytes[4]; - uint32_t word; - }; - uint32_t buffer_head; -} gpu_cmd_stream_t; - typedef struct { gpu_matrix_srv_t mvp_matrix; - int16_t viewport_scale[4]; - int16_t viewport_offset[4]; + int16_t vp_scale[4]; + int16_t vp_offset[4]; uint32_t flags; uint16_t tex_size[2]; uint16_t tex_offset[2]; @@ -120,7 +105,8 @@ static inline void gpu_draw_triangle(int i0, int i1, int i2) } -static gpu_viewport_t state_viewport; +static float gpu_vp_scale[3]; +static float gpu_vp_offset[3]; static bool gpu_texturing; static void* gpu_pointer; static int gpu_stride; @@ -232,38 +218,38 @@ static void gpuDrawArrays(uint32_t first, uint32_t count) static void gpuDepthRange(float n, float f) { - state_viewport.scale[2] = (f - n) * 0.5f; - state_viewport.offset[2] = n + (f - n) * 0.5f; + gpu_vp_scale[2] = (f - n) * 0.5f; + gpu_vp_offset[2] = n + (f - n) * 0.5f; - gpu_set_short(offsetof(gpu_state, viewport_scale[2]), state_viewport.scale[2] * 4); - gpu_set_short(offsetof(gpu_state, viewport_offset[2]), state_viewport.offset[2] * 4); + gpu_set_short(offsetof(gpu_state, vp_scale[2]), gpu_vp_scale[2] * 4); + gpu_set_short(offsetof(gpu_state, vp_offset[2]), gpu_vp_offset[2] * 4); } static void gpuViewport(int x, int y, int w, int h) { - state_viewport.scale[0] = w * 0.5f; - state_viewport.scale[1] = h * -0.5f; - state_viewport.offset[0] = x + w * 0.5f; - state_viewport.offset[1] = y + h * 0.5f; + gpu_vp_scale[0] = w * 0.5f; + gpu_vp_scale[1] = h * -0.5f; + gpu_vp_offset[0] = x + w * 0.5f; + gpu_vp_offset[1] = y + h * 0.5f; // Screen coordinates are s13.2 #define SCREEN_XY_SCALE 4.0f #define SCREEN_Z_SCALE 32767.0f // * 2.0f to compensate for RSP reciprocal missing 1 bit - uint16_t scale_x = state_viewport.scale[0] * SCREEN_XY_SCALE * 2.0f; - uint16_t scale_y = state_viewport.scale[1] * SCREEN_XY_SCALE * 2.0f; - uint16_t scale_z = state_viewport.scale[2] * SCREEN_Z_SCALE * 2.0f; + uint16_t scale_x = gpu_vp_scale[0] * SCREEN_XY_SCALE * 2.0f; + uint16_t scale_y = gpu_vp_scale[1] * SCREEN_XY_SCALE * 2.0f; + uint16_t scale_z = gpu_vp_scale[2] * SCREEN_Z_SCALE * 2.0f; - uint16_t offset_x = state_viewport.offset[0] * SCREEN_XY_SCALE; - uint16_t offset_y = state_viewport.offset[1] * SCREEN_XY_SCALE; - uint16_t offset_z = state_viewport.offset[2] * SCREEN_Z_SCALE; + uint16_t offset_x = gpu_vp_offset[0] * SCREEN_XY_SCALE; + uint16_t offset_y = gpu_vp_offset[1] * SCREEN_XY_SCALE; + uint16_t offset_z = gpu_vp_offset[2] * SCREEN_Z_SCALE; gpu_set_long( - offsetof(gpu_state, viewport_scale), + offsetof(gpu_state, vp_scale), ((uint64_t)scale_x << 48) | ((uint64_t)scale_y << 32) | ((uint64_t)scale_z << 16)); gpu_set_long( - offsetof(gpu_state, viewport_offset), + offsetof(gpu_state, vp_offset), ((uint64_t)offset_x << 48) | ((uint64_t)offset_y << 32) | ((uint64_t)offset_z << 16)); } From a44f760f311f17bfe08f3aa22f8ad310735d2269 Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Tue, 29 Apr 2025 06:12:15 +1000 Subject: [PATCH 06/14] Undo debug changes --- src/Server.c | 4 ++-- src/main.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Server.c b/src/Server.c index deae93fb8..f6fea3c60 100644 --- a/src/Server.c +++ b/src/Server.c @@ -145,12 +145,12 @@ static void SPConnection_BeginConnect(void) { World_SetDimensions(horSize, verSize, horSize); #if defined CC_BUILD_N64 || defined CC_BUILD_NDS || defined CC_BUILD_PS1 || defined CC_BUILD_SATURN || defined CC_BUILD_32X || defined CC_BUILD_GBA - Gen_Active = &NotchyGen; + Gen_Active = &FlatgrassGen; #else Gen_Active = &NotchyGen; #endif - Gen_Seed = 400;//Random_Next(&rnd, Int32_MaxValue); + Gen_Seed = Random_Next(&rnd, Int32_MaxValue); Gen_Start(); GeneratingScreen_Show(); diff --git a/src/main.c b/src/main.c index 48b880dad..6abab2a5a 100644 --- a/src/main.c +++ b/src/main.c @@ -153,12 +153,12 @@ static int RunProgram(int argc, char** argv) { struct ResumeInfo r; cc_string host; -//#ifdef _MSC_VER +#ifdef _MSC_VER /* NOTE: Make sure to comment this out before pushing a commit */ //cc_string rawArgs = String_FromConst("UnknownShadow200 fffff 127.0.0.1 25565"); //cc_string rawArgs = String_FromConst("UnknownShadow200"); //argsCount = String_UNSAFE_Split(&rawArgs, ' ', args, 4); -//#endif +#endif if (argsCount == 0) { #ifdef CC_BUILD_WEB From 3dc7b93efff2f112cb3dd89853902ac99aeb0214 Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Wed, 30 Apr 2025 19:38:27 +1000 Subject: [PATCH 07/14] Simplify triangle command calculation --- misc/n64/gl_constants.h | 3 -- misc/n64/gpu.c | 55 ++++++++++---------------- misc/n64/rsp_gpu.S | 88 +++++++++++------------------------------ src/Graphics_N64.c | 9 +++-- 4 files changed, 48 insertions(+), 107 deletions(-) diff --git a/misc/n64/gl_constants.h b/misc/n64/gl_constants.h index cf6ea3ce6..6ad83fdeb 100644 --- a/misc/n64/gl_constants.h +++ b/misc/n64/gl_constants.h @@ -12,9 +12,6 @@ #define VTX_SHIFT 5 #define TEX_SHIFT 8 -#define FLAG_DEPTH_TEST (1 << 8) -#define FLAG_TEXTURE_ACTIVE (1 << 9) - #define GUARD_BAND_FACTOR 2 #define ASSERT_INVALID_VTX_ID 0x2001 diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index 86b1c32b3..b80e3b04f 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -28,17 +28,14 @@ static rsp_ucode_t rsp_gpu = (rsp_ucode_t){ }; enum { - GPU_CMD_SET_FLAG = 0x0, - GPU_CMD_SET_BYTE = 0x1, - GPU_CMD_SET_SHORT = 0x2, - GPU_CMD_SET_WORD = 0x3, - GPU_CMD_SET_LONG = 0x4, + GPU_CMD_SET_BYTE = 0x0, + GPU_CMD_SET_SHORT = 0x1, + GPU_CMD_SET_WORD = 0x2, + GPU_CMD_SET_LONG = 0x3, - GPU_CMD_DRAW_TRI = 0x5, - GPU_CMD_UPLOAD_VTX = 0x6, - - GPU_CMD_MATRIX_LOAD = 0x7, - GPU_CMD_PRE_INIT_PIPE = 0x8, + GPU_CMD_DRAW_TRI = 0x4, + GPU_CMD_UPLOAD_VTX = 0x5, + GPU_CMD_MATRIX_LOAD = 0x6, }; typedef struct { @@ -51,25 +48,12 @@ typedef struct { gpu_matrix_srv_t mvp_matrix; int16_t vp_scale[4]; int16_t vp_offset[4]; - uint32_t flags; uint16_t tex_size[2]; uint16_t tex_offset[2]; uint16_t tri_cmd; uint16_t tri_cull; } __attribute__((aligned(8), packed)) gpu_state; -__attribute__((always_inline)) -static inline void gpu_set_flag_raw(uint32_t offset, uint32_t flag, bool value) -{ - rspq_write(gpup_id, GPU_CMD_SET_FLAG, offset | value, value ? flag : ~flag); -} - -__attribute__((always_inline)) -static inline void gpu_set_flag(uint32_t flag, bool value) -{ - gpu_set_flag_raw(offsetof(gpu_state, flags), flag, value); -} - __attribute__((always_inline)) static inline void gpu_set_byte(uint32_t offset, uint8_t value) { @@ -111,18 +95,20 @@ static bool gpu_texturing; static void* gpu_pointer; static int gpu_stride; -#define GPU_ATTR_Z 0 -#define GPU_ATTR_TEX 1 -static void gpuSetFlag(int target, bool value) +#define GPU_ATTR_Z (1 << 8) +#define GPU_ATTR_TEX (1 << 9) +#define GPU_ATTR_SHADE (1 << 10) +#define GPU_ATTR_EDGE (1 << 11) +static bool gpu_attr_z, gpu_attr_tex; + +static void gpuUpdateFormat(void) { - switch (target) { - case GPU_ATTR_Z: - gpu_set_flag(FLAG_DEPTH_TEST, value); - break; - case GPU_ATTR_TEX: - gpu_set_flag(FLAG_TEXTURE_ACTIVE, value); - break; - } + uint16_t cmd = 0xC000 | GPU_ATTR_SHADE | GPU_ATTR_EDGE; + + if (gpu_attr_z) cmd |= GPU_ATTR_Z; + if (gpu_attr_tex) cmd |= GPU_ATTR_TEX; + + gpu_set_short(offsetof(gpu_state, tri_cmd), cmd); } static void gpuSetTexSize(uint16_t width, uint16_t height) @@ -200,7 +186,6 @@ static void upload_vertex(uint32_t index, uint8_t cache_index) static void gpuDrawArrays(uint32_t first, uint32_t count) { - rspq_write(gpup_id, GPU_CMD_PRE_INIT_PIPE); for (uint32_t i = 0; i < count; i++) { uint8_t cache_index = i % VERTEX_CACHE_SIZE; diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S index f910bbec6..71060ac68 100644 --- a/misc/n64/rsp_gpu.S +++ b/misc/n64/rsp_gpu.S @@ -4,17 +4,14 @@ .data RSPQ_BeginOverlayHeader - RSPQ_DefineCommand GLCmd_SetFlag, 8 # 0x0 - RSPQ_DefineCommand GLCmd_SetByte, 8 # 0x1 - RSPQ_DefineCommand GLCmd_SetShort, 8 # 0x2 - RSPQ_DefineCommand GLCmd_SetWord, 8 # 0x3 - RSPQ_DefineCommand GLCmd_SetLong, 12 # 0x4 + RSPQ_DefineCommand GPUCmd_SetByte, 8 # 0x0 + RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x1 + RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x2 + RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x3 - RSPQ_DefineCommand GLCmd_DrawTriangle, 8 # 0x5 - RSPQ_DefineCommand GLCmd_UploadVertex, 24 # 0x6 - - RSPQ_DefineCommand GLCmd_MatrixLoad, 68 # 0x7 - RSPQ_DefineCommand GLCmd_PreInitPipe, 4 # 0x8 + RSPQ_DefineCommand GPUCmd_DrawTriangle, 8 # 0x4 + RSPQ_DefineCommand GPUCmd_UploadVertex, 24 # 0x5 + RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x6 RSPQ_EndOverlayHeader .align 4 @@ -28,7 +25,6 @@ GL_STATE: GL_MATRIX_MVP: .ds.b MATRIX_SIZE GL_VIEWPORT_SCALE: .half 0,0,0,0 GL_VIEWPORT_OFFSET: .half 0,0,0,0 - GL_STATE_FLAGS: .word 0 GL_STATE_TEX_SIZE: .half 0,0 GL_STATE_TEX_OFFSET: .half 0,0 GL_TRI_CMD: .half 0 @@ -64,52 +60,26 @@ CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR .text - ############################################################# - # GLCmd_SetFlag - # - # Sets or clears a flag - # - # ARGS: - # a0: Bit 31..24: Command id - # Bit 11..2: Offset of flag value in GL_STATE - # Bit 0: If 1, set the flag, otherwise clear it - # a1: flag mask (inverted if clearing) - ############################################################# - .func GLCmd_SetFlag -GLCmd_SetFlag: - li t0, ~0x3 - and t0, a0, t0 - andi t1, a0, 1 - lw t2, %lo(GL_STATE)(t0) - beqz t1, 1f - and t3, t2, a1 - or t3, t2, a1 - -1: - jr ra - sw t3, %lo(GL_STATE)(t0) - .endfunc - - .func GLCmd_SetByte -GLCmd_SetByte: + .func GPUCmd_SetByte +GPUCmd_SetByte: jr ra sb a1, %lo(GL_STATE)(a0) .endfunc - .func GLCmd_SetShort -GLCmd_SetShort: + .func GPUCmd_SetShort +GPUCmd_SetShort: jr ra sh a1, %lo(GL_STATE)(a0) .endfunc - .func GLCmd_SetWord -GLCmd_SetWord: + .func GPUCmd_SetWord +GPUCmd_SetWord: jr ra sw a1, %lo(GL_STATE) + 0(a0) .endfunc - .func GLCmd_SetLong -GLCmd_SetLong: + .func GPUCmd_SetLong +GPUCmd_SetLong: sw a2, %lo(GL_STATE) + 4(a0) jr ra sw a1, %lo(GL_STATE) + 0(a0) @@ -117,7 +87,7 @@ GLCmd_SetLong: ######################################## - # GLCmd_UploadVertex + # GPUCmd_UploadVertex # # Arguments: # * 0x00 (a0): offset within VERTEX_CACHE @@ -129,8 +99,8 @@ GLCmd_SetLong: # ######################################## .align 3 - .func GLCmd_UploadVertex -GLCmd_UploadVertex: + .func GPUCmd_UploadVertex +GPUCmd_UploadVertex: #define vtx a0 #define mtx_ptr s0 #define cmd_ptr s4 @@ -444,7 +414,7 @@ GL_TnL: ################################################################ - # GLCmd_DrawTriangle + # GPUCmd_DrawTriangle # # Arguments: # a0: Bit 31..24: Command id @@ -453,8 +423,8 @@ GL_TnL: # Bit 11..0: Offset into vertex cache of vtx3 # ################################################################ - .func GLCmd_DrawTriangle -GLCmd_DrawTriangle: + .func GPUCmd_DrawTriangle +GPUCmd_DrawTriangle: #define vtx1 a1 #define vtx2 a2 #define vtx3 a3 @@ -539,7 +509,7 @@ gl_draw_triangle_end: .endfunc -GLCmd_MatrixLoad: +GPUCmd_MatrixLoad: #define src s6 #define dst s7 @@ -567,19 +537,5 @@ GLCmd_MatrixLoad: jr ra sqv vrhs23_f, 0x30,dst - .func GLCmd_PreInitPipe -GLCmd_PreInitPipe: - #define state_flags k1 - #define tri_cmd t4 - - lw tri_cmd, %lo(GL_STATE_FLAGS) - ori tri_cmd, 0xCC00 - jr ra - sh tri_cmd, %lo(GL_TRI_CMD) - - #undef tri_cmd - #undef state_flags - .endfunc - #include "rsp_gpu_clipping.inc" #include diff --git a/src/Graphics_N64.c b/src/Graphics_N64.c index 7c7c207ca..761db5fd1 100644 --- a/src/Graphics_N64.c +++ b/src/Graphics_N64.c @@ -266,7 +266,8 @@ void Gfx_SetDepthWrite(cc_bool enabled) { void Gfx_SetDepthTest(cc_bool enabled) { __rdpq_mode_change_som(SOM_Z_COMPARE, enabled ? SOM_Z_COMPARE : 0); - gpuSetFlag(GPU_ATTR_Z, enabled); + gpu_attr_z = enabled; + gpuUpdateFormat(); } static void Gfx_FreeState(void) { FreeDefaultResources(); } @@ -461,7 +462,8 @@ void Gfx_DepthOnlyRendering(cc_bool depthOnly) { //SetColorWrite(enabled & gfx_colorMask[0], enabled & gfx_colorMask[1], // enabled & gfx_colorMask[2], enabled & gfx_colorMask[3]); - gpuSetFlag(GPU_ATTR_TEX, enabled); + gpu_attr_tex = enabled; + gpuUpdateFormat(); } @@ -510,7 +512,8 @@ void Gfx_SetVertexFormat(VertexFormat fmt) { } gpu_texturing = fmt == VERTEX_FORMAT_TEXTURED; - gpuSetFlag(GPU_ATTR_TEX, gpu_texturing); + gpu_attr_tex = gpu_texturing; + gpuUpdateFormat(); } void Gfx_DrawVb_Lines(int verticesCount) { From 34474b32aa061dc08b74bff208c8d6e565fc9f25 Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Wed, 30 Apr 2025 20:54:35 +1000 Subject: [PATCH 08/14] Slightly simplify ST calculation --- misc/n64/gl_constants.h | 1 + misc/n64/rsp_gpu.S | 43 ++++++++++++++++++----------------------- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/misc/n64/gl_constants.h b/misc/n64/gl_constants.h index 6ad83fdeb..09671a81e 100644 --- a/misc/n64/gl_constants.h +++ b/misc/n64/gl_constants.h @@ -43,4 +43,5 @@ #define PRIM_VTX_TRCODE 40 // trivial-reject clipping flags (against -w/+w) #define PRIM_VTX_SIZE 42 + #endif diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S index 71060ac68..95e5cbf48 100644 --- a/misc/n64/rsp_gpu.S +++ b/misc/n64/rsp_gpu.S @@ -137,7 +137,7 @@ GPUCmd_UploadVertex: addi vtx, %lo(VERTEX_CACHE) sdv vpos, PRIM_VTX_X ,vtx sdv vcol, PRIM_VTX_R ,vtx - sdv vtex, PRIM_VTX_TEX_S ,vtx + slv vtex, PRIM_VTX_TEX_S ,vtx # == matrix multiply == li mtx_ptr, %lo(GL_MATRIX_MVP) @@ -343,41 +343,36 @@ GL_TnL: #define vtexsize $v06 #define vtexoffset $v07 - #define vstrq $v08 + #define vst $v08 - ldv vstrq, PRIM_VTX_TEX_S,vtx # S + T + R + Q + llv vst, PRIM_VTX_TEX_S,vtx # S + T suv vrgba, SCREEN_VTX_RGBA,vtx li s1, %lo(GL_STATE_TEX_SIZE) llv vtexsize.s, 0,s1 llv vtexoffset.s, 4,s1 - #define vinvq_i $v26 - #define vinvq_f $v27 - #define vstrq_i $v28 - #define vstrq_f $v29 - #define q e3 + #define vst_i $v28 + #define vst_f $v29 # Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active) - #vmudn v___, vstrq, vtexsize - # vmadh vstrq, vtexoffset, K1 + #vmudn v___, vst, vtexsize + # vmadh vst, vtexoffset, K1 - #vmudn v___, vstrq, vtexsize - #vmadh vstrq, vtexoffset, K1 - #vmudl vstrq, vstrq, vtexsize + #vmudn v___, vst, vtexsize + #vmadh vst, vtexoffset, K1 + #vmudl vst, vst, vtexsize - vmudh v___, vstrq, vtexsize - vsar vstrq_i, COP2_ACC_HI - vsar vstrq_f, COP2_ACC_MD + vmudh v___, vst, vtexsize + vsar vst_i, COP2_ACC_HI + vsar vst_f, COP2_ACC_MD - vmudl vstrq_f, vstrq_f, K8192 - vmadm vstrq_i, vstrq_i, K8192 - vmadn vstrq, vzero, vzero + vmudl vst_f, vst_f, K8192 + vmadm vst_i, vst_i, K8192 + vmadn vst, vzero, vzero - #undef vinvq_i - #undef vinvq_f - #undef vstrq_i - #undef vstrq_f + #undef vst_i + #undef vst_f #undef q lbu t0, PRIM_VTX_TRCODE(vtx) @@ -393,7 +388,7 @@ GL_TnL: sb t0, PRIM_VTX_TRCODE(vtx) jal GL_CalcScreenSpace - slv vstrq.s, SCREEN_VTX_S,vtx + slv vst.s, SCREEN_VTX_S,vtx j GL_CalcClipCodes move ra, ra2 From 06e09c30190000c4b0a0ae11a3f87d29e9406c6e Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Wed, 30 Apr 2025 21:10:29 +1000 Subject: [PATCH 09/14] Simplify RGBA calculation, saves 4 bytes per vertex upload command --- misc/n64/gl_constants.h | 9 +++------ misc/n64/gpu.c | 9 +++------ misc/n64/rsp_gpu.S | 25 +++++++++++-------------- 3 files changed, 17 insertions(+), 26 deletions(-) diff --git a/misc/n64/gl_constants.h b/misc/n64/gl_constants.h index 09671a81e..37b8ddb2b 100644 --- a/misc/n64/gl_constants.h +++ b/misc/n64/gl_constants.h @@ -32,14 +32,11 @@ #define PRIM_VTX_Y 18 // Object space position (16-bit) #define PRIM_VTX_Z 20 // Object space position (16-bit) #define PRIM_VTX_W 22 // Object space position (16-bit) -#define PRIM_VTX_R 24 -#define PRIM_VTX_G 26 -#define PRIM_VTX_B 28 -#define PRIM_VTX_A 30 +#define PRIM_VTX_RGBA 24 +// 28,29,30,31 pad #define PRIM_VTX_TEX_S 32 #define PRIM_VTX_TEX_T 34 -#define PRIM_VTX_TEX_R 36 -#define PRIM_VTX_TEX_Q 38 +//36,37,38,39 pad #define PRIM_VTX_TRCODE 40 // trivial-reject clipping flags (against -w/+w) #define PRIM_VTX_SIZE 42 diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index b80e3b04f..f34769d6e 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -157,7 +157,7 @@ static inline void put_word(rspq_write_t* s, uint16_t v1, uint16_t v2) static void upload_vertex(uint32_t index, uint8_t cache_index) { - rspq_write_t s = rspq_write_begin(gpup_id, GPU_CMD_UPLOAD_VTX, 6); + rspq_write_t s = rspq_write_begin(gpup_id, GPU_CMD_UPLOAD_VTX, 5); rspq_write_arg(&s, cache_index * PRIM_VTX_SIZE); char* ptr = gpu_pointer + index * gpu_stride; @@ -167,11 +167,8 @@ static void upload_vertex(uint32_t index, uint8_t cache_index) put_word(&s, vtx[2] * (1< Date: Thu, 1 May 2025 06:34:07 +1000 Subject: [PATCH 10/14] Minorly optimise T&L --- misc/n64/gl_constants.h | 14 +---------- misc/n64/rsp_gpu.S | 44 ++++++++++++++--------------------- misc/n64/rsp_gpu_clipping.inc | 6 ++--- 3 files changed, 21 insertions(+), 43 deletions(-) diff --git a/misc/n64/gl_constants.h b/misc/n64/gl_constants.h index 37b8ddb2b..054ffaca7 100644 --- a/misc/n64/gl_constants.h +++ b/misc/n64/gl_constants.h @@ -24,19 +24,7 @@ #define BILINEAR_TEX_OFFSET_SHIFT 9 -#define TRICMD_ATTR_MASK 0x300 - -#define PRIM_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit) -#define PRIM_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit) -#define PRIM_VTX_X 16 // Object space position (16-bit) -#define PRIM_VTX_Y 18 // Object space position (16-bit) -#define PRIM_VTX_Z 20 // Object space position (16-bit) -#define PRIM_VTX_W 22 // Object space position (16-bit) -#define PRIM_VTX_RGBA 24 -// 28,29,30,31 pad -#define PRIM_VTX_TEX_S 32 -#define PRIM_VTX_TEX_T 34 -//36,37,38,39 pad +//0-39 same as screenvtx #define PRIM_VTX_TRCODE 40 // trivial-reject clipping flags (against -w/+w) #define PRIM_VTX_SIZE 42 diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S index 0d0c339e3..f3ccc2730 100644 --- a/misc/n64/rsp_gpu.S +++ b/misc/n64/rsp_gpu.S @@ -52,8 +52,7 @@ CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR #define SCREEN_VTX_CLIP_CODE 22 #define SCREEN_VTX_PADDING 23 #define SCREEN_VTX_RGBA 24 -#define SCREEN_VTX_S 28 -#define SCREEN_VTX_T 30 +#define SCREEN_VTX_S_T 28 // 28 S, 30 T #define SCREEN_VTX_W 32 // FIXME: this is duplicated in CS_POS #define SCREEN_VTX_INVW 36 // 32-bit #define SCREEN_VTX_SIZE 40 @@ -135,9 +134,8 @@ GPUCmd_UploadVertex: llv vtex, 12, cmd_ptr # Load U, V addi vtx, %lo(VERTEX_CACHE) - sdv vpos, PRIM_VTX_X ,vtx - slv vcol, PRIM_VTX_RGBA ,vtx - slv vtex, PRIM_VTX_TEX_S ,vtx + slv vcol, SCREEN_VTX_RGBA, vtx + slv vtex, SCREEN_VTX_S_T, vtx # == matrix multiply == li mtx_ptr, %lo(GL_MATRIX_MVP) @@ -165,8 +163,8 @@ GPUCmd_UploadVertex: vmudm vcspos_i, vcspos_i, vshift8.e4 vmadl vcspos_f, vcspos_f, vshift8.e4 - sdv vcspos_i, PRIM_VTX_CS_POSi,vtx - sdv vcspos_f, PRIM_VTX_CS_POSf,vtx + sdv vcspos_i, SCREEN_VTX_CS_POSi,vtx + sdv vcspos_f, SCREEN_VTX_CS_POSf,vtx # Calculate and store clipping flags against CS.W. # These will be used for trivial rejections. @@ -330,26 +328,23 @@ GL_CalcClipCodes: ################################################################ .func GL_TnL GL_TnL: - #define tmp_ptr s2 #define vtx s3 - #define s e0 #define v___ $v01 - #define vrgba $v04 + #define vcspos_f $v02 + #define vcspos_i $v03 #define vtexsize $v06 #define vtexoffset $v07 #define vst $v08 + #define vst_i $v28 + #define vst_f $v29 move ra2, ra - llv vrgba, PRIM_VTX_RGBA, vtx # RGBA - llv vst, PRIM_VTX_TEX_S,vtx # S + T + llv vst, SCREEN_VTX_S_T, vtx # S + T - li s1, %lo(GL_STATE_TEX_SIZE) - llv vtexsize.s, 0,s1 - llv vtexoffset.s, 4,s1 - - #define vst_i $v28 - #define vst_f $v29 + li t0, %lo(GL_STATE_TEX_SIZE) + llv vtexsize, 0,t0 + llv vtexoffset, 4,t0 # Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active) #vmudn v___, vst, vtexsize @@ -369,17 +364,12 @@ GL_TnL: #undef vst_i #undef vst_f - #undef q lbu t0, PRIM_VTX_TRCODE(vtx) - slv vrgba, SCREEN_VTX_RGBA,vtx - slv vst, SCREEN_VTX_S,vtx + slv vst, SCREEN_VTX_S_T, vtx - #define vcspos_f $v02 - #define vcspos_i $v03 - - ldv vcspos_f, PRIM_VTX_CS_POSf,vtx - ldv vcspos_i, PRIM_VTX_CS_POSi,vtx + ldv vcspos_f, SCREEN_VTX_CS_POSf,vtx + ldv vcspos_i, SCREEN_VTX_CS_POSi,vtx # Mark this vertex as having T&L applied ori t0, 0x80 @@ -426,7 +416,7 @@ GPUCmd_DrawTriangle: addi vtx3, a1, %lo(VERTEX_CACHE) srl vtx2, a1, 16 - addi vtx2, %lo(VERTEX_CACHE) + addi vtx2, %lo(VERTEX_CACHE) addi vtx1, a0, %lo(VERTEX_CACHE) # Trivial reject: if all the vertices are out of the same plane (at least one), diff --git a/misc/n64/rsp_gpu_clipping.inc b/misc/n64/rsp_gpu_clipping.inc index 90d753dad..1fa9eb60f 100644 --- a/misc/n64/rsp_gpu_clipping.inc +++ b/misc/n64/rsp_gpu_clipping.inc @@ -175,11 +175,11 @@ gl_clip_no_swap: # vattr0: r0 g0 b0 a0 s0 t0 luv vattr0.e0, SCREEN_VTX_RGBA ,p0 - llv vattr0.e4, SCREEN_VTX_S ,p0 + llv vattr0.e4, SCREEN_VTX_S_T ,p0 # vattr1: r1 g1 b1 a1 s1 t1 luv vattr1.e0, SCREEN_VTX_RGBA ,p1 - llv vattr1.e4, SCREEN_VTX_S ,p1 + llv vattr1.e4, SCREEN_VTX_S_T ,p1 # Find first free slot in clip cache @@ -275,7 +275,7 @@ gl_clip_no_swap: sdv vint_f.e0, SCREEN_VTX_CS_POSf,intersection suv vattr0.e0, SCREEN_VTX_RGBA ,intersection jal GL_CalcClipCodes - slv vattr0.e4, SCREEN_VTX_S ,intersection + slv vattr0.e4, SCREEN_VTX_S_T ,intersection # Add intersection to the output list add t0, out_list, out_count From 243af150a868a8f5eea66fdcf5354f880961343f Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Thu, 1 May 2025 22:13:58 +1000 Subject: [PATCH 11/14] WIP on setting RDP state via own RSP overlay --- misc/n64/gpu.c | 11 +++++++++++ misc/n64/rsp_gpu.S | 30 ++++++++++++++++++------------ src/Graphics_N64.c | 8 +++++--- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index f34769d6e..793878818 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -36,6 +36,8 @@ enum { GPU_CMD_DRAW_TRI = 0x4, GPU_CMD_UPLOAD_VTX = 0x5, GPU_CMD_MATRIX_LOAD = 0x6, + + GPU_CMD_PUSH_RDP = 0x7, }; typedef struct { @@ -88,6 +90,15 @@ static inline void gpu_draw_triangle(int i0, int i1, int i2) ); } +#define RDP_CMD_SYNC_PIPE 0xE7000000 +#define RDP_CMD_SET_BLEND_COLOR 0xF9000000 + +__attribute__((always_inline)) +static inline void gpu_push_rdp(uint32_t a1, uint64_t a2) +{ + rdpq_write(2, gpup_id, GPU_CMD_PUSH_RDP, 0, a1, a2); +} + static float gpu_vp_scale[3]; static float gpu_vp_offset[3]; diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S index f3ccc2730..87033509d 100644 --- a/misc/n64/rsp_gpu.S +++ b/misc/n64/rsp_gpu.S @@ -12,6 +12,8 @@ RSPQ_DefineCommand GPUCmd_DrawTriangle, 8 # 0x4 RSPQ_DefineCommand GPUCmd_UploadVertex, 20 # 0x5 RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x6 + + RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x7 RSPQ_EndOverlayHeader .align 4 @@ -85,6 +87,16 @@ GPUCmd_SetLong: .endfunc + .func GPUCmd_PushRDP +GPUCmd_PushRDP: + # RDP command is expected in a0 and a1 + move a0, a1 + move a1, a2 + + jal_and_j RDPQ_Write8, RDPQ_Finalize + .endfunc + + ######################################## # GPUCmd_UploadVertex # @@ -94,7 +106,6 @@ GPUCmd_SetLong: # * 0x08 (a2): object space Z, W (16-bit) # * 0x0C (a3): RGBA (8-bit each one) # * 0x10: S, T (16-bit) - # * 0x14: normal X, Y, Z (8-bit each one) (LSB must be 0) # ######################################## .align 3 @@ -102,7 +113,7 @@ GPUCmd_SetLong: GPUCmd_UploadVertex: #define vtx a0 #define mtx_ptr s0 - #define cmd_ptr s4 + #define src_ptr s4 #define v___ $v01 @@ -126,12 +137,11 @@ GPUCmd_UploadVertex: #define z e2 #define w e3 - addi cmd_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) + 4 - sub cmd_ptr, rspq_cmd_size + addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 16 - ldv vpos, 0, cmd_ptr # Load X, Y, Z, W - llv vcol, 8, cmd_ptr # Load RGBA - llv vtex, 12, cmd_ptr # Load U, V + ldv vpos, 0, src_ptr # Load X, Y, Z, W + llv vcol, 8, src_ptr # Load RGBA + llv vtex, 12, src_ptr # Load U, V addi vtx, %lo(VERTEX_CACHE) slv vcol, SCREEN_VTX_RGBA, vtx @@ -180,12 +190,8 @@ GPUCmd_UploadVertex: jr ra sb t0, PRIM_VTX_TRCODE(vtx) - #undef cmd_ptr + #undef src_ptr #undef vtx - #undef in_xy - #undef in_zw - #undef in_rgba - #undef vtx_id #undef x #undef y diff --git a/src/Graphics_N64.c b/src/Graphics_N64.c index 761db5fd1..dea7d1f2f 100644 --- a/src/Graphics_N64.c +++ b/src/Graphics_N64.c @@ -27,10 +27,12 @@ void Gfx_Create(void) { __rdpq_mode_change_som(SOM_ZMODE_MASK, SOM_ZMODE_OPAQUE); rdpq_mode_dithering(DITHER_SQUARE_SQUARE); - // Set alpha compare threshold - rdpq_set_blend_color(RGBA32(0,0,0, 127)); - gpu_init(); + + // Set alpha compare threshold + gpu_push_rdp(RDP_CMD_SYNC_PIPE, 0); + gpu_push_rdp(RDP_CMD_SET_BLEND_COLOR, (0 << 24) | (0 << 16) | (0 << 8) | 127); + zbuffer = surface_alloc(FMT_RGBA16, display_get_width(), display_get_height()); Gfx.MaxTexWidth = 256; From b73d03b1998b1b6335277b258ea2355c1595567a Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Thu, 1 May 2025 22:54:00 +1000 Subject: [PATCH 12/14] Simplify Draw command --- misc/n64/gl_constants.h | 2 +- misc/n64/gpu.c | 21 +++------ misc/n64/rsp_gpu.S | 94 ++++++++++++++++++++++------------------- 3 files changed, 57 insertions(+), 60 deletions(-) diff --git a/misc/n64/gl_constants.h b/misc/n64/gl_constants.h index 054ffaca7..a2ca68e8f 100644 --- a/misc/n64/gl_constants.h +++ b/misc/n64/gl_constants.h @@ -1,7 +1,7 @@ #ifndef __GL_CONSTANTS #define __GL_CONSTANTS -#define VERTEX_CACHE_SIZE 16 +#define VERTEX_CACHE_SIZE 4 #define MATRIX_SIZE 64 diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index 793878818..b3d4399af 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -33,7 +33,7 @@ enum { GPU_CMD_SET_WORD = 0x2, GPU_CMD_SET_LONG = 0x3, - GPU_CMD_DRAW_TRI = 0x4, + GPU_CMD_DRAW_QUAD = 0x4, GPU_CMD_UPLOAD_VTX = 0x5, GPU_CMD_MATRIX_LOAD = 0x6, @@ -80,16 +80,6 @@ static inline void gpu_set_long(uint32_t offset, uint64_t value) rspq_write(gpup_id, GPU_CMD_SET_LONG, offset, value >> 32, value & 0xFFFFFFFF); } -static inline void gpu_draw_triangle(int i0, int i1, int i2) -{ - // We pass -1 because the triangle can be clipped and split into multiple - // triangles. - rdpq_write(-1, gpup_id, GPU_CMD_DRAW_TRI, - (i0*PRIM_VTX_SIZE), - ((i1*PRIM_VTX_SIZE)<<16) | (i2*PRIM_VTX_SIZE) - ); -} - #define RDP_CMD_SYNC_PIPE 0xE7000000 #define RDP_CMD_SET_BLEND_COLOR 0xF9000000 @@ -196,16 +186,15 @@ static void gpuDrawArrays(uint32_t first, uint32_t count) { for (uint32_t i = 0; i < count; i++) { - uint8_t cache_index = i % VERTEX_CACHE_SIZE; + uint8_t cache_index = i & 3; upload_vertex(first + i, cache_index); // Last vertex of quad? if ((i & 3) != 3) continue; - // Add two triangles - uint8_t idx = cache_index - 3; - gpu_draw_triangle(idx + 0, idx + 1, idx + 2); - gpu_draw_triangle(idx + 0, idx + 2, idx + 3); + // We pass -1 because the triangle can be clipped and split into multiple + // triangles. + rdpq_write(-1, gpup_id, GPU_CMD_DRAW_QUAD); } } diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S index 87033509d..9b3a57739 100644 --- a/misc/n64/rsp_gpu.S +++ b/misc/n64/rsp_gpu.S @@ -9,7 +9,7 @@ RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x2 RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x3 - RSPQ_DefineCommand GPUCmd_DrawTriangle, 8 # 0x4 + RSPQ_DefineCommand GPUCmd_DrawQuad, 4 # 0x4 RSPQ_DefineCommand GPUCmd_UploadVertex, 20 # 0x5 RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x6 @@ -41,6 +41,7 @@ VERTEX_CACHE: .dcb.b PRIM_VTX_SIZE * VERTEX_CACHE_SIZE CACHE_OFFSETS: .half 2,4,6,8,10,12,14,16,18 CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR +DRAW_TRI_RA: .word 0 #define CLIPPING_PLANE_COUNT 6 #define CLIPPING_CACHE_SIZE 9 @@ -97,6 +98,39 @@ GPUCmd_PushRDP: .endfunc + .func GPUCmd_MatrixLoad +GPUCmd_MatrixLoad: + #define src s6 + #define dst s7 + + #define vrhs01_i $v02 + #define vrhs01_f $v03 + #define vrhs23_i $v04 + #define vrhs23_f $v05 + + addi src, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64 + addi dst, zero, %lo(GL_MATRIX_MVP) + + # Load the matrix from command parameters (misaligned) + lqv vrhs01_i, 0x00,src + lrv vrhs01_i, 0x10,src + lqv vrhs23_i, 0x10,src + lrv vrhs23_i, 0x20,src + lqv vrhs01_f, 0x20,src + lrv vrhs01_f, 0x30,src + lqv vrhs23_f, 0x30,src + lrv vrhs23_f, 0x40,src + + sqv vrhs01_i, 0x00,dst + sqv vrhs23_i, 0x10,dst + sqv vrhs01_f, 0x20,dst + jr ra + sqv vrhs23_f, 0x30,dst + +#undef src +#undef dst + .endfunc + ######################################## # GPUCmd_UploadVertex # @@ -401,16 +435,6 @@ GL_TnL: .endfunc - ################################################################ - # GPUCmd_DrawTriangle - # - # Arguments: - # a0: Bit 31..24: Command id - # Bit 11..0: Offset into vertex cache of vtx1 - # a1: Bit 27..16: Offset into vertex cache of vtx2 - # Bit 11..0: Offset into vertex cache of vtx3 - # - ################################################################ .func GPUCmd_DrawTriangle GPUCmd_DrawTriangle: #define vtx1 a1 @@ -419,11 +443,7 @@ GPUCmd_DrawTriangle: #define trcode1 t6 #define trcode2 t7 #define trcode3 t8 - - addi vtx3, a1, %lo(VERTEX_CACHE) - srl vtx2, a1, 16 - addi vtx2, %lo(VERTEX_CACHE) - addi vtx1, a0, %lo(VERTEX_CACHE) + sw ra, %lo(DRAW_TRI_RA) # TODO find a register for this # Trivial reject: if all the vertices are out of the same plane (at least one), # the triangle is out of the viewport. @@ -487,43 +507,31 @@ gl_draw_single_triangle: addi s1, 2 gl_draw_triangle_end: - j RSPQ_Loop + lw ra, %lo(DRAW_TRI_RA) + jr ra nop #undef vtx1 #undef vtx2 #undef vtx3 - .endfunc -GPUCmd_MatrixLoad: - #define src s6 - #define dst s7 + .func GPUCmd_DrawQuad +GPUCmd_DrawQuad: + li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE + li a2, %lo(VERTEX_CACHE) + 1*PRIM_VTX_SIZE + jal GPUCmd_DrawTriangle + li a3, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE - #define vrhs01_i $v02 - #define vrhs01_f $v03 - #define vrhs23_i $v04 - #define vrhs23_f $v05 + li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE + li a2, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE + jal GPUCmd_DrawTriangle + li a3, %lo(VERTEX_CACHE) + 3*PRIM_VTX_SIZE - addi src, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64 - addi dst, zero, %lo(GL_MATRIX_MVP) - - # Load the matrix from command parameters (misaligned) - lqv vrhs01_i, 0x00,src - lrv vrhs01_i, 0x10,src - lqv vrhs23_i, 0x10,src - lrv vrhs23_i, 0x20,src - lqv vrhs01_f, 0x20,src - lrv vrhs01_f, 0x30,src - lqv vrhs23_f, 0x30,src - lrv vrhs23_f, 0x40,src - - sqv vrhs01_i, 0x00,dst - sqv vrhs23_i, 0x10,dst - sqv vrhs01_f, 0x20,dst - jr ra - sqv vrhs23_f, 0x30,dst + j RSPQ_Loop + nop + .endfunc #include "rsp_gpu_clipping.inc" #include From db9b359b8b2dd0cb47bfef7319d214c7c326774e Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Fri, 2 May 2025 07:16:29 +1000 Subject: [PATCH 13/14] Optimise vertex upload --- misc/n64/gpu.c | 41 +++++++++++++------------ misc/n64/rsp_gpu.S | 56 +++++++++++++++-------------------- misc/n64/rsp_gpu_clipping.inc | 6 ++++ 3 files changed, 50 insertions(+), 53 deletions(-) diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index b3d4399af..92a7fde13 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -34,7 +34,7 @@ enum { GPU_CMD_SET_LONG = 0x3, GPU_CMD_DRAW_QUAD = 0x4, - GPU_CMD_UPLOAD_VTX = 0x5, + GPU_CMD_UPLOAD_QUAD = 0x5, GPU_CMD_MATRIX_LOAD = 0x6, GPU_CMD_PUSH_RDP = 0x7, @@ -156,41 +156,40 @@ static inline void put_word(rspq_write_t* s, uint16_t v1, uint16_t v2) rspq_write_arg(s, v2 | (v1 << 16)); } -static void upload_vertex(uint32_t index, uint8_t cache_index) +static void upload_vertex(rspq_write_t* s, uint32_t index) { - rspq_write_t s = rspq_write_begin(gpup_id, GPU_CMD_UPLOAD_VTX, 5); - rspq_write_arg(&s, cache_index * PRIM_VTX_SIZE); char* ptr = gpu_pointer + index * gpu_stride; float* vtx = (float*)(ptr + 0); - put_word(&s, vtx[0] * (1< Date: Fri, 2 May 2025 20:27:31 +1000 Subject: [PATCH 14/14] Combine upload/draw --- misc/n64/gl_constants.h | 32 ---------------------- misc/n64/gpu.c | 27 +++++++------------ misc/n64/rsp_gpu.S | 60 +++++++++++++++++++++-------------------- 3 files changed, 40 insertions(+), 79 deletions(-) delete mode 100644 misc/n64/gl_constants.h diff --git a/misc/n64/gl_constants.h b/misc/n64/gl_constants.h deleted file mode 100644 index a2ca68e8f..000000000 --- a/misc/n64/gl_constants.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef __GL_CONSTANTS -#define __GL_CONSTANTS - -#define VERTEX_CACHE_SIZE 4 - -#define MATRIX_SIZE 64 - -#define TEXTURE_BILINEAR_MASK 0x001 -#define TEXTURE_INTERPOLATE_MASK 0x002 -#define TEXTURE_MIPMAP_MASK 0x100 - -#define VTX_SHIFT 5 -#define TEX_SHIFT 8 - -#define GUARD_BAND_FACTOR 2 - -#define ASSERT_INVALID_VTX_ID 0x2001 - -#define TEX_COORD_SHIFT 6 -#define HALF_TEXEL 0x0010 - -#define TEX_BILINEAR_SHIFT 13 -#define TEX_BILINEAR_OFFSET_SHIFT 4 - -#define BILINEAR_TEX_OFFSET_SHIFT 9 - -//0-39 same as screenvtx -#define PRIM_VTX_TRCODE 40 // trivial-reject clipping flags (against -w/+w) -#define PRIM_VTX_SIZE 42 - - -#endif diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index 92a7fde13..b65fd63e1 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -4,9 +4,10 @@ #include "rdpq_mode.h" #include "rdpq_debug.h" #include "display.h" -#include "gl_constants.h" // This is a severely cutdown version of libdragon's OpenGL implementation +#define VTX_SHIFT 5 +#define TEX_SHIFT 8 static uint32_t gpup_id; //DEFINE_RSP_UCODE(rsp_gpu); @@ -34,20 +35,14 @@ enum { GPU_CMD_SET_LONG = 0x3, GPU_CMD_DRAW_QUAD = 0x4, - GPU_CMD_UPLOAD_QUAD = 0x5, - GPU_CMD_MATRIX_LOAD = 0x6, + GPU_CMD_MATRIX_LOAD = 0x5, - GPU_CMD_PUSH_RDP = 0x7, + GPU_CMD_PUSH_RDP = 0x6, }; typedef struct { - int16_t i[4][4]; - uint16_t f[4][4]; -} gpu_matrix_srv_t; -_Static_assert(sizeof(gpu_matrix_srv_t) == MATRIX_SIZE, "Matrix size does not match"); - -typedef struct { - gpu_matrix_srv_t mvp_matrix; + int16_t mvp_matrix_i[4][4]; + uint16_t mvp_matrix_f[4][4]; int16_t vp_scale[4]; int16_t vp_offset[4]; uint16_t tex_size[2]; @@ -146,7 +141,7 @@ static inline void gpu_matrix_write(rspq_write_t* w, const float* m) static void gpuLoadMatrix(const float* m) { rspq_write_t w = rspq_write_begin(gpup_id, GPU_CMD_MATRIX_LOAD, 17); - rspq_write_arg(&w, false); // no multiply + rspq_write_arg(&w, 0); // padding gpu_matrix_write(&w, m); rspq_write_end(&w); } @@ -183,17 +178,13 @@ static void gpuDrawArrays(uint32_t first, uint32_t count) { for (uint32_t i = 0; i < count; i += 4) { - rspq_write_t s = rspq_write_begin(gpup_id, GPU_CMD_UPLOAD_QUAD, 17); - rspq_write_arg(&s, 0); + rspq_write_t s = rspq_write_begin(gpup_id, GPU_CMD_DRAW_QUAD, 17); + rspq_write_arg(&s, 0); // padding for (uint32_t j = 0; j < 4; j++) { upload_vertex(&s, first + i + j); } rspq_write_end(&s); - - // We pass -1 because the triangle can be clipped and split into multiple - // triangles. - rdpq_write(-1, gpup_id, GPU_CMD_DRAW_QUAD); } } diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S index 732dbbe0a..08f41b5d4 100644 --- a/misc/n64/rsp_gpu.S +++ b/misc/n64/rsp_gpu.S @@ -1,6 +1,8 @@ #include #include -#include "gl_constants.h" +#define MATRIX_SIZE 64 +#define GUARD_BAND_FACTOR 2 + .data RSPQ_BeginOverlayHeader @@ -9,11 +11,10 @@ RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x2 RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x3 - RSPQ_DefineCommand GPUCmd_DrawQuad, 4 # 0x4 - RSPQ_DefineCommand GPUCmd_UploadQuad, 68 # 0x5 - RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x6 + RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x4 + RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x5 - RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x7 + RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x6 RSPQ_EndOverlayHeader .align 4 @@ -32,9 +33,6 @@ GL_STATE: GL_TRI_CMD: .half 0 GL_TRI_CULL: .half 0 - .align 3 -VERTEX_CACHE: .dcb.b PRIM_VTX_SIZE * VERTEX_CACHE_SIZE - RSPQ_EndSavedState .align 4 @@ -54,6 +52,15 @@ DRAW_TRI_RA: .word 0 #define SCREEN_VTX_INVW 36 // 32-bit #define SCREEN_VTX_SIZE 40 + .bss + .align 3 +#define VERTEX_CACHE_SIZE 4 +//0-39 same as screenvtx +#define PRIM_VTX_TRCODE 40 // trivial-reject clipping flags (against -w/+w) +#define PRIM_VTX_SIZE 42 + +VERTEX_CACHE: .dcb.b PRIM_VTX_SIZE * VERTEX_CACHE_SIZE + .text .func GPUCmd_SetByte @@ -126,8 +133,8 @@ GPUCmd_MatrixLoad: .endfunc .align 3 - .func GPUCmd_UploadQuad -GPUCmd_UploadQuad: + .func GPUCmd_DrawQuad +GPUCmd_DrawQuad: #define vtx a0 #define mtx_ptr s0 #define src_ptr s4 @@ -214,8 +221,20 @@ upload_vertex: bnez vcount, upload_vertex addi vtx, PRIM_VTX_SIZE - jr ra - nop + + # now do the actual drawing + li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE + li a2, %lo(VERTEX_CACHE) + 1*PRIM_VTX_SIZE + jal GPUCmd_DrawTriangle + li a3, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE + + li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE + li a2, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE + jal GPUCmd_DrawTriangle + li a3, %lo(VERTEX_CACHE) + 3*PRIM_VTX_SIZE + + j RSPQ_Loop + nop #undef src_ptr #undef vtx @@ -508,22 +527,5 @@ gl_draw_triangle_end: #undef vtx3 .endfunc - - .func GPUCmd_DrawQuad -GPUCmd_DrawQuad: - li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE - li a2, %lo(VERTEX_CACHE) + 1*PRIM_VTX_SIZE - jal GPUCmd_DrawTriangle - li a3, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE - - li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE - li a2, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE - jal GPUCmd_DrawTriangle - li a3, %lo(VERTEX_CACHE) + 3*PRIM_VTX_SIZE - - j RSPQ_Loop - nop - .endfunc - #include "rsp_gpu_clipping.inc" #include