N64 optimised, stage 2

This commit is contained in:
UnknownShadow200 2025-04-27 12:58:17 +10:00
parent 3ef1f91d9d
commit eba646cebb
7 changed files with 1429 additions and 10 deletions

View File

@ -1,23 +1,28 @@
BUILD_DIR = build-n64
SOURCE_DIR = src
SOURCE_DIR = misc/n64
N64_ROM_TITLE = "ClassiCube"
N64_ROM_RTC = true
TARGET = ClassiCube-n64
N64_MKDFS_ROOT = "misc/n64"
N64_MKDFS_ROOT = "misc/n64/files"
CFILES := $(notdir $(wildcard src/*.c))
OFILES := $(CFILES:.c=.o)
OFILES := $(CFILES:.c=.o) rsp_gpu.o
OBJS := $(addprefix $(BUILD_DIR)/,$(OFILES))
CFLAGS := -Wno-error=missing-braces -Wno-error=strict-aliasing -Wno-error=incompatible-pointer-types
default: $(TARGET).z64
$(BUILD_DIR)/%.o: src/%.c
@mkdir -p $(dir $@)
@echo " [CC] $<"
$(CC) -c $(CFLAGS) -o $@ $<
include $(N64_INST)/include/n64.mk
$(TARGET).z64: N64_ROM_TITLE = "ClassiCube"
$(TARGET).z64: $(BUILD_DIR)/filesystem.dfs
$(BUILD_DIR)/filesystem.dfs: misc/n64/default.zip
$(BUILD_DIR)/filesystem.dfs: misc/n64/files/default.zip
$(BUILD_DIR)/ClassiCube-n64.elf: $(OBJS)

49
misc/n64/gl_constants.h Normal file
View File

@ -0,0 +1,49 @@
#ifndef __GL_CONSTANTS
#define __GL_CONSTANTS
#define VERTEX_CACHE_SIZE 16
#define MATRIX_SIZE 64
#define TEXTURE_BILINEAR_MASK 0x001
#define TEXTURE_INTERPOLATE_MASK 0x002
#define TEXTURE_MIPMAP_MASK 0x100
#define VTX_SHIFT 5
#define TEX_SHIFT 8
#define FLAG_DEPTH_TEST (1 << 8)
#define FLAG_TEXTURE_ACTIVE (1 << 9)
#define GUARD_BAND_FACTOR 2
#define ASSERT_INVALID_VTX_ID 0x2001
#define TEX_COORD_SHIFT 6
#define HALF_TEXEL 0x0010
#define TEX_BILINEAR_SHIFT 13
#define TEX_BILINEAR_OFFSET_SHIFT 4
#define BILINEAR_TEX_OFFSET_SHIFT 9
#define TRICMD_ATTR_MASK 0x300
#define PRIM_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit)
#define PRIM_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit)
#define PRIM_VTX_X 16 // Object space position (16-bit)
#define PRIM_VTX_Y 18 // Object space position (16-bit)
#define PRIM_VTX_Z 20 // Object space position (16-bit)
#define PRIM_VTX_W 22 // Object space position (16-bit)
#define PRIM_VTX_R 24
#define PRIM_VTX_G 26
#define PRIM_VTX_B 28
#define PRIM_VTX_A 30
#define PRIM_VTX_TEX_S 32
#define PRIM_VTX_TEX_T 34
#define PRIM_VTX_TEX_R 36
#define PRIM_VTX_TEX_Q 38
#define PRIM_VTX_TRCODE 40 // trivial-reject clipping flags (against -w/+w)
#define PRIM_VTX_SIZE 42
#endif

398
misc/n64/gpu.c Normal file
View File

@ -0,0 +1,398 @@
#include "GL/gl.h"
#include "rspq.h"
#include "rdpq.h"
#include "rdpq_rect.h"
#include "rdpq_mode.h"
#include "rdpq_debug.h"
#include "display.h"
#include "rdp.h"
#include <string.h>
#include <math.h>
#include <malloc.h>
#include "gl_constants.h"
// This is a severely cutdown version of libdragon's OpenGL implementation
static uint32_t glp_id;
//DEFINE_RSP_UCODE(rsp_gpu);
extern uint8_t _binary_build_n64_rsp_gpu_text_bin_start[];
extern uint8_t _binary_build_n64_rsp_gpu_data_bin_start[];
extern uint8_t _binary_build_n64_rsp_gpu_meta_bin_start[];
extern uint8_t _binary_build_n64_rsp_gpu_text_bin_end[0];
extern uint8_t _binary_build_n64_rsp_gpu_data_bin_end[0];
extern uint8_t _binary_build_n64_rsp_gpu_meta_bin_end[0];
static rsp_ucode_t rsp_gpu = (rsp_ucode_t){
.code = _binary_build_n64_rsp_gpu_text_bin_start,
.code_end = _binary_build_n64_rsp_gpu_text_bin_end,
.data = _binary_build_n64_rsp_gpu_data_bin_start,
.data_end = _binary_build_n64_rsp_gpu_data_bin_end,
.meta = _binary_build_n64_rsp_gpu_meta_bin_start,
.meta_end = _binary_build_n64_rsp_gpu_meta_bin_end,
.name = "rsp_gpu"
};
enum {
GPU_CMD_SET_FLAG = 0x0,
GPU_CMD_SET_BYTE = 0x1,
GPU_CMD_SET_SHORT = 0x2,
GPU_CMD_SET_WORD = 0x3,
GPU_CMD_SET_LONG = 0x4,
GPU_CMD_DRAW_TRI = 0x5,
GPU_CMD_UPLOAD_VTX = 0x6,
GPU_CMD_MATRIX_LOAD = 0x7,
GPU_CMD_PRE_INIT_PIPE = 0x8,
};
enum {
ATTRIB_VERTEX,
ATTRIB_COLOR,
ATTRIB_TEXCOORD,
ATTRIB_COUNT
};
typedef struct {
GLfloat scale[3];
GLfloat offset[3];
} gl_viewport_t;
typedef struct {
int16_t i[4][4];
uint16_t f[4][4];
} gl_matrix_srv_t;
_Static_assert(sizeof(gl_matrix_srv_t) == MATRIX_SIZE, "Matrix size does not match");
typedef struct {
rspq_write_t w;
union {
uint8_t bytes[4];
uint32_t word;
};
uint32_t buffer_head;
} gl_cmd_stream_t;
typedef struct {
GLsizei stride;
const GLvoid *pointer;
bool enabled;
} gl_array_t;
typedef struct {
gl_matrix_srv_t mvp_matrix;
int16_t viewport_scale[4];
int16_t viewport_offset[4];
uint32_t flags;
uint16_t tex_size[2];
uint16_t tex_offset[2];
uint16_t tri_cmd;
uint16_t tri_cull;
} __attribute__((aligned(8), packed)) gl_server_state_t;
static inline const void *gl_get_attrib_element(const gl_array_t *src, uint32_t index)
{
return src->pointer + index * src->stride;
}
static inline gl_cmd_stream_t gl_cmd_stream_begin(uint32_t ovl_id, uint32_t cmd_id, int size)
{
return (gl_cmd_stream_t) {
.w = rspq_write_begin(ovl_id, cmd_id, size),
.buffer_head = 2,
};
}
static inline void gl_cmd_stream_commit(gl_cmd_stream_t *s)
{
rspq_write_arg(&s->w, s->word);
s->buffer_head = 0;
s->word = 0;
}
static inline void gl_cmd_stream_put_half(gl_cmd_stream_t *s, uint16_t v)
{
s->bytes[s->buffer_head++] = v >> 8;
s->bytes[s->buffer_head++] = v & 0xFF;
if (s->buffer_head == sizeof(uint32_t)) {
gl_cmd_stream_commit(s);
}
}
static inline void gl_cmd_stream_end(gl_cmd_stream_t *s)
{
if (s->buffer_head > 0) {
gl_cmd_stream_commit(s);
}
rspq_write_end(&s->w);
}
__attribute__((always_inline))
static inline void gl_set_flag_raw(uint32_t offset, uint32_t flag, bool value)
{
rspq_write(glp_id, GPU_CMD_SET_FLAG, offset | value, value ? flag : ~flag);
}
__attribute__((always_inline))
static inline void gl_set_flag(uint32_t flag, bool value)
{
gl_set_flag_raw(offsetof(gl_server_state_t, flags), flag, value);
}
__attribute__((always_inline))
static inline void gl_set_byte(uint32_t offset, uint8_t value)
{
rspq_write(glp_id, GPU_CMD_SET_BYTE, offset, value);
}
__attribute__((always_inline))
static inline void gl_set_short(uint32_t offset, uint16_t value)
{
rspq_write(glp_id, GPU_CMD_SET_SHORT, offset, value);
}
__attribute__((always_inline))
static inline void gl_set_word(uint32_t offset, uint32_t value)
{
rspq_write(glp_id, GPU_CMD_SET_WORD, offset, value);
}
__attribute__((always_inline))
static inline void gl_set_long(uint32_t offset, uint64_t value)
{
rspq_write(glp_id, GPU_CMD_SET_LONG, offset, value >> 32, value & 0xFFFFFFFF);
}
static inline void glpipe_draw_triangle(int i0, int i1, int i2)
{
// We pass -1 because the triangle can be clipped and split into multiple
// triangles.
rdpq_write(-1, glp_id, GPU_CMD_DRAW_TRI,
(i0*PRIM_VTX_SIZE),
((i1*PRIM_VTX_SIZE)<<16) | (i2*PRIM_VTX_SIZE)
);
}
static gl_viewport_t state_viewport;
static gl_array_t state_arrays[ATTRIB_COUNT];
void gl_init()
{
glp_id = rspq_overlay_register(&rsp_gpu);
glDepthRange(0, 1);
}
void gl_close()
{
rspq_wait();
rspq_overlay_unregister(glp_id);
}
void gl_set_flag2(GLenum target, bool value)
{
switch (target) {
case GL_DEPTH_TEST:
gl_set_flag(FLAG_DEPTH_TEST, value);
break;
case GL_TEXTURE_2D:
gl_set_flag(FLAG_TEXTURE_ACTIVE, value);
break;
}
}
void glEnable(GLenum target)
{
gl_set_flag2(target, true);
}
void glDisable(GLenum target)
{
gl_set_flag2(target, false);
}
void glTexSizeN64(uint16_t width, uint16_t height)
{
gl_set_word(offsetof(gl_server_state_t, tex_size[0]), (width << 16) | height);
}
static inline void write_shorts(rspq_write_t *w, const uint16_t *s, uint32_t count)
{
for (uint32_t i = 0; i < count; i += 2)
{
uint32_t packed = ((uint32_t)s[i] << 16) | (uint32_t)s[i+1];
rspq_write_arg(w, packed);
}
}
static inline void gl_matrix_write(rspq_write_t *w, const GLfloat *m)
{
uint16_t integer[16];
uint16_t fraction[16];
for (uint32_t i = 0; i < 16; i++)
{
int32_t fixed = m[i] * (1<<16);
integer[i] = (uint16_t)((fixed & 0xFFFF0000) >> 16);
fraction[i] = (uint16_t)(fixed & 0x0000FFFF);
}
write_shorts(w, integer, 16);
write_shorts(w, fraction, 16);
}
void glLoadMatrixf(const GLfloat *m)
{
rspq_write_t w = rspq_write_begin(glp_id, GPU_CMD_MATRIX_LOAD, 17);
rspq_write_arg(&w, false); // no multiply
gl_matrix_write(&w, m);
rspq_write_end(&w);
}
static void upload_vertex(const gl_array_t *arrays, uint32_t index, uint8_t cache_index)
{
gl_cmd_stream_t s = gl_cmd_stream_begin(glp_id, GPU_CMD_UPLOAD_VTX, 6);
gl_cmd_stream_put_half(&s, cache_index * PRIM_VTX_SIZE);
const float* vtx = gl_get_attrib_element(&arrays[ATTRIB_VERTEX], index);
gl_cmd_stream_put_half(&s, vtx[0] * (1<<VTX_SHIFT));
gl_cmd_stream_put_half(&s, vtx[1] * (1<<VTX_SHIFT));
gl_cmd_stream_put_half(&s, vtx[2] * (1<<VTX_SHIFT));
gl_cmd_stream_put_half(&s, 1.0f * (1<<VTX_SHIFT));
const uint8_t* col = gl_get_attrib_element(&arrays[ATTRIB_COLOR], index);
gl_cmd_stream_put_half(&s, col[0] << 7); // TODO put_byte ?
gl_cmd_stream_put_half(&s, col[1] << 7); // TODO put_byte ?
gl_cmd_stream_put_half(&s, col[2] << 7); // TODO put_byte ?
gl_cmd_stream_put_half(&s, col[3] << 7); // TODO put_byte ?
if (arrays[ATTRIB_TEXCOORD].enabled) {
const float* tex = gl_get_attrib_element(&arrays[ATTRIB_TEXCOORD], index);
gl_cmd_stream_put_half(&s, tex[0] * (1<<TEX_SHIFT));
gl_cmd_stream_put_half(&s, tex[1] * (1<<TEX_SHIFT));
} else {
gl_cmd_stream_put_half(&s, 0);
gl_cmd_stream_put_half(&s, 0);
}
gl_cmd_stream_end(&s);
}
static void gl_rsp_draw_arrays(uint32_t first, uint32_t count)
{
for (uint32_t i = 0; i < count; i++)
{
uint8_t cache_index = i % VERTEX_CACHE_SIZE;
upload_vertex(state_arrays, first + i, cache_index);
// Last vertex of quad?
if ((i & 3) != 3) continue;
// Add two triangles
uint8_t idx = cache_index - 3;
glpipe_draw_triangle(idx + 0, idx + 1, idx + 2);
glpipe_draw_triangle(idx + 0, idx + 2, idx + 3);
}
}
int gl_array_type_from_enum(GLenum array)
{
switch (array) {
case GL_VERTEX_ARRAY: return ATTRIB_VERTEX;
case GL_TEXTURE_COORD_ARRAY: return ATTRIB_TEXCOORD;
case GL_COLOR_ARRAY: return ATTRIB_COLOR;
default: return -1;
}
}
void glVertexPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *pointer)
{
gl_array_t *array = &state_arrays[ATTRIB_VERTEX];
array->stride = stride;
array->pointer = pointer;
}
void glTexCoordPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *pointer)
{
gl_array_t *array = &state_arrays[ATTRIB_TEXCOORD];
array->stride = stride;
array->pointer = pointer;
}
void glColorPointer(GLint size, GLenum type, GLsizei stride, const GLvoid *pointer)
{
gl_array_t *array = &state_arrays[ATTRIB_COLOR];
array->stride = stride;
array->pointer = pointer;
}
void gl_set_array_enabled(int array_type, bool enabled)
{
state_arrays[array_type].enabled = enabled;
}
void glEnableClientState(GLenum array)
{
gl_set_array_enabled(gl_array_type_from_enum(array), true);
}
void glDisableClientState(GLenum array)
{
gl_set_array_enabled(gl_array_type_from_enum(array), false);
}
void glDrawArrays(GLenum mode, GLint first, GLsizei count)
{
rspq_write(glp_id, GPU_CMD_PRE_INIT_PIPE);
gl_rsp_draw_arrays(first, count);
}
void glDepthRange(GLclampd n, GLclampd f)
{
state_viewport.scale[2] = (f - n) * 0.5f;
state_viewport.offset[2] = n + (f - n) * 0.5f;
gl_set_short(
offsetof(gl_server_state_t, viewport_scale) + sizeof(int16_t) * 2,
state_viewport.scale[2] * 4);
gl_set_short(
offsetof(gl_server_state_t, viewport_offset) + sizeof(int16_t) * 2,
state_viewport.offset[2] * 4);
}
void glViewport(GLint x, GLint y, GLsizei w, GLsizei h)
{
state_viewport.scale[0] = w * 0.5f;
state_viewport.scale[1] = h * -0.5f;
state_viewport.offset[0] = x + w * 0.5f;
state_viewport.offset[1] = y + h * 0.5f;
// Screen coordinates are s13.2
#define SCREEN_XY_SCALE 4.0f
#define SCREEN_Z_SCALE 32767.0f
// * 2.0f to compensate for RSP reciprocal missing 1 bit
uint16_t scale_x = state_viewport.scale[0] * SCREEN_XY_SCALE * 2.0f;
uint16_t scale_y = state_viewport.scale[1] * SCREEN_XY_SCALE * 2.0f;
uint16_t scale_z = state_viewport.scale[2] * SCREEN_Z_SCALE * 2.0f;
uint16_t offset_x = state_viewport.offset[0] * SCREEN_XY_SCALE;
uint16_t offset_y = state_viewport.offset[1] * SCREEN_XY_SCALE;
uint16_t offset_z = state_viewport.offset[2] * SCREEN_Z_SCALE;
gl_set_long(
offsetof(gl_server_state_t, viewport_scale),
((uint64_t)scale_x << 48) | ((uint64_t)scale_y << 32) | ((uint64_t)scale_z << 16));
gl_set_long(
offsetof(gl_server_state_t, viewport_offset),
((uint64_t)offset_x << 48) | ((uint64_t)offset_y << 32) | ((uint64_t)offset_z << 16));
}
void glCullFace(GLenum mode)
{
// 1 = cull backfaces
// 2 = don't cull
gl_set_short(offsetof(gl_server_state_t, tri_cull), mode ? 1 : 2);
}

585
misc/n64/rsp_gpu.S Normal file
View File

@ -0,0 +1,585 @@
#include <rsp_queue.inc>
#include <rdpq_macros.h>
#include "gl_constants.h"
.data
RSPQ_BeginOverlayHeader
RSPQ_DefineCommand GLCmd_SetFlag, 8 # 0x0
RSPQ_DefineCommand GLCmd_SetByte, 8 # 0x1
RSPQ_DefineCommand GLCmd_SetShort, 8 # 0x2
RSPQ_DefineCommand GLCmd_SetWord, 8 # 0x3
RSPQ_DefineCommand GLCmd_SetLong, 12 # 0x4
RSPQ_DefineCommand GLCmd_DrawTriangle, 8 # 0x5
RSPQ_DefineCommand GLCmd_UploadVertex, 24 # 0x6
RSPQ_DefineCommand GLCmd_MatrixLoad, 68 # 0x7
RSPQ_DefineCommand GLCmd_PreInitPipe, 4 # 0x8
RSPQ_EndOverlayHeader
.align 4
BANNER0: .ascii " RSP OpenGL T&L "
BANNER1: .ascii "Rasky & Snacchus"
RSPQ_BeginSavedState
GL_STATE:
# This is the GL state that is also used by the pipeline.
GL_MATRIX_MVP: .ds.b MATRIX_SIZE
GL_VIEWPORT_SCALE: .half 0,0,0,0
GL_VIEWPORT_OFFSET: .half 0,0,0,0
GL_STATE_FLAGS: .word 0
GL_STATE_TEX_SIZE: .half 0,0
GL_STATE_TEX_OFFSET: .half 0,0
GL_TRI_CMD: .half 0
GL_TRI_CULL: .half 0
.align 3
VERTEX_CACHE: .dcb.b PRIM_VTX_SIZE * VERTEX_CACHE_SIZE
RSPQ_EndSavedState
.align 4
CACHE_OFFSETS: .half 2,4,6,8,10,12,14,16,18
CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR
#define CLIPPING_PLANE_COUNT 6
#define CLIPPING_CACHE_SIZE 9
#define CLIPPING_PLANE_SIZE 8
#define SCREEN_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit)
#define SCREEN_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit)
#define SCREEN_VTX_X 16
#define SCREEN_VTX_Y 18
#define SCREEN_VTX_Z 20
#define SCREEN_VTX_CLIP_CODE 22
#define SCREEN_VTX_PADDING 23
#define SCREEN_VTX_RGBA 24
#define SCREEN_VTX_S 28
#define SCREEN_VTX_T 30
#define SCREEN_VTX_W 32 // FIXME: this is duplicated in CS_POS
#define SCREEN_VTX_INVW 36 // 32-bit
#define SCREEN_VTX_SIZE 40
.text
#############################################################
# GLCmd_SetFlag
#
# Sets or clears a flag
#
# ARGS:
# a0: Bit 31..24: Command id
# Bit 11..2: Offset of flag value in GL_STATE
# Bit 0: If 1, set the flag, otherwise clear it
# a1: flag mask (inverted if clearing)
#############################################################
.func GLCmd_SetFlag
GLCmd_SetFlag:
li t0, ~0x3
and t0, a0, t0
andi t1, a0, 1
lw t2, %lo(GL_STATE)(t0)
beqz t1, 1f
and t3, t2, a1
or t3, t2, a1
1:
jr ra
sw t3, %lo(GL_STATE)(t0)
.endfunc
.func GLCmd_SetByte
GLCmd_SetByte:
jr ra
sb a1, %lo(GL_STATE)(a0)
.endfunc
.func GLCmd_SetShort
GLCmd_SetShort:
jr ra
sh a1, %lo(GL_STATE)(a0)
.endfunc
.func GLCmd_SetWord
GLCmd_SetWord:
jr ra
sw a1, %lo(GL_STATE) + 0(a0)
.endfunc
.func GLCmd_SetLong
GLCmd_SetLong:
sw a2, %lo(GL_STATE) + 4(a0)
jr ra
sw a1, %lo(GL_STATE) + 0(a0)
.endfunc
########################################
# GLCmd_UploadVertex
#
# Arguments:
# * 0x00 (a0): offset within VERTEX_CACHE
# * 0x04 (a1): object space X, Y (16-bit)
# * 0x08 (a2): object space Z, W (16-bit)
# * 0x0C (a3): RGBA (8-bit each one)
# * 0x10: S, T (16-bit)
# * 0x14: normal X, Y, Z (8-bit each one) (LSB must be 0)
#
########################################
.align 3
.func GLCmd_UploadVertex
GLCmd_UploadVertex:
#define vtx a0
#define mtx_ptr s0
#define cmd_ptr s4
#define v___ $v01
#define vmtx0_i $v16 // m00 m01 m02 m03
#define vmtx0_f $v17
#define vmtx1_i $v18 // m10 m11 m12 m13
#define vmtx1_f $v19
#define vmtx2_i $v20 // m20 m21 m22 m23
#define vmtx2_f $v21
#define vmtx3_i $v22 // m30 m31 m32 m03
#define vmtx3_f $v23
#define vpos $v24
#define vcol $v25
#define vtex $v26
#define vcspos_i $v28
#define vcspos_f $v29
#define x e0
#define y e1
#define z e2
#define w e3
addi cmd_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) + 4
sub cmd_ptr, rspq_cmd_size
ldv vpos, 0, cmd_ptr # Load X, Y, Z, W
ldv vcol, 8, cmd_ptr # Load R, G, B, A
llv vtex, 16, cmd_ptr # Load U, V
addi vtx, %lo(VERTEX_CACHE)
sdv vpos, PRIM_VTX_X ,vtx
sdv vcol, PRIM_VTX_R ,vtx
sdv vtex, PRIM_VTX_TEX_S ,vtx
# == matrix multiply ==
li mtx_ptr, %lo(GL_MATRIX_MVP)
ldv vmtx0_i.e0, 0x00,mtx_ptr
ldv vmtx1_i.e0, 0x08,mtx_ptr
ldv vmtx2_i.e0, 0x10,mtx_ptr
ldv vmtx3_i.e0, 0x18,mtx_ptr
ldv vmtx0_f.e0, 0x20,mtx_ptr
ldv vmtx1_f.e0, 0x28,mtx_ptr
ldv vmtx2_f.e0, 0x30,mtx_ptr
ldv vmtx3_f.e0, 0x38,mtx_ptr
vmudn v___, vmtx0_f, vpos.h0
vmadh v___, vmtx0_i, vpos.h0
vmadn v___, vmtx1_f, vpos.h1
vmadh v___, vmtx1_i, vpos.h1
vmadn v___, vmtx2_f, vpos.h2
vmadh v___, vmtx2_i, vpos.h2
vmadn v___, vmtx3_f, vpos.h3
vmadh vcspos_i, vmtx3_i, vpos.h3
vmadn vcspos_f, vzero, vzero
# == end matrix multiply ==
# 32-bit right shift by 5, to keep the clip space coordinates unscaled
vmudm vcspos_i, vcspos_i, vshift8.e4
vmadl vcspos_f, vcspos_f, vshift8.e4
sdv vcspos_i, PRIM_VTX_CS_POSi,vtx
sdv vcspos_f, PRIM_VTX_CS_POSf,vtx
# Calculate and store clipping flags against CS.W.
# These will be used for trivial rejections.
vch v___, vcspos_i, vcspos_i.w
vcl v___, vcspos_f, vcspos_f.w
cfc2 t0, COP2_CTRL_VCC
andi t0, 0x707 # Isolate X/Y/Z flags
# Compress flags to 8 bit
srl t1, t0, 5
andi t0, 0x7
or t0, t1
jr ra
sb t0, PRIM_VTX_TRCODE(vtx)
#undef cmd_ptr
#undef vtx
#undef in_xy
#undef in_zw
#undef in_rgba
#undef vtx_id
#undef x
#undef y
#undef z
#undef w
#undef v___
#undef vmtx0_i
#undef vmtx0_f
#undef vmtx1_i
#undef vmtx1_f
#undef vmtx2_i
#undef vmtx2_f
#undef vmtx3_i
#undef vmtx3_f
#undef vpos
#undef vcspos_i
#undef vcspos_f
.endfunc
################################################################
# GL_CalcScreenSpace
#
# Args:
# s3 = Destination vertex address
# $v02 = Clip space position (fractional part)
# $v03 = Clip space position (integer part)
#
################################################################
.func GL_CalcScreenSpace
GL_CalcScreenSpace:
#define dst s3
#define vcspos_f $v02
#define vcspos_i $v03
#define vinvw_f $v23
#define vinvw_i $v24
#define vviewscale $v25
#define vviewoff $v26
#define vscreenpos_i $v27
#define vscreenpos_f $v28
#define v___ $v29
#define w e3
# Calculate 32-bit inverse W
# TODO: NR?
vrcph vinvw_i.w, vcspos_i.w
vrcpl vinvw_f.w, vcspos_f.w
vrcph vinvw_i.w, vzero.e0
# Calculate screenspace coords
li t0, %lo(GL_VIEWPORT_SCALE)
ldv vviewscale, 0,t0
ldv vviewoff, 8,t0
vmudl v___, vcspos_f, vinvw_f.w
vmadm v___, vcspos_i, vinvw_f.w
vmadn vscreenpos_f, vcspos_f, vinvw_i.w
vmadh vscreenpos_i, vcspos_i, vinvw_i.w
vmudn vscreenpos_f, vscreenpos_f, vviewscale
vmadh vscreenpos_i, vscreenpos_i, vviewscale
vadd vscreenpos_i, vviewoff
sdv vscreenpos_i, SCREEN_VTX_X ,dst
ssv vcspos_i.w, SCREEN_VTX_W+0 ,dst
ssv vcspos_f.w, SCREEN_VTX_W+2 ,dst
ssv vinvw_i.w, SCREEN_VTX_INVW+0,dst
ssv vinvw_f.w, SCREEN_VTX_INVW+2,dst
jr ra
sb zero, SCREEN_VTX_PADDING(dst)
#undef dst
#undef vcspos_f
#undef vcspos_i
#undef vinvw_f
#undef vinvw_i
#undef vviewscale
#undef vviewoff
#undef vscreenpos_i
#undef vscreenpos_f
#undef v___
#undef w
.endfunc
################################################################
# GL_CalcClipCodes
#
# Args:
# s3 = Destination vertex address
# $v02 = Clip space position (fractional part)
# $v03 = Clip space position (integer part)
#
################################################################
.func GL_CalcClipCodes
GL_CalcClipCodes:
#define dst s3
#define vcspos_f $v02
#define vcspos_i $v03
#define vguard_f $v27
#define vguard_i $v28
#define v___ $v29
#define w e3
li t0, %lo(CLIP_CODE_FACTORS)
ldv vguard_i, 0,t0
vmudn vguard_f, vcspos_f, vguard_i
vmadh vguard_i, vcspos_i, vguard_i
vch v___, vguard_i, vguard_i.w
vcl v___, vguard_f, vguard_f.w
cfc2 t0, COP2_CTRL_VCC
andi t0, 0x707
srl t1, t0, 5
andi t0, 0x7
or t0, t1
jr ra
sb t0, SCREEN_VTX_CLIP_CODE(dst)
#undef dst
#undef vcspos_i
#undef vcspos_f
#undef vguard_i
#undef vguard_f
#undef v___
#undef w
.endfunc
################################################################
# GL_TnL
#
# Args:
# s3 = address of the vertex in DMEM (usually within VERTEX_CACHE)
#
################################################################
.func GL_TnL
GL_TnL:
#define tmp_ptr s2
#define vtx s3
#define s e0
move ra2, ra
#define v___ $v01
#define vrgba $v04
ldv vrgba.e0, PRIM_VTX_R, vtx # R + G + B + A
ldv vrgba.e4, PRIM_VTX_R, vtx # R + G + B + A
#define vtexsize $v06
#define vtexoffset $v07
#define vstrq $v08
ldv vstrq, PRIM_VTX_TEX_S,vtx # S + T + R + Q
suv vrgba, SCREEN_VTX_RGBA,vtx
li s1, %lo(GL_STATE_TEX_SIZE)
llv vtexsize.s, 0,s1
llv vtexoffset.s, 4,s1
#define vinvq_i $v26
#define vinvq_f $v27
#define vstrq_i $v28
#define vstrq_f $v29
#define q e3
# Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active)
#vmudn v___, vstrq, vtexsize
# vmadh vstrq, vtexoffset, K1
#vmudn v___, vstrq, vtexsize
#vmadh vstrq, vtexoffset, K1
#vmudl vstrq, vstrq, vtexsize
vmudh v___, vstrq, vtexsize
vsar vstrq_i, COP2_ACC_HI
vsar vstrq_f, COP2_ACC_MD
vmudl vstrq_f, vstrq_f, K8192
vmadm vstrq_i, vstrq_i, K8192
vmadn vstrq, vzero, vzero
#undef vinvq_i
#undef vinvq_f
#undef vstrq_i
#undef vstrq_f
#undef q
lbu t0, PRIM_VTX_TRCODE(vtx)
#define vcspos_f $v02
#define vcspos_i $v03
ldv vcspos_f, PRIM_VTX_CS_POSf,vtx
ldv vcspos_i, PRIM_VTX_CS_POSi,vtx
# Mark this vertex as having T&L applied
ori t0, 0x80
sb t0, PRIM_VTX_TRCODE(vtx)
jal GL_CalcScreenSpace
slv vstrq.s, SCREEN_VTX_S,vtx
j GL_CalcClipCodes
move ra, ra2
#undef vcspos_f
#undef vcspos_i
#undef vtexsize
#undef vtexoffset
#undef vtx
#undef v___
#undef vrgba
#undef vst
#undef s
.endfunc
################################################################
# GLCmd_DrawTriangle
#
# Arguments:
# a0: Bit 31..24: Command id
# Bit 11..0: Offset into vertex cache of vtx1
# a1: Bit 27..16: Offset into vertex cache of vtx2
# Bit 11..0: Offset into vertex cache of vtx3
#
################################################################
.func GLCmd_DrawTriangle
GLCmd_DrawTriangle:
#define vtx1 a1
#define vtx2 a2
#define vtx3 a3
#define trcode1 t6
#define trcode2 t7
#define trcode3 t8
addi vtx3, a1, %lo(VERTEX_CACHE)
srl vtx2, a1, 16
addi vtx2, %lo(VERTEX_CACHE)
addi vtx1, a0, %lo(VERTEX_CACHE)
# Trivial reject: if all the vertices are out of the same plane (at least one),
# the triangle is out of the viewport.
# NOTE: This deliberately uses lb instead of lbu so the sign bit is extended.
# The MSB of each TR-code is a bit flag that is set if the vertex has already
# had T&L applied once.
lb trcode1, PRIM_VTX_TRCODE(vtx1)
lb trcode2, PRIM_VTX_TRCODE(vtx2)
lb trcode3, PRIM_VTX_TRCODE(vtx3)
and t0, trcode1, trcode2
and t0, trcode3
andi t0, 0x3F
bnez t0, JrRa
nop
# Perform T&L for each vertex if we haven't already
bgezal trcode1, GL_TnL
move s3, vtx1
bgezal trcode2, GL_TnL
move s3, vtx2
bgezal trcode3, GL_TnL
move s3, vtx3
lbu t0, SCREEN_VTX_CLIP_CODE(vtx1)
lbu t1, SCREEN_VTX_CLIP_CODE(vtx2)
lbu t2, SCREEN_VTX_CLIP_CODE(vtx3)
or t5, t0, t1
or t5, t2
move s1, zero
beqz t5, gl_draw_single_triangle
move s2, zero
jal GL_ClipTriangle
nop
beqz v1, gl_draw_triangle_end
addi s2, -6
lhu s5, 0(s1)
gl_draw_clipped_triangles_loop:
move vtx1, s5
lhu vtx2, 2(s1)
lhu vtx3, 4(s1)
gl_draw_single_triangle:
addi vtx1, SCREEN_VTX_X
addi vtx2, SCREEN_VTX_X
addi vtx3, SCREEN_VTX_X
lhu a0, %lo(GL_TRI_CMD)
lh v0, %lo(GL_TRI_CULL)
jal RDPQ_Triangle
li s3, %lo(RDPQ_CMD_STAGING)
jal RDPQ_Send
li s4, %lo(RDPQ_CMD_STAGING)
blt s1, s2, gl_draw_clipped_triangles_loop
addi s1, 2
gl_draw_triangle_end:
j RSPQ_Loop
nop
#undef vtx1
#undef vtx2
#undef vtx3
.endfunc
GLCmd_MatrixLoad:
#define src s6
#define dst s7
#define vrhs01_i $v02
#define vrhs01_f $v03
#define vrhs23_i $v04
#define vrhs23_f $v05
addi src, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
addi dst, zero, %lo(GL_MATRIX_MVP)
# Load the matrix from command parameters (misaligned)
lqv vrhs01_i, 0x00,src
lrv vrhs01_i, 0x10,src
lqv vrhs23_i, 0x10,src
lrv vrhs23_i, 0x20,src
lqv vrhs01_f, 0x20,src
lrv vrhs01_f, 0x30,src
lqv vrhs23_f, 0x30,src
lrv vrhs23_f, 0x40,src
sqv vrhs01_i, 0x00,dst
sqv vrhs23_i, 0x10,dst
sqv vrhs01_f, 0x20,dst
jr ra
sqv vrhs23_f, 0x30,dst
.func GLCmd_PreInitPipe
GLCmd_PreInitPipe:
#define state_flags k1
#define tri_cmd t4
lw tri_cmd, %lo(GL_STATE_FLAGS)
ori tri_cmd, 0xCC00
jr ra
sh tri_cmd, %lo(GL_TRI_CMD)
#undef tri_cmd
#undef state_flags
.endfunc
#include "rsp_gpu_clipping.inc"
#include <rsp_rdpq.inc>

View File

@ -0,0 +1,374 @@
.section .data.gl_clipping
.align 4
CLIP_PLANES:
.half 1, 0, 0, GUARD_BAND_FACTOR
.half 0, 1, 0, GUARD_BAND_FACTOR
.half 0, 0, 1, 1
.half 1, 0, 0, -GUARD_BAND_FACTOR
.half 0, 1, 0, -GUARD_BAND_FACTOR
.half 0, 0, 1, -1
.section .bss.gl_clipping
CLIP_CACHE: .dcb.b SCREEN_VTX_SIZE * CLIPPING_CACHE_SIZE
CLIP_CACHE_END:
CLIP_LISTS:
CLIP_LIST0: .dcb.w CLIPPING_CACHE_SIZE
CLIP_LIST1: .dcb.w CLIPPING_CACHE_SIZE
.section .text.gl_clipping
################################################################
# GL_ClipTriangle
# Clip a triangle against the view-frustum by using the Sutherland-Hodgman algorithm
# https://en.wikipedia.org/wiki/Sutherland%E2%80%93Hodgman_algorithm
# Args:
# a1-a3 = Vertices
# t5 = OR'd clip flags of the triangle's vertices
# Returns:
# s1 = Pointer to list of output vertices
# s2 = Pointer to end of list
################################################################
.func GL_ClipTriangle
GL_ClipTriangle:
#define out_count v1
#define clip_flags t5
#define plane_flag t6
#define in_count t7
#define in_end t8
#define in_list s0
#define out_list s1
#define plane s2
#define intersection s3
#define cur_ptr s4
#define prev_ptr s5
#define cur_vtx s6
#define prev_vtx s7
#define p0 k0
#define p1 k1
#define vtx1 a1
#define vtx2 a2
#define vtx3 a3
#define vplane $v01
#define vint_f $v02
#define vint_i $v03
#define vdot_i $v04
#define vdot_f $v05
#define vdiff_i $v06
#define vdiff_f $v07
#define va_i $v08
#define va_f $v09
#define vpos_i $v10
#define vpos_f $v11
#define vattr0 $v12
#define vattr1 $v13
#define voff0 $v14
#define voff1 $v15
#define vcache0 $v16
#define vcache1 $v17
#define v__ $v29
move ra2, ra
# Init in_list as empty
li in_list, %lo(CLIP_LIST0)
move in_count, zero
# Put three original vertices in the out_list
# (So after the initial swap they will be in the in_list)
li out_list, %lo(CLIP_LIST1)
sh vtx1, 0(out_list)
sh vtx2, 2(out_list)
sh vtx3, 4(out_list)
li out_count, 3*2
li plane, %lo(CLIP_PLANES)
li plane_flag, 1
# Load cache offsets
li t0, %lo(CACHE_OFFSETS)
vxor voff1, voff1
lqv voff0, 0,t0
lsv voff1, 16,t0
# Temporarily use the RDP staging area as a map of which cache slots are used
# Init to zero
li t0, %lo(RDPQ_CMD_STAGING)
sqv vzero, 0,t0
sqv vzero, 16,t0
# Iterate over the 6 clipping planes
gl_clip_plane_loop:
and t0, clip_flags, plane_flag
beqz t0, gl_clip_plane_loop_end
move t1, in_list
# Swap in and out lists
# If the out list is empty from the last iteration,
# the triangle has no visible points and we are done
beqz out_count, gl_clip_return
move in_list, out_list
move out_list, t1
move in_count, out_count
move out_count, zero
# Iterate over the egdes of the polygon in the input list
# The current edge is between cur_vtx and prev_vtx
move cur_ptr, in_list
add in_end, in_list, in_count
# Init the "previous" vertex to the last in the list for the wrap-around
addi prev_ptr, in_end, -2
gl_clip_edge_loop:
#define cur_flag t3
#define prev_flag t4
# Check which side of the plane the two vertices are on
lhu cur_vtx, 0(cur_ptr)
lhu prev_vtx, 0(prev_ptr)
lbu cur_flag, SCREEN_VTX_CLIP_CODE(cur_vtx)
lbu prev_flag, SCREEN_VTX_CLIP_CODE(prev_vtx)
and cur_flag, plane_flag
and prev_flag, plane_flag
# If they are on opposite sides, there is an intersection
xor t0, cur_flag, prev_flag
beqz t0, gl_clip_no_intersection
move p0, cur_vtx
# Swap the two points if necessary to make intersection calculation consistent
# This will make sure p0 is always inside and p1 is always outside
bnez prev_flag, gl_clip_no_swap
move p1, prev_vtx
xor p0, p0, p1
xor p1, p0, p1
xor p0, p0, p1
#undef prev_flag
gl_clip_no_swap:
# Calculate intersection of the line segment and the plane
li t0, %lo(RDPQ_CMD_STAGING)
lqv vcache0, 0,t0
lqv vcache1, 16,t0
# Repeat plane coefficients twice
ldv vplane.e0, 0,plane
ldv vplane.e4, 0,plane
# vpos: x0 y0 z0 w0 x1 y1 z1 w1
ldv vpos_i.e0, SCREEN_VTX_CS_POSi,p0
ldv vpos_f.e0, SCREEN_VTX_CS_POSf,p0
ldv vpos_i.e4, SCREEN_VTX_CS_POSi,p1
ldv vpos_f.e4, SCREEN_VTX_CS_POSf,p1
# vint: x1 y1 z1 w1
ldv vint_i.e0, SCREEN_VTX_CS_POSi,p1
ldv vint_f.e0, SCREEN_VTX_CS_POSf,p1
# vattr0: r0 g0 b0 a0 s0 t0
luv vattr0.e0, SCREEN_VTX_RGBA ,p0
llv vattr0.e4, SCREEN_VTX_S ,p0
# vattr1: r1 g1 b1 a1 s1 t1
luv vattr1.e0, SCREEN_VTX_RGBA ,p1
llv vattr1.e4, SCREEN_VTX_S ,p1
# Find first free slot in clip cache
# Add the values from the "used slots map" to the cache offsets
# After this, each lane will contain the offset of its corresponding cache slot,
# but only if the slot is not used. If it is used, it will contain some large value.
vaddc vcache0, voff0
vaddc vcache1, voff1
# Look for the smallest value, which will end up in vcache.e0
# Because used slots are marked as large values, they will never be found.
vlt vcache0, vcache0.q1
vlt vcache0, vcache0.h2
vlt vcache0, vcache0.e4
vlt vcache0, vcache1.e0
mfc2 t0, vcache0.e0
# Mark slot as used by storing some large value (careful of overflows!)
li t1, 0xFF
sh t1, %lo(RDPQ_CMD_STAGING)-2(t0)
# t0 is the index multiplied by 2
# intersection = t0 * 20 = t0 * 16 + t0 * 4
sll intersection, t0, 4
sll t1, t0, 2
add intersection, t1
# CAUTION: intersection might point to the same address as either p0 or p1,
# because one of them is the previous point, which could have been marked unused
# in the previous iteration. As long as we don't access p0 or p1 after writing to
# intersection, this is fine.
addi intersection, %lo(CLIP_CACHE) - SCREEN_VTX_SIZE
# Store the cache offset in unused memory (used later when finding the cache slot to mark as unused)
sb t0, SCREEN_VTX_PADDING(intersection)
# Compute dot products of both positions with the clip plane
# vdot.e0: d0 = dot(p0, plane)
# vdot.e4: d1 = dot(p1, plane)
vmudn vdot_f, vpos_f, vplane
vmadh vdot_i, vpos_i, vplane
vaddc vdot_f, vdot_f.q1
vadd vdot_i, vdot_i.q1
vaddc vdot_f, vdot_f.h2
vadd vdot_i, vdot_i.h2
# d0 - d1
vsubc vdiff_f, vdot_f, vdot_f.e4
vsub vdiff_i, vdot_i, vdot_i.e4
# 1 / (d0 - d1)
vrcph v__.e0, vdiff_i.e0
vrcpl va_f.e0, vdiff_f.e0
vrcph va_i.e0, vzero.e0
# a = d0 / (d0 - d1)
vmudl v__, va_f, vdot_f.e0
vmadm v__, va_i, vdot_f.e0
vmadn va_f, va_f, vdot_i.e0
# Prepare 0x7FFF in va_i.e0
vsubc va_i, vshift8, K1
# a = min(a, 1)
vge v__, va_f, vzero
vmrg va_f, va_f, va_i.e0
# Account for right shift introduced by vrcp
vmudn va_f, va_f, K2
# p1 - p0
vsubc vint_f, vpos_f
vsub vint_i, vpos_i
# attr1 - attr0
vsubc vattr1, vattr0
# Result of linear interpolation:
# p0 + a * (p1 - p0)
vmudl v__, vint_f, va_f.e0
vmadm v__, vint_i, va_f.e0
vmadn vint_f, vpos_f, K1
vmadh vint_i, vpos_i, K1
# a * (attr1 - attr0)
vmudm vattr1, vattr1, va_f.e0
# attr0 + a * (attr1 - attr0)
vaddc vattr0, vattr1
# Store results
sdv vint_i.e0, SCREEN_VTX_CS_POSi,intersection
sdv vint_f.e0, SCREEN_VTX_CS_POSf,intersection
suv vattr0.e0, SCREEN_VTX_RGBA ,intersection
jal GL_CalcClipCodes
slv vattr0.e4, SCREEN_VTX_S ,intersection
# Add intersection to the output list
add t0, out_list, out_count
sh intersection, 0(t0)
addi out_count, 2
gl_clip_no_intersection:
# If cur_vtx is inside, add it to the output list
bnez cur_flag, gl_clip_no_current
add t0, out_list, out_count
sh cur_vtx, 0(t0)
b gl_clip_edge_loop_end
addi out_count, 2
#undef cur_flag
gl_clip_no_current:
# Check if the vertex is stored in the clip cache
lbu t0, SCREEN_VTX_PADDING(cur_vtx)
beqz t0, gl_clip_edge_loop_end
# Reset the padding field to zero, so the screen space values won't be recalculated below
sb zero, SCREEN_VTX_PADDING(cur_vtx)
# If so, mark it as unused
sh zero, %lo(RDPQ_CMD_STAGING)-2(t0)
gl_clip_edge_loop_end:
# Advance to the next edge
addi cur_ptr, 2
blt cur_ptr, in_end, gl_clip_edge_loop
addi prev_ptr, cur_ptr, -2
gl_clip_plane_loop_end:
# Advance to the next clipping plane
sll plane_flag, 1
blt plane_flag, (1<<CLIPPING_PLANE_COUNT), gl_clip_plane_loop
addi plane, CLIPPING_PLANE_SIZE
#define cache_vtx s3
#define cache_end s5
# Calculate screen space values for new vertices (in the clip cache)
# TODO: maybe iterate over out_list instead
li cache_vtx, %lo(CLIP_CACHE)
li cache_end, %lo(CLIP_CACHE_END) - SCREEN_VTX_SIZE
gl_clip_finalize_loop:
lbu t0, SCREEN_VTX_PADDING(cache_vtx)
neg t0
# Only calculate screen space values if the vertex is actually used
ldv vint_i, SCREEN_VTX_CS_POSi,cache_vtx
bltzal t0, GL_CalcScreenSpace
ldv vint_f, SCREEN_VTX_CS_POSf,cache_vtx
blt cache_vtx, cache_end, gl_clip_finalize_loop
addi cache_vtx, SCREEN_VTX_SIZE
gl_clip_return:
# Done!
jr ra2
add s2, out_list, out_count
#undef cache_vtx
#undef cache_end
#undef clip_flags
#undef plane_flag
#undef in_count
#undef out_count
#undef in_end
#undef intersection
#undef in_list
#undef out_list
#undef plane
#undef cur_ptr
#undef prev_ptr
#undef cur_vtx
#undef prev_vtx
#undef p0
#undef p1
#undef vtx1
#undef vtx2
#undef vtx3
#undef vplane
#undef vpos_i
#undef vpos_f
#undef vdot_i
#undef vdot_f
#undef vdiff_i
#undef vdiff_f
#undef va_f
#undef vint_i
#undef vint_f
#undef vattr0
#undef vattr1
#undef v__
.endfunc

View File

@ -5,9 +5,9 @@
#include "Logger.h"
#include "Window.h"
#include <libdragon.h>
#include <GL/gl.h>
#include <GL/gl_integration.h>
#include <malloc.h>
#include <rspq_profile.h>
#include "../misc/n64/gpu.c"
typedef void (*GL_SetupVBFunc)(void);
static GL_SetupVBFunc gfx_setupVBFunc;
@ -20,16 +20,21 @@ static surface_t zbuffer;
static GfxResourceID white_square;
void Gfx_Create(void) {
rspq_init();
//rspq_profile_start();
rdpq_init();
//rdpq_debug_start(); // TODO debug
//rdpq_debug_log(true);
rdpq_set_mode_standard();
__rdpq_mode_change_som(SOM_TEXTURE_PERSP, SOM_TEXTURE_PERSP);
__rdpq_mode_change_som(SOM_ZMODE_MASK, SOM_ZMODE_OPAQUE);
rdpq_mode_dithering(DITHER_SQUARE_SQUARE);
// Set alpha compare threshold
rdpq_set_blend_color(RGBA32(0,0,0, 127));
gl_init();
//rdpq_debug_start(); // TODO debug
//rdpq_debug_log(true);
zbuffer = surface_alloc(FMT_RGBA16, display_get_width(), display_get_height());
Gfx.MaxTexWidth = 256;
@ -121,7 +126,10 @@ void Gfx_ClearColor(PackedCol color) {
void Gfx_EndFrame(void) {
Platform_LogConst("GFX ctx end");
rdpq_detach_show();
//Platform_LogConst("GFX END");
//Platform_LogConst("GFX END");
//rspq_profile_dump();
//rspq_profile_next_frame();
}
@ -253,7 +261,7 @@ static void SetAlphaBlend(cc_bool enabled) {
void Gfx_SetAlphaArgBlend(cc_bool enabled) { }
static void SetAlphaTest(cc_bool enabled) {
rdpq_mode_alphacompare(enabled ? 127 : 0);
__rdpq_mode_change_som(SOM_ALPHACOMPARE_MASK, enabled ? SOM_ALPHACOMPARE_THRESHOLD : 0);
}
static void SetColorWrite(cc_bool r, cc_bool g, cc_bool b, cc_bool a) {