diff --git a/misc/dreamcast/VertexTransform.S b/misc/dreamcast/VertexTransform.S index 3db5c06d8..908a05eab 100644 --- a/misc/dreamcast/VertexTransform.S +++ b/misc/dreamcast/VertexTransform.S @@ -1,77 +1,44 @@ -! ========================================================= -! ======================== PROCESSOR INFO ================= -! ========================================================= -! The SH4 can dual issue (i.e. parallel execution) two instructions -! as long as the groups of the two instructions are different: -! * LS - most ALU and FPU register load/stores -! * EX - most ALU arithmetic instructions -! * MT - TST, CMP, NOP, MOV Rm,Rn -! * FE - most FPU arithmetic instructions -! * CO - other instructions (NOTE: Cannot be exeucted in parallel) +#define FLG r0 // clip flags +#define TMP r1 // temp +#define VTX r2 // PVR_CMD_VERTEX +#define EOS r3 // PVR_CMD_VERTEX_EOL +#define SRC r4 // src pointer ARG +#define DST r5 // dst pointer ARG +#define CNT r6 // quads count ARG +#define PFT r7 // prefetch address -! Thee following general aspects of instructions are important to note per the SH4 manual: -! * Issue rate: Interval between the issue of an instruction and that of the next instruction -! * Latency: Interval between the issue of an instruction and the generation of its result (completion) -! * Latency is also the interval between the execution of two instructions with an interdependent relationship. -! (although different cases may either increase or decrease Latency) +#define ZERO fr0 // 0.0 +#define F_U fr1 // vertex.u +#define F_V fr2 // vertex.v +#define F_C fr3 // vertex.colour +#define F_X fr4 // vertex.x +#define F_Y fr5 // vertex.y +#define F_Z fr6 // vertex.z +#define F_W fr7 // vertex.w - -! ========================================================= -! ======================== REGISTER USAGES ================ -! ========================================================= -! SH4 C ABI: -! - R0 to R3 are return values (can be overwritten) -! - R4 to R7 are input arguments (can be overwritten) -! - R8 to R13 are non-volatile (must be restored at end) -! - R14 is the frame pointer (must be restored at end) -! - R15 is the stack pointer (must be restored at end) -! - FR0 to FR3 are return values (can be overwritten) -! - FR4 to FR11 are input arguments (can be overwritten) -! - FR12 to FR13 are non-volatile (must be restored at end) - -!r0 = clip flags -!r1 = GPU command -!r2 = temp -!r3 = prefetch address -!r4 = src pointer ARG -!r5 = dst pointer ARG -!r6 = quads count ARG -!r7 = PVR_CMD_VERTEX -!r11 = PVR_CMD_VERTEX_EOL - -!fr0 = temp -!fr1 = u -!fr2 = v -!fr3 = c -!fr4 = x -!fr5 = y -!fr6 = z -!fr7 = w -!fv4 = XYZW +#define XYZW fv4 // vertex.xyzw ! ========================================================= ! ========================= TRANSFORM SETUP =============== ! ========================================================= .macro TransformSetup - mov r4,r3 ! MT, r3 = src - add #-32, r5 ! EX, r5 -= sizeof(VERTEX) - mov.l r11, @-r15 ! LS, push(r11) - mov #0xE0, r7 ! EX, r7 = 0x00 00 00 E0 - pref @r3 ! LS, PREFETCH r3 (first vertex) - shll16 r7 ! EX, r7 = 0x00 E0 00 00 - shll8 r7 ! EX, r7 = 0xE0 00 00 00 (PVR_CMD_VERTEX) - mov #0xF0, r11 ! EX, r11 = 0x00 00 00 F0 - shll16 r11 ! EX, r11 = 0x00 F0 00 00 - shll8 r11 ! EX, r11 = 0xF0 00 00 00 (PVR_CMD_VERTEX_EOL) + mov SRC, PFT ! MT, pft = src + add #-32, DST ! EX, dst -= sizeof(VERTEX) + mov #0xE0, VTX ! EX, VTX = 0x00 00 00 E0 + pref @PFT ! LS, PREFETCH pft (first vertex) + shll16 VTX ! EX, VTX = 0x00 E0 00 00 + shll8 VTX ! EX, VTX = 0xE0 00 00 00 (PVR_CMD_VERTEX) + mov #0xF0, EOS ! EX, EOS = 0x00 00 00 F0 + shll16 EOS ! EX, EOS = 0x00 F0 00 00 + shll8 EOS ! EX, EOS = 0xF0 00 00 00 (PVR_CMD_VERTEX_EOL) + fldi0 ZERO ! LS, fr0 = 0.0 .endm .macro TransformEnd - mov.l @r15+, r11 ! LS, pop(r11) - - add #32, r5 ! EX, r5 += sizeof(VERTEX) - rts ! CO, return after executing instruction in delay slot - mov r5,r0 ! MT, r0 = r5 + add #32, DST ! EX, DST += sizeof(VERTEX) + rts ! CO, return after executing instruction in delay slot + mov DST, r0 ! MT, r0 = DST .endm @@ -80,36 +47,36 @@ ! ========================================================= .macro LoadColouredVertex ! LOAD XYZ - fmov @r4+, fr4 ! LS, X = src->x - fmov @r4+, fr5 ! LS, Y = src->y - fmov @r4+, fr6 ! LS, Z = src->z - fldi1 fr7 ! LS, W = 1.0 + fmov @SRC+, F_X ! LS, X = src->x + fmov @SRC+, F_Y ! LS, Y = src->y + fmov @SRC+, F_Z ! LS, Z = src->z + fldi1 F_W ! LS, W = 1.0 ! PREPARE NEXT VERTEX - add #16, r3 ! EX, r3 += VERTEX_STRIDE - pref @r3 ! LS, PREFETCH r3 (next vertex) - add #64, r5 ! EX, r5 += 2 * sizeof(VERTEX) + add #16, PFT ! EX, pft += VERTEX_STRIDE + pref @PFT ! LS, PREFETCH pft (next vertex) + add #64, DST ! EX, dst += 2 * sizeof(VERTEX) ! TRANSFORM VERTEX - ftrv xmtrx, fv4 ! FE, TRANSFORM(XYZW) + ftrv xmtrx, XYZW ! FE, TRANSFORM(XYZW) ! LOAD ATTRIBUTES - fmov @r4+,fr3 ! LS, C = src->color + fmov @SRC+, F_C ! LS, C = src->color .endm .macro LoadTexturedVertex ! LOAD XYZ - fmov @r4+, fr4 ! LS, X = src->x - fmov @r4+, fr5 ! LS, Y = src->y - fmov @r4+, fr6 ! LS, Z = src->z - fldi1 fr7 ! LS, W = 1.0 + fmov @SRC+, F_X ! LS, X = src->x + fmov @SRC+, F_Y ! LS, Y = src->y + fmov @SRC+, F_Z ! LS, Z = src->z + fldi1 F_W ! LS, W = 1.0 ! PREPARE NEXT VERTEX - add #24, r3 ! EX, r3 += VERTEX_STRIDE - pref @r3 ! LS, PREFETCH r3 (next vertex) - add #64, r5 ! EX, r5 += 2 * sizeof(VERTEX) + add #24, PFT ! EX, pft += VERTEX_STRIDE + pref @PFT ! LS, PREFETCH pft (next vertex) + add #64, DST ! EX, dst += 2 * sizeof(VERTEX) ! TRANSFORM VERTEX - ftrv xmtrx, fv4 ! FE, TRANSFORM(XYZW) + ftrv xmtrx, XYZW ! FE, TRANSFORM(XYZW) ! LOAD ATTRIBUTES - fmov @r4+,fr3 ! LS, C = src->color - fmov @r4+,fr1 ! LS, U = src->u - fmov @r4+,fr2 ! LS, V = src->v + fmov @SRC+, F_C ! LS, C = src->color + fmov @SRC+, F_U ! LS, U = src->u + fmov @SRC+, F_V ! LS, V = src->v .endm ! ========================================================= @@ -118,67 +85,63 @@ ! To take advantage of SH4 dual instruction processing, ! clipflag calculation and vertex output are interleaved .macro ProcessVertex1 - fmov.s fr7,@-r5 ! LS, dst->w = W - fmov.s fr3,@-r5 ! LS, dst->c = C - fldi0 fr0 ! LS, fr0 = 0.0 - fmov.s fr2,@-r5 ! LS, dst->v = V - fcmp/gt fr0,fr6 ! FE, T = Z > 0 - fmov.s fr1,@-r5 ! LS, dst->u = U - movt r0 ! EX, CLIPFLAGS = T - fmov.s fr6,@-r5 ! LS, dst->z = Z - fmov.s fr5,@-r5 ! LS, dst->y = Y - fmov.s fr4,@-r5 ! LS, dst->x = X - mov.l r7,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX + fmov.s F_W,@-DST ! LS, dst->w = W + fmov.s F_C,@-DST ! LS, dst->c = C + fmov.s F_V,@-DST ! LS, dst->v = V + fcmp/gt ZERO, F_Z ! FE, T = Z > 0 + fmov.s F_U,@-DST ! LS, dst->u = U + movt FLG ! EX, CLIPFLAGS = T + fmov.s F_Z,@-DST ! LS, dst->z = Z + fmov.s F_Y,@-DST ! LS, dst->y = Y + fmov.s F_X,@-DST ! LS, dst->x = X + mov.l VTX,@-DST ! LS, dst->flags = PVR_CMD_VERTEX .endm .macro ProcessVertex2 - fmov.s fr7,@-r5 ! LS, dst->w = W - fmov.s fr3,@-r5 ! LS, dst->c = C - fldi0 fr0 ! LS, fr0 = 0.0 - fmov.s fr2,@-r5 ! LS, dst->v = V - fcmp/gt fr0,fr6 ! FE, T = Z > 0 - fmov.s fr1,@-r5 ! LS, dst->u = U - movt r2 ! EX, tmp = T - fmov.s fr6,@-r5 ! LS, dst->z = Z - add r2,r2 ! EX, tmp = tmp + tmp - fmov.s fr5,@-r5 ! LS, dst->y = Y - or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 1) - fmov.s fr4,@-r5 ! LS, dst->x = X - mov.l r7,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX + fmov.s F_W,@-DST ! LS, dst->w = W + fmov.s F_C,@-DST ! LS, dst->c = C + fmov.s F_V,@-DST ! LS, dst->v = V + fcmp/gt ZERO,F_Z ! FE, T = Z > 0 + fmov.s F_U,@-DST ! LS, dst->u = U + movt TMP ! EX, tmp = T + fmov.s F_Z,@-DST ! LS, dst->z = Z + add TMP,TMP ! EX, tmp = tmp + tmp + fmov.s F_Y,@-DST ! LS, dst->y = Y + or TMP,FLG ! EX, CLIPFLAGS |= tmp (T << 1) + fmov.s F_X,@-DST ! LS, dst->x = X + mov.l VTX,@-DST ! LS, dst->flags = PVR_CMD_VERTEX .endm .macro ProcessVertex3 - fmov.s fr7,@-r5 ! LS, dst->w = W - fmov.s fr3,@-r5 ! LS, dst->c = C - fldi0 fr0 ! LS, fr0 = 0.0 - fmov.s fr2,@-r5 ! LS, dst->v = V - fcmp/gt fr0,fr6 ! FE, T = Z > 0 - fmov.s fr1,@-r5 ! LS, dst->u = U - movt r2 ! EX, tmp = T - fmov.s fr6,@-r5 ! LS, dst->z = Z - fmov.s fr5,@-r5 ! LS, dst->y = Y - shll2 r2 ! EX, tmp = tmp << 2 - fmov.s fr4,@-r5 ! LS, dst->x = X - or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 2) - mov.l r7,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX + fmov.s F_W,@-DST ! LS, dst->w = W + fmov.s F_C,@-DST ! LS, dst->c = C + fmov.s F_V,@-DST ! LS, dst->v = V + fcmp/gt ZERO, F_Z ! FE, T = Z > 0 + fmov.s F_U,@-DST ! LS, dst->u = U + movt TMP ! EX, tmp = T + fmov.s F_Z,@-DST ! LS, dst->z = Z + fmov.s F_Y,@-DST ! LS, dst->y = Y + shll2 TMP ! EX, tmp = tmp << 2 + fmov.s F_X,@-DST ! LS, dst->x = X + or TMP,FLG ! EX, CLIPFLAGS |= tmp (T << 2) + mov.l VTX,@-DST ! LS, dst->flags = PVR_CMD_VERTEX .endm .macro ProcessVertex4 - fmov.s fr7,@-r5 ! LS, dst->w = W - or r11,r0 ! EX, CLIPFLAGS |= PVR_CMD_VERTEX_EOL - fmov.s fr3,@-r5 ! LS, dst->c = C - fldi0 fr0 ! LS, fr0 = 0.0 - fmov.s fr2,@-r5 ! LS, dst->v = V - fcmp/gt fr0,fr6 ! FE, T = Z > 0 - fmov.s fr1,@-r5 ! LS, dst->u = U - movt r2 ! EX, tmp = T - fmov.s fr6,@-r5 ! LS, dst->z = Z - shll2 r2 ! EX, tmp = tmp << 2 - fmov.s fr5,@-r5 ! LS, dst->y = Y - add r2,r2 ! EX, tmp = (tmp << 2) + (tmp << 2) (T << 3) - fmov.s fr4,@-r5 ! LS, dst->x = X - or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 3) - mov.l r0,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX_EOL | CLIPFLAGS + fmov.s F_W,@-DST ! LS, dst->w = W + or EOS,FLG ! EX, CLIPFLAGS |= PVR_CMD_VERTEX_EOL + fmov.s F_C,@-DST ! LS, dst->c = C + fmov.s F_V,@-DST ! LS, dst->v = V + fcmp/gt ZERO, F_Z ! FE, T = Z > 0 + fmov.s F_U,@-DST ! LS, dst->u = U + movt TMP ! EX, tmp = T + fmov.s F_Z,@-DST ! LS, dst->z = Z + shll2 TMP ! EX, tmp = tmp << 2 + fmov.s F_Y,@-DST ! LS, dst->y = Y + add TMP,TMP ! EX, tmp = (tmp << 2) + (tmp << 2) (T << 3) + fmov.s F_X,@-DST ! LS, dst->x = X + or TMP,FLG ! EX, CLIPFLAGS |= tmp (T << 3) + mov.l FLG,@-DST ! LS, dst->flags = PVR_CMD_VERTEX_EOL | CLIPFLAGS .endm @@ -206,21 +169,21 @@ _DrawTexturedQuads: ProcessVertex4 ! CLIPFLAGS TESTING - and #15,r0 - cmp/eq #0,r0 ! T = r0 == 0 (all points invisible) - bt/s .T_NO_POINTS_VISIBLE ! if T goto NO_POINTS_VISIBLE + and #15,FLG + cmp/eq #0,FLG ! T = CLIPFLAGS == 0 (all points invisible) + bt/s .T_NONE_VISIBLE ! if T goto NONE_VISIBLE nop - bra .T_SOME_POINTS_VISIBLE + bra .T_SOME_VISIBLE nop -.T_NO_POINTS_VISIBLE: +.T_NONE_VISIBLE: bra .T_LOOP_END ! jump to loop end after executing instruction in delay slot - add #-128, r5 ! r5 -= 4 * sizeof(VERTEX), move back to prior quad, so that this invisible quad gets overwritten in next iteration + add #-128, DST ! DST -= 4 * sizeof(VERTEX), move back to prior quad, so that this invisible quad gets overwritten in next iteration -.T_SOME_POINTS_VISIBLE: +.T_SOME_VISIBLE: .T_LOOP_END: - dt r6 ! r6--; T = r6 == 0 + dt CNT ! count--; T = count == 0 bf .T_TRANSFORM_QUAD ! if !T then goto T_TRANSFORM_QUAD nop @@ -236,8 +199,8 @@ _DrawTexturedQuads: _DrawColouredQuads: ! Setup - fldi0 fr1 ! U = 0 - fldi0 fr2 ! V = 0 + fldi0 F_U ! U = 0 + fldi0 F_V ! V = 0 TransformSetup .C_TRANSFORM_QUAD: @@ -254,21 +217,21 @@ _DrawColouredQuads: ProcessVertex4 ! CLIPFLAGS TESTING - and #15,r0 - cmp/eq #0,r0 ! T = r0 == 0 (all points invisible) - bt/s .C_NO_POINTS_VISIBLE ! if T goto NO_POINTS_VISIBLE + and #15,FLG + cmp/eq #0,FLG ! T = CLIPFLAGS == 0 (all points invisible) + bt/s .C_NONE_VISIBLE ! if T goto NONE_VISIBLE nop - bra .C_SOME_POINTS_VISIBLE + bra .C_SOME_VISIBLE nop -.C_NO_POINTS_VISIBLE: +.C_NONE_VISIBLE: bra .C_LOOP_END ! jump to loop end after executing instruction in delay slot - add #-128, r5 ! r5 -= 4 * sizeof(VERTEX), move back to 1 vertex before start of quad + add #-128, DST ! dst -= 4 * sizeof(VERTEX), move back to 1 vertex before start of quad -.C_SOME_POINTS_VISIBLE: +.C_SOME_VISIBLE: .C_LOOP_END: - dt r6 ! r6--; T = r6 == 0 + dt CNT ! count--; T = count == 0 bf .C_TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD nop diff --git a/misc/dreamcast/sh4_notes.txt b/misc/dreamcast/sh4_notes.txt new file mode 100644 index 000000000..cadc79a41 --- /dev/null +++ b/misc/dreamcast/sh4_notes.txt @@ -0,0 +1,31 @@ +========================================================= +======================== PROCESSOR INFO ================= +========================================================= +The SH4 can dual issue (i.e. parallel execution) two instructions +as long as the groups of the two instructions are different: +* LS - most ALU and FPU register load/stores +* EX - most ALU arithmetic instructions +* MT - TST, CMP, NOP, MOV Rm,Rn (NOTE: Can execute in parallel with other MT) +* FE - most FPU arithmetic instructions +* CO - other instructions (NOTE: Cannot never execute in parallel) + +The following general aspects of instructions are important to note per the SH4 manual: +* Issue rate: Interval between the issue of an instruction and that of the next instruction +* Latency: Interval between the issue of an instruction and the generation of its result (completion) +* Latency is also the interval between the execution of two instructions with an interdependent relationship. + (although different cases may either increase or decrease Latency) + + +========================================================= +======================== REGISTER USAGES ================ +========================================================= +SH4 C ABI: +- R0 to R3 are return values (can be overwritten) +- R4 to R7 are input arguments (can be overwritten) +- R8 to R13 are non-volatile (must be restored at end) +- R14 is the frame pointer (must be restored at end) +- R15 is the stack pointer (must be restored at end) +- FR0 to FR3 are return values (can be overwritten) +- FR4 to FR11 are input arguments (can be overwritten) +- FR12 to FR13 are non-volatile (must be restored at end) + diff --git a/src/ExtMath.c b/src/ExtMath.c index 9a3ca2ea8..12531f8fa 100644 --- a/src/ExtMath.c +++ b/src/ExtMath.c @@ -37,6 +37,14 @@ float sqrtf(float x) { } #elif defined __GNUC__ /* Defined in .h using builtins */ +#elif defined __TINYC__ + /* Older versions of TinyC don't support fabsf or sqrtf */ + /* Those can be used though if compiling with newer TinyC */ + /* versions for a very small performance improvement */ + #include + + float Math_AbsF(float x) { return fabs(x); } + float Math_SqrtF(float x) { return sqrt(x); } #else #include