diff --git a/.github/workflows/build_mac32.yml b/.github/workflows/build_mac32.yml index e0193b199..38e3ffe6d 100644 --- a/.github/workflows/build_mac32.yml +++ b/.github/workflows/build_mac32.yml @@ -13,7 +13,7 @@ concurrency: jobs: build: - if: ${{ inputs.WEBHOOK_URL != '' }} + if: ${{ secrets.GHCR_ACCESS_KEY != '' }} runs-on: ubuntu-latest container: image: ghcr.io/classicube/minimal-osxcross:latest diff --git a/misc/dreamcast/DrawColouredQuads.S b/misc/dreamcast/DrawColouredQuads.S index a68f1127c..ba962a533 100644 --- a/misc/dreamcast/DrawColouredQuads.S +++ b/misc/dreamcast/DrawColouredQuads.S @@ -1,27 +1,3 @@ -!r0 = clip flags -!r1 = GPU command -!r2 = temp -!r3 = prefetch address -!r4 = src pointer ARG -!r5 = dst pointer ARG -!r6 = quads count ARG -!r7 = ? - -!fr0 = temp -!fr1 = u (0.0) -!fr2 = v (0.0) -!fr3 = c -!fr4 = x -!fr5 = y -!fr6 = z -!fr7 = w -!fr8 = VIEWPORT_HWIDTH -!fr9 = VIEWPORT_HHEIGHT -!fr10 = VIEWPORT_X_PLUS_HWIDTH -!fr11 = VIEWPORT_Y_PLUS_HHEIGHT - -!fv4 = XYZW - #include "ViewportTransform.S" .global _DrawColouredQuads .align 4 @@ -90,4 +66,4 @@ _VP_COL_X_PLUS_HWIDTH: .long 0 .global _VP_COL_Y_PLUS_HHEIGHT .type _VP_COL_Y_PLUS_HHEIGHT,%object -_VP_COL_Y_PLUS_HHEIGHT: .long 0 \ No newline at end of file +_VP_COL_Y_PLUS_HHEIGHT: .long 0 diff --git a/misc/dreamcast/DrawTexturedQuads.S b/misc/dreamcast/DrawTexturedQuads.S index 9ef80f9fe..2771ac4ad 100644 --- a/misc/dreamcast/DrawTexturedQuads.S +++ b/misc/dreamcast/DrawTexturedQuads.S @@ -1,27 +1,3 @@ -!r0 = clip flags -!r1 = GPU command -!r2 = temp -!r3 = prefetch address -!r4 = src pointer ARG -!r5 = dst pointer ARG -!r6 = quads count ARG -!r7 = ? - -!fr0 = temp -!fr1 = u -!fr2 = v -!fr3 = c -!fr4 = x -!fr5 = y -!fr6 = z -!fr7 = w -!fr8 = VIEWPORT_HWIDTH -!fr9 = VIEWPORT_HHEIGHT -!fr10 = VIEWPORT_X_PLUS_HWIDTH -!fr11 = VIEWPORT_Y_PLUS_HHEIGHT - -!fv4 = XYZW - #include "ViewportTransform.S" .global _DrawTexturedQuads .align 4 @@ -88,4 +64,4 @@ _VP_TEX_X_PLUS_HWIDTH: .long 0 .global _VP_TEX_Y_PLUS_HHEIGHT .type _VP_TEX_Y_PLUS_HHEIGHT,%object -_VP_TEX_Y_PLUS_HHEIGHT: .long 0 \ No newline at end of file +_VP_TEX_Y_PLUS_HHEIGHT: .long 0 diff --git a/misc/dreamcast/ViewportTransform.S b/misc/dreamcast/ViewportTransform.S index fee37ce0c..3ceb5bdb1 100644 --- a/misc/dreamcast/ViewportTransform.S +++ b/misc/dreamcast/ViewportTransform.S @@ -1,108 +1,170 @@ +! ========================================================= +! ======================== PROCESSOR INFO ================= +! ========================================================= +! The SH4 can dual issue (i.e. parallel execution) instructions +! as long as the groups of the two instructions are different: +! * LS - most APU and FPU register load/stores +! * EX - most APU arithmetic instructions +! * MT - TST, CMP, NOP, MOV Rm,Rn +! * FE - most FPU arithmetic instructions +! * CO - other instructions + +! Thee following general aspects of instructions are important to note per the SH4 manual: +! * Issue rate: Interval between the issue of an instruction and that of the next instruction +! * Latency: Interval between the issue of an instruction and the generation of its result (completion) +! * Latency is also the interval between the execution of two instructions with an interdependent relationship. +! (although different cases may either increase or decrease Latency) +! +! The instructions have the following latencies +! * FADD.S/FMUL.S/FSUB.S/FMAC.S - 3/4 +! * FTRV - 5/8 +! * SHL/SHR - 1 +! * ADD/SUB/OR/XOR - 1 + + +! ========================================================= +! ======================== REGISTER USAGES ================ +! ========================================================= +! SH4 C ABI: +! - R0 to R3 are return values (can be overwritten) +! - R4 to R7 are input arguments (can be overwritten) +! - R8 to R13 are non-volatile (must be restored at end) +! - R14 is the frame pointer (must be restored at end) +! - R15 is the stack pointer (must be restored at end) +! - FR0 to FR3 are return values (can be overwritten) +! - FR4 to FR11 are input arguments (can be overwritten) +! - FR12 to FR13 are non-volatile (must be restored at end) + +!r0 = clip flags +!r1 = GPU command +!r2 = temp +!r3 = prefetch address +!r4 = src pointer ARG +!r5 = dst pointer ARG +!r6 = quads count ARG +!r7 = ? + +!fr0 = temp +!fr1 = u +!fr2 = v +!fr3 = c +!fr4 = x +!fr5 = y +!fr6 = z +!fr7 = w +!fr8 = VIEWPORT_HWIDTH +!fr9 = VIEWPORT_HHEIGHT +!fr10 = VIEWPORT_X_PLUS_HWIDTH +!fr11 = VIEWPORT_Y_PLUS_HHEIGHT + +!fv4 = XYZW + + ! ========================================================= ! ========================= VERTEX LOADING ================ ! ========================================================= .macro LoadColouredVertex ! PREPARE NEXT VERTEX - add #16, r3 ! r3 += VERTEX_STRIDE - pref @r3 ! PREFETCH r3 (next vertex) - add #64, r5 ! r5 += 2 * sizeof(VERTEX) + add #16, r3 ! EX, r3 += VERTEX_STRIDE + pref @r3 ! LS, PREFETCH r3 (next vertex) + add #64, r5 ! EX, r5 += 2 * sizeof(VERTEX) ! LOAD XYZ - fmov @r4+, fr4 ! X = src->x - fmov @r4+, fr5 ! Y = src->y - fmov @r4+, fr6 ! Z = src->z - fldi1 fr7 ! W = 1.0 + fmov @r4+, fr4 ! LS, X = src->x + fmov @r4+, fr5 ! LS, Y = src->y + fmov @r4+, fr6 ! LS, Z = src->z + fldi1 fr7 ! LS, W = 1.0 ! TRANSFORM VERTEX - ftrv xmtrx, fv4 ! TRANSFORM(XYZW) + ftrv xmtrx, fv4 ! FE, TRANSFORM(XYZW) ! LOAD ATTRIBUTES - fmov @r4+,fr3 ! C = src->color + fmov @r4+,fr3 ! LS, C = src->color .endm .macro LoadTexturedVertex ! PREPARE NEXT VERTEX - add #24, r3 ! r3 += VERTEX_STRIDE - pref @r3 ! PREFETCH r3 (next vertex) - add #64, r5 ! r5 += 2 * sizeof(VERTEX) + add #24, r3 ! EX, r3 += VERTEX_STRIDE + pref @r3 ! LS, PREFETCH r3 (next vertex) + add #64, r5 ! EX, r5 += 2 * sizeof(VERTEX) ! LOAD XYZ - fmov @r4+, fr4 ! X = src->x - fmov @r4+, fr5 ! Y = src->y - fmov @r4+, fr6 ! Z = src->z - fldi1 fr7 ! W = 1.0 + fmov @r4+, fr4 ! LS, X = src->x + fmov @r4+, fr5 ! LS, Y = src->y + fmov @r4+, fr6 ! LS, Z = src->z + fldi1 fr7 ! LS, W = 1.0 ! TRANSFORM VERTEX - ftrv xmtrx, fv4 ! TRANSFORM(XYZW) + ftrv xmtrx, fv4 ! FE, TRANSFORM(XYZW) ! LOAD ATTRIBUTES - fmov @r4+,fr3 ! C = src->color - fmov @r4+,fr1 ! U = src->u - fmov @r4+,fr2 ! V = src->v + fmov @r4+,fr3 ! LS, C = src->color + fmov @r4+,fr1 ! LS, U = src->u + fmov @r4+,fr2 ! LS, V = src->v .endm ! ========================================================= ! ========================= VERTEX OUTPUT ================= ! ========================================================= -! To take advantage of SH4 dual instruction processing, interleave -! the clipflag calculation and vertex output instructions +! To take advantage of SH4 dual instruction processing, +! clipflag calculation and vertex output are interleaved .macro ProcessVertex1 - fmov.s fr7,@-r5 ! dst->w = W - fmov.s fr3,@-r5 ! dst->c = C - fneg fr7 ! W = -W - fmov.s fr2,@-r5 ! dst->v = V - fcmp/gt fr7,fr6 ! T = Z > W (i.e. Z > -W) - fmov.s fr1,@-r5 ! dst->u = U - movt r0 ! CLIPFLAGS = T - fmov.s fr6,@-r5 ! dst->z = Z - fmov.s fr5,@-r5 ! dst->y = Y - fmov.s fr4,@-r5 ! dst->x = X - mov.l r1,@-r5 ! dst->flags = CMD_VERT + fmov.s fr7,@-r5 ! LS, dst->w = W + fmov.s fr3,@-r5 ! LS, dst->c = C + fneg fr7 ! LS, W = -W + fmov.s fr2,@-r5 ! LS, dst->v = V + fcmp/gt fr7,fr6 ! CO, T = Z > W (i.e. Z > -W) + fmov.s fr1,@-r5 ! LS, dst->u = U + movt r0 ! EX, CLIPFLAGS = T + fmov.s fr6,@-r5 ! LS, dst->z = Z + fmov.s fr5,@-r5 ! LS, dst->y = Y + fmov.s fr4,@-r5 ! LS, dst->x = X + mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT .endm .macro ProcessVertex2 - fmov.s fr7,@-r5 ! dst->w = W - fmov.s fr3,@-r5 ! dst->c = C - fneg fr7 ! W = -W - fmov.s fr2,@-r5 ! dst->v = V - fcmp/gt fr7,fr6 ! T = Z > W (i.e. Z > -W) - fmov.s fr1,@-r5 ! dst->u = U - movt r2 ! tmp = T - fmov.s fr6,@-r5 ! dst->z = Z - add r2,r2 ! tmp = tmp + tmp - fmov.s fr5,@-r5 ! dst->y = Y - or r2,r0 ! CLIPFLAGS |= tmp (T << 1) - fmov.s fr4,@-r5 ! dst->x = X - mov.l r1,@-r5 ! dst->flags = CMD_VERT + fmov.s fr7,@-r5 ! LS, dst->w = W + fmov.s fr3,@-r5 ! LS, dst->c = C + fneg fr7 ! LS, W = -W + fmov.s fr2,@-r5 ! LS, dst->v = V + fcmp/gt fr7,fr6 ! CO, T = Z > W (i.e. Z > -W) + fmov.s fr1,@-r5 ! LS, dst->u = U + movt r2 ! EX, tmp = T + fmov.s fr6,@-r5 ! LS, dst->z = Z + add r2,r2 ! EX, tmp = tmp + tmp + fmov.s fr5,@-r5 ! LS, dst->y = Y + or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 1) + fmov.s fr4,@-r5 ! LS, dst->x = X + mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT .endm .macro ProcessVertex3 - fmov.s fr7,@-r5 ! dst->w = W - fmov.s fr3,@-r5 ! dst->c = C - fneg fr7 ! W = -W - fmov.s fr2,@-r5 ! dst->v = V - fcmp/gt fr7,fr6 ! T = Z > W (i.e. Z > -W) - fmov.s fr1,@-r5 ! dst->u = U - movt r2 ! tmp = T - fmov.s fr6,@-r5 ! dst->z = Z - fmov.s fr5,@-r5 ! dst->y = Y - shll2 r2 ! tmp = tmp << 2 - fmov.s fr4,@-r5 ! dst->x = X - or r2,r0 ! CLIPFLAGS |= tmp (T << 2) - mov.l r1,@-r5 ! dst->flags = CMD_VERT + fmov.s fr7,@-r5 ! LS, dst->w = W + fmov.s fr3,@-r5 ! LS, dst->c = C + fneg fr7 ! LS, W = -W + fmov.s fr2,@-r5 ! LS, dst->v = V + fcmp/gt fr7,fr6 ! CO, T = Z > W (i.e. Z > -W) + fmov.s fr1,@-r5 ! LS, dst->u = U + movt r2 ! EX, tmp = T + fmov.s fr6,@-r5 ! LS, dst->z = Z + fmov.s fr5,@-r5 ! LS, dst->y = Y + shll2 r2 ! EX, tmp = tmp << 2 + fmov.s fr4,@-r5 ! LS, dst->x = X + or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 2) + mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT .endm .macro ProcessVertex4 eos_addr - fmov.s fr7,@-r5 ! dst->w = W - fmov.s fr3,@-r5 ! dst->c = C - fneg fr7 ! W = -W - fmov.s fr2,@-r5 ! dst->v = V - fcmp/gt fr7,fr6 ! T = Z > W (i.e. Z > -W) - fmov.s fr1,@-r5 ! dst->u = U - movt r2 ! tmp = T - fmov.s fr6,@-r5 ! dst->z = Z - shll2 r2 ! tmp = tmp << 2 - fmov.s fr5,@-r5 ! dst->y = Y - add r2,r2 ! tmp = (tmp << 2) + (tmp << 2) - fmov.s fr4,@-r5 ! dst->x = X - mov.l \eos_addr, r1 ! r1 = GPU EOS command - or r2,r0 ! CLIPFLAGS |= tmp (T << 3) - or r0,r1 ! r1 |= CLIPFLAGS - mov.l r1,@-r5 ! dst->flags = GPU EOS | CLIPFLAGS + fmov.s fr7,@-r5 ! LS, dst->w = W + fmov.s fr3,@-r5 ! LS, dst->c = C + fneg fr7 ! LS, W = -W + fmov.s fr2,@-r5 ! LS, dst->v = V + fcmp/gt fr7,fr6 ! CO, T = Z > W (i.e. Z > -W) + fmov.s fr1,@-r5 ! LS, dst->u = U + movt r2 ! EX, tmp = T + fmov.s fr6,@-r5 ! LS, dst->z = Z + shll2 r2 ! EX, tmp = tmp << 2 + fmov.s fr5,@-r5 ! LS, dst->y = Y + add r2,r2 ! EX, tmp = (tmp << 2) + (tmp << 2) + fmov.s fr4,@-r5 ! LS, dst->x = X + mov.l \eos_addr, r1 ! LS, r1 = GPU EOS command + or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 3) + or r0,r1 ! EX, r1 |= CLIPFLAGS + mov.l r1,@-r5 ! LS, dst->flags = GPU EOS | CLIPFLAGS .endm @@ -123,39 +185,39 @@ !fr11 = VIEWPORT_Y_PLUS_HHEIGHT .macro ViewportTransformSetup viewport_addr - mova \viewport_addr, r0 - fmov.s @r0+,fr8 ! fr8 = VIEWPORT_HWIDTH - fmov.s @r0+,fr9 ! fr9 = VIEWPORT_HHEIGHT - fmov.s @r0+,fr10 ! fr10 = VIEWPORT_X_PLUS_HWIDTH - fmov.s @r0+,fr11 ! fr11 = VIEWPORT_Y_PLUS_HHEIGHT - nop ! align to even instructions + mova \viewport_addr, r0 ! EX, r0 = &VIEWPORT + fmov.s @r0+,fr8 ! LS, fr8 = VIEWPORT_HWIDTH + fmov.s @r0+,fr9 ! LS, fr9 = VIEWPORT_HHEIGHT + fmov.s @r0+,fr10 ! LS, fr10 = VIEWPORT_X_PLUS_HWIDTH + fmov.s @r0+,fr11 ! LS, fr11 = VIEWPORT_Y_PLUS_HHEIGHT + nop ! MT, align to even instructions .endm .macro ViewportTransformVertex ! INVERSE W CALCULATION - add #28, r5 ! r5 = &vertex->w - fmov.s @r5,fr0 ! fr0 = vertex->w - fmul fr0,fr0 ! fr0 = fr0 * fr0 - add #-24, r5 ! r5 = &vertex->x - fsrra fr0 ! fr0 = 1 / sqrt(fr0) -> 1 / vertex->w + add #28, r5 ! EX, r5 = &vertex->w + fmov.s @r5,fr0 ! LS, fr0 = vertex->w + fmul fr0,fr0 ! FE, fr0 = fr0 * fr0 + add #-24, r5 ! EX, r5 = &vertex->x + fsrra fr0 ! FE, fr0 = 1 / sqrt(fr0) -> 1 / vertex->w ! TRANSFORM X - fmov.s @r5,fr4 ! fr4 = vertex->x - fmov fr10,fr5 ! fr5 = VIEWPORT_X_PLUS_HWIDTH - fmul fr8,fr4 ! fr4 = VIEWPORT_HWIDTH * vertex->x - fmac fr0,fr4,fr5 ! fr5 = fr0 * fr4 + fr5 -- (X * F * hwidth) + x_plus_hwidth - fmov.s fr5,@r5 ! vertex->x = fr5 - add #4, r5 ! r5 = &vertex->y + fmov.s @r5,fr4 ! LS, fr4 = vertex->x + fmov fr10,fr5 ! LS, fr5 = VIEWPORT_X_PLUS_HWIDTH + fmul fr8,fr4 ! FE, fr4 = VIEWPORT_HWIDTH * vertex->x + fmac fr0,fr4,fr5 ! FE, fr5 = fr0 * fr4 + fr5 -- (X * F * hwidth) + x_plus_hwidth + fmov.s fr5,@r5 ! LS, vertex->x = fr5 + add #4, r5 ! EX, r5 = &vertex->y ! TRANSFORM Y - fmov.s @r5,fr4 ! fr4 = vertex->y - fmov fr11,fr5 ! fr5 = VIEWPORT_Y_PLUS_HHEIGHT - fmul fr9,fr4 ! fr4 = VIEWPORT_HHEIGHT * vertex->y - fmac fr0,fr4,fr5 ! fr5 = fr0 * fr4 + fr5 -- (Y * F * hheight) + y_plus_hheight - fmov.s fr5,@r5 ! vertex->y = fr5 - add #4, r5 ! r5 = &vertex->z + fmov.s @r5,fr4 ! LS, fr4 = vertex->y + fmov fr11,fr5 ! LS, fr5 = VIEWPORT_Y_PLUS_HHEIGHT + fmul fr9,fr4 ! FE, fr4 = VIEWPORT_HHEIGHT * vertex->y + fmac fr0,fr4,fr5 ! FE, fr5 = fr0 * fr4 + fr5 -- (Y * F * hheight) + y_plus_hheight + fmov.s fr5,@r5 ! LS, vertex->y = fr5 + add #4, r5 ! EX, r5 = &vertex->z ! ASSIGN Z - fmov.s fr0,@r5 ! vertex->z = fr0 - add #20, r5 ! r5 += 20 (points to start of next vertex) -.endm \ No newline at end of file + fmov.s fr0,@r5 ! LS, vertex->z = fr0 + add #20, r5 ! EX, r5 += 20 (points to start of next vertex) +.endm