From 9a1072d45aa4a4e384fad92ffe43911b1f23a494 Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Tue, 26 Sep 2023 21:13:23 +1000 Subject: [PATCH] Tidy up readme --- readme.md | 115 +- src/Graphics_Dreamcast.c | 3 +- third_party/gldc/LICENSE | 25 + third_party/gldc/Makefile | 28 + third_party/gldc/README.md | 66 + third_party/gldc/include/gldc.h | 348 ++++ third_party/gldc/src/aligned_vector.c | 55 + third_party/gldc/src/aligned_vector.h | 217 ++ third_party/gldc/src/draw.c | 374 ++++ third_party/gldc/src/error.c | 32 + third_party/gldc/src/flush.c | 103 + third_party/gldc/src/gl_assert.h | 20 + third_party/gldc/src/matrix.c | 27 + third_party/gldc/src/named_array.c | 90 + third_party/gldc/src/named_array.h | 38 + third_party/gldc/src/platform.h | 433 ++++ third_party/gldc/src/private.h | 323 +++ third_party/gldc/src/sh4.c | 470 +++++ third_party/gldc/src/sh4.h | 119 ++ third_party/gldc/src/sh4_math.h | 1820 +++++++++++++++++ third_party/gldc/src/state.c | 499 +++++ third_party/gldc/src/texture.c | 280 +++ third_party/gldc/src/types.h | 16 + third_party/gldc/src/yalloc/LICENSE | 21 + third_party/gldc/src/yalloc/README.md | 158 ++ third_party/gldc/src/yalloc/yalloc.c | 803 ++++++++ third_party/gldc/src/yalloc/yalloc.h | 176 ++ third_party/gldc/src/yalloc/yalloc_dump.c | 39 + .../gldc/src/yalloc/yalloc_internals.h | 63 + 29 files changed, 6705 insertions(+), 56 deletions(-) create mode 100644 third_party/gldc/LICENSE create mode 100644 third_party/gldc/Makefile create mode 100644 third_party/gldc/README.md create mode 100644 third_party/gldc/include/gldc.h create mode 100644 third_party/gldc/src/aligned_vector.c create mode 100644 third_party/gldc/src/aligned_vector.h create mode 100644 third_party/gldc/src/draw.c create mode 100644 third_party/gldc/src/error.c create mode 100644 third_party/gldc/src/flush.c create mode 100644 third_party/gldc/src/gl_assert.h create mode 100644 third_party/gldc/src/matrix.c create mode 100644 third_party/gldc/src/named_array.c create mode 100644 third_party/gldc/src/named_array.h create mode 100644 third_party/gldc/src/platform.h create mode 100644 third_party/gldc/src/private.h create mode 100644 third_party/gldc/src/sh4.c create mode 100644 third_party/gldc/src/sh4.h create mode 100644 third_party/gldc/src/sh4_math.h create mode 100644 third_party/gldc/src/state.c create mode 100644 third_party/gldc/src/texture.c create mode 100644 third_party/gldc/src/types.h create mode 100644 third_party/gldc/src/yalloc/LICENSE create mode 100644 third_party/gldc/src/yalloc/README.md create mode 100644 third_party/gldc/src/yalloc/yalloc.c create mode 100644 third_party/gldc/src/yalloc/yalloc.h create mode 100644 third_party/gldc/src/yalloc/yalloc_dump.c create mode 100644 third_party/gldc/src/yalloc/yalloc_internals.h diff --git a/readme.md b/readme.md index 8994d7884..97cf71246 100644 --- a/readme.md +++ b/readme.md @@ -3,7 +3,59 @@ ClassiCube is a custom Minecraft Classic compatible client written in C from scr ![screenshot_n](http://i.imgur.com/FCiwl27.png) -### Supported systems +# Information + +ClassiCube aims to replicate the 2009 Minecraft Classic client while also: +* offering **optional** additions to improve gameplay +* providing much better performance for a large range of hardware +* running on many different systems (desktop, web, mobile, consoles) + +ClassiCube is not trying to replicate modern Minecraft versions - and therefore it doesn't (and won't) support: +* survival mode +* logging in with Microsoft or Mojang accounts +* connecting to Minecraft Java Edition or Bedrock Edition servers +
+ +You can download ClassiCube [from here](https://www.classicube.net/download/) and the very latest builds [from here](https://www.classicube.net/nightlies/). + +### We need your help + +ClassiCube strives to replicate the original Minecraft Classic experience by **strictly following a [clean room](https://en.wikipedia.org/wiki/Clean_room_design) reverse engineering approach**. + +If you're interested in documenting or verifying the behaviour of the original Minecraft Classic, please get in contact with me. (`unknownshadow200` on Discord) + +### What ClassiCube is +* A complete re-implementation of Minecraft Classic 0.30, with **optional** additions +* Partially supports some features of Minecraft Classic versions before 0.30 +* Lightweight, minimal memory usage compared to original Minecraft Classic +* Much better performance than original Minecraft Classic +* Works with effectively all graphics cards that support OpenGL or Direct3D 9 +* Runs on Windows, macOS, Linux, Android, iOS, and in a web browser +* Also runs on OpenBSD, FreeBSD, NetBSD, Solaris, Haiku, IRIX, SerenityOS +* Although still work in progresses, also runs on various consoles + +### Instructions +Initially, you will need to run ClassiCube.exe to download the required assets from minecraft.net and classicube.net.
+Just click 'OK' to the dialog menu that appears when you start the launcher. + +**Singleplayer** +Run ClassiCube.exe, then click Singleplayer at the main menu. + +**Multiplayer** +Run ClassiCube.exe. You can connect to LAN/locally hosted servers, and classicube.net servers if you have a [ClassiCube account](https://www.classicube.net/). + +
+ +**Note:** When running from within VirtualBox, disable Mouse Integration, otherwise the camera will not work properly + +#### *Stuck on OpenGL 1.1?* +The most common reason for being stuck on OpenGL 1.1 is non-working GPU drivers - so if possible, you should try either installing or updating the drivers for your GPU. + +Otherwise: +* On Windows, you can still run the OpenGL build of ClassiCube anyways. (You can try downloading and using the MESA software renderer from [here](http://download.qt.io/development_releases/prebuilt/llvmpipe/windows/) for slightly better performance though) +* On other operating systems, you will have to [compile the game yourself](#Compiling). Don't forget to add `-DCC_BUILD_GL11` to the compilation command line so that the compiled game supports OpenGL 1.1. + +## Supported systems ClassiCube runs on: * Windows - 95 and later @@ -25,59 +77,14 @@ ClassiCube runs on:
  • BeOS - untested on actual hardware
  • IRIX - needs curl and openal packages
  • SerenityOS - needs SDL2
  • -
  • 3DS - unfinished, but usable
  • -
  • Wii - unfinished, but usable
  • -
  • GameCube - unfinished, but usable
  • -
  • PSP - unfinished, rendering issues
  • -
  • Dreamcast - unfinished, but renders
  • -
  • PS Vita - majorly unfinished
  • -
  • Xbox - majorly unfinished
  • - - - -You can download ClassiCube [from here](https://www.classicube.net/download/) and the very latest builds [from here](https://www.classicube.net/nightlies/). - -### We need your help - -ClassiCube strives to support providing an experience identical to the original Minecraft Classic by **strictly following a [clean room](https://en.wikipedia.org/wiki/Clean_room_design) reverse engineering approach**. - -If you're interested in documenting or verifying the behaviour of the original Minecraft Classic, please get in contact with me. (`unknownshadow200` on Discord) - -## Information - -#### What ClassiCube is -* A complete re-implementation of Minecraft Classic 0.30, with **optional** additions -* Partially supports some features of Minecraft Classic versions before 0.30 -* Lightweight, minimal memory usage compared to original Minecraft Classic -* Much better performance than original Minecraft Classic -* Works with effectively all graphics cards that support OpenGL or Direct3D 9 -* Runs on Windows, macOS, Linux, Android, iOS, and in a web browser -* Also runs on OpenBSD, FreeBSD, NetBSD, Solaris, Haiku, IRIX, SerenityOS -* Although still work in progresses, also runs on various consoles - -#### What ClassiCube isn't -* It does not work with Minecraft Java or Bedrock edition servers -* It does not have a survival mode (nor will such a mode be added) -* It does not support logging in with Mojang/Minecraft accounts - -**Note:** When running from within VirtualBox, disable Mouse Integration, otherwise the camera will not work properly - -#### Instructions -Initially, you will need to run ClassiCube.exe to download the required assets from minecraft.net and classicube.net.
    -Just click 'OK' to the dialog menu that appears when you start the launcher. - -**Singleplayer** -Run ClassiCube.exe, then click Singleplayer at the main menu. - -**Multiplayer** -Run ClassiCube.exe. You can connect to LAN/locally hosted servers, and classicube.net servers if you have a [ClassiCube account](https://www.classicube.net/). - -##### *Stuck on OpenGL 1.1?* -The most common reason for being stuck on OpenGL 1.1 is non-working GPU drivers - so if possible, you should try either installing or updating the drivers for your GPU. - -Otherwise: -* On Windows, you can still run the OpenGL build of ClassiCube anyways. (You can try downloading and using the MESA software renderer from [here](http://download.qt.io/development_releases/prebuilt/llvmpipe/windows/) for slightly better performance though) -* On other operating systems, you will have to [compile the game yourself](#Compiling). Don't forget to add `-DCC_BUILD_GL11` to the compilation command line so that the compiled game supports OpenGL 1.1. +
  • 3DS - unfinished, but usable
  • +
  • Wii - unfinished, but usable
  • +
  • GameCube - unfinished, but usable
  • +
  • Dreamcast - unfinished, but renders
  • +
  • PSP - unfinished, rendering issues
  • +
  • PS Vita - unfinished, rendering issues
  • +
  • Xbox - unfinished, major rendering issues
  • +
  • PS3 - doesn't render properly yet
  • # Compiling diff --git a/src/Graphics_Dreamcast.c b/src/Graphics_Dreamcast.c index c6b9a3c2a..cc1465a5c 100644 --- a/src/Graphics_Dreamcast.c +++ b/src/Graphics_Dreamcast.c @@ -4,8 +4,7 @@ #include "Errors.h" #include "Logger.h" #include "Window.h" -#include "GL/gl.h" -#include "GL/glkos.h" +#include "../third_party/gldc/include/gldc.h" #include #include #include diff --git a/third_party/gldc/LICENSE b/third_party/gldc/LICENSE new file mode 100644 index 000000000..e5f645cc6 --- /dev/null +++ b/third_party/gldc/LICENSE @@ -0,0 +1,25 @@ +BSD 2-Clause License + +Copyright (c) 2018, Luke Benstead +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/gldc/Makefile b/third_party/gldc/Makefile new file mode 100644 index 000000000..10dcad132 --- /dev/null +++ b/third_party/gldc/Makefile @@ -0,0 +1,28 @@ +SOURCE_DIRS := src src/yalloc + +C_FILES := $(foreach dir,$(SOURCE_DIRS),$(wildcard $(dir)/*.c)) +OBJS := $(notdir $(C_FILES:%.c=%.o)) + +C_FLAGS = -O3 -DNDEBUG -mfsrra -mfsca -ffp-contract=fast -ffast-math -O3 -mpretend-cmove -fexpensive-optimizations -fomit-frame-pointer -finline-functions -flto -fno-fat-lto-objects -ml -m4-single-only -ffunction-sections -fdata-sections -std=gnu99 + +C_DEFINES = -DDREAMCAST -DNDEBUG -D__DREAMCAST__ -D__arch_dreamcast -D_arch_dreamcast -D_arch_sub_pristine + +C_INCLUDES = -I/opt/toolchains/dc/kos/include -I/opt/toolchains/dc/kos/kernel/arch/dreamcast/include -I/opt/toolchains/dc/kos/addons/include + +TARGET := libGLdc.a + +ifeq ($(strip $(KOS_BASE)),) +$(error "Please set KOS variables in your environment.") +endif + +default: $(TARGET) + +%.o: src/%.c + kos-cc $(C_DEFINES) $(C_INCLUDES) $(C_FLAGS) -c $< -o $@ + +%.o: src/yalloc/%.c + kos-cc $(C_DEFINES) $(C_INCLUDES) $(C_FLAGS) -c $< -o $@ + +$(TARGET): $(OBJS) + kos-ar cr $@ $< + kos-ranlib $@ diff --git a/third_party/gldc/README.md b/third_party/gldc/README.md new file mode 100644 index 000000000..24f03033f --- /dev/null +++ b/third_party/gldc/README.md @@ -0,0 +1,66 @@ +This is a fork of GLdc optimised for the Dreamcast port of ClassiCube, and unfortunately is essentially useless for any other project + +--- + +# GLdc + +**Development of GLdc has moved to [Gitlab](https://gitlab.com/simulant/GLdc)** + +This is a partial implementation of OpenGL 1.2 for the SEGA Dreamcast for use +with the KallistiOS SDK. + +It began as a fork of libGL by Josh Pearson but has undergone a large refactor +which is essentially a rewrite. + +The aim is to implement as much of OpenGL 1.2 as possible, and to add additional +features via extensions. + +Things left to (re)implement: + + - Spotlights (Trivial) + - Framebuffer extension (Trivial) + - Texture Matrix (Trivial) + +Things I'd like to do: + + - Use a clean "gl.h" + - Define an extension for modifier volumes + - Add support for point sprites + - Optimise, add unit tests for correctness + +# Compiling + +GLdc uses CMake for its build system, it currently ships with two "backends": + + - kospvr - This is the hardware-accelerated Dreamcast backend + - software - This is a stub software rasterizer used for testing testing and debugging + +To compile a Dreamcast debug build, you'll want to do something like the following: + +``` +mkdir dcbuild +cd dcbuild +cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" .. +make +``` + +For a release build, replace the cmake line with with the following: +``` +cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release .. +``` + +You will need KallistiOS compiled and configured (e.g. the KOS_BASE environment +variable must be set) + +To compile for PC: + +``` +mkdir pcbuild +cd pcbuild +cmake -G "Unix Makefiles" .. +make +``` + +# Special Thanks! + + - Massive shout out to Hayden Kowalchuk for diagnosing and fixing a large number of bugs while porting GL Quake to the Dreamcast. Absolute hero! diff --git a/third_party/gldc/include/gldc.h b/third_party/gldc/include/gldc.h new file mode 100644 index 000000000..832f22850 --- /dev/null +++ b/third_party/gldc/include/gldc.h @@ -0,0 +1,348 @@ +/* KallistiGL for KallistiOS ##version## + + libgl/gl.h + Copyright (C) 2013-2014 Josh "PH3NOM" Pearson + Copyright (C) 2014, 2016 Lawrence Sebald + + Some functionality adapted from the original KOS libgl: + Copyright (C) 2001 Dan Potter + Copyright (C) 2002 Benoit Miller + + This API implements much but not all of the OpenGL 1.1 for KallistiOS. +*/ + +#ifndef __GL_GL_H +#define __GL_GL_H + +#include +__BEGIN_DECLS + +#include + +/* Primitive Types taken from GL for compatability */ +/* Not all types are implemented in Open GL DC V.1.0 */ +#define GL_POINTS 0x0000 +#define GL_LINES 0x0001 +#define GL_TRIANGLES 0x0004 +#define GL_TRIANGLE_STRIP 0x0005 +#define GL_QUADS 0x0007 + +/* FrontFaceDirection */ +#define GL_CW 0x0900 +#define GL_CCW 0x0901 + +#define GL_NONE 0 +#define GL_FRONT_LEFT 0x0400 +#define GL_FRONT_RIGHT 0x0401 +#define GL_BACK_LEFT 0x0402 +#define GL_BACK_RIGHT 0x0403 +#define GL_FRONT 0x0404 +#define GL_BACK 0x0405 +#define GL_LEFT 0x0406 +#define GL_RIGHT 0x0407 +#define GL_FRONT_AND_BACK 0x0408 +#define GL_CULL_FACE 0x0B44 +#define GL_CULL_FACE_MODE 0x0B45 +#define GL_FRONT_FACE 0x0B46 + +/* Scissor box */ +#define GL_SCISSOR_TEST 0x0008 /* capability bit */ +#define GL_SCISSOR_BOX 0x0C10 + +/* Depth buffer */ +#define GL_NEVER 0x0200 +#define GL_LESS 0x0201 +#define GL_EQUAL 0x0202 +#define GL_LEQUAL 0x0203 +#define GL_GREATER 0x0204 +#define GL_NOTEQUAL 0x0205 +#define GL_GEQUAL 0x0206 +#define GL_ALWAYS 0x0207 +#define GL_DEPTH_TEST 0x0B71 +#define GL_DEPTH_BITS 0x0D56 +#define GL_DEPTH_FUNC 0x0B74 +#define GL_DEPTH_WRITEMASK 0x0B72 +#define GL_DEPTH_COMPONENT 0x1902 + +/* Blending: Simply Need to Map GL constants to PVR constants */ +#define GL_BLEND_DST 0x0BE0 +#define GL_BLEND_SRC 0x0BE1 +#define GL_BLEND 0x0BE2 /* capability bit */ + +#define GL_ZERO 0x0 +#define GL_ONE 0x1 +#define GL_SRC_COLOR 0x0300 +#define GL_ONE_MINUS_SRC_COLOR 0x0301 +#define GL_SRC_ALPHA 0x0302 +#define GL_ONE_MINUS_SRC_ALPHA 0x0303 +#define GL_DST_ALPHA 0x0304 +#define GL_ONE_MINUS_DST_ALPHA 0x0305 +#define GL_DST_COLOR 0x0306 +#define GL_ONE_MINUS_DST_COLOR 0x0307 +#define GL_SRC_ALPHA_SATURATE 0x0308 + +/* Misc texture constants */ +#define GL_TEXTURE_2D 0x0001 /* capability bit */ + +/* Texture Environment */ +#define GL_TEXTURE_ENV_MODE 0x2200 +#define GL_REPLACE 0x1E01 +#define GL_MODULATE 0x2100 +#define GL_DECAL 0x2101 + +/* TextureMagFilter */ +#define GL_NEAREST 0x2600 +#define GL_LINEAR 0x2601 + +/* Fog */ +#define GL_FOG 0x0004 /* capability bit */ + +/* Client state caps */ +#define GL_VERTEX_ARRAY 0x8074 +#define GL_COLOR_ARRAY 0x8076 +#define GL_TEXTURE_COORD_ARRAY 0x8078 + +#define GL_FRONT_AND_BACK 0x0408 +#define GL_FRONT 0x0404 +#define GL_BACK 0x0405 + +#define GL_SHADE_MODEL 0x0b54 +#define GL_FLAT 0x1d00 +#define GL_SMOOTH 0x1d01 + +/* Data types */ +#define GL_BYTE 0x1400 +#define GL_UNSIGNED_BYTE 0x1401 +#define GL_SHORT 0x1402 +#define GL_UNSIGNED_SHORT 0x1403 +#define GL_INT 0x1404 +#define GL_UNSIGNED_INT 0x1405 +#define GL_FLOAT 0x1406 + +/* ErrorCode */ +#define GL_NO_ERROR 0 +#define GL_INVALID_ENUM 0x0500 +#define GL_INVALID_VALUE 0x0501 +#define GL_INVALID_OPERATION 0x0502 +#define GL_OUT_OF_MEMORY 0x0505 + +/* GetPName */ +#define GL_MAX_TEXTURE_SIZE 0x0D33 + +/* StringName */ +#define GL_VENDOR 0x1F00 +#define GL_RENDERER 0x1F01 + +#define GL_UNSIGNED_SHORT_4_4_4_4 0x8033 +#define GL_UNSIGNED_SHORT_5_5_5_1 0x8034 +#define GL_UNSIGNED_SHORT_5_6_5 0x8363 +#define GL_UNSIGNED_SHORT_5_6_5_REV 0x8364 +#define GL_UNSIGNED_SHORT_4_4_4_4_REV 0x8365 +#define GL_UNSIGNED_SHORT_1_5_5_5_REV 0x8366 + +#define GL_RGBA 0x1908 + +/* Polygons */ +#define GL_POINT 0x1B00 +#define GL_LINE 0x1B01 +#define GL_FILL 0x1B02 +#define GL_CW 0x0900 +#define GL_CCW 0x0901 +#define GL_FRONT 0x0404 +#define GL_BACK 0x0405 +#define GL_POLYGON_MODE 0x0B40 +#define GL_POLYGON_SMOOTH 0x0B41 +#define GL_POLYGON_STIPPLE 0x0B42 +#define GL_EDGE_FLAG 0x0B43 +#define GL_CULL_FACE 0x0B44 +#define GL_CULL_FACE_MODE 0x0B45 +#define GL_FRONT_FACE 0x0B46 + +#define GLbyte char +#define GLshort short +#define GLint int +#define GLfloat float +#define GLvoid void +#define GLushort unsigned short +#define GLuint unsigned int +#define GLenum unsigned int +#define GLsizei unsigned int +#define GLclampf float +#define GLclampd float +#define GLubyte unsigned char +#define GLboolean unsigned char +#define GL_FALSE 0 +#define GL_TRUE 1 + +/* Stubs for portability */ +#define GL_ALPHA_TEST 0x0BC0 + +#define GLAPI extern +#define APIENTRY + +/* Start Submission of Primitive Data */ +/* Currently Supported Primitive Types: + -GL_POINTS ( does NOT work with glDrawArrays )( ZClipping NOT supported ) + -GL_TRIANGLES ( works with glDrawArrays )( ZClipping supported ) + -GL_TRIANLGLE_STRIP ( works with glDrawArrays )( ZClipping supported ) + -GL_QUADS ( works with glDrawArrays )( ZClipping supported ) +**/ + +/* Enable / Disable Capability */ +/* Currently Supported Capabilities: + GL_TEXTURE_2D + GL_BLEND + GL_DEPTH_TEST + GL_LIGHTING + GL_SCISSOR_TEST + GL_FOG + GL_CULL_FACE + GL_KOS_NEARZ_CLIPPING + GL_KOS_TEXTURE_MATRIX +*/ +GLAPI void glEnable(GLenum cap); +GLAPI void glDisable(GLenum cap); + +/* Clear Caps */ +GLAPI void glClear(GLuint mode); +GLAPI void glClearColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a); + +/* Depth Testing */ +GLAPI void glClearDepth(GLfloat depth); +GLAPI void glClearDepthf(GLfloat depth); +GLAPI void glDepthMask(GLboolean flag); +GLAPI void glDepthFunc(GLenum func); +GLAPI void glDepthRange(GLclampf n, GLclampf f); +GLAPI void glDepthRangef(GLclampf n, GLclampf f); + +/* Culling */ +GLAPI void glFrontFace(GLenum mode); +GLAPI void glCullFace(GLenum mode); + +/* Shading - Flat or Goraud */ +GLAPI void glShadeModel(GLenum mode); + +/* Blending */ +GLAPI void glBlendFunc(GLenum sfactor, GLenum dfactor); + +GLAPI GLuint gldcGenTexture(void); +GLAPI void gldcDeleteTexture(GLuint texture); +GLAPI void gldcBindTexture(GLuint texture); + +/* Loads texture from SH4 RAM into PVR VRAM applying color conversion if needed */ +/* internalformat must be one of the following constants: + GL_RGB + GL_RGBA + + format must be the same as internalformat + + if internal format is GL_RGBA, type must be one of the following constants: + GL_BYTE + GL_UNSIGNED_BYTE + GL_SHORT + GL_UNSIGNED_SHORT + GL_FLOAT + GL_UNSIGNED_SHORT_4_4_4_4 + GL_UNSIGNED_SHORT_4_4_4_4_TWID + GL_UNSIGNED_SHORT_1_5_5_5 + GL_UNSIGNED_SHORT_1_5_5_5_TWID + */ +GLAPI void gldcAllocTexture(GLsizei w, GLsizei h, GLenum format, GLenum type); +GLAPI void gldcGetTexture(GLvoid** data, GLsizei* width, GLsizei* height); + + +/* GL Array API - Only GL_TRIANGLES, GL_TRIANGLE_STRIP, and GL_QUADS are supported */ +GLAPI void glVertexPointer(GLint size, GLenum type, + GLsizei stride, const GLvoid *pointer); + +/* Array Data Submission */ +GLAPI void glDrawArrays(GLenum mode, GLint first, GLsizei count); +GLAPI void glDrawElements(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices); + +GLAPI void glEnableClientState(GLenum cap); +GLAPI void glDisableClientState(GLenum cap); + +/* Transformation / Matrix Functions */ + +GLAPI void glViewport(GLint x, GLint y, GLsizei width, GLsizei height); + +GLAPI void glScissor(GLint x, GLint y, GLsizei width, GLsizei height); + +/* glGet Functions */ +GLAPI void glGetIntegerv(GLenum pname, GLint *params); +GLAPI const GLubyte* glGetString(GLenum name); + +/* Error handling */ +GLAPI GLenum glGetError(void); + + +GLAPI void glAlphaFunc(GLenum func, GLclampf ref); +/* Non Operational Stubs for portability */ +GLAPI void glPolygonOffset(GLfloat factor, GLfloat units); +GLAPI void glColorMask(GLboolean red, GLboolean green, GLboolean blue, GLboolean alpha); + + +/* ATI_meminfo extension */ +#define GL_VBO_FREE_MEMORY_ATI 0x87FB +#define GL_TEXTURE_FREE_MEMORY_ATI 0x87FC +#define GL_RENDERBUFFER_FREE_MEMORY_ATI 0x87FD + + + + +/* + * Dreamcast specific compressed + twiddled formats. + * We use constants from the range 0xEEE0 onwards + * to avoid trampling any real GL constants (this is in the middle of the + * any_vendor_future_use range defined in the GL enum.spec file. +*/ +#define GL_UNSIGNED_SHORT_5_6_5_TWID_KOS 0xEEE0 +#define GL_UNSIGNED_SHORT_1_5_5_5_REV_TWID_KOS 0xEEE2 +#define GL_UNSIGNED_SHORT_4_4_4_4_REV_TWID_KOS 0xEEE3 + +#define GL_NEARZ_CLIPPING_KOS 0xEEFA + + +/* Initialize the GL pipeline. GL will initialize the PVR. */ +GLAPI void glKosInit(); + +typedef struct { + /* If GL_TRUE, enables pvr autosorting, this *will* break glDepthFunc/glDepthTest */ + GLboolean autosort_enabled; + + /* If GL_TRUE, enables the PVR FSAA */ + GLboolean fsaa_enabled; + + /* Initial capacity of each of the OP, TR and PT lists in vertices */ + GLuint initial_op_capacity; + GLuint initial_tr_capacity; + GLuint initial_pt_capacity; + +} GLdcConfig; + + +GLAPI void glKosInitConfig(GLdcConfig* config); + +/* Usage: + * + * GLdcConfig config; + * glKosInitConfig(&config); + * + * config.autosort_enabled = GL_TRUE; + * + * glKosInitEx(&config); + */ +GLAPI void glKosInitEx(GLdcConfig* config); +GLAPI void glKosSwapBuffers(); +\ + +/* Memory allocation extension (GL_KOS_texture_memory_management) */ +GLAPI GLvoid glDefragmentTextureMemory_KOS(void); + +#define GL_FREE_TEXTURE_MEMORY_KOS 0xEF3D +#define GL_USED_TEXTURE_MEMORY_KOS 0xEF3E +#define GL_FREE_CONTIGUOUS_TEXTURE_MEMORY_KOS 0xEF3F + +__END_DECLS + +#endif /* !__GL_GL_H */ diff --git a/third_party/gldc/src/aligned_vector.c b/third_party/gldc/src/aligned_vector.c new file mode 100644 index 000000000..5ac969d3a --- /dev/null +++ b/third_party/gldc/src/aligned_vector.c @@ -0,0 +1,55 @@ +#include +#include +#include +#include +#include + +#include "private.h" +#include "aligned_vector.h" + +extern inline void* aligned_vector_resize(AlignedVector* vector, const uint32_t element_count); +extern inline void* aligned_vector_extend(AlignedVector* vector, const uint32_t additional_count); +extern inline void* aligned_vector_reserve(AlignedVector* vector, uint32_t element_count); +extern inline void* aligned_vector_push_back(AlignedVector* vector, const void* objs, uint32_t count); + +void aligned_vector_init(AlignedVector* vector, uint32_t element_size) { + /* Now initialize the header*/ + AlignedVectorHeader* const hdr = &vector->hdr; + hdr->size = 0; + hdr->capacity = ALIGNED_VECTOR_CHUNK_SIZE; + hdr->element_size = element_size; + vector->data = NULL; + + /* Reserve some initial capacity. This will do the allocation but not set up the header */ + void* ptr = aligned_vector_reserve(vector, ALIGNED_VECTOR_CHUNK_SIZE); + assert(ptr); + (void) ptr; +} + +void aligned_vector_shrink_to_fit(AlignedVector* vector) { + AlignedVectorHeader* const hdr = &vector->hdr; + if(hdr->size == 0) { + uint32_t element_size = hdr->element_size; + free(vector->data); + + /* Reallocate the header */ + vector->data = NULL; + hdr->size = hdr->capacity = 0; + hdr->element_size = element_size; + } else { + uint32_t new_byte_size = (hdr->size * hdr->element_size); + uint8_t* original_data = vector->data; + vector->data = (unsigned char*) memalign(0x20, new_byte_size); + + if(original_data) { + FASTCPY(vector->data, original_data, new_byte_size); + free(original_data); + } + hdr->capacity = hdr->size; + } +} + +void aligned_vector_cleanup(AlignedVector* vector) { + aligned_vector_clear(vector); + aligned_vector_shrink_to_fit(vector); +} diff --git a/third_party/gldc/src/aligned_vector.h b/third_party/gldc/src/aligned_vector.h new file mode 100644 index 000000000..11dfdbdc2 --- /dev/null +++ b/third_party/gldc/src/aligned_vector.h @@ -0,0 +1,217 @@ +#pragma once + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__APPLE__) || defined(__WIN32__) +/* Linux + Kos define this, OSX does not, so just use malloc there */ +static inline void* memalign(size_t alignment, size_t size) { + (void) alignment; + return malloc(size); +} +#else + #include +#endif + +#ifdef __cplusplus +#define AV_FORCE_INLINE static inline +#else +#define AV_NO_INSTRUMENT inline __attribute__((no_instrument_function)) +#define AV_INLINE_DEBUG AV_NO_INSTRUMENT __attribute__((always_inline)) +#define AV_FORCE_INLINE static AV_INLINE_DEBUG +#endif + + +#ifdef __DREAMCAST__ +#include + +AV_FORCE_INLINE void *AV_MEMCPY4(void *dest, const void *src, size_t len) +{ + if(!len) + { + return dest; + } + + const uint8_t *s = (uint8_t *)src; + uint8_t *d = (uint8_t *)dest; + + uint32_t diff = (uint32_t)d - (uint32_t)(s + 1); // extra offset because input gets incremented before output is calculated + // Underflow would be like adding a negative offset + + // Can use 'd' as a scratch reg now + asm volatile ( + "clrs\n" // Align for parallelism (CO) - SH4a use "stc SR, Rn" instead with a dummy Rn + ".align 2\n" + "0:\n\t" + "dt %[size]\n\t" // (--len) ? 0 -> T : 1 -> T (EX 1) + "mov.b @%[in]+, %[scratch]\n\t" // scratch = *(s++) (LS 1/2) + "bf.s 0b\n\t" // while(s != nexts) aka while(!T) (BR 1/2) + " mov.b %[scratch], @(%[offset], %[in])\n" // *(datatype_of_s*) ((char*)s + diff) = scratch, where src + diff = dest (LS 1) + : [in] "+&r" ((uint32_t)s), [scratch] "=&r" ((uint32_t)d), [size] "+&r" (len) // outputs + : [offset] "z" (diff) // inputs + : "t", "memory" // clobbers + ); + + return dest; +} + +#else +#define AV_MEMCPY4 memcpy +#endif + +typedef struct { + uint32_t size; + uint32_t capacity; + uint32_t element_size; +} __attribute__((aligned(32))) AlignedVectorHeader; + +typedef struct { + AlignedVectorHeader hdr; + uint8_t* data; +} AlignedVector; + +#define ALIGNED_VECTOR_CHUNK_SIZE 256u + + +#define ROUND_TO_CHUNK_SIZE(v) \ + ((((v) + ALIGNED_VECTOR_CHUNK_SIZE - 1) / ALIGNED_VECTOR_CHUNK_SIZE) * ALIGNED_VECTOR_CHUNK_SIZE) + + +void aligned_vector_init(AlignedVector* vector, uint32_t element_size); + +AV_FORCE_INLINE void* aligned_vector_at(const AlignedVector* vector, const uint32_t index) { + const AlignedVectorHeader* hdr = &vector->hdr; + assert(index < hdr->size); + return vector->data + (index * hdr->element_size); +} + +AV_FORCE_INLINE void* aligned_vector_reserve(AlignedVector* vector, uint32_t element_count) { + AlignedVectorHeader* hdr = &vector->hdr; + + if(element_count < hdr->capacity) { + return aligned_vector_at(vector, element_count); + } + + uint32_t original_byte_size = (hdr->size * hdr->element_size); + + /* We overallocate so that we don't make small allocations during push backs */ + element_count = ROUND_TO_CHUNK_SIZE(element_count); + + uint32_t new_byte_size = (element_count * hdr->element_size); + uint8_t* original_data = vector->data; + + vector->data = (uint8_t*) memalign(0x20, new_byte_size); + assert(vector->data); + + AV_MEMCPY4(vector->data, original_data, original_byte_size); + free(original_data); + + hdr->capacity = element_count; + return vector->data + original_byte_size; +} + +AV_FORCE_INLINE AlignedVectorHeader* aligned_vector_header(const AlignedVector* vector) { + return (AlignedVectorHeader*) &vector->hdr; +} + +AV_FORCE_INLINE uint32_t aligned_vector_size(const AlignedVector* vector) { + const AlignedVectorHeader* hdr = &vector->hdr; + return hdr->size; +} + +AV_FORCE_INLINE uint32_t aligned_vector_capacity(const AlignedVector* vector) { + const AlignedVectorHeader* hdr = &vector->hdr; + return hdr->capacity; +} + +AV_FORCE_INLINE void* aligned_vector_front(const AlignedVector* vector) { + return vector->data; +} + +#define av_assert(x) \ + do {\ + if(!(x)) {\ + fprintf(stderr, "Assertion failed at %s:%d\n", __FILE__, __LINE__);\ + exit(1);\ + }\ + } while(0); \ + +/* Resizes the array and returns a pointer to the first new element (if upsizing) or NULL (if downsizing) */ +AV_FORCE_INLINE void* aligned_vector_resize(AlignedVector* vector, const uint32_t element_count) { + void* ret = NULL; + + AlignedVectorHeader* hdr = &vector->hdr; + uint32_t previous_count = hdr->size; + if(hdr->capacity <= element_count) { + /* If we didn't have capacity, increase capacity (slow) */ + + aligned_vector_reserve(vector, element_count); + hdr->size = element_count; + + ret = aligned_vector_at(vector, previous_count); + + av_assert(hdr->size == element_count); + av_assert(hdr->size <= hdr->capacity); + } else if(previous_count < element_count) { + /* So we grew, but had the capacity, just get a pointer to + * where we were */ + hdr->size = element_count; + av_assert(hdr->size < hdr->capacity); + ret = aligned_vector_at(vector, previous_count); + } else if(hdr->size != element_count) { + hdr->size = element_count; + av_assert(hdr->size < hdr->capacity); + } + + return ret; +} + +AV_FORCE_INLINE void* aligned_vector_push_back(AlignedVector* vector, const void* objs, uint32_t count) { + /* Resize enough room */ + AlignedVectorHeader* hdr = &vector->hdr; + + assert(count); + assert(hdr->element_size); + +#ifndef NDEBUG + uint32_t element_size = hdr->element_size; + uint32_t initial_size = hdr->size; +#endif + + uint8_t* dest = (uint8_t*) aligned_vector_resize(vector, hdr->size + count); + assert(dest); + + /* Copy the objects in */ + AV_MEMCPY4(dest, objs, hdr->element_size * count); + + assert(hdr->element_size == element_size); + assert(hdr->size == initial_size + count); + return dest; +} + + +AV_FORCE_INLINE void* aligned_vector_extend(AlignedVector* vector, const uint32_t additional_count) { + AlignedVectorHeader* hdr = &vector->hdr; + void* ret = aligned_vector_resize(vector, hdr->size + additional_count); + assert(ret); // Should always return something + return ret; +} + +AV_FORCE_INLINE void aligned_vector_clear(AlignedVector* vector){ + AlignedVectorHeader* hdr = &vector->hdr; + hdr->size = 0; +} + +void aligned_vector_shrink_to_fit(AlignedVector* vector); +void aligned_vector_cleanup(AlignedVector* vector); + +#ifdef __cplusplus +} +#endif diff --git a/third_party/gldc/src/draw.c b/third_party/gldc/src/draw.c new file mode 100644 index 000000000..d72f072a5 --- /dev/null +++ b/third_party/gldc/src/draw.c @@ -0,0 +1,374 @@ +#include +#include +#include +#include +#include + +#include "private.h" +#include "platform.h" + + +static void* VERTEX_PTR; +static GLsizei VERTEX_STRIDE; +static GLuint ENABLED_VERTEX_ATTRIBUTES; + +extern GLboolean AUTOSORT_ENABLED; + +#define ITERATE(count) \ + GLuint i = count; \ + while(i--) + + +void _glInitAttributePointers() { + TRACE(); + VERTEX_PTR = NULL; + VERTEX_STRIDE = 0; +} + + +GL_FORCE_INLINE PolyHeader *_glSubmissionTargetHeader(SubmissionTarget* target) { + gl_assert(target->header_offset < aligned_vector_size(&target->output->vector)); + return aligned_vector_at(&target->output->vector, target->header_offset); +} + +GL_INLINE_DEBUG Vertex* _glSubmissionTargetStart(SubmissionTarget* target) { + gl_assert(target->start_offset < aligned_vector_size(&target->output->vector)); + return aligned_vector_at(&target->output->vector, target->start_offset); +} + +Vertex* _glSubmissionTargetEnd(SubmissionTarget* target) { + return _glSubmissionTargetStart(target) + target->count; +} + +typedef struct { + float u, v; +} Float2; + +static const Float2 F2ZERO = {0.0f, 0.0f}; + +static void generateQuads(SubmissionTarget* target, const GLsizei first, const GLuint count) { + /* Read from the client buffers and generate an array of ClipVertices */ + TRACE(); + + GLuint numQuads = count / 4; + GLuint idx = first; + + Vertex* start = _glSubmissionTargetStart(target); + + const GLuint stride = VERTEX_STRIDE; + + /* Copy the pos, uv and color directly in one go */ + const GLubyte* pos = VERTEX_PTR; + const GLubyte* uv = (ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) ? VERTEX_PTR : NULL; + const GLubyte* col = VERTEX_PTR; + + Vertex* dst = start; + const float w = 1.0f; + + // TODO: optimise + for (GLuint i = 0; i < numQuads; ++i) { + // 4 vertices per quad + Vertex* it = dst; + PREFETCH(it); // TODO: more prefetching? + + for(GLuint j = 0; j < 4; ++j) { + pos = (GLubyte*) VERTEX_PTR + (idx * stride); + PREFETCH(pos); + TransformVertex((const float*) pos, &w, it->xyz, &it->w); + + col = pos + 12; + *((uint32_t*) it->bgra) = *((uint32_t*) col); + + if(uv) { + uv = pos + 16; + MEMCPY4(it->uv, uv, sizeof(float) * 2); + } else { + *((Float2*) it->uv) = F2ZERO; + } + + it++; + idx++; + } + + // Quads [0, 1, 2, 3] -> Triangles [{0, 1, 2} {2, 3, 0}] + PREFETCH(dst); // TODO: more prefetching? + memcpy_vertex(dst + 5, dst + 0); dst[5].flags = GPU_CMD_VERTEX_EOL; + memcpy_vertex(dst + 4, dst + 3); dst[4].flags = GPU_CMD_VERTEX; + memcpy_vertex(dst + 3, dst + 2); dst[3].flags = GPU_CMD_VERTEX; + + dst[2].flags = GPU_CMD_VERTEX_EOL; + dst[1].flags = GPU_CMD_VERTEX; + dst[0].flags = GPU_CMD_VERTEX; + // TODO copy straight to dst?? + + + dst += 6; + } +} + +GL_FORCE_INLINE void divide(SubmissionTarget* target) { + TRACE(); + + /* Perform perspective divide on each vertex */ + Vertex* vertex = _glSubmissionTargetStart(target); + + const float h = GetVideoMode()->height; + + ITERATE(target->count) { + const float f = MATH_Fast_Invert(vertex->w); + + /* Convert to NDC and apply viewport */ + vertex->xyz[0] = MATH_fmac( + VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth + ); + vertex->xyz[1] = h - MATH_fmac( + VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight + ); + + /* Apply depth range */ + vertex->xyz[2] = MAX( + 1.0f - MATH_fmac(vertex->xyz[2] * f, 0.5f, 0.5f), + PVR_MIN_Z + ); + + ++vertex; + } +} + +GL_FORCE_INLINE int _calc_pvr_face_culling() { + if(!_glIsCullingEnabled()) { + return GPU_CULLING_SMALL; + } else { + if(_glGetCullFace() == GL_BACK) { + return (_glGetFrontFace() == GL_CW) ? GPU_CULLING_CCW : GPU_CULLING_CW; + } else { + return (_glGetFrontFace() == GL_CCW) ? GPU_CULLING_CCW : GPU_CULLING_CW; + } + } +} + +GL_FORCE_INLINE int _calc_pvr_depth_test() { + if(!_glIsDepthTestEnabled()) { + return GPU_DEPTHCMP_ALWAYS; + } + + switch(_glGetDepthFunc()) { + case GL_NEVER: + return GPU_DEPTHCMP_NEVER; + case GL_LESS: + return GPU_DEPTHCMP_GREATER; + case GL_EQUAL: + return GPU_DEPTHCMP_EQUAL; + case GL_LEQUAL: + return GPU_DEPTHCMP_GEQUAL; + case GL_GREATER: + return GPU_DEPTHCMP_LESS; + case GL_NOTEQUAL: + return GPU_DEPTHCMP_NOTEQUAL; + case GL_GEQUAL: + return GPU_DEPTHCMP_LEQUAL; + break; + case GL_ALWAYS: + default: + return GPU_DEPTHCMP_ALWAYS; + } +} + +GL_FORCE_INLINE int _calcPVRBlendFactor(GLenum factor) { + switch(factor) { + case GL_ZERO: + return GPU_BLEND_ZERO; + case GL_SRC_ALPHA: + return GPU_BLEND_SRCALPHA; + case GL_DST_COLOR: + return GPU_BLEND_DESTCOLOR; + case GL_DST_ALPHA: + return GPU_BLEND_DESTALPHA; + case GL_ONE_MINUS_DST_COLOR: + return GPU_BLEND_INVDESTCOLOR; + case GL_ONE_MINUS_SRC_ALPHA: + return GPU_BLEND_INVSRCALPHA; + case GL_ONE_MINUS_DST_ALPHA: + return GPU_BLEND_INVDESTALPHA; + case GL_ONE: + return GPU_BLEND_ONE; + default: + fprintf(stderr, "Invalid blend mode: %u\n", (unsigned int) factor); + return GPU_BLEND_ONE; + } +} + + +GL_FORCE_INLINE void _updatePVRBlend(PolyContext* context) { + if(_glIsBlendingEnabled() || _glIsAlphaTestEnabled()) { + context->gen.alpha = GPU_ALPHA_ENABLE; + } else { + context->gen.alpha = GPU_ALPHA_DISABLE; + } + + context->blend.src = _calcPVRBlendFactor(_glGetBlendSourceFactor()); + context->blend.dst = _calcPVRBlendFactor(_glGetBlendDestFactor()); +} + +GL_FORCE_INLINE void apply_poly_header(PolyHeader* header, PolyList* activePolyList) { + TRACE(); + + // Compile the header + PolyContext ctx; + memset(&ctx, 0, sizeof(PolyContext)); + + ctx.list_type = activePolyList->list_type; + ctx.fmt.color = GPU_CLRFMT_ARGBPACKED; + ctx.fmt.uv = GPU_UVFMT_32BIT; + ctx.gen.color_clamp = GPU_CLRCLAMP_DISABLE; + + ctx.gen.culling = _calc_pvr_face_culling(); + ctx.depth.comparison = _calc_pvr_depth_test(); + ctx.depth.write = _glIsDepthWriteEnabled() ? GPU_DEPTHWRITE_ENABLE : GPU_DEPTHWRITE_DISABLE; + + ctx.gen.shading = (_glGetShadeModel() == GL_SMOOTH) ? GPU_SHADE_GOURAUD : GPU_SHADE_FLAT; + + if(_glIsScissorTestEnabled()) { + ctx.gen.clip_mode = GPU_USERCLIP_INSIDE; + } else { + ctx.gen.clip_mode = GPU_USERCLIP_DISABLE; + } + + if(_glIsFogEnabled()) { + ctx.gen.fog_type = GPU_FOG_TABLE; + } else { + ctx.gen.fog_type = GPU_FOG_DISABLE; + } + + _updatePVRBlend(&ctx); + + if(ctx.list_type == GPU_LIST_OP_POLY) { + /* Opaque polys are always one/zero */ + ctx.blend.src = GPU_BLEND_ONE; + ctx.blend.dst = GPU_BLEND_ZERO; + } else if(ctx.list_type == GPU_LIST_PT_POLY) { + /* Punch-through polys require fixed blending and depth modes */ + ctx.blend.src = GPU_BLEND_SRCALPHA; + ctx.blend.dst = GPU_BLEND_INVSRCALPHA; + ctx.depth.comparison = GPU_DEPTHCMP_LEQUAL; + } else if(ctx.list_type == GPU_LIST_TR_POLY && AUTOSORT_ENABLED) { + /* Autosort mode requires this mode for transparent polys */ + ctx.depth.comparison = GPU_DEPTHCMP_GEQUAL; + } + + _glUpdatePVRTextureContext(&ctx, 0); + + CompilePolyHeader(header, &ctx); + + /* Force bits 18 and 19 on to switch to 6 triangle strips */ + header->cmd |= 0xC0000; + + /* Post-process the vertex list */ + /* + * This is currently unnecessary. aligned_vector memsets the allocated objects + * to zero, and we don't touch oargb, also, we don't *enable* oargb yet in the + * pvr header so it should be ignored anyway. If this ever becomes a problem, + * uncomment this. + ClipVertex* vout = output; + const ClipVertex* end = output + count; + while(vout < end) { + vout->oargb = 0; + } + */ +} + +#define DEBUG_CLIPPING 0 + + +static SubmissionTarget SUBMISSION_TARGET; + + +void _glInitSubmissionTarget() { + SubmissionTarget* target = &SUBMISSION_TARGET; + + target->count = 0; + target->output = NULL; + target->header_offset = target->start_offset = 0; +} + + +GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count) { + + SubmissionTarget* const target = &SUBMISSION_TARGET; + + TRACE(); + + /* No vertices? Do nothing */ + if(!count) return; + + target->output = _glActivePolyList(); + gl_assert(target->output); + + uint32_t vector_size = aligned_vector_size(&target->output->vector); + + GLboolean header_required = (vector_size == 0) || _glGPUStateIsDirty(); + + target->count = count * 6 / 4; // quads -> triangles + target->header_offset = vector_size; + target->start_offset = target->header_offset + (header_required ? 1 : 0); + + gl_assert(target->header_offset >= 0); + gl_assert(target->start_offset >= target->header_offset); + gl_assert(target->count); + + /* Make room for the vertices and header */ + aligned_vector_extend(&target->output->vector, target->count + (header_required)); + + if(header_required) { + apply_poly_header(_glSubmissionTargetHeader(target), target->output); + _glGPUStateMarkClean(); + } + + generateQuads(target, first, count); +} + +void APIENTRY glDrawArrays(GLenum mode, GLint first, GLsizei count) { + TRACE(); + submitVertices(mode, first, count); +} + +void APIENTRY glEnableClientState(GLenum cap) { + TRACE(); + + switch(cap) { + case GL_VERTEX_ARRAY: + ENABLED_VERTEX_ATTRIBUTES |= VERTEX_ENABLED_FLAG; + break; + case GL_COLOR_ARRAY: + ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; + break; + case GL_TEXTURE_COORD_ARRAY: + ENABLED_VERTEX_ATTRIBUTES |= UV_ENABLED_FLAG; + break; + default: + _glKosThrowError(GL_INVALID_ENUM, __func__); + } +} + +void APIENTRY glDisableClientState(GLenum cap) { + TRACE(); + + switch(cap) { + case GL_VERTEX_ARRAY: + ENABLED_VERTEX_ATTRIBUTES &= ~VERTEX_ENABLED_FLAG; + break; + case GL_COLOR_ARRAY: + ENABLED_VERTEX_ATTRIBUTES &= ~DIFFUSE_ENABLED_FLAG; + break; + case GL_TEXTURE_COORD_ARRAY: + ENABLED_VERTEX_ATTRIBUTES &= ~UV_ENABLED_FLAG; + break; + default: + _glKosThrowError(GL_INVALID_ENUM, __func__); + } +} + +void APIENTRY glVertexPointer(GLint size, GLenum type, GLsizei stride, const GLvoid * pointer) { + VERTEX_PTR = pointer; + VERTEX_STRIDE = stride; +} diff --git a/third_party/gldc/src/error.c b/third_party/gldc/src/error.c new file mode 100644 index 000000000..25c5b2929 --- /dev/null +++ b/third_party/gldc/src/error.c @@ -0,0 +1,32 @@ +/* KallistiGL for KallistiOS ##version## + + libgl/gl-error.c + Copyright (C) 2014 Josh Pearson + Copyright (C) 2016 Lawrence Sebald + * Copyright (C) 2017 Luke Benstead + + KOS Open GL State Machine Error Code Implementation. +*/ + +#include + +#include "private.h" + +GLenum LAST_ERROR = GL_NO_ERROR; +char ERROR_FUNCTION[64] = { '\0' }; + +/* Quoth the GL Spec: + When an error occurs, the error flag is set to the appropriate error code + value. No other errors are recorded until glGetError is called, the error + code is returned, and the flag is reset to GL_NO_ERROR. + + So, we only record an error here if the error code is currently unset. + Nothing in the spec requires recording multiple error flags, although it is + allowed by the spec. We take the easy way out for now. */ + + +GLenum glGetError(void) { + GLenum rv = LAST_ERROR; + _glKosResetError(); + return rv; +} diff --git a/third_party/gldc/src/flush.c b/third_party/gldc/src/flush.c new file mode 100644 index 000000000..bc17f1859 --- /dev/null +++ b/third_party/gldc/src/flush.c @@ -0,0 +1,103 @@ +#include "aligned_vector.h" +#include "private.h" + +PolyList OP_LIST; +PolyList PT_LIST; +PolyList TR_LIST; + +/** + * FAST_MODE will use invW for all Z coordinates sent to the + * GPU. + * + * This will break orthographic mode so default is FALSE + **/ + +#define FAST_MODE GL_FALSE + +GLboolean AUTOSORT_ENABLED = GL_FALSE; + +PolyList* _glOpaquePolyList() { + return &OP_LIST; +} + +PolyList* _glPunchThruPolyList() { + return &PT_LIST; +} + +PolyList *_glTransparentPolyList() { + return &TR_LIST; +} + + +void APIENTRY glKosInitConfig(GLdcConfig* config) { + config->autosort_enabled = GL_FALSE; + config->fsaa_enabled = GL_FALSE; + + config->initial_op_capacity = 1024 * 3; + config->initial_pt_capacity = 512 * 3; + config->initial_tr_capacity = 1024 * 3; +} + +void APIENTRY glKosInitEx(GLdcConfig* config) { + TRACE(); + + puts("\nWelcome to GLdc!\n"); + + InitGPU(config->autosort_enabled, config->fsaa_enabled); + + AUTOSORT_ENABLED = config->autosort_enabled; + + _glInitSubmissionTarget(); + _glInitMatrices(); + _glInitAttributePointers(); + _glInitContext(); + _glInitTextures(); + + OP_LIST.list_type = GPU_LIST_OP_POLY; + PT_LIST.list_type = GPU_LIST_PT_POLY; + TR_LIST.list_type = GPU_LIST_TR_POLY; + + aligned_vector_init(&OP_LIST.vector, sizeof(Vertex)); + aligned_vector_init(&PT_LIST.vector, sizeof(Vertex)); + aligned_vector_init(&TR_LIST.vector, sizeof(Vertex)); + + aligned_vector_reserve(&OP_LIST.vector, config->initial_op_capacity); + aligned_vector_reserve(&PT_LIST.vector, config->initial_pt_capacity); + aligned_vector_reserve(&TR_LIST.vector, config->initial_tr_capacity); +} + +void APIENTRY glKosInit() { + GLdcConfig config; + glKosInitConfig(&config); + glKosInitEx(&config); +} + +void APIENTRY glKosSwapBuffers() { + TRACE(); + + SceneBegin(); + if(aligned_vector_header(&OP_LIST.vector)->size > 2) { + SceneListBegin(GPU_LIST_OP_POLY); + SceneListSubmit((Vertex*) aligned_vector_front(&OP_LIST.vector), aligned_vector_size(&OP_LIST.vector)); + SceneListFinish(); + } + + if(aligned_vector_header(&PT_LIST.vector)->size > 2) { + SceneListBegin(GPU_LIST_PT_POLY); + SceneListSubmit((Vertex*) aligned_vector_front(&PT_LIST.vector), aligned_vector_size(&PT_LIST.vector)); + SceneListFinish(); + } + + if(aligned_vector_header(&TR_LIST.vector)->size > 2) { + SceneListBegin(GPU_LIST_TR_POLY); + SceneListSubmit((Vertex*) aligned_vector_front(&TR_LIST.vector), aligned_vector_size(&TR_LIST.vector)); + SceneListFinish(); + } + SceneFinish(); + + aligned_vector_clear(&OP_LIST.vector); + aligned_vector_clear(&PT_LIST.vector); + aligned_vector_clear(&TR_LIST.vector); + + _glApplyScissor(true); +} diff --git a/third_party/gldc/src/gl_assert.h b/third_party/gldc/src/gl_assert.h new file mode 100644 index 000000000..ef5788f8a --- /dev/null +++ b/third_party/gldc/src/gl_assert.h @@ -0,0 +1,20 @@ + +#ifndef NDEBUG +/* We're debugging, use normal assert */ +#include +#define gl_assert assert +#else +/* Release mode, use our custom assert */ +#include +#include + +#define gl_assert(x) \ + do {\ + if(!(x)) {\ + fprintf(stderr, "Assertion failed at %s:%d\n", __FILE__, __LINE__);\ + exit(1);\ + }\ + } while(0); \ + +#endif + diff --git a/third_party/gldc/src/matrix.c b/third_party/gldc/src/matrix.c new file mode 100644 index 000000000..d0771fb17 --- /dev/null +++ b/third_party/gldc/src/matrix.c @@ -0,0 +1,27 @@ +#include +#include +#include + +#include "private.h" + +Viewport VIEWPORT = { + 0, 0, 640, 480, 320.0f, 240.0f, 320.0f, 240.0f +}; + +void _glInitMatrices() { + const VideoMode* vid_mode = GetVideoMode(); + + glViewport(0, 0, vid_mode->width, vid_mode->height); +} + +/* Set the GL viewport */ +void APIENTRY glViewport(GLint x, GLint y, GLsizei width, GLsizei height) { + VIEWPORT.x = x; + VIEWPORT.y = y; + VIEWPORT.width = width; + VIEWPORT.height = height; + VIEWPORT.hwidth = ((GLfloat) VIEWPORT.width) * 0.5f; + VIEWPORT.hheight = ((GLfloat) VIEWPORT.height) * 0.5f; + VIEWPORT.x_plus_hwidth = VIEWPORT.x + VIEWPORT.hwidth; + VIEWPORT.y_plus_hheight = VIEWPORT.y + VIEWPORT.hheight; +} diff --git a/third_party/gldc/src/named_array.c b/third_party/gldc/src/named_array.c new file mode 100644 index 000000000..4f6fad34c --- /dev/null +++ b/third_party/gldc/src/named_array.c @@ -0,0 +1,90 @@ +#include +#include +#include +#include +#include + +#ifndef __APPLE__ +#include +#else +/* Linux + Kos define this, OSX does not, so just use malloc there */ +#define memalign(x, size) malloc((size)) +#endif + +#include "named_array.h" + +void named_array_init(NamedArray* array, unsigned int element_size, unsigned int max_elements) { + array->element_size = element_size; + array->max_element_count = max_elements; + + array->marker_count = (unsigned char)((max_elements+8-1)/8); + +#ifdef _arch_dreamcast + // Use 32-bit aligned memory on the Dreamcast + array->elements = (unsigned char*) memalign(0x20, element_size * max_elements); + array->used_markers = (unsigned char*) memalign(0x20, array->marker_count); +#else + array->elements = (unsigned char*) malloc(element_size * max_elements); + array->used_markers = (unsigned char*) malloc(array->marker_count); +#endif + memset(array->used_markers, 0, sizeof(unsigned char) * array->marker_count); +} + +void* named_array_alloc(NamedArray* array, unsigned int* new_id) { + unsigned int i = 0, j = 0; + for(i = 0; i < array->marker_count; ++i) { + for(j = 0; j < 8; ++j) { + unsigned int id = (i * 8) + j; + if(!named_array_used(array, id)) { + array->used_markers[i] |= (unsigned char) 1 << j; + *new_id = id; + unsigned char* ptr = &array->elements[id * array->element_size]; + memset(ptr, 0, array->element_size); + return ptr; + } + } + } + + return NULL; +} + +void* named_array_reserve(NamedArray* array, unsigned int id) { + if(!named_array_used(array, id)) { + unsigned int j = (id % 8); + unsigned int i = id / 8; + + assert(!named_array_used(array, id)); + array->used_markers[i] |= (unsigned char) 1 << j; + assert(named_array_used(array, id)); + + unsigned char* ptr = &array->elements[id * array->element_size]; + memset(ptr, 0, array->element_size); + return ptr; + } + + return named_array_get(array, id); +} + +void named_array_release(NamedArray* array, unsigned int new_id) { + unsigned int i = new_id / 8; + unsigned int j = new_id % 8; + array->used_markers[i] &= (unsigned char) ~(1 << j); +} + +void* named_array_get(NamedArray* array, unsigned int id) { + if(!named_array_used(array, id)) { + return NULL; + } + + return &array->elements[id * array->element_size]; +} + +void named_array_cleanup(NamedArray* array) { + free(array->elements); + free(array->used_markers); + array->elements = NULL; + array->used_markers = NULL; + array->element_size = array->max_element_count = 0; + array->marker_count = 0; +} + diff --git a/third_party/gldc/src/named_array.h b/third_party/gldc/src/named_array.h new file mode 100644 index 000000000..d6be27632 --- /dev/null +++ b/third_party/gldc/src/named_array.h @@ -0,0 +1,38 @@ +#pragma once + +#ifndef NAMED_ARRAY_H +#define NAMED_ARRAY_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + unsigned int element_size; + unsigned int max_element_count; + unsigned char* elements; + unsigned char* used_markers; + unsigned char marker_count; +} NamedArray; + +void named_array_init(NamedArray* array, unsigned int element_size, unsigned int max_elements); +static inline char named_array_used(NamedArray* array, unsigned int id) { + const unsigned int i = id / 8; + const unsigned int j = id % 8; + + unsigned char v = array->used_markers[i] & (unsigned char) (1 << j); + return !!(v); +} + +void* named_array_alloc(NamedArray* array, unsigned int* new_id); +void* named_array_reserve(NamedArray* array, unsigned int id); + +void named_array_release(NamedArray* array, unsigned int new_id); +void* named_array_get(NamedArray* array, unsigned int id); +void named_array_cleanup(NamedArray* array); + +#ifdef __cplusplus +} +#endif + +#endif // NAMED_ARRAY_H diff --git a/third_party/gldc/src/platform.h b/third_party/gldc/src/platform.h new file mode 100644 index 000000000..c8d9a0e84 --- /dev/null +++ b/third_party/gldc/src/platform.h @@ -0,0 +1,433 @@ +#pragma once + +#include +#include +#include + +#include "gl_assert.h" +#include "types.h" + +#define MEMSET(dst, v, size) memset((dst), (v), (size)) + +typedef enum GPUAlpha { + GPU_ALPHA_DISABLE = 0, + GPU_ALPHA_ENABLE = 1 +} GPUAlpha; + +typedef enum GPUTexture { + GPU_TEXTURE_DISABLE = 0, + GPU_TEXTURE_ENABLE = 1 +} GPUTexture; + +typedef enum GPUTextureAlpha { + GPU_TXRALPHA_DISABLE = 1, + GPU_TXRALPHA_ENABLE = 0 +} GPUTextureAlpha; + +typedef enum GPUList { + GPU_LIST_OP_POLY = 0, + GPU_LIST_OP_MOD = 1, + GPU_LIST_TR_POLY = 2, + GPU_LIST_TR_MOD = 3, + GPU_LIST_PT_POLY = 4 +} GPUList; + +typedef enum GPUBlend { + GPU_BLEND_ZERO = 0, + GPU_BLEND_ONE = 1, + GPU_BLEND_DESTCOLOR = 2, + GPU_BLEND_INVDESTCOLOR = 3, + GPU_BLEND_SRCALPHA = 4, + GPU_BLEND_INVSRCALPHA = 5, + GPU_BLEND_DESTALPHA = 6, + GPU_BLEND_INVDESTALPHA = 7 +} GPUBlend; + +typedef enum GPUDepthCompare { + GPU_DEPTHCMP_NEVER = 0, + GPU_DEPTHCMP_LESS = 1, + GPU_DEPTHCMP_EQUAL = 2, + GPU_DEPTHCMP_LEQUAL = 3, + GPU_DEPTHCMP_GREATER = 4, + GPU_DEPTHCMP_NOTEQUAL = 5, + GPU_DEPTHCMP_GEQUAL = 6, + GPU_DEPTHCMP_ALWAYS = 7 +} GPUDepthCompare; + +typedef enum GPUTextureFormat { + GPU_TXRFMT_NONE, + GPU_TXRFMT_VQ_DISABLE = (0 << 30), + GPU_TXRFMT_VQ_ENABLE = (1 << 30), + GPU_TXRFMT_ARGB1555 = (0 << 27), + GPU_TXRFMT_RGB565 = (1 << 27), + GPU_TXRFMT_ARGB4444 = (2 << 27), + GPU_TXRFMT_YUV422 = (3 << 27), + GPU_TXRFMT_BUMP = (4 << 27), + GPU_TXRFMT_PAL4BPP = (5 << 27), + GPU_TXRFMT_PAL8BPP = (6 << 27), + GPU_TXRFMT_TWIDDLED = (0 << 26), + GPU_TXRFMT_NONTWIDDLED = (1 << 26), + GPU_TXRFMT_NOSTRIDE = (0 << 21), + GPU_TXRFMT_STRIDE = (1 << 21) +} GPUTextureFormat; + +typedef enum GPUCulling { + GPU_CULLING_NONE = 0, + GPU_CULLING_SMALL = 1, + GPU_CULLING_CCW = 2, + GPU_CULLING_CW = 3 +} GPUCulling; + +typedef enum GPUUVFlip { + GPU_UVFLIP_NONE = 0, + GPU_UVFLIP_V = 1, + GPU_UVFLIP_U = 2, + GPU_UVFLIP_UV = 3 +} GPUUVFlip; + +typedef enum GPUUVClamp { + GPU_UVCLAMP_NONE = 0, + GPU_UVCLAMP_V = 1, + GPU_UVCLAMP_U = 2, + GPU_UVCLAMP_UV = 3 +} GPUUVClamp; + +typedef enum GPUColorClamp { + GPU_CLRCLAMP_DISABLE = 0, + GPU_CLRCLAMP_ENABLE = 1 +} GPUColorClamp; + +typedef enum GPUFilter { + GPU_FILTER_NEAREST = 0, + GPU_FILTER_BILINEAR = 2, + GPU_FILTER_TRILINEAR1 = 4, + GPU_FILTER_TRILINEAR2 = 6 +} GPUFilter; + +typedef enum GPUDepthWrite { + GPU_DEPTHWRITE_ENABLE = 0, + GPU_DEPTHWRITE_DISABLE = 1 +} GPUDepthWrite; + +typedef enum GPUUserClip { + GPU_USERCLIP_DISABLE = 0, + GPU_USERCLIP_INSIDE = 2, + GPU_USERCLIP_OUTSIDE = 3 +} GPUUserClip; + +typedef enum GPUColorFormat { + GPU_CLRFMT_ARGBPACKED = 0, + GPU_CLRFMT_4FLOATS = 1, + GPU_CLRFMT_INTENSITY = 2, + GPU_CLRFMT_INTENSITY_PREV = 3 +} GPUColorFormat; + +typedef enum GPUUVFormat { + GPU_UVFMT_32BIT = 0, + GPU_UVFMT_16BIT = 1 +} GPUUVFormat; + +typedef enum GPUFog { + GPU_FOG_TABLE = 0, + GPU_FOG_VERTEX = 1, + GPU_FOG_DISABLE = 2, + GPU_FOG_TABLE2 = 3 +} GPUFog; + +typedef enum GPUShade { + GPU_SHADE_FLAT = 0, + GPU_SHADE_GOURAUD = 1 +} GPUShade; + +typedef enum GPUTextureEnv { + GPU_TXRENV_REPLACE = 0, + GPU_TXRENV_MODULATE = 1, + GPU_TXRENV_DECAL = 2, + GPU_TXRENV_MODULATEALPHA = 3 +} GPUTextureEnv; + +typedef struct VideoMode { + uint16_t width; + uint16_t height; +} VideoMode; + +const VideoMode* GetVideoMode(); + +/* Duplication of pvr_poly_cxt_t from KOS so that we can + * compile on non-KOS platforms for testing */ + +typedef struct { + GPUList list_type; + + struct { + int alpha; + int shading; + int fog_type; + int culling; + int color_clamp; + int clip_mode; + int modifier_mode; + int specular; + int alpha2; + int fog_type2; + int color_clamp2; + } gen; + struct { + int src; + int dst; + int src_enable; + int dst_enable; + int src2; + int dst2; + int src_enable2; + int dst_enable2; + } blend; + struct { + int color; + int uv; + int modifier; + } fmt; + struct { + int comparison; + int write; + } depth; + struct { + int enable; + int filter; + int mipmap; + int mipmap_bias; + int uv_flip; + int uv_clamp; + int alpha; + int env; + int width; + int height; + int format; + void* base; + } txr; + struct { + int enable; + int filter; + int mipmap; + int mipmap_bias; + int uv_flip; + int uv_clamp; + int alpha; + int env; + int width; + int height; + int format; + void* base; + } txr2; +} PolyContext; + +typedef struct { + uint32_t cmd; + uint32_t mode1; + uint32_t mode2; + uint32_t mode3; + uint32_t d1; + uint32_t d2; + uint32_t d3; + uint32_t d4; +} PolyHeader; + +enum GPUCommand { + GPU_CMD_POLYHDR = 0x80840000, + GPU_CMD_VERTEX = 0xe0000000, + GPU_CMD_VERTEX_EOL = 0xf0000000, + GPU_CMD_USERCLIP = 0x20000000, + GPU_CMD_MODIFIER = 0x80000000, + GPU_CMD_SPRITE = 0xA0000000 +}; + +typedef float Matrix4x4[16]; + +void SceneBegin(); + +void SceneListBegin(GPUList list); +void SceneListSubmit(Vertex* v2, int n); +void SceneListFinish(); + +void SceneFinish(); + +#define GPU_TA_CMD_TYPE_SHIFT 24 +#define GPU_TA_CMD_TYPE_MASK (7 << GPU_TA_CMD_TYPE_SHIFT) + +#define GPU_TA_CMD_USERCLIP_SHIFT 16 +#define GPU_TA_CMD_USERCLIP_MASK (3 << GPU_TA_CMD_USERCLIP_SHIFT) + +#define GPU_TA_CMD_CLRFMT_SHIFT 4 +#define GPU_TA_CMD_CLRFMT_MASK (7 << GPU_TA_CMD_CLRFMT_SHIFT) + +#define GPU_TA_CMD_SPECULAR_SHIFT 2 +#define GPU_TA_CMD_SPECULAR_MASK (1 << GPU_TA_CMD_SPECULAR_SHIFT) + +#define GPU_TA_CMD_SHADE_SHIFT 1 +#define GPU_TA_CMD_SHADE_MASK (1 << GPU_TA_CMD_SHADE_SHIFT) + +#define GPU_TA_CMD_UVFMT_SHIFT 0 +#define GPU_TA_CMD_UVFMT_MASK (1 << GPU_TA_CMD_UVFMT_SHIFT) + +#define GPU_TA_CMD_MODIFIER_SHIFT 7 +#define GPU_TA_CMD_MODIFIER_MASK (1 << GPU_TA_CMD_MODIFIER_SHIFT) + +#define GPU_TA_CMD_MODIFIERMODE_SHIFT 6 +#define GPU_TA_CMD_MODIFIERMODE_MASK (1 << GPU_TA_CMD_MODIFIERMODE_SHIFT) + +#define GPU_TA_PM1_DEPTHCMP_SHIFT 29 +#define GPU_TA_PM1_DEPTHCMP_MASK (7 << GPU_TA_PM1_DEPTHCMP_SHIFT) + +#define GPU_TA_PM1_CULLING_SHIFT 27 +#define GPU_TA_PM1_CULLING_MASK (3 << GPU_TA_PM1_CULLING_SHIFT) + +#define GPU_TA_PM1_DEPTHWRITE_SHIFT 26 +#define GPU_TA_PM1_DEPTHWRITE_MASK (1 << GPU_TA_PM1_DEPTHWRITE_SHIFT) + +#define GPU_TA_PM1_TXRENABLE_SHIFT 25 +#define GPU_TA_PM1_TXRENABLE_MASK (1 << GPU_TA_PM1_TXRENABLE_SHIFT) + +#define GPU_TA_PM1_MODIFIERINST_SHIFT 29 +#define GPU_TA_PM1_MODIFIERINST_MASK (3 << GPU_TA_PM1_MODIFIERINST_SHIFT) + +#define GPU_TA_PM2_SRCBLEND_SHIFT 29 +#define GPU_TA_PM2_SRCBLEND_MASK (7 << GPU_TA_PM2_SRCBLEND_SHIFT) + +#define GPU_TA_PM2_DSTBLEND_SHIFT 26 +#define GPU_TA_PM2_DSTBLEND_MASK (7 << GPU_TA_PM2_DSTBLEND_SHIFT) + +#define GPU_TA_PM2_SRCENABLE_SHIFT 25 +#define GPU_TA_PM2_SRCENABLE_MASK (1 << GPU_TA_PM2_SRCENABLE_SHIFT) + +#define GPU_TA_PM2_DSTENABLE_SHIFT 24 +#define GPU_TA_PM2_DSTENABLE_MASK (1 << GPU_TA_PM2_DSTENABLE_SHIFT) + +#define GPU_TA_PM2_FOG_SHIFT 22 +#define GPU_TA_PM2_FOG_MASK (3 << GPU_TA_PM2_FOG_SHIFT) + +#define GPU_TA_PM2_CLAMP_SHIFT 21 +#define GPU_TA_PM2_CLAMP_MASK (1 << GPU_TA_PM2_CLAMP_SHIFT) + +#define GPU_TA_PM2_ALPHA_SHIFT 20 +#define GPU_TA_PM2_ALPHA_MASK (1 << GPU_TA_PM2_ALPHA_SHIFT) + +#define GPU_TA_PM2_TXRALPHA_SHIFT 19 +#define GPU_TA_PM2_TXRALPHA_MASK (1 << GPU_TA_PM2_TXRALPHA_SHIFT) + +#define GPU_TA_PM2_UVFLIP_SHIFT 17 +#define GPU_TA_PM2_UVFLIP_MASK (3 << GPU_TA_PM2_UVFLIP_SHIFT) + +#define GPU_TA_PM2_UVCLAMP_SHIFT 15 +#define GPU_TA_PM2_UVCLAMP_MASK (3 << GPU_TA_PM2_UVCLAMP_SHIFT) + +#define GPU_TA_PM2_FILTER_SHIFT 12 +#define GPU_TA_PM2_FILTER_MASK (7 << GPU_TA_PM2_FILTER_SHIFT) + +#define GPU_TA_PM2_MIPBIAS_SHIFT 8 +#define GPU_TA_PM2_MIPBIAS_MASK (15 << GPU_TA_PM2_MIPBIAS_SHIFT) + +#define GPU_TA_PM2_TXRENV_SHIFT 6 +#define GPU_TA_PM2_TXRENV_MASK (3 << GPU_TA_PM2_TXRENV_SHIFT) + +#define GPU_TA_PM2_USIZE_SHIFT 3 +#define GPU_TA_PM2_USIZE_MASK (7 << GPU_TA_PM2_USIZE_SHIFT) + +#define GPU_TA_PM2_VSIZE_SHIFT 0 +#define GPU_TA_PM2_VSIZE_MASK (7 << GPU_TA_PM2_VSIZE_SHIFT) + +#define GPU_TA_PM3_MIPMAP_SHIFT 31 +#define GPU_TA_PM3_MIPMAP_MASK (1 << GPU_TA_PM3_MIPMAP_SHIFT) + +#define GPU_TA_PM3_TXRFMT_SHIFT 0 +#define GPU_TA_PM3_TXRFMT_MASK 0xffffffff + +static inline int DimensionFlag(const int w) { + switch(w) { + case 16: return 1; + case 32: return 2; + case 64: return 3; + case 128: return 4; + case 256: return 5; + case 512: return 6; + case 1024: return 7; + case 8: + default: + return 0; + } +} + +/* Compile a polygon context into a polygon header */ +static inline void CompilePolyHeader(PolyHeader *dst, const PolyContext *src) { + uint32_t txr_base; + + /* Basically we just take each parameter, clip it, shift it + into place, and OR it into the final result. */ + + /* The base values for CMD */ + dst->cmd = GPU_CMD_POLYHDR; + + dst->cmd |= src->txr.enable << 3; + + /* Or in the list type, shading type, color and UV formats */ + dst->cmd |= (src->list_type << GPU_TA_CMD_TYPE_SHIFT) & GPU_TA_CMD_TYPE_MASK; + dst->cmd |= (src->fmt.color << GPU_TA_CMD_CLRFMT_SHIFT) & GPU_TA_CMD_CLRFMT_MASK; + dst->cmd |= (src->gen.shading << GPU_TA_CMD_SHADE_SHIFT) & GPU_TA_CMD_SHADE_MASK; + dst->cmd |= (src->fmt.uv << GPU_TA_CMD_UVFMT_SHIFT) & GPU_TA_CMD_UVFMT_MASK; + dst->cmd |= (src->gen.clip_mode << GPU_TA_CMD_USERCLIP_SHIFT) & GPU_TA_CMD_USERCLIP_MASK; + dst->cmd |= (src->fmt.modifier << GPU_TA_CMD_MODIFIER_SHIFT) & GPU_TA_CMD_MODIFIER_MASK; + dst->cmd |= (src->gen.modifier_mode << GPU_TA_CMD_MODIFIERMODE_SHIFT) & GPU_TA_CMD_MODIFIERMODE_MASK; + dst->cmd |= (src->gen.specular << GPU_TA_CMD_SPECULAR_SHIFT) & GPU_TA_CMD_SPECULAR_MASK; + + /* Polygon mode 1 */ + dst->mode1 = (src->depth.comparison << GPU_TA_PM1_DEPTHCMP_SHIFT) & GPU_TA_PM1_DEPTHCMP_MASK; + dst->mode1 |= (src->gen.culling << GPU_TA_PM1_CULLING_SHIFT) & GPU_TA_PM1_CULLING_MASK; + dst->mode1 |= (src->depth.write << GPU_TA_PM1_DEPTHWRITE_SHIFT) & GPU_TA_PM1_DEPTHWRITE_MASK; + dst->mode1 |= (src->txr.enable << GPU_TA_PM1_TXRENABLE_SHIFT) & GPU_TA_PM1_TXRENABLE_MASK; + + /* Polygon mode 2 */ + dst->mode2 = (src->blend.src << GPU_TA_PM2_SRCBLEND_SHIFT) & GPU_TA_PM2_SRCBLEND_MASK; + dst->mode2 |= (src->blend.dst << GPU_TA_PM2_DSTBLEND_SHIFT) & GPU_TA_PM2_DSTBLEND_MASK; + dst->mode2 |= (src->blend.src_enable << GPU_TA_PM2_SRCENABLE_SHIFT) & GPU_TA_PM2_SRCENABLE_MASK; + dst->mode2 |= (src->blend.dst_enable << GPU_TA_PM2_DSTENABLE_SHIFT) & GPU_TA_PM2_DSTENABLE_MASK; + dst->mode2 |= (src->gen.fog_type << GPU_TA_PM2_FOG_SHIFT) & GPU_TA_PM2_FOG_MASK; + dst->mode2 |= (src->gen.color_clamp << GPU_TA_PM2_CLAMP_SHIFT) & GPU_TA_PM2_CLAMP_MASK; + dst->mode2 |= (src->gen.alpha << GPU_TA_PM2_ALPHA_SHIFT) & GPU_TA_PM2_ALPHA_MASK; + + if(src->txr.enable == GPU_TEXTURE_DISABLE) { + dst->mode3 = 0; + } + else { + dst->mode2 |= (src->txr.alpha << GPU_TA_PM2_TXRALPHA_SHIFT) & GPU_TA_PM2_TXRALPHA_MASK; + dst->mode2 |= (src->txr.uv_flip << GPU_TA_PM2_UVFLIP_SHIFT) & GPU_TA_PM2_UVFLIP_MASK; + dst->mode2 |= (src->txr.uv_clamp << GPU_TA_PM2_UVCLAMP_SHIFT) & GPU_TA_PM2_UVCLAMP_MASK; + dst->mode2 |= (src->txr.filter << GPU_TA_PM2_FILTER_SHIFT) & GPU_TA_PM2_FILTER_MASK; + dst->mode2 |= (src->txr.mipmap_bias << GPU_TA_PM2_MIPBIAS_SHIFT) & GPU_TA_PM2_MIPBIAS_MASK; + dst->mode2 |= (src->txr.env << GPU_TA_PM2_TXRENV_SHIFT) & GPU_TA_PM2_TXRENV_MASK; + + dst->mode2 |= (DimensionFlag(src->txr.width) << GPU_TA_PM2_USIZE_SHIFT) & GPU_TA_PM2_USIZE_MASK; + dst->mode2 |= (DimensionFlag(src->txr.height) << GPU_TA_PM2_VSIZE_SHIFT) & GPU_TA_PM2_VSIZE_MASK; + + /* Polygon mode 3 */ + dst->mode3 = (src->txr.mipmap << GPU_TA_PM3_MIPMAP_SHIFT) & GPU_TA_PM3_MIPMAP_MASK; + dst->mode3 |= (src->txr.format << GPU_TA_PM3_TXRFMT_SHIFT) & GPU_TA_PM3_TXRFMT_MASK; + + /* Convert the texture address */ + txr_base = (uint32_t) src->txr.base; + txr_base = (txr_base & 0x00fffff8) >> 3; + dst->mode3 |= txr_base; + } + + if(src->fmt.modifier && src->gen.modifier_mode) { + /* If we're affected by a modifier volume, silently promote the header + to the one that is affected by a modifier volume. */ + dst->d1 = dst->mode2; + dst->d2 = dst->mode3; + } + else { + dst->d1 = dst->d2 = 0xffffffff; + } + + dst->d3 = dst->d4 = 0xffffffff; +} + +#include "sh4.h" diff --git a/third_party/gldc/src/private.h b/third_party/gldc/src/private.h new file mode 100644 index 000000000..9ebba2332 --- /dev/null +++ b/third_party/gldc/src/private.h @@ -0,0 +1,323 @@ +#ifndef PRIVATE_H +#define PRIVATE_H + +#include +#include + +#include "gl_assert.h" +#include "platform.h" +#include "types.h" + +#include "../include/gldc.h" + +#include "aligned_vector.h" +#include "named_array.h" + +/* This figure is derived from the needs of Quake 1 */ +#define MAX_TEXTURE_COUNT 1088 + + +extern void* memcpy4 (void *dest, const void *src, size_t count); + +#define GL_NO_INSTRUMENT inline __attribute__((no_instrument_function)) +#define GL_INLINE_DEBUG GL_NO_INSTRUMENT __attribute__((always_inline)) +#define GL_FORCE_INLINE static GL_INLINE_DEBUG +#define _GL_UNUSED(x) (void)(x) + +#define _PACK4(v) ((v * 0xF) / 0xFF) +#define PACK_ARGB4444(a,r,g,b) (_PACK4(a) << 12) | (_PACK4(r) << 8) | (_PACK4(g) << 4) | (_PACK4(b)) +#define PACK_ARGB8888(a,r,g,b) ( ((a & 0xFF) << 24) | ((r & 0xFF) << 16) | ((g & 0xFF) << 8) | (b & 0xFF) ) +#define PACK_ARGB1555(a,r,g,b) \ + (((GLushort)(a > 0) << 15) | (((GLushort) r >> 3) << 10) | (((GLushort)g >> 3) << 5) | ((GLushort)b >> 3)) + +#define PACK_RGB565(r,g,b) \ + ((((GLushort)r & 0xf8) << 8) | (((GLushort) g & 0xfc) << 3) | ((GLushort) b >> 3)) + +#define TRACE_ENABLED 0 +#define TRACE() if(TRACE_ENABLED) {fprintf(stderr, "%s\n", __func__);} (void) 0 + +#define VERTEX_ENABLED_FLAG (1 << 0) +#define UV_ENABLED_FLAG (1 << 1) +#define ST_ENABLED_FLAG (1 << 2) +#define DIFFUSE_ENABLED_FLAG (1 << 3) + +#define MAX_TEXTURE_SIZE 1024 + +typedef struct { + unsigned int flags; /* Constant PVR_CMD_USERCLIP */ + unsigned int d1, d2, d3; /* Ignored for this type */ + unsigned int sx, /* Start x */ + sy, /* Start y */ + ex, /* End x */ + ey; /* End y */ +} PVRTileClipCommand; /* Tile Clip command for the pvr */ + +typedef struct { + unsigned int list_type; + AlignedVector vector; +} PolyList; + +typedef struct { + GLint x; + GLint y; + GLint width; + GLint height; + + float x_plus_hwidth; + float y_plus_hheight; + float hwidth; /* width * 0.5f */ + float hheight; /* height * 0.5f */ +} Viewport; + +extern Viewport VIEWPORT; + +typedef struct { + //0 + GLuint index; + GLuint color; /* This is the PVR texture format */ + //8 + GLenum minFilter; + GLenum magFilter; + //16 + GLvoid *data; + //20 + GLushort width; + GLushort height; + //24 + GLushort mipmap; /* Bitmask of supplied mipmap levels */ + // 26 + GLushort _pad0; + // 28 + GLubyte mipmap_bias; + GLubyte env; + GLubyte _pad1; + GLubyte _pad2; + //32 + GLubyte padding[32]; // Pad to 64-bytes +} __attribute__((aligned(32))) TextureObject; + + +#define argbcpy(dst, src) \ + *((GLuint*) dst) = *((const GLuint*) src) \ + + +typedef struct { + float xy[2]; +} _glvec2; + +typedef struct { + float xyz[3]; +} _glvec3; + +typedef struct { + float xyzw[4]; +} _glvec4; + +#define vec2cpy(dst, src) \ + *((_glvec2*) dst) = *((_glvec2*) src) + +#define vec3cpy(dst, src) \ + *((_glvec3*) dst) = *((_glvec3*) src) + +#define vec4cpy(dst, src) \ + *((_glvec4*) dst) = *((_glvec4*) src) + +GL_FORCE_INLINE float clamp(float d, float min, float max) { + return (d < min) ? min : (d > max) ? max : d; +} + +GL_FORCE_INLINE void memcpy_vertex(Vertex *dest, const Vertex *src) { +#ifdef __DREAMCAST__ + _Complex float double_scratch; + + asm volatile ( + "fschg\n\t" + "clrs\n\t" + ".align 2\n\t" + "fmov.d @%[in]+, %[scratch]\n\t" + "fmov.d %[scratch], @%[out]\n\t" + "fmov.d @%[in]+, %[scratch]\n\t" + "add #8, %[out]\n\t" + "fmov.d %[scratch], @%[out]\n\t" + "fmov.d @%[in]+, %[scratch]\n\t" + "add #8, %[out]\n\t" + "fmov.d %[scratch], @%[out]\n\t" + "fmov.d @%[in], %[scratch]\n\t" + "add #8, %[out]\n\t" + "fmov.d %[scratch], @%[out]\n\t" + "fschg\n" + : [in] "+&r" ((uint32_t) src), [scratch] "=&d" (double_scratch), [out] "+&r" ((uint32_t) dest) + : + : "t", "memory" // clobbers + ); +#else + *dest = *src; +#endif +} + +#define swapVertex(a, b) \ +do { \ + Vertex __attribute__((aligned(32))) c; \ + memcpy_vertex(&c, a); \ + memcpy_vertex(a, b); \ + memcpy_vertex(b, &c); \ +} while(0) + +/* Generating PVR vertices from the user-submitted data gets complicated, particularly + * when a realloc could invalidate pointers. This structure holds all the information + * we need on the target vertex array to allow passing around to the various stages (e.g. generate/clip etc.) + */ +typedef struct __attribute__((aligned(32))) { + PolyList* output; + uint32_t header_offset; // The offset of the header in the output list + uint32_t start_offset; // The offset into the output list + uint32_t count; // The number of vertices in this output +} SubmissionTarget; + +Vertex* _glSubmissionTargetStart(SubmissionTarget* target); +Vertex* _glSubmissionTargetEnd(SubmissionTarget* target); + +typedef enum { + CLIP_RESULT_ALL_IN_FRONT, + CLIP_RESULT_ALL_BEHIND, + CLIP_RESULT_ALL_ON_PLANE, + CLIP_RESULT_FRONT_TO_BACK, + CLIP_RESULT_BACK_TO_FRONT +} ClipResult; + + +#define A8IDX 3 +#define R8IDX 2 +#define G8IDX 1 +#define B8IDX 0 + +struct SubmissionTarget; + +PolyList* _glOpaquePolyList(); +PolyList* _glPunchThruPolyList(); +PolyList *_glTransparentPolyList(); + +void _glInitAttributePointers(); +void _glInitContext(); +void _glInitMatrices(); +void _glInitSubmissionTarget(); + +void _glMatrixLoadModelViewProjection(); + +GLubyte _glInitTextures(); + +void _glUpdatePVRTextureContext(PolyContext* context, GLshort textureUnit); + +typedef struct { + const void* ptr; // 4 + GLenum type; // 4 + GLsizei stride; // 4 + GLint size; // 4 +} AttribPointer; + +typedef struct { + AttribPointer vertex; // 16 + AttribPointer colour; // 32 + AttribPointer uv; // 48 + AttribPointer st; // 64 + AttribPointer normal; // 80 + AttribPointer padding; // 96 +} AttribPointerList; + +GLboolean _glCheckValidEnum(GLint param, GLint* values, const char* func); + +GLenum _glGetShadeModel(); + +extern TextureObject* TEXTURE_ACTIVE; +extern GLboolean TEXTURES_ENABLED; + +GLboolean _glIsBlendingEnabled(); +GLboolean _glIsAlphaTestEnabled(); +GLboolean _glIsCullingEnabled(); +GLboolean _glIsDepthTestEnabled(); +GLboolean _glIsDepthWriteEnabled(); +GLboolean _glIsScissorTestEnabled(); +GLboolean _glIsFogEnabled(); +GLenum _glGetDepthFunc(); +GLenum _glGetCullFace(); +GLenum _glGetFrontFace(); +GLenum _glGetBlendSourceFactor(); +GLenum _glGetBlendDestFactor(); + +extern PolyList OP_LIST; +extern PolyList PT_LIST; +extern PolyList TR_LIST; + +GL_FORCE_INLINE PolyList* _glActivePolyList() { + if(_glIsBlendingEnabled()) { + return &TR_LIST; + } else if(_glIsAlphaTestEnabled()) { + return &PT_LIST; + } else { + return &OP_LIST; + } +} + +extern GLenum LAST_ERROR; +extern char ERROR_FUNCTION[64]; + +GL_FORCE_INLINE const char* _glErrorEnumAsString(GLenum error) { + switch(error) { + case GL_INVALID_ENUM: return "GL_INVALID_ENUM"; + case GL_OUT_OF_MEMORY: return "GL_OUT_OF_MEMORY"; + case GL_INVALID_OPERATION: return "GL_INVALID_OPERATION"; + case GL_INVALID_VALUE: return "GL_INVALID_VALUE"; + default: + return "GL_UNKNOWN_ERROR"; + } +} + +GL_FORCE_INLINE void _glKosThrowError(GLenum error, const char *function) { + if(LAST_ERROR == GL_NO_ERROR) { + LAST_ERROR = error; + sprintf(ERROR_FUNCTION, "%s\n", function); + fprintf(stderr, "GL ERROR: %s when calling %s\n", _glErrorEnumAsString(LAST_ERROR), ERROR_FUNCTION); + } +} + +GL_FORCE_INLINE GLubyte _glKosHasError() { + return (LAST_ERROR != GL_NO_ERROR) ? GL_TRUE : GL_FALSE; +} + +GL_FORCE_INLINE void _glKosResetError() { + LAST_ERROR = GL_NO_ERROR; + sprintf(ERROR_FUNCTION, "\n"); +} + +typedef struct { + float n[3]; // 12 bytes + float finalColour[4]; //28 bytes + uint32_t padding; // 32 bytes +} EyeSpaceData; + +unsigned char _glIsClippingEnabled(); +void _glEnableClipping(unsigned char v); + +GLuint _glFreeTextureMemory(); +GLuint _glUsedTextureMemory(); +GLuint _glFreeContiguousTextureMemory(); + +void _glApplyScissor(bool force); + +GLboolean _glNearZClippingEnabled(); + +GLboolean _glGPUStateIsDirty(); +void _glGPUStateMarkClean(); +void _glGPUStateMarkDirty(); + +#define MAX_GLDC_TEXTURE_UNITS 2 + + +/* This is from KOS pvr_buffers.c */ +#define PVR_MIN_Z 0.0001f + +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) +#define CLAMP( X, _MIN, _MAX ) ( (X)<(_MIN) ? (_MIN) : ((X)>(_MAX) ? (_MAX) : (X)) ) + +#endif // PRIVATE_H diff --git a/third_party/gldc/src/sh4.c b/third_party/gldc/src/sh4.c new file mode 100644 index 000000000..76d23b441 --- /dev/null +++ b/third_party/gldc/src/sh4.c @@ -0,0 +1,470 @@ +#include "platform.h" +#include "sh4.h" + + +#define CLIP_DEBUG 0 + +#define PVR_VERTEX_BUF_SIZE 2560 * 256 + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define SQ_BASE_ADDRESS (void*) 0xe0000000 + + +GL_FORCE_INLINE bool glIsVertex(const float flags) { + return flags == GPU_CMD_VERTEX_EOL || flags == GPU_CMD_VERTEX; +} + +GL_FORCE_INLINE bool glIsLastVertex(const float flags) { + return flags == GPU_CMD_VERTEX_EOL; +} + +void InitGPU(_Bool autosort, _Bool fsaa) { + pvr_init_params_t params = { + /* Enable opaque and translucent polygons with size 32 and 32 */ + {PVR_BINSIZE_32, PVR_BINSIZE_0, PVR_BINSIZE_32, PVR_BINSIZE_0, PVR_BINSIZE_32}, + PVR_VERTEX_BUF_SIZE, /* Vertex buffer size */ + 0, /* No DMA */ + fsaa, /* No FSAA */ + (autosort) ? 0 : 1 /* Disable translucent auto-sorting to match traditional GL */ + }; + + pvr_init(¶ms); + + /* If we're PAL and we're NOT VGA, then use 50hz by default. This is the safest + thing to do. If someone wants to force 60hz then they can call vid_set_mode later and hopefully + that'll work... */ + + int cable = vid_check_cable(); + int region = flashrom_get_region(); + + if(region == FLASHROM_REGION_EUROPE && cable != CT_VGA) { + printf("PAL region without VGA - enabling 50hz"); + vid_set_mode(DM_640x480_PAL_IL, PM_RGB565); + } +} + +void SceneBegin() { + pvr_wait_ready(); + pvr_scene_begin(); +} + +void SceneListBegin(GPUList list) { + pvr_list_begin(list); +} + +GL_FORCE_INLINE float _glFastInvert(float x) { + return (1.f / __builtin_sqrtf(x * x)); +} + +GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex, const float h) { + TRACE(); + + const float f = _glFastInvert(vertex->w); + + /* Convert to NDC and apply viewport */ + vertex->xyz[0] = (vertex->xyz[0] * f * 320) + 320; + vertex->xyz[1] = (vertex->xyz[1] * f * -240) + 240; + + /* Orthographic projections need to use invZ otherwise we lose + the depth information. As w == 1, and clip-space range is -w to +w + we add 1.0 to the Z to bring it into range. We add a little extra to + avoid a divide by zero. + */ + if(vertex->w == 1.0f) { + vertex->xyz[2] = _glFastInvert(1.0001f + vertex->xyz[2]); + } else { + vertex->xyz[2] = f; + } +} + + +volatile uint32_t *sq = SQ_BASE_ADDRESS; + +static inline void _glFlushBuffer() { + TRACE(); + + /* Wait for both store queues to complete */ + sq = (uint32_t*) 0xe0000000; + sq[0] = sq[8] = 0; +} + +static inline void _glPushHeaderOrVertex(Vertex* v) { + TRACE(); + + uint32_t* s = (uint32_t*) v; + sq[0] = *(s++); + sq[1] = *(s++); + sq[2] = *(s++); + sq[3] = *(s++); + sq[4] = *(s++); + sq[5] = *(s++); + sq[6] = *(s++); + sq[7] = *(s++); + __asm__("pref @%0" : : "r"(sq)); + sq += 8; +} + +static inline void _glClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout) { + const static float o = 0.003921569f; // 1 / 255 + const float d0 = v1->w + v1->xyz[2]; + const float d1 = v2->w + v2->xyz[2]; + const float t = (fabs(d0) * (1.0f / sqrtf((d1 - d0) * (d1 - d0)))) + 0.000001f; + const float invt = 1.0f - t; + + vout->xyz[0] = invt * v1->xyz[0] + t * v2->xyz[0]; + vout->xyz[1] = invt * v1->xyz[1] + t * v2->xyz[1]; + vout->xyz[2] = invt * v1->xyz[2] + t * v2->xyz[2]; + + vout->uv[0] = invt * v1->uv[0] + t * v2->uv[0]; + vout->uv[1] = invt * v1->uv[1] + t * v2->uv[1]; + + vout->w = invt * v1->w + t * v2->w; + + const float m = 255 * t; + const float n = 255 - m; + + vout->bgra[0] = (v1->bgra[0] * n + v2->bgra[0] * m) * o; + vout->bgra[1] = (v1->bgra[1] * n + v2->bgra[1] * m) * o; + vout->bgra[2] = (v1->bgra[2] * n + v2->bgra[2] * m) * o; + vout->bgra[3] = (v1->bgra[3] * n + v2->bgra[3] * m) * o; +} + +#define SPAN_SORT_CFG 0x005F8030 +static volatile uint32_t* PVR_LMMODE0 = (uint32_t*) 0xA05F6884; +static volatile uint32_t *PVR_LMMODE1 = (uint32_t*) 0xA05F6888; +static volatile uint32_t *QACR = (uint32_t*) 0xFF000038; + +void SceneListSubmit(Vertex* v2, int n) { + TRACE(); + + /* You need at least a header, and 3 vertices to render anything */ + if(n < 4) { + return; + } + + const float h = GetVideoMode()->height; + + PVR_SET(SPAN_SORT_CFG, 0x0); + + //Set PVR DMA registers + *PVR_LMMODE0 = 0; + *PVR_LMMODE1 = 0; + + //Set QACR registers + QACR[1] = QACR[0] = 0x11; + +#if CLIP_DEBUG + Vertex* vertex = (Vertex*) src; + for(int i = 0; i < n; ++i) { + fprintf(stderr, "{%f, %f, %f, %f}, // %x (%x)\n", vertex[i].xyz[0], vertex[i].xyz[1], vertex[i].xyz[2], vertex[i].w, vertex[i].flags, &vertex[i]); + } + + fprintf(stderr, "----\n"); +#endif + uint8_t visible_mask = 0; + uint8_t counter = 0; + + sq = SQ_BASE_ADDRESS; + + for(int i = 0; i < n; ++i, ++v2) { + PREFETCH(v2 + 1); + switch(v2->flags) { + case GPU_CMD_VERTEX_EOL: + if(counter < 2) { + continue; + } + counter = 0; + break; + case GPU_CMD_VERTEX: + ++counter; + if(counter < 3) { + continue; + } + break; + default: + _glPushHeaderOrVertex(v2); + counter = 0; + continue; + }; + + Vertex* const v0 = v2 - 2; + Vertex* const v1 = v2 - 1; + + visible_mask = ( + (v0->xyz[2] > -v0->w) << 0 | + (v1->xyz[2] > -v1->w) << 1 | + (v2->xyz[2] > -v2->w) << 2 | + (counter == 0) << 3 + ); + + switch(visible_mask) { + case 15: /* All visible, but final vertex in strip */ + { + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + + _glPerspectiveDivideVertex(v1, h); + _glPushHeaderOrVertex(v1); + + _glPerspectiveDivideVertex(v2, h); + _glPushHeaderOrVertex(v2); + } + break; + case 7: + /* All visible, push the first vertex and move on */ + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + break; + case 9: + /* First vertex was visible, last in strip */ + { + Vertex __attribute__((aligned(32))) scratch[2]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + + _glClipEdge(v0, v1, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v2, v0, b); + b->flags = GPU_CMD_VERTEX_EOL; + + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + } + break; + case 1: + /* First vertex was visible, but not last in strip */ + { + Vertex __attribute__((aligned(32))) scratch[2]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + + _glClipEdge(v0, v1, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v2, v0, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + _glPushHeaderOrVertex(b); + } + break; + case 10: + case 2: + /* Second vertex was visible. In self case we need to create a triangle and produce + two new vertices: 1-2, and 2-3. */ + { + Vertex __attribute__((aligned(32))) scratch[3]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + Vertex* c = &scratch[2]; + + memcpy_vertex(c, v1); + + _glClipEdge(v0, v1, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v1, v2, b); + b->flags = v2->flags; + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + } + break; + case 11: + case 3: /* First and second vertex were visible */ + { + Vertex __attribute__((aligned(32))) scratch[3]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + Vertex* c = &scratch[2]; + + memcpy_vertex(c, v1); + + _glClipEdge(v2, v0, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + + _glClipEdge(v1, v2, a); + a->flags = v2->flags; + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(c); + _glPushHeaderOrVertex(a); + } + break; + case 12: + case 4: + /* Third vertex was visible. */ + { + Vertex __attribute__((aligned(32))) scratch[3]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + Vertex* c = &scratch[2]; + + memcpy_vertex(c, v2); + + _glClipEdge(v2, v0, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v1, v2, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + if(counter % 2 == 1) { + _glPushHeaderOrVertex(a); + } + + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + } + break; + case 13: + { + Vertex __attribute__((aligned(32))) scratch[3]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + Vertex* c = &scratch[2]; + + memcpy_vertex(c, v2); + c->flags = GPU_CMD_VERTEX; + + _glClipEdge(v0, v1, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v1, v2, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + + c->flags = GPU_CMD_VERTEX_EOL; + _glPushHeaderOrVertex(c); + } + break; + case 5: /* First and third vertex were visible */ + { + Vertex __attribute__((aligned(32))) scratch[3]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + Vertex* c = &scratch[2]; + + memcpy_vertex(c, v2); + c->flags = GPU_CMD_VERTEX; + + _glClipEdge(v0, v1, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v1, v2, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + _glPushHeaderOrVertex(c); + } + break; + case 14: + case 6: /* Second and third vertex were visible */ + { + Vertex __attribute__((aligned(32))) scratch[4]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + Vertex* c = &scratch[2]; + Vertex* d = &scratch[3]; + + memcpy_vertex(c, v1); + memcpy_vertex(d, v2); + + _glClipEdge(v0, v1, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v2, v0, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + _glPushHeaderOrVertex(c); + + _glPerspectiveDivideVertex(d, h); + _glPushHeaderOrVertex(d); + } + break; + case 8: + default: + break; + } + } + + _glFlushBuffer(); +} + +void SceneListFinish() { + pvr_list_finish(); +} + +void SceneFinish() { + pvr_scene_finish(); +} + +const VideoMode* GetVideoMode() { + static VideoMode mode; + mode.width = vid_mode->width; + mode.height = vid_mode->height; + return &mode; +} diff --git a/third_party/gldc/src/sh4.h b/third_party/gldc/src/sh4.h new file mode 100644 index 000000000..b2eaa4c4e --- /dev/null +++ b/third_party/gldc/src/sh4.h @@ -0,0 +1,119 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "private.h" + +#include "sh4_math.h" + +#ifndef NDEBUG +#define PERF_WARNING(msg) printf("[PERF] %s\n", msg) +#else +#define PERF_WARNING(msg) (void) 0 +#endif + +#ifndef GL_FORCE_INLINE +#define GL_NO_INSTRUMENT inline __attribute__((no_instrument_function)) +#define GL_INLINE_DEBUG GL_NO_INSTRUMENT __attribute__((always_inline)) +#define GL_FORCE_INLINE static GL_INLINE_DEBUG +#endif + +#define PREFETCH(addr) __builtin_prefetch((addr)) + +GL_FORCE_INLINE void* memcpy_fast(void *dest, const void *src, size_t len) { + if(!len) { + return dest; + } + + const uint8_t *s = (uint8_t *)src; + uint8_t *d = (uint8_t *)dest; + + uint32_t diff = (uint32_t)d - (uint32_t)(s + 1); // extra offset because input gets incremented before output is calculated + // Underflow would be like adding a negative offset + + // Can use 'd' as a scratch reg now + asm volatile ( + "clrs\n" // Align for parallelism (CO) - SH4a use "stc SR, Rn" instead with a dummy Rn + ".align 2\n" + "0:\n\t" + "dt %[size]\n\t" // (--len) ? 0 -> T : 1 -> T (EX 1) + "mov.b @%[in]+, %[scratch]\n\t" // scratch = *(s++) (LS 1/2) + "bf.s 0b\n\t" // while(s != nexts) aka while(!T) (BR 1/2) + " mov.b %[scratch], @(%[offset], %[in])\n" // *(datatype_of_s*) ((char*)s + diff) = scratch, where src + diff = dest (LS 1) + : [in] "+&r" ((uint32_t)s), [scratch] "=&r" ((uint32_t)d), [size] "+&r" (len) // outputs + : [offset] "z" (diff) // inputs + : "t", "memory" // clobbers + ); + + return dest; +} + +/* We use sq_cpy if the src and size is properly aligned. We control that the + * destination is properly aligned so we assert that. */ +#define FASTCPY(dst, src, bytes) \ + do { \ + if(bytes % 32 == 0 && ((uintptr_t) src % 4) == 0) { \ + gl_assert(((uintptr_t) dst) % 32 == 0); \ + sq_cpy(dst, src, bytes); \ + } else { \ + memcpy_fast(dst, src, bytes); \ + } \ + } while(0) + + +#define MEMCPY4(dst, src, bytes) memcpy_fast(dst, src, bytes) + +#define MEMSET4(dst, v, size) memset4((dst), (v), (size)) + +#define VEC3_NORMALIZE(x, y, z) vec3f_normalize((x), (y), (z)) +#define VEC3_LENGTH(x, y, z, l) vec3f_length((x), (y), (z), (l)) +#define VEC3_DOT(x1, y1, z1, x2, y2, z2, d) vec3f_dot((x1), (y1), (z1), (x2), (y2), (z2), (d)) + +GL_FORCE_INLINE void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) { + register float __x __asm__("fr12") = (xyz[0]); + register float __y __asm__("fr13") = (xyz[1]); + register float __z __asm__("fr14") = (xyz[2]); + register float __w __asm__("fr15") = (*w); + + __asm__ __volatile__( + "fldi1 fr15\n" + "ftrv xmtrx,fv12\n" + : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) + : "0" (__x), "1" (__y), "2" (__z), "3" (__w) + ); + + oxyz[0] = __x; + oxyz[1] = __y; + oxyz[2] = __z; + *ow = __w; +} + +void InitGPU(_Bool autosort, _Bool fsaa); + +static inline size_t GPUMemoryAvailable() { + return pvr_mem_available(); +} + +static inline void* GPUMemoryAlloc(size_t size) { + return pvr_mem_malloc(size); +} + +static inline void GPUSetBackgroundColour(float r, float g, float b) { + pvr_set_bg_color(r, g, b); +} + +#define PT_ALPHA_REF 0x011c + +static inline void GPUSetAlphaCutOff(uint8_t val) { + PVR_SET(PT_ALPHA_REF, val); +} + +static inline void GPUSetClearDepth(float v) { + pvr_set_zclip(v); +} diff --git a/third_party/gldc/src/sh4_math.h b/third_party/gldc/src/sh4_math.h new file mode 100644 index 000000000..c12c30d99 --- /dev/null +++ b/third_party/gldc/src/sh4_math.h @@ -0,0 +1,1820 @@ +// ---- sh4_math.h - SH7091 Math Module ---- +// +// Version 1.1.3 +// +// This file is part of the DreamHAL project, a hardware abstraction library +// primarily intended for use on the SH7091 found in hardware such as the SEGA +// Dreamcast game console. +// +// This math module is hereby released into the public domain in the hope that it +// may prove useful. Now go hit 60 fps! :) +// +// --Moopthehedgehog +// + +// Notes: +// - GCC 4 users have a different return type for the fsca functions due to an +// internal compiler error regarding complex numbers; no issue under GCC 9.2.0 +// - Using -m4 instead of -m4-single-only completely breaks the matrix and +// vector operations +// - Function inlining must be enabled and not blocked by compiler options such +// as -ffunction-sections, as blocking inlining will result in significant +// performance degradation for the vector and matrix functions employing a +// RETURN_VECTOR_STRUCT return type. I have added compiler hints and attributes +// "static inline __attribute__((always_inline))" to mitigate this, so in most +// cases the functions should be inlined regardless. If in doubt, check the +// compiler asm output! +// + +#ifndef __SH4_MATH_H_ +#define __SH4_MATH_H_ + +#define GNUC_FSCA_ERROR_VERSION 4 + +// +// Fast SH4 hardware math functions +// +// +// High-accuracy users beware, the fsrra functions have an error of +/- 2^-21 +// per http://www.shared-ptr.com/sh_insns.html +// + +//============================================================================== +// Definitions +//============================================================================== +// +// Structures, useful definitions, and reference comments +// + +// Front matrix format: +// +// FV0 FV4 FV8 FV12 +// --- --- --- ---- +// [ fr0 fr4 fr8 fr12 ] +// [ fr1 fr5 fr9 fr13 ] +// [ fr2 fr6 fr10 fr14 ] +// [ fr3 fr7 fr11 fr15 ] +// +// Back matrix, XMTRX, is similar, although it has no FVn vector groups: +// +// [ xf0 xf4 xf8 xf12 ] +// [ xf1 xf5 xf9 xf13 ] +// [ xf2 xf6 xf10 xf14 ] +// [ xf3 xf7 xf11 xf15 ] +// + +typedef struct __attribute__((aligned(32))) { + float fr0; + float fr1; + float fr2; + float fr3; + float fr4; + float fr5; + float fr6; + float fr7; + float fr8; + float fr9; + float fr10; + float fr11; + float fr12; + float fr13; + float fr14; + float fr15; +} ALL_FLOATS_STRUCT; + +// Return structs should be defined locally so that GCC optimizes them into +// register usage instead of memory accesses. +typedef struct { + float z1; + float z2; + float z3; + float z4; +} RETURN_VECTOR_STRUCT; + +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION +typedef struct { + float sine; + float cosine; +} RETURN_FSCA_STRUCT; +#endif + +// Identity Matrix +// +// FV0 FV4 FV8 FV12 +// --- --- --- ---- +// [ 1 0 0 0 ] +// [ 0 1 0 0 ] +// [ 0 0 1 0 ] +// [ 0 0 0 1 ] +// + +static const ALL_FLOATS_STRUCT MATH_identity_matrix = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f}; + +// Constants +#define MATH_pi 3.14159265358979323846264338327950288419716939937510f +#define MATH_e 2.71828182845904523536028747135266249775724709369995f +#define MATH_phi 1.61803398874989484820458683436563811772030917980576f + +//============================================================================== +// Basic math functions +//============================================================================== +// +// The following functions are available. +// Please see their definitions for other usage info, otherwise they may not +// work for you. +// +/* + // |x| + float MATH_fabs(float x) + + // sqrt(x) + float MATH_fsqrt(float x) + + // a*b+c + float MATH_fmac(float a, float b, float c) + + // a*b-c + float MATH_fmac_Dec(float a, float b, float c) + + // fminf() - return the min of two floats + // This doesn't check for NaN + float MATH_Fast_Fminf(float a, float b) + + // fmaxf() - return the max of two floats + // This doesn't check for NaN + float MATH_Fast_Fmaxf(float a, float b) + + // Fast floorf() - return the nearest integer <= x as a float + // This doesn't check for NaN + float MATH_Fast_Floorf(float x) + + // Fast ceilf() - return the nearest integer >= x as a float + // This doesn't check for NaN + float MATH_Fast_Ceilf(float x) + + // Very fast floorf() - return the nearest integer <= x as a float + // Inspired by a cool trick I came across here: + // https://www.codeproject.com/Tips/700780/Fast-floor-ceiling-functions + // This doesn't check for NaN + float MATH_Very_Fast_Floorf(float x) + + // Very fast ceilf() - return the nearest integer >= x as a float + // Inspired by a cool trick I came across here: + // https://www.codeproject.com/Tips/700780/Fast-floor-ceiling-functions + // This doesn't check for NaN + float MATH_Very_Fast_Ceilf(float x) +*/ + +// |x| +// This one works on ARM and x86, too! +static inline __attribute__((always_inline)) float MATH_fabs(float x) +{ + asm volatile ("fabs %[floatx]\n" + : [floatx] "+f" (x) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + return x; +} + +// sqrt(x) +// This one works on ARM and x86, too! +// NOTE: There is a much faster version (MATH_Fast_Sqrt()) in the fsrra section of +// this file. Chances are you probably want that one. +static inline __attribute__((always_inline)) float MATH_fsqrt(float x) +{ + asm volatile ("fsqrt %[floatx]\n" + : [floatx] "+f" (x) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + return x; +} + +// a*b+c +static inline __attribute__((always_inline)) float MATH_fmac(float a, float b, float c) +{ + asm volatile ("fmac fr0, %[floatb], %[floatc]\n" + : [floatc] "+f" (c) // outputs, "+" means r/w + : "w" (a), [floatb] "f" (b) // inputs + : // no clobbers + ); + + return c; +} + +// a*b-c +static inline __attribute__((always_inline)) float MATH_fmac_Dec(float a, float b, float c) +{ + asm volatile ("fneg %[floatc]\n\t" + "fmac fr0, %[floatb], %[floatc]\n" + : [floatc] "+&f" (c) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : "w" (a), [floatb] "f" (b) // inputs + : // no clobbers + ); + + return c; +} + +// Fast fminf() - return the min of two floats +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Fast_Fminf(float a, float b) +{ + float output_float; + + asm volatile ( + "fcmp/gt %[floata], %[floatb]\n\t" // b > a (NaN evaluates to !GT; 0 -> T) + "bt.s 1f\n\t" // yes, a is smaller + " fmov %[floata], %[float_out]\n\t" // so return a + "fmov %[floatb], %[float_out]\n" // no, either b is smaller or they're equal and it doesn't matter + "1:\n" + : [float_out] "=&f" (output_float) // outputs + : [floata] "f" (a), [floatb] "f" (b) // inputs + : "t" // clobbers + ); + + return output_float; +} + +// Fast fmaxf() - return the max of two floats +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Fast_Fmaxf(float a, float b) +{ + float output_float; + + asm volatile ( + "fcmp/gt %[floata], %[floatb]\n\t" // b > a (NaN evaluates to !GT; 0 -> T) + "bt.s 1f\n\t" // yes, a is smaller + " fmov %[floatb], %[float_out]\n\t" // so return b + "fmov %[floata], %[float_out]\n" // no, either a is bigger or they're equal and it doesn't matter + "1:\n" + : [float_out] "=&f" (output_float) // outputs + : [floata] "f" (a), [floatb] "f" (b) // inputs + : "t" // clobbers + ); + + return output_float; +} + +// Fast floorf() - return the nearest integer <= x as a float +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Fast_Floorf(float x) +{ + float output_float; + + // To hold -1.0f + float minus_one; + + asm volatile ( + "fldi1 %[minus_1]\n\t" + "fneg %[minus_1]\n\t" + "fcmp/gt %[minus_1], %[floatx]\n\t" // x >= 0 + "ftrc %[floatx], fpul\n\t" // convert float to int + "bt.s 1f\n\t" + " float fpul, %[float_out]\n\t" // convert int to float + "fadd %[minus_1], %[float_out]\n" // if input x < 0, subtract 1.0 + "1:\n" + : [minus_1] "=&f" (minus_one), [float_out] "=f" (output_float) + : [floatx] "f" (x) + : "fpul", "t" + ); + + return output_float; +} + +// Fast ceilf() - return the nearest integer >= x as a float +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Fast_Ceilf(float x) +{ + float output_float; + + // To hold 0.0f and 1.0f + float zero_one; + + asm volatile ( + "fldi0 %[zero_1]\n\t" + "fcmp/gt %[zero_1], %[floatx]\n\t" // x > 0 + "ftrc %[floatx], fpul\n\t" // convert float to int + "bf.s 1f\n\t" + " float fpul, %[float_out]\n\t" // convert int to float + "fldi1 %[zero_1]\n\t" + "fadd %[zero_1], %[float_out]\n" // if input x > 0, add 1.0 + "1:\n" + : [zero_1] "=&f" (zero_one), [float_out] "=f" (output_float) + : [floatx] "f" (x) + : "fpul", "t" + ); + + return output_float; +} + +// Very fast floorf() - return the nearest integer <= x as a float +// Inspired by a cool trick I came across here: +// https://www.codeproject.com/Tips/700780/Fast-floor-ceiling-functions +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Very_Fast_Floorf(float x) +{ + float output_float; + unsigned int scratch_reg; + unsigned int scratch_reg2; + + // 0x4f000000 == 2^31 in float -- 0x4f << 24 is INT_MAX + 1.0f + // 0x80000000 == -2^31 == INT_MIN == -(INT_MAX + 1.0f) + + // floor = (float)( (int)(x + (float)2^31) - 2^31) + + asm volatile ( + "mov #0x4f, %[scratch]\n\t" // Build float INT_MAX + 1 as a float using only regs (EX) + "shll16 %[scratch]\n\t" // (EX) + "shll8 %[scratch]\n\t" // (EX) + "lds %[scratch], fpul\n\t" // move float INT_MAX + 1 to float regs (LS) + "mov #1, %[scratch2]\n\t" // Build INT_MIN from scratch in parallel (EX) + "fsts fpul, %[float_out]\n\t" // (LS) + "fadd %[floatx], %[float_out]\n\t" // float-add float INT_MAX + 1 to x (FE) + "rotr %[scratch2]\n\t" // rotate the 1 in bit 0 from LSB to MSB for INT_MIN, clobber T (EX) + "ftrc %[float_out], fpul\n\t" // convert float to int (FE) -- ftrc -> sts is special combo + "sts fpul, %[scratch]\n\t" // move back to int regs (LS) + "add %[scratch2], %[scratch]\n\t" // Add INT_MIN to int (EX) + "lds %[scratch], fpul\n\t" // (LS) -- lds -> float is a special combo + "float fpul, %[float_out]\n" // convert back to float (FE) + : [scratch] "=&r" (scratch_reg), [scratch2] "=&r" (scratch_reg2), [float_out] "=&f" (output_float) + : [floatx] "f" (x) + : "fpul", "t" + ); + + return output_float; +} + +// Very fast ceilf() - return the nearest integer >= x as a float +// Inspired by a cool trick I came across here: +// https://www.codeproject.com/Tips/700780/Fast-floor-ceiling-functions +// This doesn't check for NaN +static inline __attribute__((always_inline)) float MATH_Very_Fast_Ceilf(float x) +{ + float output_float; + unsigned int scratch_reg; + unsigned int scratch_reg2; + + // 0x4f000000 == 2^31 in float -- 0x4f << 24 is INT_MAX + 1.0f + // 0x80000000 == -2^31 == INT_MIN == -(INT_MAX + 1.0f) + + // Ceiling is the inverse of floor such that f^-1(x) = -f(-x) + // To make very fast ceiling have as wide a range as very fast floor, + // use this property to subtract x from INT_MAX + 1 and get the negative of the + // ceiling, and then negate the final output. This allows ceiling to use + // -2^31 and have the same range as very fast floor. + + // Given: + // floor = (float)( (int)(x + (float)2^31) - 2^31 ) + // We can do: + // ceiling = -( (float)( (int)((float)2^31 - x) - 2^31 ) ) + // or (slower on SH4 since 'fneg' is faster than 'neg'): + // ceiling = (float) -( (int)((float)2^31 - x) - 2^31 ) + // Since mathematically these functions are related by f^-1(x) = -f(-x). + + asm volatile ( + "mov #0x4f, %[scratch]\n\t" // Build float INT_MAX + 1 as a float using only regs (EX) + "shll16 %[scratch]\n\t" // (EX) + "shll8 %[scratch]\n\t" // (EX) + "lds %[scratch], fpul\n\t" // move float INT_MAX + 1 to float regs (LS) + "mov #1, %[scratch2]\n\t" // Build INT_MIN from scratch in parallel (EX) + "fsts fpul, %[float_out]\n\t" // (LS) + "fsub %[floatx], %[float_out]\n\t" // float-sub x from float INT_MAX + 1 (FE) + "rotr %[scratch2]\n\t" // rotate the 1 in bit 0 from LSB to MSB for INT_MIN, clobber T (EX) + "ftrc %[float_out], fpul\n\t" // convert float to int (FE) -- ftrc -> sts is special combo + "sts fpul, %[scratch]\n\t" // move back to int regs (LS) + "add %[scratch2], %[scratch]\n\t" // Add INT_MIN to int (EX) + "lds %[scratch], fpul\n\t" // (LS) -- lds -> float is a special combo + "float fpul, %[float_out]\n\t" // convert back to float (FE) + "fneg %[float_out]\n" + : [scratch] "=&r" (scratch_reg), [scratch2] "=&r" (scratch_reg2), [float_out] "=&f" (output_float) + : [floatx] "f" (x) + : "fpul", "t" + ); + + return output_float; +} + +//============================================================================== +// Fun with fsrra, which does 1/sqrt(x) in one cycle +//============================================================================== +// +// Error of 'fsrra' is +/- 2^-21 per http://www.shared-ptr.com/sh_insns.html +// +// The following functions are available. +// Please see their definitions for other usage info, otherwise they may not +// work for you. +// +/* + // 1/sqrt(x) + float MATH_fsrra(float x) + + // 1/x + float MATH_Fast_Invert(float x) + + // A faster divide than the 'fdiv' instruction + float MATH_Fast_Divide(float numerator, float denominator) + + // A faster square root then the 'fsqrt' instruction + float MATH_Fast_Sqrt(float x) + + // Standard, accurate, and slow float divide. Use this if MATH_Fast_Divide() gives you issues. + float MATH_Slow_Divide(float numerator, float denominator) +*/ + +// 1/sqrt(x) +static inline __attribute__((always_inline)) float MATH_fsrra(float x) +{ + asm volatile ("fsrra %[one_div_sqrt]\n" + : [one_div_sqrt] "+f" (x) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + return x; +} + +// 1/x +// 1.0f / sqrt(x^2) +static inline __attribute__((always_inline)) float MATH_Fast_Invert(float x) +{ + int neg = 0; + + if(x < 0.0f) + { + neg = 1; + } + + x = MATH_fsrra(x*x); // 1.0f / sqrt(x^2) + + if(neg) + { + return -x; + } + else + { + return x; + } +} + +// It's faster to do this than to use 'fdiv'. +// Only fdiv can do doubles, however. +// Of course, not having to divide at all is generally the best way to go. :P +static inline __attribute__((always_inline)) float MATH_Fast_Divide(float numerator, float denominator) +{ + denominator = MATH_Fast_Invert(denominator); + return numerator * denominator; +} + +// fast sqrt(x) +// Crazy thing: invert(fsrra(x)) is actually about 3x faster than fsqrt. +static inline __attribute__((always_inline)) float MATH_Fast_Sqrt(float x) +{ + return MATH_Fast_Invert(MATH_fsrra(x)); +} + +// Standard, accurate, and slow float divide. Use this if MATH_Fast_Divide() gives you issues. +// This DOES work on negatives. +static inline __attribute__((always_inline)) float MATH_Slow_Divide(float numerator, float denominator) +{ + asm volatile ("fdiv %[div_denom], %[div_numer]\n" + : [div_numer] "+f" (numerator) // outputs, "+" means r/w + : [div_denom] "f" (denominator) // inputs + : // clobbers + ); + + return numerator; +} + +//============================================================================== +// Fun with fsca, which does simultaneous sine and cosine in 3 cycles +//============================================================================== +// +// NOTE: GCC 4.7 has a bug that prevents it from working with fsca and complex +// numbers in m4-single-only mode, so GCC 4 users will get a RETURN_FSCA_STRUCT +// instead of a _Complex float. This may be much slower in some instances. +// +// VERY IMPORTANT USAGE INFORMATION (sine and cosine functions): +// +// Due to the nature in which the fsca instruction behaves, you MUST do the +// following in your code to get sine and cosine from these functions: +// +// _Complex float sine_cosine = [Call the fsca function here] +// float sine_value = __real__ sine_cosine; +// float cosine_value = __imag__ sine_cosine; +// Your output is now in sine_value and cosine_value. +// +// This is necessary because fsca outputs both sine and cosine simultaneously +// and uses a double register to do so. The fsca functions do not actually +// return a double--they return two floats--and using a complex float here is +// just a bit of hacking the C language to make GCC do something that's legal in +// assembly according to the SH4 calling convention (i.e. multiple return values +// stored in floating point registers FR0-FR3). This is better than using a +// struct of floats for optimization purposes--this will operate at peak +// performance even at -O0, whereas a struct will not be fast at low +// optimization levels due to memory accesses. +// +// Technically you may be able to use the complex return values as a complex +// number if you wanted to, but that's probably not what you're after and they'd +// be flipped anyways (in mathematical convention, sine is the imaginary part). +// + +// Notes: +// - From http://www.shared-ptr.com/sh_insns.html: +// The input angle is specified as a signed fraction in twos complement. +// The result of sin and cos is a single-precision floating-point number. +// 0x7FFFFFFF to 0x00000001: 360×2^15−360/2^16 to 360/2^16 degrees +// 0x00000000: 0 degree +// 0xFFFFFFFF to 0x80000000: −360/2^16 to −360×2^15 degrees +// - fsca format is 2^16 is 360 degrees, so a value of 1 is actually +// 1/182.044444444 of a degree or 1/10430.3783505 of a radian +// - fsca does a %360 automatically for values over 360 degrees +// +// Also: +// In order to make the best use of fsca units, a program must expect them from +// the outset and not "make them" by dividing radians or degrees to get them, +// otherwise it's just giving the 'fsca' instruction radians or degrees! +// + +// The following functions are available. +// Please see their definitions for other usage info, otherwise they may not +// work for you. +// +/* + // For integer input in native fsca units (fastest) + _Complex float MATH_fsca_Int(unsigned int input_int) + + // For integer input in degrees + _Complex float MATH_fsca_Int_Deg(unsigned int input_int) + + // For integer input in radians + _Complex float MATH_fsca_Int_Rad(unsigned int input_int) + + // For float input in native fsca units + _Complex float MATH_fsca_Float(float input_float) + + // For float input in degrees + _Complex float MATH_fsca_Float_Deg(float input_float) + + // For float input in radians + _Complex float MATH_fsca_Float_Rad(float input_float) +*/ + +//------------------------------------------------------------------------------ +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION +//------------------------------------------------------------------------------ +// +// This set of fsca functions is specifically for old versions of GCC. +// See later for functions for newer versions of GCC. +// + +// +// Integer input (faster) +// + +// For int input, input_int is in native fsca units (fastest) +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int(unsigned int input_int) +{ + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For int input, input_int is in degrees +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int_Deg(unsigned int input_int) +{ + // normalize whole number input degrees to fsca format + input_int = ((1527099483ULL * input_int) >> 23); + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For int input, input_int is in radians +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int_Rad(unsigned int input_int) +{ + // normalize whole number input rads to fsca format + input_int = ((2734261102ULL * input_int) >> 18); + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// +// Float input (slower) +// + +// For float input, input_float is in native fsca units +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float(float input_float) +{ + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For float input, input_float is in degrees +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float_Deg(float input_float) +{ + input_float *= 182.044444444f; + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For float input, input_float is in radians +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float_Rad(float input_float) +{ + input_float *= 10430.3783505f; + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +//------------------------------------------------------------------------------ +#else +//------------------------------------------------------------------------------ +// +// This set of fsca functions is specifically for newer versions of GCC. They +// work fine under GCC 9.2.0. +// + +// +// Integer input (faster) +// + +// For int input, input_int is in native fsca units (fastest) +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int(unsigned int input_int) +{ + _Complex float output; + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For int input, input_int is in degrees +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int_Deg(unsigned int input_int) +{ + // normalize whole number input degrees to fsca format + input_int = ((1527099483ULL * input_int) >> 23); + + _Complex float output; + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For int input, input_int is in radians +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int_Rad(unsigned int input_int) +{ + // normalize whole number input rads to fsca format + input_int = ((2734261102ULL * input_int) >> 18); + + _Complex float output; + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// +// Float input (slower) +// + +// For float input, input_float is in native fsca units +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float(float input_float) +{ + _Complex float output; + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For float input, input_float is in degrees +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float_Deg(float input_float) +{ + input_float *= 182.044444444f; + + _Complex float output; + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For float input, input_float is in radians +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float_Rad(float input_float) +{ + input_float *= 10430.3783505f; + + _Complex float output; + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + return output; +} + +//------------------------------------------------------------------------------ +#endif +//------------------------------------------------------------------------------ + +//============================================================================== +// Hardware vector and matrix operations +//============================================================================== +// +// These functions each have very specific usage instructions. Please be sure to +// read them before use or else they won't seem to work right! +// +// The following functions are available. +// Please see their definitions for important usage info, otherwise they may not +// work for you. +// +/* + + //------------------------------------------------------------------------------ + // Vector and matrix math operations + //------------------------------------------------------------------------------ + + // Inner/dot product (4x1 vec . 4x1 vec = scalar) + float MATH_fipr(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) + + // Sum of Squares (w^2 + x^2 + y^2 + z^2) + float MATH_Sum_of_Squares(float w, float x, float y, float z) + + // Cross product with bonus multiply (vec X vec = orthogonal vec, with an extra a*b=c) + RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b) + + // Cross product (vec X vec = orthogonal vec) + RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3) + + // Outer product (vec (X) vec = 4x4 matrix) + void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) + + // Matrix transform (4x4 matrix * 4x1 vec = 4x1 vec) + RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4) + + // 4x4 Matrix transpose (XMTRX^T) + void MATH_Matrix_Transpose(void) + + // 4x4 Matrix product (XMTRX and one from memory) + void MATH_Matrix_Product(ALL_FLOATS_STRUCT * front_matrix) + + // 4x4 Matrix product (two from memory) + void MATH_Load_Matrix_Product(ALL_FLOATS_STRUCT * matrix1, ALL_FLOATS_STRUCT * matrix2) + + //------------------------------------------------------------------------------ + // Matrix load and store operations + //------------------------------------------------------------------------------ + + // Load 4x4 XMTRX from memory + void MATH_Load_XMTRX(ALL_FLOATS_STRUCT * back_matrix) + + // Store 4x4 XMTRX to memory + ALL_FLOATS_STRUCT * MATH_Store_XMTRX(ALL_FLOATS_STRUCT * destination) +*/ + +//------------------------------------------------------------------------------ +// Vector and matrix math operations +//------------------------------------------------------------------------------ + +// Inner/dot product: vec . vec = scalar +// _ _ +// | y1 | +// [ x1 x2 x3 x4 ] . | y2 | = scalar +// | y3 | +// |_ y4 _| +// +// SH4 calling convention states we get 8 float arguments. Perfect! +static inline __attribute__((always_inline)) float MATH_fipr(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) +{ + // FR4-FR11 are the regs that are passed in, aka vectors FV4 and FV8. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx4 = x4; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float ty4 = y4; + + // vector FV4 + register float __x1 __asm__("fr4") = tx1; + register float __x2 __asm__("fr5") = tx2; + register float __x3 __asm__("fr6") = tx3; + register float __x4 __asm__("fr7") = tx4; + + // vector FV8 + register float __y1 __asm__("fr8") = ty1; + register float __y2 __asm__("fr9") = ty2; + register float __y3 __asm__("fr10") = ty3; + register float __y4 __asm__("fr11") = ty4; + + // take care of all the floats in one fell swoop + asm volatile ("fipr FV4, FV8\n" + : "+f" (__y4) // output (gets written to FR11) + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__x4), "f" (__y1), "f" (__y2), "f" (__y3) // inputs + : // clobbers + ); + + return __y4; +} + +// Sum of Squares +// _ _ +// | w | +// [ w x y z ] . | x | = w^2 + x^2 + y^2 + z^2 = scalar +// | y | +// |_ z _| +// +static inline __attribute__((always_inline)) float MATH_Sum_of_Squares(float w, float x, float y, float z) +{ + // FR4-FR7 are the regs that are passed in, aka vector FV4. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tw = w; + float tx = x; + float ty = y; + float tz = z; + + // vector FV4 + register float __w __asm__("fr4") = tw; + register float __x __asm__("fr5") = tx; + register float __y __asm__("fr6") = ty; + register float __z __asm__("fr7") = tz; + + // take care of all the floats in one fell swoop + asm volatile ("fipr FV4, FV4\n" + : "+f" (__z) // output (gets written to FR7) + : "f" (__w), "f" (__x), "f" (__y) // inputs + : // clobbers + ); + + return __z; +} + +// Cross product: vec X vec = orthogonal vec +// _ _ _ _ _ _ +// | x1 | | y1 | | z1 | +// | x2 | X | y2 | = | z2 | +// |_ x3 _| |_ y3 _| |_ z3 _| +// +// With bonus multiply: +// +// a * b = c +// +// IMPORTANT USAGE INFORMATION (cross product): +// +// Return vector struct maps as below to the above diagram: +// +// typedef struct { +// float z1; +// float z2; +// float z3; +// float z4; // c is stored in z4, and c = a*b if using 'with mult' version (else c = 0) +// } RETURN_VECTOR_STRUCT; +// +// For people familiar with the unit vector notation, z1 == 'i', z2 == 'j', +// and z3 == 'k'. +// +// The cross product matrix will also be stored in XMTRX after this, so calling +// MATH_Matrix_Transform() on a vector after using this function will do a cross +// product with the same x1-x3 values and a multiply with the same 'a' value +// as used in this function. In this a situation, 'a' will be multiplied with +// the x4 parameter of MATH_Matrix_Transform(). a = 0 if not using the 'with mult' +// version of the cross product function. +// +// For reference, XMTRX will look like this: +// +// [ 0 -x3 x2 0 ] +// [ x3 0 -x1 0 ] +// [ -x2 x1 0 0 ] +// [ 0 0 0 a ] (<-- a = 0 if not using 'with mult') +// +// Similarly to how the sine and cosine functions use fsca and return 2 floats, +// the cross product functions actually return 4 floats. The first 3 are the +// cross product output, and the 4th is a*b. The SH4 only multiplies 4x4 +// matrices with 4x1 vectors, which is why the output is like that--but it means +// we also get a bonus float multiplication while we do our cross product! +// + +// Please do not call this function directly (notice the weird syntax); call +// MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead. +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product_with_Mult(float x3, float a, float y3, float b, float x2, float x1, float y1, float y2) +{ + // FR4-FR11 are the regs that are passed in, in that order. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float ta = a; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float tb = b; + + register float __x1 __asm__("fr9") = tx1; // need to negate (need to move to fr6, then negate fr9) + register float __x2 __asm__("fr8") = tx2; // in place for matrix (need to move to fr2 then negate fr2) + register float __x3 __asm__("fr4") = tx3; // need to negate (move to fr1 first, then negate fr4) + register float __a __asm__("fr5") = ta; + + register float __y1 __asm__("fr10") = ty1; + register float __y2 __asm__("fr11") = ty2; + register float __y3 __asm__("fr6") = ty3; + register float __b __asm__("fr7") = tb; + + register float __z1 __asm__("fr0") = 0.0f; // z1 + register float __z2 __asm__("fr1") = 0.0f; // z2 (not moving x3 here yet since a double 0 is needed) + register float __z3 __asm__("fr2") = tx2; // z3 (this handles putting x2 in fr2) + register float __c __asm__("fr3") = 0.0f; // c + + // This actually does a matrix transform to do the cross product. + // It's this: + // _ _ _ _ + // [ 0 -x3 x2 0 ] | y1 | | -x3y2 + x2y3 | + // [ x3 0 -x1 0 ] | y2 | = | x3y1 - x1y3 | + // [ -x2 x1 0 0 ] | y3 | | -x2y1 + x1y2 | + // [ 0 0 0 a ] |_ b _| |_ c _| + // + + asm volatile ( + // set up back bank's FV0 + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + + // Save FR12-FR15, which are supposed to be preserved across functions calls. + // This stops them from getting clobbered and saves 4 stack pushes (memory accesses). + "fmov DR12, XD12\n\t" + "fmov DR14, XD14\n\t" + + "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1) + "fmov DR6, XD2\n\t" // fmov 'y3' and 'b' from FR6, FR7 into position (XF2, XF3) + + // pair move zeros for some speed in setting up front bank for matrix + "fmov DR0, DR10\n\t" // clear FR10, FR11 + "fmov DR0, DR12\n\t" // clear FR12, FR13 + "fschg\n\t" // switch back to single moves + // prepare front bank for XMTRX + "fmov FR5, FR15\n\t" // fmov 'a' into position + "fmov FR0, FR14\n\t" // clear out FR14 + "fmov FR0, FR7\n\t" // clear out FR7 + "fmov FR0, FR5\n\t" // clear out FR5 + + "fneg FR2\n\t" // set up 'x2' + "fmov FR9, FR6\n\t" // set up 'x1' + "fneg FR9\n\t" + "fmov FR4, FR1\n\t" // set up 'x3' + "fneg FR4\n\t" + // flip banks and matrix multiply + "frchg\n\t" + "ftrv XMTRX, FV0\n" + : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0) + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__a), "f" (__b) // inputs + : // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved) + ); + + RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c}; + return output; +} + +// Please do not call this function directly (notice the weird syntax); call +// MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead. +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product(float x3, float zero, float x1, float y3, float x2, float x1_2, float y1, float y2) +{ + // FR4-FR11 are the regs that are passed in, in that order. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx1_2 = x1_2; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float tzero = zero; + + register float __x1 __asm__("fr6") = tx1; // in place + register float __x2 __asm__("fr8") = tx2; // in place (fmov to fr2, negate fr2) + register float __x3 __asm__("fr4") = tx3; // need to negate (fmov to fr1, negate fr4) + + register float __zero __asm__("fr5") = tzero; // in place + register float __x1_2 __asm__("fr9") = tx1_2; // need to negate + + register float __y1 __asm__("fr10") = ty1; + register float __y2 __asm__("fr11") = ty2; + // no __y3 needed in this function + + register float __z1 __asm__("fr0") = tzero; // z1 + register float __z2 __asm__("fr1") = tzero; // z2 + register float __z3 __asm__("fr2") = ty3; // z3 + register float __c __asm__("fr3") = tzero; // c + + // This actually does a matrix transform to do the cross product. + // It's this: + // _ _ _ _ + // [ 0 -x3 x2 0 ] | y1 | | -x3y2 + x2y3 | + // [ x3 0 -x1 0 ] | y2 | = | x3y1 - x1y3 | + // [ -x2 x1 0 0 ] | y3 | | -x2y1 + x1y2 | + // [ 0 0 0 0 ] |_ 0 _| |_ 0 _| + // + + asm volatile ( + // zero out FR7. For some reason, if this is done in C after __z3 is set: + // register float __y3 __asm__("fr7") = tzero; + // then GCC will emit a spurious stack push (pushing FR12). So just zero it here. + "fmov FR5, FR7\n\t" + // set up back bank's FV0 + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + + // Save FR12-FR15, which are supposed to be preserved across functions calls. + // This stops them from getting clobbered and saves 4 stack pushes (memory accesses). + "fmov DR12, XD12\n\t" + "fmov DR14, XD14\n\t" + + "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1) + "fmov DR2, XD2\n\t" // fmov 'y3' and '0' from FR2, FR3 into position (XF2, XF3) + + // pair move zeros for some speed in setting up front bank for matrix + "fmov DR0, DR10\n\t" // clear FR10, FR11 + "fmov DR0, DR12\n\t" // clear FR12, FR13 + "fmov DR0, DR14\n\t" // clear FR14, FR15 + "fschg\n\t" // switch back to single moves + // prepare front bank for XMTRX + "fneg FR9\n\t" // set up 'x1' + "fmov FR8, FR2\n\t" // set up 'x2' + "fneg FR2\n\t" + "fmov FR4, FR1\n\t" // set up 'x3' + "fneg FR4\n\t" + // flip banks and matrix multiply + "frchg\n\t" + "ftrv XMTRX, FV0\n" + : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0) + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__zero), "f" (__x1_2) // inputs + : "fr7" // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved) + ); + + RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c}; + return output; +} + +//------------------------------------------------------------------------------ +// Functions that wrap the xMATH_do_Cross_Product[_with_Mult]() functions to make +// it easier to organize parameters +//------------------------------------------------------------------------------ + +// Cross product with a bonus float multiply (c = a * b) +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b) +{ + return xMATH_do_Cross_Product_with_Mult(x3, a, y3, b, x2, x1, y1, y2); +} + +// Plain cross product; does not use the bonus float multiply (c = 0 and a in the cross product matrix will be 0) +// This is a tiny bit faster than 'with_mult' (about 2 cycles faster) +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3) +{ + return xMATH_do_Cross_Product(x3, 0.0f, x1, y3, x2, x1, y1, y2); +} + +// Outer product: vec (X) vec = matrix +// _ _ +// | x1 | +// | x2 | (X) [ y1 y2 y3 y4 ] = 4x4 matrix +// | x3 | +// |_ x4 _| +// +// This returns the floats in the back bank (XF0-15), which are inaccessible +// outside of using frchg or paired-move fmov. It's meant to set up a matrix for +// use with other matrix functions. GCC also does not touch the XFn bank. +// This will also wipe out anything stored in the float registers, as it uses the +// whole FPU register file (all 32 of the float registers). +static inline __attribute__((always_inline)) void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) +{ + // FR4-FR11 are the regs that are passed in, in that order. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx4 = x4; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float ty4 = y4; + + // vector FV4 + register float __x1 __asm__("fr4") = tx1; + register float __x2 __asm__("fr5") = tx2; + register float __x3 __asm__("fr6") = tx3; + register float __x4 __asm__("fr7") = tx4; + + // vector FV8 + register float __y1 __asm__("fr8") = ty1; + register float __y2 __asm__("fr9") = ty2; + register float __y3 __asm__("fr10") = ty3; // in place already + register float __y4 __asm__("fr11") = ty4; + + // This actually does a 4x4 matrix multiply to do the outer product. + // It's this: + // + // [ x1 x1 x1 x1 ] [ y1 0 0 0 ] [ x1y1 x1y2 x1y3 x1y4 ] + // [ x2 x2 x2 x2 ] [ 0 y2 0 0 ] = [ x2y1 x2y2 x2y3 x2y4 ] + // [ x3 x3 x3 x3 ] [ 0 0 y3 0 ] [ x3y1 x3y2 x3y3 x3y4 ] + // [ x4 x4 x4 x4 ] [ 0 0 0 y4 ] [ x4y1 x4y2 x4y3 x4y4 ] + // + + asm volatile ( + // zero out unoccupied front floats to make a double 0 in DR2 + "fldi0 FR2\n\t" + "fmov FR2, FR3\n\t" + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + // fmov 'x1' and 'x2' from FR4, FR5 into position (XF0,4,8,12, XF1,5,9,13) + "fmov DR4, XD0\n\t" + "fmov DR4, XD4\n\t" + "fmov DR4, XD8\n\t" + "fmov DR4, XD12\n\t" + // fmov 'x3' and 'x4' from FR6, FR7 into position (XF2,6,10,14, XF3,7,11,15) + "fmov DR6, XD2\n\t" + "fmov DR6, XD6\n\t" + "fmov DR6, XD10\n\t" + "fmov DR6, XD14\n\t" + // set up front floats (y1-y4) + "fmov DR8, DR0\n\t" + "fmov DR8, DR4\n\t" + "fmov DR10, DR14\n\t" + // finish zeroing out front floats + "fmov DR2, DR6\n\t" + "fmov DR2, DR8\n\t" + "fmov DR2, DR12\n\t" + "fschg\n\t" // switch back to single-move mode + // zero out remaining values and matrix multiply 4x4 + "fmov FR2, FR1\n\t" + "ftrv XMTRX, FV0\n\t" + + "fmov FR6, FR4\n\t" + "ftrv XMTRX, FV4\n\t" + + "fmov FR8, FR11\n\t" + "ftrv XMTRX, FV8\n\t" + + "fmov FR12, FR14\n\t" + "ftrv XMTRX, FV12\n\t" + // Save output in XF regs + "frchg\n" + : // no outputs + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__x4), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__y4) // inputs + : "fr0", "fr1", "fr2", "fr3", "fr12", "fr13", "fr14", "fr15" // clobbers, can't avoid it + ); + // GCC will restore FR12-FR15 from the stack after this, so we really can't keep the output in the front bank. +} + +// Matrix transform: matrix * vector = vector +// _ _ _ _ +// [ ----------- ] | x1 | | z1 | +// [ ---XMTRX--- ] | x2 | = | z2 | +// [ ----------- ] | x3 | | z3 | +// [ ----------- ] |_ x4 _| |_ z4 _| +// +// IMPORTANT USAGE INFORMATION (matrix transform): +// +// Return vector struct maps 1:1 to the above diagram: +// +// typedef struct { +// float z1; +// float z2; +// float z3; +// float z4; +// } RETURN_VECTOR_STRUCT; +// +// Similarly to how the sine and cosine functions use fsca and return 2 floats, +// the matrix transform function actually returns 4 floats. The SH4 only multiplies +// 4x4 matrices with 4x1 vectors, which is why the output is like that. +// +// Multiply a matrix stored in the back bank (XMTRX) with an input vector +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4) +{ + // The floats comprising FV4 are the regs that are passed in. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx4 = x4; + + // output vector FV0 + register float __z1 __asm__("fr0") = tx1; + register float __z2 __asm__("fr1") = tx2; + register float __z3 __asm__("fr2") = tx3; + register float __z4 __asm__("fr3") = tx4; + + asm volatile ("ftrv XMTRX, FV0\n\t" + // have to do this to obey SH4 calling convention--output returned in FV0 + : "+w" (__z1), "+f" (__z2), "+f" (__z3), "+f" (__z4) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __z4}; + return output; +} + +// Matrix Transpose +// +// This does a matrix transpose on the matrix in XMTRX, which swaps rows with +// columns as follows (math notation is [XMTRX]^T): +// +// [ a b c d ] T [ a e i m ] +// [ e f g h ] = [ b f j n ] +// [ i j k l ] [ c g k o ] +// [ m n o p ] [ d h l p ] +// +// PLEASE NOTE: It is faster to avoid the need for a transpose altogether by +// structuring matrices and vectors accordingly. +static inline __attribute__((always_inline)) void MATH_Matrix_Transpose(void) +{ + asm volatile ( + "frchg\n\t" // fmov for singles only works on front bank + // FR0, FR5, FR10, and FR15 are already in place + // swap FR1 and FR4 + "flds FR1, FPUL\n\t" + "fmov FR4, FR1\n\t" + "fsts FPUL, FR4\n\t" + // swap FR2 and FR8 + "flds FR2, FPUL\n\t" + "fmov FR8, FR2\n\t" + "fsts FPUL, FR8\n\t" + // swap FR3 and FR12 + "flds FR3, FPUL\n\t" + "fmov FR12, FR3\n\t" + "fsts FPUL, FR12\n\t" + // swap FR6 and FR9 + "flds FR6, FPUL\n\t" + "fmov FR9, FR6\n\t" + "fsts FPUL, FR9\n\t" + // swap FR7 and FR13 + "flds FR7, FPUL\n\t" + "fmov FR13, FR7\n\t" + "fsts FPUL, FR13\n\t" + // swap FR11 and FR14 + "flds FR11, FPUL\n\t" + "fmov FR14, FR11\n\t" + "fsts FPUL, FR14\n\t" + // restore XMTRX to back bank + "frchg\n" + : // no outputs + : // no inputs + : "fpul" // clobbers + ); +} + +// Matrix product: matrix * matrix = matrix +// +// These use the whole dang floating point unit. +// +// [ ----------- ] [ ----------- ] [ ----------- ] +// [ ---Back---- ] [ ---Front--- ] = [ ---XMTRX--- ] +// [ ---Matrix-- ] [ ---Matrix-- ] [ ----------- ] +// [ --(XMTRX)-- ] [ ----------- ] [ ----------- ] +// +// Multiply a matrix stored in the back bank with a matrix loaded from memory +// Output is stored in the back bank (XMTRX) +static inline __attribute__((always_inline)) void MATH_Matrix_Product(ALL_FLOATS_STRUCT * front_matrix) +{ + /* + // This prefetching should help a bit if placed suitably far enough in advance (not here) + // Possibly right before this function call. Change the "front_matrix" variable appropriately. + // SH4 does not support r/w or temporal prefetch hints, so we only need to pass in an address. + __builtin_prefetch(front_matrix); + */ + + unsigned int prefetch_scratch; + + asm volatile ( + "mov %[fmtrx], %[pref_scratch]\n\t" // parallel-load address for prefetching (MT) + "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fschg\n\t" // switch fmov to paired moves (FE) + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + // interleave loads and matrix multiply 4x4 + "fmov.d @%[fmtrx]+, DR0\n\t" // (LS) + "fmov.d @%[fmtrx]+, DR2\n\t" + "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec + "ftrv XMTRX, FV0\n\t" // (FE) + + "fmov.d @%[fmtrx]+, DR6\n\t" + "fmov.d @%[fmtrx]+, DR8\n\t" // prefetch should work for here + "ftrv XMTRX, FV4\n\t" + + "fmov.d @%[fmtrx]+, DR10\n\t" + "fmov.d @%[fmtrx]+, DR12\n\t" + "ftrv XMTRX, FV8\n\t" + + "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles) + "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE) + "ftrv XMTRX, FV12\n\t" // (FE) + // Save output in XF regs + "frchg\n" + : [fmtrx] "+r" ((unsigned int)front_matrix), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w + : // no inputs + : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) + ); +} + +// Load two 4x4 matrices and multiply them, storing the output into the back bank (XMTRX) +// +// MATH_Load_Matrix_Product() is slightly faster than doing this: +// MATH_Load_XMTRX(matrix1) +// MATH_Matrix_Product(matrix2) +// as it saves having to do 2 extraneous 'fschg' instructions. +// +static inline __attribute__((always_inline)) void MATH_Load_Matrix_Product(ALL_FLOATS_STRUCT * matrix1, ALL_FLOATS_STRUCT * matrix2) +{ + /* + // This prefetching should help a bit if placed suitably far enough in advance (not here) + // Possibly right before this function call. Change the "matrix1" variable appropriately. + // SH4 does not support r/w or temporal prefetch hints, so we only need to pass in an address. + __builtin_prefetch(matrix1); + */ + + unsigned int prefetch_scratch; + + asm volatile ( + "mov %[bmtrx], %[pref_scratch]\n\t" // (MT) + "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE) + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + // back matrix + "fmov.d @%[bmtrx]+, XD0\n\t" // (LS) + "fmov.d @%[bmtrx]+, XD2\n\t" + "fmov.d @%[bmtrx]+, XD4\n\t" + "fmov.d @%[bmtrx]+, XD6\n\t" + "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS) + "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here + "fmov.d @%[bmtrx]+, XD10\n\t" + "fmov.d @%[bmtrx]+, XD12\n\t" + "mov %[fmtrx], %[pref_scratch]\n\t" // (MT) + "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fmov.d @%[bmtrx], XD14\n\t" + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + // front matrix + // interleave loads and matrix multiply 4x4 + "fmov.d @%[fmtrx]+, DR0\n\t" + "fmov.d @%[fmtrx]+, DR2\n\t" + "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec + "ftrv XMTRX, FV0\n\t" // (FE) + + "fmov.d @%[fmtrx]+, DR6\n\t" + "fmov.d @%[fmtrx]+, DR8\n\t" + "ftrv XMTRX, FV4\n\t" + + "fmov.d @%[fmtrx]+, DR10\n\t" + "fmov.d @%[fmtrx]+, DR12\n\t" + "ftrv XMTRX, FV8\n\t" + + "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles) + "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE) + "ftrv XMTRX, FV12\n\t" // (FE) + // Save output in XF regs + "frchg\n" + : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : // no inputs + : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) + ); +} + +//------------------------------------------------------------------------------ +// Matrix load and store operations +//------------------------------------------------------------------------------ + +// Load a matrix from memory into the back bank (XMTRX) +static inline __attribute__((always_inline)) void MATH_Load_XMTRX(ALL_FLOATS_STRUCT * back_matrix) +{ + /* + // This prefetching should help a bit if placed suitably far enough in advance (not here) + // Possibly right before this function call. Change the "back_matrix" variable appropriately. + // SH4 does not support r/w or temporal prefetch hints, so we only need to pass in an address. + __builtin_prefetch(back_matrix); + */ + + unsigned int prefetch_scratch; + + asm volatile ( + "mov %[bmtrx], %[pref_scratch]\n\t" // (MT) + "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE) + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + "fmov.d @%[bmtrx]+, XD0\n\t" + "fmov.d @%[bmtrx]+, XD2\n\t" + "fmov.d @%[bmtrx]+, XD4\n\t" + "fmov.d @%[bmtrx]+, XD6\n\t" + "fmov.d @%[bmtrx]+, XD8\n\t" + "fmov.d @%[bmtrx]+, XD10\n\t" + "fmov.d @%[bmtrx]+, XD12\n\t" + "fmov.d @%[bmtrx], XD14\n\t" + "fschg\n" // switch back to single moves + : [bmtrx] "+r" ((unsigned int)back_matrix), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w + : // no inputs + : // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) + ); +} + +// Store XMTRX to memory +static inline __attribute__((always_inline)) ALL_FLOATS_STRUCT * MATH_Store_XMTRX(ALL_FLOATS_STRUCT * destination) +{ + /* + // This prefetching should help a bit if placed suitably far enough in advance (not here) + // Possibly right before this function call. Change the "destination" variable appropriately. + // SH4 does not support r/w or temporal prefetch hints, so we only need to pass in an address. + __builtin_prefetch( (ALL_FLOATS_STRUCT*)((unsigned char*)destination + 32) ); // Store works backwards, so note the '+32' here + */ + + char * output = ((char*)destination) + sizeof(ALL_FLOATS_STRUCT) + 8; // ALL_FLOATS_STRUCT should be 64 bytes + + asm volatile ( + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE) + "pref @%[dest_base]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + "fmov.d XD0, @-%[out_mtrx]\n\t" // These do *(--output) = XDn (LS) + "fmov.d XD2, @-%[out_mtrx]\n\t" + "fmov.d XD4, @-%[out_mtrx]\n\t" + "fmov.d XD6, @-%[out_mtrx]\n\t" + "fmov.d XD8, @-%[out_mtrx]\n\t" + "fmov.d XD10, @-%[out_mtrx]\n\t" + "fmov.d XD12, @-%[out_mtrx]\n\t" + "fmov.d XD14, @-%[out_mtrx]\n\t" + "fschg\n" // switch back to single moves + : [out_mtrx] "+&r" ((unsigned int)output) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : [dest_base] "r" ((unsigned int)destination) // inputs + : "memory" // clobbers + ); + + return destination; +} + + +// In general, writing the entire required math routine in one asm function is +// the best way to go for performance reasons anyways, and in that situation one +// can just throw calling convention to the wind until returning back to C. + +//============================================================================== +// Miscellaneous Functions +//============================================================================== +// +// The following functions are provided as examples of ways in which these math +// functions can be used. +// +// Reminder: 1 fsca unit = 1/182.044444444 of a degree or 1/10430.3783505 of a radian +// In order to make the best use of fsca units, a program must expect them from +// the outset and not "make them" by dividing radians or degrees to get them, +// otherwise it's just giving the 'fsca' instruction radians or degrees! +// +/* + + //------------------------------------------------------------------------------ + // Commonly useful functions + //------------------------------------------------------------------------------ + + // Returns 1 if point 't' is inside triangle with vertices 'v0', 'v1', and 'v2', and 0 if not + int MATH_Is_Point_In_Triangle(float v0x, float v0y, float v1x, float v1y, float v2x, float v2y, float ptx, float pty) + + //------------------------------------------------------------------------------ + // Interpolation + //------------------------------------------------------------------------------ + + // Linear interpolation + float MATH_Lerp(float a, float b, float t) + + // Speherical interpolation ('theta' in fsca units) + float MATH_Slerp(float a, float b, float t, float theta) + + //------------------------------------------------------------------------------ + // Fast Sinc functions (unnormalized, sin(x)/x version) + //------------------------------------------------------------------------------ + // Just pass in MATH_pi * x for normalized versions :) + + // Sinc function (fsca units) + float MATH_Fast_Sincf(float x) + + // Sinc function (degrees) + float MATH_Fast_Sincf_Deg(float x) + + // Sinc function (rads) + float MATH_Fast_Sincf_Rad(float x) + +*/ + +//------------------------------------------------------------------------------ +// Commonly useful functions +//------------------------------------------------------------------------------ + +// Returns 1 if point 'pt' is inside triangle with vertices 'v0', 'v1', and 'v2', and 0 if not +// Determines triangle center using barycentric coordinate transformation +// Adapted from: https://stackoverflow.com/questions/2049582/how-to-determine-if-a-point-is-in-a-2d-triangle +// Specifically the answer by user 'adreasdr' in addition to the comment by user 'urraka' on the answer from user 'Andreas Brinck' +// +// The notation here assumes v0x is the x-component of v0, v0y is the y-component of v0, etc. +// +static inline __attribute__((always_inline)) int MATH_Is_Point_In_Triangle(float v0x, float v0y, float v1x, float v1y, float v2x, float v2y, float ptx, float pty) +{ + float sdot = MATH_fipr(v0y, -v0x, v2y - v0y, v0x - v2x, v2x, v2y, ptx, pty); + float tdot = MATH_fipr(v0x, -v0y, v0y - v1y, v1x - v0x, v1y, v1x, ptx, pty); + + float areadot = MATH_fipr(-v1y, v0y, v0x, v1x, v2x, -v1x + v2x, v1y - v2y, v2y); + + // 'areadot' could be negative depending on the winding of the triangle + if(areadot < 0.0f) + { + sdot *= -1.0f; + tdot *= -1.0f; + areadot *= -1.0f; + } + + if( (sdot > 0.0f) && (tdot > 0.0f) && (areadot > (sdot + tdot)) ) + { + return 1; + } + + return 0; +} + +//------------------------------------------------------------------------------ +// Interpolation +//------------------------------------------------------------------------------ + +// Linear interpolation +static inline __attribute__((always_inline)) float MATH_Lerp(float a, float b, float t) +{ + return MATH_fmac(t, (b-a), a); +} + +// Speherical interpolation ('theta' in fsca units) +static inline __attribute__((always_inline)) float MATH_Slerp(float a, float b, float t, float theta) +{ + // a is an element of v0, b is an element of v1 + // v = ( v0 * sin(theta - t * theta) + v1 * sin(t * theta) ) / sin(theta) + // by using sine/cosine identities and properties, this can be optimized to: + // v = v0 * cos(-t * theta) + ( v0 * ( cos(theta) * sin(-t * theta) ) - sin(-t * theta) * v1 ) / sin(theta) + // which only requires two calls to fsca. + // Specifically, sin(a + b) = sin(a)cos(b) + cos(a)sin(b) & sin(-a) = -sin(a) + + // MATH_fsca_* functions return reverse-ordered complex numbers for speed reasons (i.e. normally sine is the imaginary part) + // This could be made even faster by using MATH_fsca_Int() with 'theta' and 't' as unsigned ints + +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION + + RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float(theta); + float sine_value_theta = sine_cosine.sine; + float cosine_value_theta = sine_cosine.cosine; + + RETURN_FSCA_STRUCT sine_cosine2 = MATH_fsca_Float(-t * theta); + float sine_value_minus_t_theta = sine_cosine2.sine; + float cosine_value_minus_t_theta = sine_cosine2.cosine; + +#else + + _Complex float sine_cosine = MATH_fsca_Float(theta); + float sine_value_theta = __real__ sine_cosine; + float cosine_value_theta = __imag__ sine_cosine; + + _Complex float sine_cosine2 = MATH_fsca_Float(-t * theta); + float sine_value_minus_t_theta = __real__ sine_cosine2; + float cosine_value_minus_t_theta = __imag__ sine_cosine2; + +#endif + + float numer = a * cosine_value_theta * sine_value_minus_t_theta - sine_value_minus_t_theta * b; + float output_float = a * cosine_value_minus_t_theta + MATH_Fast_Divide(numer, sine_value_theta); + + return output_float; +} + +//------------------------------------------------------------------------------ +// Fast Sinc (unnormalized, sin(x)/x version) +//------------------------------------------------------------------------------ +// +// Just pass in MATH_pi * x for normalized versions :) +// + +// Sinc function (fsca units) +static inline __attribute__((always_inline)) float MATH_Fast_Sincf(float x) +{ + if(x == 0.0f) + { + return 1.0f; + } + +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION + + RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float(x); + float sine_value = sine_cosine.sine; + +#else + + _Complex float sine_cosine = MATH_fsca_Float(x); + float sine_value = __real__ sine_cosine; + +#endif + + return MATH_Fast_Divide(sine_value, x); +} + +// Sinc function (degrees) +static inline __attribute__((always_inline)) float MATH_Fast_Sincf_Deg(float x) +{ + if(x == 0.0f) + { + return 1.0f; + } + +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION + + RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float_Deg(x); + float sine_value = sine_cosine.sine; + +#else + + _Complex float sine_cosine = MATH_fsca_Float_Deg(x); + float sine_value = __real__ sine_cosine; + +#endif + + return MATH_Fast_Divide(sine_value, x); +} + +// Sinc function (rads) +static inline __attribute__((always_inline)) float MATH_Fast_Sincf_Rad(float x) +{ + if(x == 0.0f) + { + return 1.0f; + } + +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION + + RETURN_FSCA_STRUCT sine_cosine = MATH_fsca_Float_Rad(x); + float sine_value = sine_cosine.sine; + +#else + + _Complex float sine_cosine = MATH_fsca_Float_Rad(x); + float sine_value = __real__ sine_cosine; + +#endif + + return MATH_Fast_Divide(sine_value, x); +} + +//============================================================================== +// Miscellaneous Snippets +//============================================================================== +// +// The following snippets are best implemented manually in user code (they can't +// be put into their own functions without incurring performance penalties). +// +// They also serve as examples of how one might use the functions in this header. +// +/* + Normalize a vector (x, y, z) and get its pre-normalized magnitude (length) +*/ + +// +// Normalize a vector (x, y, z) and get its pre-normalized magnitude (length) +// +// magnitude = sqrt(x^2 + y^2 + z^2) +// (x, y, z) = 1/magnitude * (x, y, z) +// +// x, y, z, and magnitude are assumed already existing floats +// + +/* -- start -- + + // Don't need an 'else' with this (if length is 0, x = y = z = 0) + magnitude = 0; + + if(__builtin_expect(x || y || z, 1)) + { + temp = MATH_Sum_of_Squares(x, y, z, 0); // temp = x^2 + y^2 + z^2 + 0^2 + float normalizer = MATH_fsrra(temp); // 1/sqrt(temp) + x = normalizer * x; + y = normalizer * y; + z = normalizer * z; + magnitude = MATH_Fast_Invert(normalizer); + } + +-- end -- */ + + +#endif /* __SH4_MATH_H_ */ + diff --git a/third_party/gldc/src/state.c b/third_party/gldc/src/state.c new file mode 100644 index 000000000..0c06ba4fa --- /dev/null +++ b/third_party/gldc/src/state.c @@ -0,0 +1,499 @@ +#include +#include +#include + +#include "private.h" + + +static struct { + GLboolean is_dirty; + +/* We can't just use the GL_CONTEXT for this state as the two + * GL states are combined, so we store them separately and then + * calculate the appropriate PVR state from them. */ + GLenum depth_func; + GLboolean depth_test_enabled; + GLenum cull_face; + GLenum front_face; + GLboolean culling_enabled; + GLboolean znear_clipping_enabled; + GLboolean alpha_test_enabled; + GLboolean scissor_test_enabled; + GLboolean fog_enabled; + GLboolean depth_mask_enabled; + + struct { + GLint x; + GLint y; + GLsizei width; + GLsizei height; + GLboolean applied; + } scissor_rect; + + GLenum blend_sfactor; + GLenum blend_dfactor; + GLboolean blend_enabled; + + GLenum shade_model; +} GPUState = { + .is_dirty = GL_TRUE, + .depth_func = GL_LESS, + .depth_test_enabled = GL_FALSE, + .cull_face = GL_BACK, + .front_face = GL_CCW, + .culling_enabled = GL_FALSE, + .znear_clipping_enabled = GL_TRUE, + .alpha_test_enabled = GL_FALSE, + .scissor_test_enabled = GL_FALSE, + .fog_enabled = GL_FALSE, + .depth_mask_enabled = GL_FALSE, + .scissor_rect = {0, 0, 640, 480, false}, + .blend_sfactor = GL_ONE, + .blend_dfactor = GL_ZERO, + .blend_enabled = GL_FALSE, + .shade_model = GL_SMOOTH +}; + +void _glGPUStateMarkClean() { + GPUState.is_dirty = GL_FALSE; +} + +void _glGPUStateMarkDirty() { + GPUState.is_dirty = GL_TRUE; +} + +GLboolean _glGPUStateIsDirty() { + return GPUState.is_dirty; +} + +GLboolean _glIsDepthTestEnabled() { + return GPUState.depth_test_enabled; +} + +GLenum _glGetDepthFunc() { + return GPUState.depth_func; +} + +GLboolean _glIsDepthWriteEnabled() { + return GPUState.depth_mask_enabled; +} + +GLenum _glGetShadeModel() { + return GPUState.shade_model; +} + +GLboolean _glIsBlendingEnabled() { + return GPUState.blend_enabled; +} + +GLboolean _glIsAlphaTestEnabled() { + return GPUState.alpha_test_enabled; +} + +GLboolean _glIsCullingEnabled() { + return GPUState.culling_enabled; +} + +GLenum _glGetCullFace() { + return GPUState.cull_face; +} + +GLenum _glGetFrontFace() { + return GPUState.front_face; +} + +GLboolean _glIsFogEnabled() { + return GPUState.fog_enabled; +} + +GLboolean _glIsScissorTestEnabled() { + return GPUState.scissor_test_enabled; +} + +GLboolean _glNearZClippingEnabled() { + return GPUState.znear_clipping_enabled; +} + +void _glApplyScissor(bool force); + +GLenum _glGetBlendSourceFactor() { + return GPUState.blend_sfactor; +} + +GLenum _glGetBlendDestFactor() { + return GPUState.blend_dfactor; +} + + +GLboolean _glCheckValidEnum(GLint param, GLint* values, const char* func) { + GLubyte found = 0; + while(*values != 0) { + if(*values == param) { + found++; + break; + } + values++; + } + + if(!found) { + _glKosThrowError(GL_INVALID_ENUM, func); + return GL_TRUE; + } + + return GL_FALSE; +} + +GLboolean TEXTURES_ENABLED = GL_FALSE; + +void _glUpdatePVRTextureContext(PolyContext *context, GLshort textureUnit) { + const TextureObject *tx1 = TEXTURE_ACTIVE; + + /* Disable all texturing to start with */ + context->txr.enable = GPU_TEXTURE_DISABLE; + context->txr2.enable = GPU_TEXTURE_DISABLE; + context->txr2.alpha = GPU_TXRALPHA_DISABLE; + + if(!TEXTURES_ENABLED || !tx1 || !tx1->data) { + context->txr.base = NULL; + return; + } + + context->txr.alpha = (GPUState.blend_enabled || GPUState.alpha_test_enabled) ? GPU_TXRALPHA_ENABLE : GPU_TXRALPHA_DISABLE; + + GLuint filter = GPU_FILTER_NEAREST; + + if(tx1->minFilter == GL_LINEAR && tx1->magFilter == GL_LINEAR) { + filter = GPU_FILTER_BILINEAR; + } + + if(tx1->data) { + context->txr.enable = GPU_TEXTURE_ENABLE; + context->txr.filter = filter; + context->txr.width = tx1->width; + context->txr.height = tx1->height; + context->txr.mipmap = GL_FALSE; + context->txr.mipmap_bias = tx1->mipmap_bias; + + context->txr.base = tx1->data; + context->txr.format = tx1->color; + context->txr.env = tx1->env; + context->txr.uv_flip = GPU_UVFLIP_NONE; + context->txr.uv_clamp = GPU_UVCLAMP_NONE; + } +} + +void _glInitContext() { + const VideoMode* mode = GetVideoMode(); + + GPUState.scissor_rect.x = 0; + GPUState.scissor_rect.y = 0; + GPUState.scissor_rect.width = mode->width; + GPUState.scissor_rect.height = mode->height; + + glClearDepth(1.0f); + glDepthFunc(GL_LESS); + glDepthMask(GL_TRUE); + glFrontFace(GL_CCW); + glCullFace(GL_BACK); + glShadeModel(GL_SMOOTH); + + glDisable(GL_ALPHA_TEST); + glDisable(GL_CULL_FACE); + glDisable(GL_BLEND); + glDisable(GL_DEPTH_TEST); + glDisable(GL_TEXTURE_2D); + glDisable(GL_FOG); +} + +GLAPI void APIENTRY glEnable(GLenum cap) { + switch(cap) { + case GL_TEXTURE_2D: + if(TEXTURES_ENABLED != GL_TRUE) { + TEXTURES_ENABLED = GL_TRUE; + GPUState.is_dirty = GL_TRUE; + } + break; + case GL_CULL_FACE: { + if(GPUState.culling_enabled != GL_TRUE) { + GPUState.culling_enabled = GL_TRUE; + GPUState.is_dirty = GL_TRUE; + } + + } break; + case GL_DEPTH_TEST: { + if(GPUState.depth_test_enabled != GL_TRUE) { + GPUState.depth_test_enabled = GL_TRUE; + GPUState.is_dirty = GL_TRUE; + } + } break; + case GL_BLEND: { + if(GPUState.blend_enabled != GL_TRUE) { + GPUState.blend_enabled = GL_TRUE; + GPUState.is_dirty = GL_TRUE; + } + } break; + case GL_SCISSOR_TEST: { + if(GPUState.scissor_test_enabled != GL_TRUE) { + GPUState.scissor_test_enabled = GL_TRUE; + GPUState.is_dirty = GL_TRUE; + } + } break; + case GL_FOG: + if(GPUState.fog_enabled != GL_TRUE) { + GPUState.fog_enabled = GL_TRUE; + GPUState.is_dirty = GL_TRUE; + } + break; + case GL_ALPHA_TEST: { + if(GPUState.alpha_test_enabled != GL_TRUE) { + GPUState.alpha_test_enabled = GL_TRUE; + GPUState.is_dirty = GL_TRUE; + } + } break; + case GL_NEARZ_CLIPPING_KOS: + if(GPUState.znear_clipping_enabled != GL_TRUE) { + GPUState.znear_clipping_enabled = GL_TRUE; + GPUState.is_dirty = GL_TRUE; + } + break; + default: + break; + } +} + +GLAPI void APIENTRY glDisable(GLenum cap) { + switch(cap) { + case GL_TEXTURE_2D: + if(TEXTURES_ENABLED != GL_FALSE) { + TEXTURES_ENABLED = GL_FALSE; + GPUState.is_dirty = GL_TRUE; + } + break; + case GL_CULL_FACE: { + if(GPUState.culling_enabled != GL_FALSE) { + GPUState.culling_enabled = GL_FALSE; + GPUState.is_dirty = GL_TRUE; + } + + } break; + case GL_DEPTH_TEST: { + if(GPUState.depth_test_enabled != GL_FALSE) { + GPUState.depth_test_enabled = GL_FALSE; + GPUState.is_dirty = GL_TRUE; + } + } break; + case GL_BLEND: { + if(GPUState.blend_enabled != GL_FALSE) { + GPUState.blend_enabled = GL_FALSE; + GPUState.is_dirty = GL_TRUE; + } + } break; + case GL_SCISSOR_TEST: { + if(GPUState.scissor_test_enabled != GL_FALSE) { + GPUState.scissor_test_enabled = GL_FALSE; + GPUState.is_dirty = GL_TRUE; + } + } break; + case GL_FOG: + if(GPUState.fog_enabled != GL_FALSE) { + GPUState.fog_enabled = GL_FALSE; + GPUState.is_dirty = GL_TRUE; + } + break; + case GL_ALPHA_TEST: { + if(GPUState.alpha_test_enabled != GL_FALSE) { + GPUState.alpha_test_enabled = GL_FALSE; + GPUState.is_dirty = GL_TRUE; + } + } break; + case GL_NEARZ_CLIPPING_KOS: + if(GPUState.znear_clipping_enabled != GL_FALSE) { + GPUState.znear_clipping_enabled = GL_FALSE; + GPUState.is_dirty = GL_TRUE; + } + break; + default: + break; + } +} + +/* Depth Testing */ +GLAPI void APIENTRY glClearDepthf(GLfloat depth) { + glClearDepth(depth); +} + +GLAPI void APIENTRY glClearDepth(GLfloat depth) { + /* We reverse because using invW means that farther Z == lower number */ + GPUSetClearDepth(MIN(1.0f - depth, PVR_MIN_Z)); +} + +GLAPI void APIENTRY glDepthMask(GLboolean flag) { + if(GPUState.depth_mask_enabled != flag) { + GPUState.depth_mask_enabled = flag; + GPUState.is_dirty = GL_TRUE; + } +} + +GLAPI void APIENTRY glDepthFunc(GLenum func) { + if(GPUState.depth_func != func) { + GPUState.depth_func = func; + GPUState.is_dirty = GL_TRUE; + } +} + +/* Culling */ +GLAPI void APIENTRY glFrontFace(GLenum mode) { + if(GPUState.front_face != mode) { + GPUState.front_face = mode; + GPUState.is_dirty = GL_TRUE; + } +} + +GLAPI void APIENTRY glCullFace(GLenum mode) { + if(GPUState.cull_face != mode) { + GPUState.cull_face = mode; + GPUState.is_dirty = GL_TRUE; + } +} + +/* Shading - Flat or Goraud */ +GLAPI void APIENTRY glShadeModel(GLenum mode) { + if(GPUState.shade_model != mode) { + GPUState.shade_model = mode; + GPUState.is_dirty = GL_TRUE; + } +} + +/* Blending */ +GLAPI void APIENTRY glBlendFunc(GLenum sfactor, GLenum dfactor) { + if(GPUState.blend_dfactor != dfactor || GPUState.blend_sfactor != sfactor) { + GPUState.blend_sfactor = sfactor; + GPUState.blend_dfactor = dfactor; + GPUState.is_dirty = GL_TRUE; + } +} + + +GLAPI void APIENTRY glAlphaFunc(GLenum func, GLclampf ref) { + GLubyte val = (GLubyte)(ref * 255.0f); + GPUSetAlphaCutOff(val); +} + +void APIENTRY glScissor(GLint x, GLint y, GLsizei width, GLsizei height) { + + if(GPUState.scissor_rect.x == x && + GPUState.scissor_rect.y == y && + GPUState.scissor_rect.width == width && + GPUState.scissor_rect.height == height) { + return; + } + + GPUState.scissor_rect.x = x; + GPUState.scissor_rect.y = y; + GPUState.scissor_rect.width = width; + GPUState.scissor_rect.height = height; + GPUState.scissor_rect.applied = false; + GPUState.is_dirty = GL_TRUE; // FIXME: do we need this? + + _glApplyScissor(false); +} + +/* Setup the hardware user clip rectangle. + + The minimum clip rectangle is a 32x32 area which is dependent on the tile + size use by the tile accelerator. The PVR swithes off rendering to tiles + outside or inside the defined rectangle dependant upon the 'clipmode' + bits in the polygon header. + + Clip rectangles therefore must have a size that is some multiple of 32. + + glScissor(0, 0, 32, 32) allows only the 'tile' in the lower left + hand corner of the screen to be modified and glScissor(0, 0, 0, 0) + disallows modification to all 'tiles' on the screen. + + We call this in the following situations: + + - glEnable(GL_SCISSOR_TEST) is called + - glScissor() is called + - After glKosSwapBuffers() + + This ensures that a clip command is added to every vertex list + at the right place, either when enabling the scissor test, or + when the scissor test changes. +*/ +void _glApplyScissor(bool force) { + /* Don't do anyting if clipping is disabled */ + if(!GPUState.scissor_test_enabled) { + return; + } + + /* Don't apply if we already applied - nothing changed */ + if(GPUState.scissor_rect.applied && !force) { + return; + } + + PVRTileClipCommand c; + + GLint miny, maxx, maxy; + + const VideoMode* vid_mode = GetVideoMode(); + + GLsizei scissor_width = MAX(MIN(GPUState.scissor_rect.width, vid_mode->width), 0); + GLsizei scissor_height = MAX(MIN(GPUState.scissor_rect.height, vid_mode->height), 0); + + /* force the origin to the lower left-hand corner of the screen */ + miny = (vid_mode->height - scissor_height) - GPUState.scissor_rect.y; + maxx = (scissor_width + GPUState.scissor_rect.x); + maxy = (scissor_height + miny); + + /* load command structure while mapping screen coords to TA tiles */ + c.flags = GPU_CMD_USERCLIP; + c.d1 = c.d2 = c.d3 = 0; + + uint16_t vw = vid_mode->width >> 5; + uint16_t vh = vid_mode->height >> 5; + + c.sx = CLAMP(GPUState.scissor_rect.x >> 5, 0, vw); + c.sy = CLAMP(miny >> 5, 0, vh); + c.ex = CLAMP((maxx >> 5) - 1, 0, vw); + c.ey = CLAMP((maxy >> 5) - 1, 0, vh); + + aligned_vector_push_back(&_glOpaquePolyList()->vector, &c, 1); + aligned_vector_push_back(&_glPunchThruPolyList()->vector, &c, 1); + aligned_vector_push_back(&_glTransparentPolyList()->vector, &c, 1); + + GPUState.scissor_rect.applied = true; +} + +void APIENTRY glGetIntegerv(GLenum pname, GLint *params) { + switch(pname) { + case GL_MAX_TEXTURE_SIZE: + *params = MAX_TEXTURE_SIZE; + break; + case GL_TEXTURE_FREE_MEMORY_ATI: + case GL_FREE_TEXTURE_MEMORY_KOS: + *params = _glFreeTextureMemory(); + break; + case GL_USED_TEXTURE_MEMORY_KOS: + *params = _glUsedTextureMemory(); + break; + case GL_FREE_CONTIGUOUS_TEXTURE_MEMORY_KOS: + *params = _glFreeContiguousTextureMemory(); + break; + default: + _glKosThrowError(GL_INVALID_ENUM, __func__); + break; + } +} + +const GLubyte *glGetString(GLenum name) { + switch(name) { + case GL_VENDOR: + return (const GLubyte*) "KallistiOS / Kazade"; + + case GL_RENDERER: + return (const GLubyte*) "PowerVR2 CLX2 100mHz"; + } + + return (const GLubyte*) "GL_KOS_ERROR: ENUM Unsupported\n"; +} diff --git a/third_party/gldc/src/texture.c b/third_party/gldc/src/texture.c new file mode 100644 index 000000000..c9b6fa2df --- /dev/null +++ b/third_party/gldc/src/texture.c @@ -0,0 +1,280 @@ +#include "private.h" + +#include +#include +#include +#include + +#include "platform.h" + +#include "yalloc/yalloc.h" + +/* We always leave this amount of vram unallocated to prevent + * issues with the allocator */ +#define PVR_MEM_BUFFER_SIZE (64 * 1024) + +TextureObject* TEXTURE_ACTIVE = NULL; +static NamedArray TEXTURE_OBJECTS; + +static void* YALLOC_BASE = NULL; +static size_t YALLOC_SIZE = 0; + +static void* yalloc_alloc_and_defrag(size_t size) { + void* ret = yalloc_alloc(YALLOC_BASE, size); + + if(!ret) { + /* Tried to allocate, but out of room, let's try defragging + * and repeating the alloc */ + fprintf(stderr, "Ran out of memory, defragmenting\n"); + glDefragmentTextureMemory_KOS(); + ret = yalloc_alloc(YALLOC_BASE, size); + } + + gl_assert(ret && "Out of PVR memory!"); + + return ret; +} + +#define GL_KOS_INTERNAL_DEFAULT_MIPMAP_LOD_BIAS 4 +static void _glInitializeTextureObject(TextureObject* txr, unsigned int id) { + txr->index = id; + txr->width = txr->height = 0; + txr->mipmap = 0; + txr->env = GPU_TXRENV_MODULATEALPHA; + txr->data = NULL; + txr->minFilter = GL_NEAREST; + txr->magFilter = GL_NEAREST; + txr->mipmap_bias = GL_KOS_INTERNAL_DEFAULT_MIPMAP_LOD_BIAS; +} + +GLubyte _glInitTextures() { + named_array_init(&TEXTURE_OBJECTS, sizeof(TextureObject), MAX_TEXTURE_COUNT); + + // Reserve zero so that it is never given to anyone as an ID! + named_array_reserve(&TEXTURE_OBJECTS, 0); + + // Initialize zero as an actual texture object though because apparently it is! + TextureObject* default_tex = (TextureObject*) named_array_get(&TEXTURE_OBJECTS, 0); + _glInitializeTextureObject(default_tex, 0); + TEXTURE_ACTIVE = default_tex; + + size_t vram_free = GPUMemoryAvailable(); + YALLOC_SIZE = vram_free - PVR_MEM_BUFFER_SIZE; /* Take all but 64kb VRAM */ + YALLOC_BASE = GPUMemoryAlloc(YALLOC_SIZE); + +#ifdef __DREAMCAST__ + /* Ensure memory is aligned */ + gl_assert((uintptr_t) YALLOC_BASE % 32 == 0); +#endif + + yalloc_init(YALLOC_BASE, YALLOC_SIZE); + + gl_assert(TEXTURE_OBJECTS.element_size > 0); + return 1; +} + +GLuint APIENTRY gldcGenTexture(void) { + TRACE(); + + gl_assert(TEXTURE_OBJECTS.element_size > 0); + + GLuint id = 0; + TextureObject* txr = (TextureObject*) named_array_alloc(&TEXTURE_OBJECTS, &id); + gl_assert(txr); + gl_assert(id); // Generated IDs must never be zero + + _glInitializeTextureObject(txr, id); + + gl_assert(txr->index == id); + gl_assert(TEXTURE_OBJECTS.element_size > 0); + + return id; +} + +void APIENTRY gldcDeleteTexture(GLuint id) { + TRACE(); + + gl_assert(TEXTURE_OBJECTS.element_size > 0); + + if(id == 0) { + /* Zero is the "default texture" and we never allow deletion of it */ + return; + } + + TextureObject* txr = (TextureObject*) named_array_get(&TEXTURE_OBJECTS, id); + + if(txr) { + gl_assert(txr->index == id); + + if(txr == TEXTURE_ACTIVE) { + // Reset to the default texture + TEXTURE_ACTIVE = (TextureObject*) named_array_get(&TEXTURE_OBJECTS, 0); + } + + if(txr->data) { + yalloc_free(YALLOC_BASE, txr->data); + txr->data = NULL; + } + + named_array_release(&TEXTURE_OBJECTS, id); + } + + gl_assert(TEXTURE_OBJECTS.element_size > 0); +} + +void APIENTRY gldcBindTexture(GLuint id) { + TRACE(); + + TextureObject* txr = (TextureObject*) named_array_get(&TEXTURE_OBJECTS, id); + + TEXTURE_ACTIVE = txr; + gl_assert(TEXTURE_ACTIVE->index == id); + + gl_assert(TEXTURE_OBJECTS.element_size > 0); + + _glGPUStateMarkDirty(); +} + +static GLuint _determinePVRFormat(GLint internalFormat, GLenum type) { + /* Given a cleaned internalFormat, return the Dreamcast format + * that can hold it + */ + switch(internalFormat) { + case GL_RGBA: + /* OK so if we have something that requires alpha, we return 4444 unless + * the type was already 1555 (1-bit alpha) in which case we return that + */ + if(type == GL_UNSIGNED_SHORT_1_5_5_5_REV) { + return GPU_TXRFMT_ARGB1555 | GPU_TXRFMT_NONTWIDDLED; + } else if(type == GL_UNSIGNED_SHORT_1_5_5_5_REV_TWID_KOS) { + return GPU_TXRFMT_ARGB1555 | GPU_TXRFMT_TWIDDLED; + } else if(type == GL_UNSIGNED_SHORT_4_4_4_4_REV_TWID_KOS) { + return GPU_TXRFMT_ARGB4444 | GPU_TXRFMT_TWIDDLED; + } else { + return GPU_TXRFMT_ARGB4444 | GPU_TXRFMT_NONTWIDDLED; + } + /* Compressed and twiddled versions */ + case GL_UNSIGNED_SHORT_5_6_5_TWID_KOS: + return GPU_TXRFMT_RGB565 | GPU_TXRFMT_TWIDDLED; + case GL_UNSIGNED_SHORT_4_4_4_4_REV_TWID_KOS: + return GPU_TXRFMT_ARGB4444 | GPU_TXRFMT_TWIDDLED; + case GL_UNSIGNED_SHORT_1_5_5_5_REV_TWID_KOS: + return GPU_TXRFMT_ARGB1555 | GPU_TXRFMT_TWIDDLED; + default: + return 0; + } +} + + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) +#define INFO_MSG(x) fprintf(stderr, "%s:%s > %s\n", __FILE__, TOSTRING(__LINE__), x) + +void APIENTRY gldcAllocTexture(GLsizei w, GLsizei h, GLenum format, GLenum type) { + TRACE(); + + if (w > 1024 || h > 1024){ + INFO_MSG("Invalid texture size"); + _glKosThrowError(GL_INVALID_VALUE, __func__); + return; + } + + if((w < 8 || (w & -w) != w)) { + /* Width is not a power of two. Must be!*/ + INFO_MSG(""); + _glKosThrowError(GL_INVALID_VALUE, __func__); + return; + } + + if((h < 8 || (h & -h) != h)) { + /* height is not a power of two. Must be!*/ + INFO_MSG(""); + _glKosThrowError(GL_INVALID_VALUE, __func__); + return; + } + + TextureObject* active = TEXTURE_ACTIVE; + /* Calculate the format that we need to convert the data to */ + GLuint pvr_format = _determinePVRFormat(format, type); + + if(active->data) { + /* pre-existing texture - check if changed */ + if(active->width != w || active->height != h) { + /* changed - free old texture memory */ + yalloc_free(YALLOC_BASE, active->data); + active->data = NULL; + active->mipmap = 0; + } + } + + /* All colour formats are represented as shorts internally. */ + GLuint bytes = w * h * 2; + + if(!active->data) { + /* need texture memory */ + active->width = w; + active->height = h; + active->color = pvr_format; + + active->data = yalloc_alloc_and_defrag(bytes); + } + + gl_assert(active->data); + + /* If we run out of PVR memory just return */ + if(!active->data) { + _glKosThrowError(GL_OUT_OF_MEMORY, __func__); + return; + } + + /* Mark level 0 as set in the mipmap bitmask */ + active->mipmap |= (1 << 0); + + /* We make sure we remove nontwiddled and add twiddled. We could always + * make it twiddled when determining the format but I worry that would make the + * code less flexible to change in the future */ + active->color &= ~(1 << 26); + + _glGPUStateMarkDirty(); +} + +GLAPI void APIENTRY gldcGetTexture(GLvoid** data, GLsizei* width, GLsizei* height) { + TextureObject* active = TEXTURE_ACTIVE; + *data = active->data; + *width = active->width; + *height = active->height; +} + +GLuint _glMaxTextureMemory() { + return YALLOC_SIZE; +} + +GLuint _glFreeTextureMemory() { + return yalloc_count_free(YALLOC_BASE); +} + +GLuint _glUsedTextureMemory() { + return YALLOC_SIZE - _glFreeTextureMemory(); +} + +GLuint _glFreeContiguousTextureMemory() { + return yalloc_count_continuous(YALLOC_BASE); +} + +GLAPI GLvoid APIENTRY glDefragmentTextureMemory_KOS(void) { + yalloc_defrag_start(YALLOC_BASE); + + GLuint id; + + /* Replace all texture pointers */ + for(id = 0; id < MAX_TEXTURE_COUNT; id++){ + TextureObject* txr = (TextureObject*) named_array_get(&TEXTURE_OBJECTS, id); + if(txr){ + gl_assert(txr->index == id); + txr->data = yalloc_defrag_address(YALLOC_BASE, txr->data); + } + } + + yalloc_defrag_commit(YALLOC_BASE); +} + diff --git a/third_party/gldc/src/types.h b/third_party/gldc/src/types.h new file mode 100644 index 000000000..85df8bad9 --- /dev/null +++ b/third_party/gldc/src/types.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +typedef struct { + /* Same 32 byte layout as pvr_vertex_t */ + uint32_t flags; + float xyz[3]; + float uv[2]; + uint8_t bgra[4]; + + /* In the pvr_vertex_t structure, this next 4 bytes is oargb + * but we're not using that for now, so having W here makes the code + * simpler */ + float w; +} __attribute__ ((aligned (32))) Vertex; diff --git a/third_party/gldc/src/yalloc/LICENSE b/third_party/gldc/src/yalloc/LICENSE new file mode 100644 index 000000000..8aa26455d --- /dev/null +++ b/third_party/gldc/src/yalloc/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [year] [fullname] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/third_party/gldc/src/yalloc/README.md b/third_party/gldc/src/yalloc/README.md new file mode 100644 index 000000000..ca23ec200 --- /dev/null +++ b/third_party/gldc/src/yalloc/README.md @@ -0,0 +1,158 @@ +# Summary + +yalloc is a memory efficient allocator which is intended for embedded +applications that only have a low amount of RAM and want to maximize its +utilization. Properties of the allocator: + + - pools can be up to 128k + - user data is 32bit aligned + - 4 bytes overhead per allocation + - supports defragmentation + - uses a free list for first fit allocation strategy (most recently freed + blocks are used first) + - extensively tested (see section below) + - MIT license + +# Defragmentation + +This feature was the initial motivation for this implementation. Especially +when dealing with highly memory constrained environments fragmenting memory +pools can be annoying. For this reason this implementation supports +defragmentation which moves all allocated blocks into a contiguous range at the +beginning of the pool, leaving a maximized free range at the end. + +As there is no garbage collector or other runtime system involved that updates +the references, the application must do so. This is done in three steps: + + 1. yalloc_defrag_start() is called. This calculates the new + post-defragmentation-addresses for all allocations, but otherwise leaves + the allocations untouched. + + 2. yalloc_defrag_address() is called by the application for every pointer that + points to an allocation. It returns the post-defragmentation-address for + the allocation. The application must update all its relevant pointers this + way. Care must be taken not not yet dereference that moved pointers. If the + application works with hierarchical data then this can easily be done by + updating the pointers button up (first the leafs then their parents). + + 3. yalloc_defrag_commit() is called to finally perform the defragmentation. + All allocated blocks are moved to their post-defragmentation-address and + the application can continue using the pool the normal way. + +It is up to the application when (and if) it performs defragmentation. One +strategy would be to delay it until an allocation failure. Another approach +would be to perform the defragmentation regularly when there is nothing else to +do. + +# Configurable Defines + +INTERNAL_VALIDATE + +If this is not defined on the compiler commandline it will be defined as 0 if +NDEBUG is defined and otherwise as 1. If you want to disable internal +validation when NDEBUG is not defined then define INERNAL_VALIDATE as 0 on the +compiler commandline. + +If it is nonzero the heap will be validated via a bunch of assert() calls at +the end of every function that modifies the heap. This has roughly O(N*M) +overhead where N is the number of allocated blocks and M the number of free +blocks in a heap. For applications with enough live allocations this will get +significant. + +YALLOC_VALGRIND + +If this is defined in yalloc.c and NVALGRIND is not defined then +valgrind/memcheck.h is included and the the allocator functions tell valgrind +about the pool, the allocations and makes the block headers inaccessible outside +of yalloc-functions. This allows valgrind to detect a lot of the accidents that +can happen when dealing dynamic memory. This also adds some overhead for every +yalloc-call because most of them will "unprotect" the internal structure on +entry and "protect" it again (marking it as inaccessible for valgrind) before +returning. + +# Tests + +The tests rely on internal validation of the pool (see INTERNAL_VALIDATE) to +check that no assumptions about the internal structure of the pool are +violated. They additionally check for correctness of observations that can be +made by using the public functions of the allocator (like checking if user data +stays unmodified). There are a few different scripts that run tests: + + - run_coverage.sh runs a bunch of testfunctions that are carefully crafted to + cover all code paths. Coverage data is generated by clang and a summary is + shown at the end of the test. + + - run_valgrind.sh tests if the valgrind integration is working as expected, + runs the functions from the coverage test and some randomly generated + testcases under valgrind. + + - run_libfuzzer.sh uses libfuzzer from clang to generate interesting testcases + and runs them in multiple jobs in parallel for 10 seconds. It also generates + coverage data at the end (it always got 100% coverage in my testruns). + +All tests exit with 0 and print "All fine!" at the end if there where no +errors. Coverage deficits are not counted as error, so you have to look at the +summary (they should show 100% coverage!). + + +# Implementation Details + +The Headers and the user data are 32bit aligned. Headers have two 16bit fields +where the high 15 bits represent offsets (relative to the pools address) to the +previous/next block. The macros HDR_PTR() and HDR_OFFSET() are used to +translate an offset to an address and back. The 32bit alignment is exploited to +allow pools of up to 128k with that 15 significant bits. + +A pool is always occupied by non-overlapping blocks that link to their +previous/next block in address order via the prev/next field of Header. + +Free blocks are always joined: No two free blocks will ever be neighbors. + +Free blocks have an additional header of the same structure. This additional +header is used to build a list of free blocks (independent of their address +order). + +yalloc_free() will insert the freed block to the front of the free list. +yalloc_alloc() searches that list front to back and takes the first block that +is big enough to satisfy the allocation. + +There is always a Header at the front and at the end of the pool. The Header at +the end is degenerate: It is marked as "used" but has no next block (which is +usually used to determine the size of a block). + +The prev-field of the very first block in the pool has special meaning: It +points to the first free block in the pool. Or, if the pool is currently +defragmenting (after yalloc_defrag_start() and before yalloc_defrag_commit()), +points to the last header of the pool. This state can be recognized by checking +if it points to an empty block (normal pool state) or a used block +(defragmentation in progress). This logic can be seen in +yalloc_defrag_in_progress(). + +The lowest bit of next/prev have special meaning: + + - low bit of prev is set for free blocks + + - low bit of next is set for blocks with 32bit padding after the user data. + This is needed when a block is allocated from a free block that leaves only + 4 free bytes after the user data... which is not enough to insert a + free-header (which is needs 8 bytes). The padding will be reclaimed when + that block is freed or when the pool is defragmented. The predicate + isPadded() can be used to test if a block is padded. Free blocks are never + padded. + +The predicate isNil() can be used to test if an offset points nowhere (it tests +if all 15 high bits of an offset are 1). The constant NIL has all but the +lowest bit set. It is used to set offsets to point to nowhere, and in some +places it is used to mask out the actual address bits of an offset. This should +be kept in mind when modifying the code and updating prev/next: Think carefully +if you have to preserve the low bit when updating an offset! + +Defragmentation is done in two phases: First the user calls +yalloc_defrag_start(). This will put the pool in a special state where no +alloc/free-calls are allowed. In this state the prev-fields of the used blocks +have a special meaning: They store the offset that the block will have after +defragmentation finished. This information is used by yalloc_defrag_address() +which can be called by the application to query the new addresses for its +allocations. After the application has updated all its pointers it must call +yalloc_defrag_commit() which moves all used blocks in contiguous space at the +beginning of the pool, leaving one maximized free block at the end. diff --git a/third_party/gldc/src/yalloc/yalloc.c b/third_party/gldc/src/yalloc/yalloc.c new file mode 100644 index 000000000..6dcf0e592 --- /dev/null +++ b/third_party/gldc/src/yalloc/yalloc.c @@ -0,0 +1,803 @@ +#include "yalloc.h" +#include "yalloc_internals.h" + +#include +#include + +#define ALIGN(num, align) (((num) + ((align) - 1)) & ~((align) - 1)) + +#if defined(YALLOC_VALGRIND) && !defined(NVALGRIND) +# define USE_VALGRIND 1 +#else +# define USE_VALGRIND 0 +#endif + +#if USE_VALGRIND +# include +#else +# define VALGRIND_MAKE_MEM_UNDEFINED(p, s) ((void)0) +# define VALGRIND_MAKE_MEM_DEFINED(p, s) ((void)0) +# define VALGRIND_MAKE_MEM_NOACCESS(p, s) ((void)0) +# define VALGRIND_CREATE_MEMPOOL(pool, rz, z) ((void)0) +# define VALGRIND_MEMPOOL_ALLOC(pool, p, s) ((void)0) +# define VALGRIND_MEMPOOL_FREE(pool, p) ((void)0) +# define VALGRIND_MEMPOOL_CHANGE(pool, a, b, s) ((void)0) +#endif + +#define MARK_NEW_FREE_HDR(p) VALGRIND_MAKE_MEM_UNDEFINED(p, sizeof(Header) * 2) +#define MARK_NEW_HDR(p) VALGRIND_MAKE_MEM_UNDEFINED(p, sizeof(Header)) +#define PROTECT_HDR(p) VALGRIND_MAKE_MEM_NOACCESS(p, sizeof(Header)) +#define PROTECT_FREE_HDR(p) VALGRIND_MAKE_MEM_NOACCESS(p, sizeof(Header) * 2) +#define UNPROTECT_HDR(p) VALGRIND_MAKE_MEM_DEFINED(p, sizeof(Header)) +#define UNPROTECT_FREE_HDR(p) VALGRIND_MAKE_MEM_DEFINED(p, sizeof(Header) * 2) + + +#if USE_VALGRIND +static void _unprotect_pool(void * pool) +{ + Header * cur = (Header*)pool; + for (;;) + { + UNPROTECT_HDR(cur); + if (isFree(cur)) + UNPROTECT_HDR(cur + 1); + + if (isNil(cur->next)) + break; + + cur = HDR_PTR(cur->next); + } +} + +static void _protect_pool(void * pool) +{ + Header * cur = (Header*)pool; + while (cur) + { + Header * next = isNil(cur->next) ? NULL : HDR_PTR(cur->next); + + if (isFree(cur)) + VALGRIND_MAKE_MEM_NOACCESS(cur, (char*)next - (char*)cur); + else + PROTECT_HDR(cur); + + cur = next; + } +} +#define assert_is_pool(pool) assert(VALGRIND_MEMPOOL_EXISTS(pool)); + +#else + +static void _unprotect_pool(void * pool){(void)pool;} +static void _protect_pool(void * pool){(void)pool;} +#define assert_is_pool(pool) ((void)0) +#endif + +// internal version that does not unprotect/protect the pool +static int _yalloc_defrag_in_progress(void * pool) +{ + // fragmentation is indicated by a free list with one entry: the last block of the pool, which has its "free"-bit cleared. + Header * p = (Header*)pool; + if (isNil(p->prev)) + return 0; + + return !(HDR_PTR(p->prev)->prev & 1); +} + +int yalloc_defrag_in_progress(void * pool) +{ + _unprotect_pool(pool); + int ret = _yalloc_defrag_in_progress(pool); + _protect_pool(pool); + return ret; +} + +#if YALLOC_INTERNAL_VALIDATE + +static size_t _count_free_list_occurences(Header * pool, Header * blk) +{ + int n = 0; + if (!isNil(pool->prev)) + { + Header * cur = HDR_PTR(pool->prev); + for (;;) + { + if (cur == blk) + ++n; + + if (isNil(cur[1].next)) + break; + + cur = HDR_PTR(cur[1].next); + } + } + return n; +} + +static size_t _count_addr_list_occurences(Header * pool, Header * blk) +{ + size_t n = 0; + Header * cur = pool; + for (;;) + { + if (cur == blk) + ++n; + + if (isNil(cur->next)) + break; + + cur = HDR_PTR(cur->next); + } + return n; +} + +static void _validate_user_ptr(void * pool, void * p) +{ + Header * hdr = (Header*)p - 1; + size_t n = _count_addr_list_occurences((Header*)pool, hdr); + assert(n == 1 && !isFree(hdr)); +} + +/** +Validates if all the invariants of a pool are intact. + +This is very expensive when there are enough blocks in the heap (quadratic complexity!). +*/ +static void _yalloc_validate(void * pool_) +{ + Header * pool = (Header*)pool_; + Header * cur = pool; + + assert(!isNil(pool->next)); // there must always be at least two blocks: a free/used one and the final block at the end + + if (_yalloc_defrag_in_progress(pool)) + { + Header * prevUsed = NULL; + while (!isNil(cur->next)) + { + if (!isFree(cur)) + { // it is a used block + Header * newAddr = cur == pool ? pool : HDR_PTR(cur->prev); + assert(newAddr <= cur); + assert(newAddr >= pool); + + if (prevUsed) + { + Header * prevNewAddr = prevUsed == pool ? pool : HDR_PTR(prevUsed->prev); + size_t prevBruttoSize = (char*)HDR_PTR(prevUsed->next) - (char*)prevUsed; + if (isPadded(prevUsed)) + prevBruttoSize -= 4; // remove padding + assert((char*)newAddr == (char*)prevNewAddr + prevBruttoSize); + } + else + { + assert(newAddr == pool); + } + + prevUsed = cur; + } + + cur = HDR_PTR(cur->next); + } + + assert(cur == HDR_PTR(pool->prev)); // the free-list should point to the last block + assert(!isFree(cur)); // the last block must not be free + } + else + { + Header * prev = NULL; + + // iterate blocks in address order + for (;;) + { + if (prev) + { + Header * x = HDR_PTR(cur->prev); + assert(x == prev); + } + + int n = _count_free_list_occurences(pool, cur); + if (isFree(cur)) + { // it is a free block + assert(n == 1); + assert(!isPadded(cur)); // free blocks must have a zero padding-bit + + if (prev) + { + assert(!isFree(prev)); // free blocks must not be direct neighbours + } + } + else + { + assert(n == 0); + } + + if (isNil(cur->next)) + break; + + Header * next = HDR_PTR(cur->next); + assert((char*)next >= (char*)cur + sizeof(Header) * 2); + prev = cur; + cur = next; + } + + assert(isNil(cur->next)); + + if (!isNil(pool->prev)) + { + // iterate free-list + Header * f = HDR_PTR(pool->prev); + assert(isNil(f[1].prev)); + for (;;) + { + assert(isFree(f)); // must be free + + int n = _count_addr_list_occurences(pool, f); + assert(n == 1); + + if (isNil(f[1].next)) + break; + + f = HDR_PTR(f[1].next); + } + } + } +} + +#else +static void _yalloc_validate(void * pool){(void)pool;} +static void _validate_user_ptr(void * pool, void * p){(void)pool; (void)p;} +#endif + +int yalloc_init(void * pool, size_t size) +{ + if (size > MAX_POOL_SIZE) + return -1; + + // TODO: Error when pool is not properly aligned + + // TODO: Error when size is not a multiple of the alignment? + while (size % sizeof(Header)) + --size; + + if(size < sizeof(Header) * 3) + return -1; + + VALGRIND_CREATE_MEMPOOL(pool, 0, 0); + + Header * first = (Header*)pool; + Header * last = (Header*)((char*)pool + size) - 1; + + MARK_NEW_FREE_HDR(first); + MARK_NEW_HDR(first); + + first->prev = HDR_OFFSET(first) | 1; + first->next = HDR_OFFSET(last); + first[1].prev = NIL; + first[1].next = NIL; + + last->prev = HDR_OFFSET(first); + last->next = NIL; + + _unprotect_pool(pool); + _yalloc_validate(pool); + _protect_pool(pool); + return 0; +} + +void yalloc_deinit(void * pool) +{ +#if USE_VALGRIND + VALGRIND_DESTROY_MEMPOOL(pool); + + Header * last = (Header*)pool; + UNPROTECT_HDR(last); + while (!isNil(last->next)) + { + Header * next = HDR_PTR(last->next); + UNPROTECT_HDR(next); + last = next; + } + + VALGRIND_MAKE_MEM_UNDEFINED(pool, (char*)(last + 1) - (char*)pool); +#else + (void)pool; +#endif +} + + +void * yalloc_alloc(void * pool, size_t size) +{ + assert_is_pool(pool); + _unprotect_pool(pool); + assert(!_yalloc_defrag_in_progress(pool)); + _yalloc_validate(pool); + if (!size) + { + _protect_pool(pool); + return NULL; + } + + Header * root = (Header*)pool; + if (isNil(root->prev)) + { + _protect_pool(pool); + return NULL; /* no free block, no chance to allocate anything */ // TODO: Just read up which C standard supports single line comments and then fucking use them! + } + + /* round up to alignment */ + size = ALIGN(size, 32); + + size_t bruttoSize = size + sizeof(Header); + Header * prev = NULL; + Header * cur = HDR_PTR(root->prev); + for (;;) + { + size_t curSize = (char*)HDR_PTR(cur->next) - (char*)cur; /* size of the block, including its header */ + + if (curSize >= bruttoSize) // it is big enough + { + // take action for unused space in the free block + if (curSize >= bruttoSize + sizeof(Header) * 2) + { // the leftover space is big enough to make it a free block + // Build a free block from the unused space and insert it into the list of free blocks after the current free block + Header * tail = (Header*)((char*)cur + bruttoSize); + MARK_NEW_FREE_HDR(tail); + + // update address-order-list + tail->next = cur->next; + tail->prev = HDR_OFFSET(cur) | 1; + HDR_PTR(cur->next)->prev = HDR_OFFSET(tail); // NOTE: We know the next block is used because free blocks are never neighbours. So we don't have to care about the lower bit which would be set for the prev of a free block. + cur->next = HDR_OFFSET(tail); + + // update list of free blocks + tail[1].next = cur[1].next; + // NOTE: tail[1].prev is updated in the common path below (assignment to "HDR_PTR(cur[1].next)[1].prev") + + if (!isNil(cur[1].next)) + HDR_PTR(cur[1].next)[1].prev = HDR_OFFSET(tail); + cur[1].next = HDR_OFFSET(tail); + } + else if (curSize > bruttoSize) + { // there will be unused space, but not enough to insert a free header + internal_assert(curSize - bruttoSize == sizeof(Header)); // unused space must be enough to build a free-block or it should be exactly the size of a Header + cur->next |= 1; // set marker for "has unused trailing space" + } + else + { + internal_assert(curSize == bruttoSize); + } + + cur->prev &= NIL; // clear marker for "is a free block" + + // remove from linked list of free blocks + if (prev) + prev[1].next = cur[1].next; + else + { + uint32_t freeBit = isFree(root); + root->prev = (cur[1].next & NIL) | freeBit; + } + + if (!isNil(cur[1].next)) + HDR_PTR(cur[1].next)[1].prev = prev ? HDR_OFFSET(prev) : NIL; + + _yalloc_validate(pool); + VALGRIND_MEMPOOL_ALLOC(pool, cur + 1, size); + _protect_pool(pool); + return cur + 1; // return address after the header + } + + if (isNil(cur[1].next)) + break; + + prev = cur; + cur = HDR_PTR(cur[1].next); + } + + _yalloc_validate(pool); + _protect_pool(pool); + return NULL; +} + +// Removes a block from the free-list and moves the pools first-free-bock pointer to its successor if it pointed to that block. +static void unlink_from_free_list(Header * pool, Header * blk) +{ + // update the pools pointer to the first block in the free list if necessary + if (isNil(blk[1].prev)) + { // the block is the first in the free-list + // make the pools first-free-pointer point to the next in the free list + uint32_t freeBit = isFree(pool); + pool->prev = (blk[1].next & NIL) | freeBit; + } + else + HDR_PTR(blk[1].prev)[1].next = blk[1].next; + + if (!isNil(blk[1].next)) + HDR_PTR(blk[1].next)[1].prev = blk[1].prev; +} + +size_t yalloc_block_size(void * pool, void * p) +{ + Header * a = (Header*)p - 1; + UNPROTECT_HDR(a); + Header * b = HDR_PTR(a->next); + size_t payloadSize = (char*)b - (char*)p; + if (isPadded(a)) + payloadSize -= sizeof(Header); + PROTECT_HDR(a); + return payloadSize; +} + +void yalloc_free(void * pool_, void * p) +{ + assert_is_pool(pool_); + assert(!yalloc_defrag_in_progress(pool_)); + if (!p) + return; + + _unprotect_pool(pool_); + + Header * pool = (Header*)pool_; + Header * cur = (Header*)p - 1; + + // get pointers to previous/next block in address order + Header * prev = cur == pool || isNil(cur->prev) ? NULL : HDR_PTR(cur->prev); + Header * next = isNil(cur->next) ? NULL : HDR_PTR(cur->next); + + int prevFree = prev && isFree(prev); + int nextFree = next && isFree(next); + +#if USE_VALGRIND + { + unsigned errs = VALGRIND_COUNT_ERRORS; + VALGRIND_MEMPOOL_FREE(pool, p); + if (VALGRIND_COUNT_ERRORS > errs) + { // early exit if the free was invalid (so we get a valgrind error and don't mess up the pool, which is helpful for testing if invalid frees are detected by valgrind) + _protect_pool(pool_); + return; + } + } +#endif + + _validate_user_ptr(pool_, p); + + if (prevFree && nextFree) + { // the freed block has two free neighbors + unlink_from_free_list(pool, prev); + unlink_from_free_list(pool, next); + + // join prev, cur and next + prev->next = next->next; + HDR_PTR(next->next)->prev = cur->prev; + + // prev is now the block we want to push onto the free-list + cur = prev; + } + else if (prevFree) + { + unlink_from_free_list(pool, prev); + + // join prev and cur + prev->next = cur->next; + HDR_PTR(cur->next)->prev = cur->prev; + + // prev is now the block we want to push onto the free-list + cur = prev; + } + else if (nextFree) + { + unlink_from_free_list(pool, next); + + // join cur and next + cur->next = next->next; + HDR_PTR(next->next)->prev = next->prev & NIL; + } + + // if there is a previous block and that block has padding then we want to grow the new free block into that padding + if (cur != pool && !isNil(cur->prev)) + { // there is a previous block + Header * left = HDR_PTR(cur->prev); + if (isPadded(left)) + { // the previous block has padding, so extend the current block to consume move the padding to the current free block + Header * grown = cur - 1; + MARK_NEW_HDR(grown); + grown->next = cur->next; + grown->prev = cur->prev; + left->next = HDR_OFFSET(grown); + if (!isNil(cur->next)) + HDR_PTR(cur->next)->prev = HDR_OFFSET(grown); + + cur = grown; + } + } + + cur->prev |= 1; // it becomes a free block + cur->next &= NIL; // reset padding-bit + UNPROTECT_HDR(cur + 1); + cur[1].prev = NIL; // it will be the first free block in the free list, so it has no prevFree + + if (!isNil(pool->prev)) + { // the free-list was already non-empty + HDR_PTR(pool->prev)[1].prev = HDR_OFFSET(cur); // make the first entry in the free list point back to the new free block (it will become the first one) + cur[1].next = pool->prev; // the next free block is the first of the old free-list + } + else + cur[1].next = NIL; // free-list was empty, so there is no successor + + VALGRIND_MAKE_MEM_NOACCESS(cur + 2, (char*)HDR_PTR(cur->next) - (char*)(cur + 2)); + + // now the freed block is the first in the free-list + + // update the offset to the first element of the free list + uint32_t freeBit = isFree(pool); // remember the free-bit of the offset + pool->prev = HDR_OFFSET(cur) | freeBit; // update the offset and restore the free-bit + _yalloc_validate(pool); + _protect_pool(pool); +} + +size_t yalloc_count_free(void * pool_) +{ + assert_is_pool(pool_); + _unprotect_pool(pool_); + assert(!_yalloc_defrag_in_progress(pool_)); + Header * pool = (Header*)pool_; + size_t bruttoFree = 0; + Header * cur = pool; + + _yalloc_validate(pool); + + for (;;) + { + if (isFree(cur)) + { // it is a free block + bruttoFree += (char*)HDR_PTR(cur->next) - (char*)cur; + } + else + { // it is a used block + if (isPadded(cur)) + { // the used block is padded + bruttoFree += sizeof(Header); + } + } + + if (isNil(cur->next)) + break; + + cur = HDR_PTR(cur->next); + } + + _protect_pool(pool); + + if (bruttoFree < sizeof(Header)) + { + internal_assert(!bruttoFree); // free space should always be a multiple of sizeof(Header) + return 0; + } + + return bruttoFree - sizeof(Header); +} + +size_t yalloc_count_continuous(void * pool_) +{ + assert_is_pool(pool_); + _unprotect_pool(pool_); + assert(!_yalloc_defrag_in_progress(pool_)); + Header * pool = (Header*)pool_; + size_t largestFree = 0; + Header * cur = pool; + + _yalloc_validate(pool); + + for (;;) + { + if (isFree(cur)) + { // it is a free block + size_t temp = (uintptr_t)HDR_PTR(cur->next) - (uintptr_t)cur; + if(temp > largestFree) + largestFree = temp; + } + + if (isNil(cur->next)) + break; + + cur = HDR_PTR(cur->next); + } + + _protect_pool(pool); + + if (largestFree < sizeof(Header)) + { + internal_assert(!largestFree); // free space should always be a multiple of sizeof(Header) + return 0; + } + + return largestFree - sizeof(Header); +} + +void * yalloc_first_used(void * pool) +{ + assert_is_pool(pool); + _unprotect_pool(pool); + Header * blk = (Header*)pool; + while (!isNil(blk->next)) + { + if (!isFree(blk)) + { + _protect_pool(pool); + return blk + 1; + } + + blk = HDR_PTR(blk->next); + } + + _protect_pool(pool); + return NULL; +} + +void * yalloc_next_used(void * pool, void * p) +{ + assert_is_pool(pool); + _unprotect_pool(pool); + _validate_user_ptr(pool, p); + Header * prev = (Header*)p - 1; + assert(!isNil(prev->next)); // the last block should never end up as input to this function (because it is not user-visible) + + Header * blk = HDR_PTR(prev->next); + while (!isNil(blk->next)) + { + if (!isFree(blk)) + { + _protect_pool(pool); + return blk + 1; + } + + blk = HDR_PTR(blk->next); + } + + _protect_pool(pool); + return NULL; +} + +void yalloc_defrag_start(void * pool_) +{ + assert_is_pool(pool_); + _unprotect_pool(pool_); + assert(!_yalloc_defrag_in_progress(pool_)); + Header * pool = (Header*)pool_; + + // iterate over all blocks in address order and store the post-defragment address of used blocks in their "prev" field + size_t end = 0; // offset for the next used block + Header * blk = (Header*)pool; + for (; !isNil(blk->next); blk = HDR_PTR(blk->next)) + { + if (!isFree(blk)) + { // it is a used block + blk->prev = end >> 1; + internal_assert((char*)HDR_PTR(blk->prev) == (char*)pool + end); + + size_t bruttoSize = (char*)HDR_PTR(blk->next) - (char*)blk; + + if (isPadded(blk)) + { // the block is padded + bruttoSize -= sizeof(Header); + } + + end += bruttoSize; + internal_assert(end % sizeof(Header) == 0); + } + } + + // blk is now the last block (the dummy "used" block at the end of the pool) + internal_assert(isNil(blk->next)); + internal_assert(!isFree(blk)); + + // mark the pool as "defragementation in progress" + uint32_t freeBit = isFree(pool); + pool->prev = (HDR_OFFSET(blk) & NIL) | freeBit; + + _yalloc_validate(pool); + internal_assert(yalloc_defrag_in_progress(pool)); + _protect_pool(pool); +} + +void * yalloc_defrag_address(void * pool_, void * p) +{ + assert_is_pool(pool_); + assert(yalloc_defrag_in_progress(pool_)); + if (!p) + return NULL; + + Header * pool = (Header*)pool_; + + _unprotect_pool(pool); + _validate_user_ptr(pool_, p); + + if (pool + 1 == p) + return pool + 1; // "prev" of the first block points to the last used block to mark the pool as "defragmentation in progress" + + Header * blk = (Header*)p - 1; + + void * defragP = HDR_PTR(blk->prev) + 1; + + _protect_pool(pool); + return defragP; +} + +void yalloc_defrag_commit(void * pool_) +{ + assert_is_pool(pool_); + _unprotect_pool(pool_); + assert(_yalloc_defrag_in_progress(pool_)); + Header * pool = (Header*)pool_; + + // iterate over all blocks in address order and move them + size_t end = 0; // offset for the next used block + Header * blk = pool; + Header * lastUsed = NULL; + while (!isNil(blk->next)) + { + if (!isFree(blk)) + { // it is a used block + size_t bruttoSize = (char*)HDR_PTR(blk->next) - (char*)blk; + + if (isPadded(blk)) + { // the block is padded + bruttoSize -= sizeof(Header); + } + + Header * next = HDR_PTR(blk->next); + + blk->prev = lastUsed ? HDR_OFFSET(lastUsed) : NIL; + blk->next = (end + bruttoSize) >> 1; + + lastUsed = (Header*)((char*)pool + end); + VALGRIND_MAKE_MEM_UNDEFINED(lastUsed, (char*)blk - (char*)lastUsed); + memmove(lastUsed, blk, bruttoSize); + VALGRIND_MEMPOOL_CHANGE(pool, blk + 1, lastUsed + 1, bruttoSize - sizeof(Header)); + + end += bruttoSize; + blk = next; + } + else + blk = HDR_PTR(blk->next); + } + + // blk is now the last block (the dummy "used" block at the end of the pool) + internal_assert(isNil(blk->next)); + internal_assert(!isFree(blk)); + + if (lastUsed) + { + Header * gap = HDR_PTR(lastUsed->next); + if (gap == blk) + { // there is no gap + pool->prev = NIL; // the free list is empty + blk->prev = HDR_OFFSET(lastUsed); + } + else if (blk - gap > 1) + { // the gap is big enouogh for a free Header + + // set a free list that contains the gap as only element + gap->prev = HDR_OFFSET(lastUsed) | 1; + gap->next = HDR_OFFSET(blk); + gap[1].prev = NIL; + gap[1].next = NIL; + pool->prev = blk->prev = HDR_OFFSET(gap); + } + else + { // there is a gap, but it is too small to be used as free-list-node, so just make it padding of the last used block + lastUsed->next = HDR_OFFSET(blk) | 1; + pool->prev = NIL; + blk->prev = HDR_OFFSET(lastUsed); + } + } + else + { // the pool is empty + pool->prev = 1; + } + + internal_assert(!_yalloc_defrag_in_progress(pool)); + _yalloc_validate(pool); + _protect_pool(pool); +} diff --git a/third_party/gldc/src/yalloc/yalloc.h b/third_party/gldc/src/yalloc/yalloc.h new file mode 100644 index 000000000..4349eb96b --- /dev/null +++ b/third_party/gldc/src/yalloc/yalloc.h @@ -0,0 +1,176 @@ +/** +@file + +API of the yalloc allocator. +*/ + +#ifndef YALLOC_H +#define YALLOC_H + +#include + +/** +Maximum supported pool size. yalloc_init() will fail for larger pools. +*/ +#define MAX_POOL_SIZE ((2 << 24) - 4) + +/** +Creates a pool inside a given buffer. + +Pools must be deinitialized with yalloc_deinit() when they are no longer needed. + +@param pool The starting address of the pool. It must have at least 16bit +alignment (internal structure uses 16bit integers). Allocations are placed at +32bit boundaries starting from this address, so if the user data should be +32bit aligned then this address has to be 32bit aligned. Typically an address +of static memory, or an array on the stack is used if the pool is only used +temporarily. +@param size Size of the pool. +@return 0 on success, nonzero if the size is not supported. + */ +int yalloc_init(void * pool, size_t size); + +/** +Deinitializes the buffer that is used by the pool and makes it available for other use. + +The content of the buffer is undefined after this. + +@param pool The starting address of an initialized pool. +*/ +void yalloc_deinit(void * pool); + +/** +Allocates a block of memory from a pool. + +This function mimics malloc(). + +The pool must not be in the "defragmenting" state when this function is called. + +@param pool The starting address of an initialized pool. +@param size Number of bytes to allocate. +@return Allocated buffer or \c NULL if there was no free range that could serve +the allocation. See @ref yalloc_defrag_start() for a way to remove +fragmentation which may cause allocations to fail even when there is enough +space in total. +*/ +void * yalloc_alloc(void * pool, size_t size); + +/** +Returns an allocation to a pool. + +This function mimics free(). + +The pool must not be in the "defragmenting" state when this function is called. + +@param pool The starting address of the initialized pool the allocation comes from. +@param p An address that was returned from yalloc_alloc() of the same pool. +*/ +void yalloc_free(void * pool, void * p); + +/** +Returns the maximum size of a successful allocation (assuming a completely unfragmented heap). + +After defragmentation the first allocation with the returned size is guaranteed to succeed. + +@param pool The starting address of an initialized pool. +@return Number of bytes that can be allocated (assuming the pool is defragmented). +*/ +size_t yalloc_count_free(void * pool); + +/** +Returns the maximum continuous free area. + +@param pool The starting address of an initialized pool. +@return Number of free bytes that exist continuously. +*/ +size_t yalloc_count_continuous(void * pool_); + +/** +Queries the usable size of an allocated block. + +@param pool The starting address of the initialized pool the allocation comes from. +@param p An address that was returned from yalloc_alloc() of the same pool. +@return Size of the memory block. This is the size passed to @ref yalloc_alloc() rounded up to 4. +*/ +size_t yalloc_block_size(void * pool, void * p); + +/** +Finds the first (in address order) allocation of a pool. + +@param pool The starting address of an initialized pool. +@return Address of the allocation the lowest address inside the pool (this is +what @ref yalloc_alloc() returned), or \c NULL if there is no used block. +*/ +void * yalloc_first_used(void * pool); + +/** +Given a pointer to an allocation finds the next (in address order) used block of a pool. + +@param pool The starting address of the initialized pool the allocation comes from. +@param p Pointer to an allocation in that pool, typically comes from a previous +call to @ref yalloc_first_used() +*/ +void * yalloc_next_used(void * pool, void * p); + +/** +Starts defragmentation for a pool. + +Allocations will stay where they are. But the pool is put in the "defagmenting" +state (see @ref yalloc_defrag_in_progress()). + +The pool must not be in the "defragmenting" state when this function is called. +The pool is put into the "defragmenting" state by this function. + +@param pool The starting address of an initialized pool. +*/ +void yalloc_defrag_start(void * pool); + +/** +Returns the address that an allocation will have after @ref yalloc_defrag_commit() is called. + +The pool must be in the "defragmenting" state when this function is called. + +@param pool The starting address of the initialized pool the allocation comes from. +@param p Pointer to an allocation in that pool. +@return The address the alloation will have after @ref yalloc_defrag_commit() is called. +*/ +void * yalloc_defrag_address(void * pool, void * p); + +/** +Finishes the defragmentation. + +The content of all allocations in the pool will be moved to the address that +was reported by @ref yalloc_defrag_address(). The pool will then have only one +free block. This means that an yalloc_alloc(pool, yalloc_count_free(pool)) +will succeed. + +The pool must be in the "defragmenting" state when this function is called. The +pool is put back to normal state by this function. + +@param pool The starting address of an initialized pool. +*/ +void yalloc_defrag_commit(void * pool); + +/** +Tells if the pool is in the "defragmenting" state (after a @ref yalloc_defrag_start() and before a @ref yalloc_defrag_commit()). + +@param pool The starting address of an initialized pool. +@return Nonzero if the pool is currently in the "defragmenting" state. +*/ +int yalloc_defrag_in_progress(void * pool); + + +/** +Helper function that dumps the state of the pool to stdout. + +This function is only available if build with yalloc_dump.c. This +function only exists for debugging purposes and can be ignored by normal users +that are not interested in the internal structure of the implementation. + +@param pool The starting address of an initialized pool. +@param name A string that is used as "Title" for the output. +*/ +void yalloc_dump(void * pool, char * name); + + +#endif // YALLOC_H diff --git a/third_party/gldc/src/yalloc/yalloc_dump.c b/third_party/gldc/src/yalloc/yalloc_dump.c new file mode 100644 index 000000000..f0dfdcbcd --- /dev/null +++ b/third_party/gldc/src/yalloc/yalloc_dump.c @@ -0,0 +1,39 @@ +#include "yalloc_internals.h" + +#include + +static void printOffset(void * pool, char * name, uint16_t offset) +{ + if (isNil(offset)) + printf(" %s: nil\n", name); + else + printf(" %s: %td\n", name, (char*)HDR_PTR(offset) - (char*)pool); +} + +void yalloc_dump(void * pool, char * name) +{ + printf("---- %s ----\n", name); + Header * cur = (Header*)pool; + for (;;) + { + printf(isFree(cur) ? "%td: free @%p\n" : "%td: used @%p\n", (char*)cur - (char*)pool, cur); + printOffset(pool, cur == pool ? "first free" : "prev", cur->prev); + printOffset(pool, "next", cur->next); + if (isFree(cur)) + { + printOffset(pool, "prevFree", cur[1].prev); + printOffset(pool, "nextFree", cur[1].next); + } + else + printf(" payload includes padding: %i\n", isPadded(cur)); + + if (isNil(cur->next)) + break; + + printf(" %td bytes payload\n", (char*)HDR_PTR(cur->next) - (char*)cur - sizeof(Header)); + + cur = HDR_PTR(cur->next); + } + + fflush(stdout); +} diff --git a/third_party/gldc/src/yalloc/yalloc_internals.h b/third_party/gldc/src/yalloc/yalloc_internals.h new file mode 100644 index 000000000..ffb70cb9e --- /dev/null +++ b/third_party/gldc/src/yalloc/yalloc_internals.h @@ -0,0 +1,63 @@ +#ifndef YALLOC_INTERNALS_H +#define YALLOC_INTERNALS_H + +#include + +typedef struct +{ + uint32_t prev; // low bit set if free + uint32_t next; // for used blocks: low bit set if unused header at the end + + /* We need user data to be 32-byte aligned, so the header needs + * to be 32 bytes in size (as user data follows the header) */ + uint8_t padding[32 - (sizeof(uint32_t) * 2)]; +} Header; + +// NOTE: We have 32bit aligned data and 16bit offsets where the lowest bit is used as flag. So we remove the low bit and shift by 1 to address 128k bytes with the 15bit significant offset bits. + +#define NIL 0xFFFFFFFEu + +// return Header-address for a prev/next +#define HDR_PTR(offset) ((Header*)((char*)pool + (((offset) & NIL)<<1))) + +// return a prev/next for a Header-address +#define HDR_OFFSET(blockPtr) ((uint32_t)(((char*)blockPtr - (char*)pool) >> 1)) + +#ifndef YALLOC_INTERNAL_VALIDATE +# ifdef NDEBUG +# define YALLOC_INTERNAL_VALIDATE 0 +# else +# define YALLOC_INTERNAL_VALIDATE 1 +#endif +#endif + + +/* +internal_assert() is used in some places to check internal expections. +Activate this if you modify the code to detect problems as early as possible. +In other cases this should be deactivated. +*/ +#if 0 +#define internal_assert assert +#else +#define internal_assert(condition)((void) 0) +#endif + +// detects offsets that point nowhere +static inline int isNil(uint32_t offset) +{ + return (offset | 1) == 0xFFFFFFFF; +} + +static inline int isFree(Header * hdr) +{ + return hdr->prev & 1; +} + +static inline int isPadded(Header * hdr) +{ + return hdr->next & 1; +} + + +#endif // YALLOC_INTERNALS_H