From aa955b8ae28314ae061613f0ddf13183a98aca03 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sat, 23 Dec 2023 20:38:41 +1000 Subject: [PATCH] GPU/SW: Split out rasterizer, add dynamic selection --- src/core/CMakeLists.txt | 2 + src/core/core.vcxproj | 11 + src/core/core.vcxproj.filters | 6 + src/core/gpu_backend.cpp | 6 +- src/core/gpu_backend.h | 4 +- src/core/gpu_sw.cpp | 1 + src/core/gpu_sw_backend.cpp | 750 +--------------- src/core/gpu_sw_backend.h | 142 +-- src/core/gpu_sw_rasterizer.cpp | 100 +++ src/core/gpu_sw_rasterizer.h | 89 ++ src/core/gpu_sw_rasterizer.inl | 1250 +++++++++++++++++++++++++++ src/core/gpu_sw_rasterizer_avx2.cpp | 12 + src/core/gpu_types.h | 1 + 13 files changed, 1493 insertions(+), 881 deletions(-) create mode 100644 src/core/gpu_sw_rasterizer.cpp create mode 100644 src/core/gpu_sw_rasterizer.h create mode 100644 src/core/gpu_sw_rasterizer.inl create mode 100644 src/core/gpu_sw_rasterizer_avx2.cpp diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 2297591e8..a0a9662e7 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -57,6 +57,8 @@ add_library(core gpu_sw.h gpu_sw_backend.cpp gpu_sw_backend.h + gpu_sw_rasterizer.cpp + gpu_sw_rasterizer.h gpu_types.h guncon.cpp guncon.h diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj index 7c11148c2..2144c2873 100644 --- a/src/core/core.vcxproj +++ b/src/core/core.vcxproj @@ -50,6 +50,13 @@ + + + AdvancedVectorExtensions2 + %(AdditionalOptions) -mavx2 + true + NotUsing + @@ -127,6 +134,7 @@ + @@ -195,6 +203,9 @@ {57f6206d-f264-4b07-baf8-11b9bbe1f455} + + + {868B98C8-65A1-494B-8346-250A73A48C0A} diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters index b089e83f1..f623ed9f2 100644 --- a/src/core/core.vcxproj.filters +++ b/src/core/core.vcxproj.filters @@ -67,6 +67,8 @@ + + @@ -140,5 +142,9 @@ + + + + \ No newline at end of file diff --git a/src/core/gpu_backend.cpp b/src/core/gpu_backend.cpp index 367236e05..57508ff78 100644 --- a/src/core/gpu_backend.cpp +++ b/src/core/gpu_backend.cpp @@ -26,7 +26,7 @@ bool GPUBackend::Initialize(bool force_thread) void GPUBackend::Reset() { Sync(true); - m_drawing_area = {}; + DrawingAreaChanged(GPUDrawingArea{0, 0, 0, 0}, GSVector4i::zero()); } void GPUBackend::UpdateSettings() @@ -310,8 +310,8 @@ void GPUBackend::HandleCommand(const GPUBackendCommand* cmd) case GPUBackendCommandType::SetDrawingArea: { FlushRender(); - m_drawing_area = static_cast(cmd)->new_area; - DrawingAreaChanged(); + const GPUBackendSetDrawingAreaCommand* ccmd = static_cast(cmd); + DrawingAreaChanged(ccmd->new_area, GSVector4i::load(ccmd->new_clamped_area)); } break; diff --git a/src/core/gpu_backend.h b/src/core/gpu_backend.h index 67854dc5a..b58df358c 100644 --- a/src/core/gpu_backend.h +++ b/src/core/gpu_backend.h @@ -62,13 +62,11 @@ protected: virtual void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) = 0; virtual void DrawLine(const GPUBackendDrawLineCommand* cmd) = 0; virtual void FlushRender() = 0; - virtual void DrawingAreaChanged() = 0; + virtual void DrawingAreaChanged(const GPUDrawingArea& new_drawing_area, const GSVector4i clamped_drawing_area) = 0; virtual void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) = 0; void HandleCommand(const GPUBackendCommand* cmd); - GPUDrawingArea m_drawing_area = {}; - Threading::KernelSemaphore m_sync_semaphore; std::atomic_bool m_gpu_thread_sleeping{false}; std::atomic_bool m_gpu_loop_done{false}; diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp index e2f9110a8..56a88e779 100644 --- a/src/core/gpu_sw.cpp +++ b/src/core/gpu_sw.cpp @@ -501,6 +501,7 @@ void GPU_SW::DispatchRenderCommand() { GPUBackendSetDrawingAreaCommand* cmd = m_backend.NewSetDrawingAreaCommand(); cmd->new_area = m_drawing_area; + GSVector4i::store(cmd->new_clamped_area, m_clamped_drawing_area); m_backend.PushCommand(cmd); m_drawing_area_changed = false; } diff --git a/src/core/gpu_sw_backend.cpp b/src/core/gpu_sw_backend.cpp index 0f4def66d..cbf547d88 100644 --- a/src/core/gpu_sw_backend.cpp +++ b/src/core/gpu_sw_backend.cpp @@ -3,6 +3,7 @@ #include "gpu_sw_backend.h" #include "gpu.h" +#include "gpu_sw_rasterizer.h" #include "system.h" #include "util/gpu_device.h" @@ -15,6 +16,8 @@ GPU_SW_Backend::~GPU_SW_Backend() = default; bool GPU_SW_Backend::Initialize(bool force_thread) { + GPU_SW_Rasterizer::SelectImplementation(); + return GPUBackend::Initialize(force_thread); } @@ -28,688 +31,31 @@ void GPU_SW_Backend::DrawPolygon(const GPUBackendDrawPolygonCommand* cmd) const GPURenderCommand rc{cmd->rc.bits}; const bool dithering_enable = rc.IsDitheringEnabled() && cmd->draw_mode.dither_enable; - const DrawTriangleFunction DrawFunction = GetDrawTriangleFunction( + const GPU_SW_Rasterizer::DrawTriangleFunction DrawFunction = GPU_SW_Rasterizer::GetDrawTriangleFunction( rc.shading_enable, rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable, dithering_enable); - (this->*DrawFunction)(cmd, &cmd->vertices[0], &cmd->vertices[1], &cmd->vertices[2]); + DrawFunction(cmd, &cmd->vertices[0], &cmd->vertices[1], &cmd->vertices[2]); if (rc.quad_polygon) - (this->*DrawFunction)(cmd, &cmd->vertices[2], &cmd->vertices[1], &cmd->vertices[3]); + DrawFunction(cmd, &cmd->vertices[2], &cmd->vertices[1], &cmd->vertices[3]); } void GPU_SW_Backend::DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) { const GPURenderCommand rc{cmd->rc.bits}; - const DrawRectangleFunction DrawFunction = - GetDrawRectangleFunction(rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable); + const GPU_SW_Rasterizer::DrawRectangleFunction DrawFunction = + GPU_SW_Rasterizer::GetDrawRectangleFunction(rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable); - (this->*DrawFunction)(cmd); + DrawFunction(cmd); } void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd) { - const DrawLineFunction DrawFunction = - GetDrawLineFunction(cmd->rc.shading_enable, cmd->rc.transparency_enable, cmd->IsDitheringEnabled()); + const GPU_SW_Rasterizer::DrawLineFunction DrawFunction = GPU_SW_Rasterizer::GetDrawLineFunction( + cmd->rc.shading_enable, cmd->rc.transparency_enable, cmd->IsDitheringEnabled()); for (u16 i = 1; i < cmd->num_vertices; i++) - (this->*DrawFunction)(cmd, &cmd->vertices[i - 1], &cmd->vertices[i]); -} - -constexpr GPU_SW_Backend::DitherLUT GPU_SW_Backend::ComputeDitherLUT() -{ - DitherLUT lut = {}; - for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++) - { - for (u32 j = 0; j < DITHER_MATRIX_SIZE; j++) - { - for (u32 value = 0; value < DITHER_LUT_SIZE; value++) - { - const s32 dithered_value = (static_cast(value) + DITHER_MATRIX[i][j]) >> 3; - lut[i][j][value] = static_cast((dithered_value < 0) ? 0 : ((dithered_value > 31) ? 31 : dithered_value)); - } - } - } - return lut; -} - -static constexpr GPU_SW_Backend::DitherLUT s_dither_lut = GPU_SW_Backend::ComputeDitherLUT(); - -template -void ALWAYS_INLINE_RELEASE GPU_SW_Backend::ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r, - u8 color_g, u8 color_b, u8 texcoord_x, u8 texcoord_y) -{ - VRAMPixel color; - if constexpr (texture_enable) - { - // Apply texture window - texcoord_x = (texcoord_x & cmd->window.and_x) | cmd->window.or_x; - texcoord_y = (texcoord_y & cmd->window.and_y) | cmd->window.or_y; - - VRAMPixel texture_color; - switch (cmd->draw_mode.texture_mode) - { - case GPUTextureMode::Palette4Bit: - { - const u16 palette_value = - GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 4)) % VRAM_WIDTH, - (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT); - const size_t palette_index = (palette_value >> ((texcoord_x % 4) * 4)) & 0x0Fu; - texture_color.bits = g_gpu_clut[palette_index]; - } - break; - - case GPUTextureMode::Palette8Bit: - { - const u16 palette_value = - GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 2)) % VRAM_WIDTH, - (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT); - const size_t palette_index = (palette_value >> ((texcoord_x % 2) * 8)) & 0xFFu; - texture_color.bits = g_gpu_clut[palette_index]; - } - break; - - default: - { - texture_color.bits = GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x)) % VRAM_WIDTH, - (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT); - } - break; - } - - if (texture_color.bits == 0) - return; - - if constexpr (raw_texture_enable) - { - color.bits = texture_color.bits; - } - else - { - const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u; - const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u; - - color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.r) * u16(color_r)) >> 4]) << 0) | - (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.g) * u16(color_g)) >> 4]) << 5) | - (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.b) * u16(color_b)) >> 4]) << 10) | - (texture_color.bits & 0x8000u); - } - } - else - { - const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u; - const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u; - - // Non-textured transparent polygons don't set bit 15, but are treated as transparent. - color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_r]) << 0) | - (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_g]) << 5) | - (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_b]) << 10) | (transparency_enable ? 0x8000u : 0); - } - - const VRAMPixel bg_color{GetPixel(static_cast(x), static_cast(y))}; - if constexpr (transparency_enable) - { - if (color.bits & 0x8000u || !texture_enable) - { - // Based on blargg's efficient 15bpp pixel math. - u32 bg_bits = ZeroExtend32(bg_color.bits); - u32 fg_bits = ZeroExtend32(color.bits); - switch (cmd->draw_mode.transparency_mode) - { - case GPUTransparencyMode::HalfBackgroundPlusHalfForeground: - { - bg_bits |= 0x8000u; - color.bits = Truncate16(((fg_bits + bg_bits) - ((fg_bits ^ bg_bits) & 0x0421u)) >> 1); - } - break; - - case GPUTransparencyMode::BackgroundPlusForeground: - { - bg_bits &= ~0x8000u; - - const u32 sum = fg_bits + bg_bits; - const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u; - - color.bits = Truncate16((sum - carry) | (carry - (carry >> 5))); - } - break; - - case GPUTransparencyMode::BackgroundMinusForeground: - { - bg_bits |= 0x8000u; - fg_bits &= ~0x8000u; - - const u32 diff = bg_bits - fg_bits + 0x108420u; - const u32 borrow = (diff - ((bg_bits ^ fg_bits) & 0x108420u)) & 0x108420u; - - color.bits = Truncate16((diff - borrow) & (borrow - (borrow >> 5))); - } - break; - - case GPUTransparencyMode::BackgroundPlusQuarterForeground: - { - bg_bits &= ~0x8000u; - fg_bits = ((fg_bits >> 2) & 0x1CE7u) | 0x8000u; - - const u32 sum = fg_bits + bg_bits; - const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u; - - color.bits = Truncate16((sum - carry) | (carry - (carry >> 5))); - } - break; - - default: - break; - } - - // See above. - if constexpr (!texture_enable) - color.bits &= ~0x8000u; - } - } - - const u16 mask_and = cmd->params.GetMaskAND(); - if ((bg_color.bits & mask_and) != 0) - return; - - DebugAssert(static_cast(x) < VRAM_WIDTH && static_cast(y) < VRAM_HEIGHT); - SetPixel(static_cast(x), static_cast(y), color.bits | cmd->params.GetMaskOR()); -} - -template -void GPU_SW_Backend::DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) -{ - const s32 origin_x = cmd->x; - const s32 origin_y = cmd->y; - const auto [r, g, b] = UnpackColorRGB24(cmd->color); - const auto [origin_texcoord_x, origin_texcoord_y] = UnpackTexcoord(cmd->texcoord); - - for (u32 offset_y = 0; offset_y < cmd->height; offset_y++) - { - const s32 y = origin_y + static_cast(offset_y); - if (y < static_cast(m_drawing_area.top) || y > static_cast(m_drawing_area.bottom) || - (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u))) - { - continue; - } - - const u32 draw_y = static_cast(y) & VRAM_HEIGHT_MASK; - const u8 texcoord_y = Truncate8(ZeroExtend32(origin_texcoord_y) + offset_y); - - for (u32 offset_x = 0; offset_x < cmd->width; offset_x++) - { - const s32 x = origin_x + static_cast(offset_x); - if (x < static_cast(m_drawing_area.left) || x > static_cast(m_drawing_area.right)) - continue; - - const u8 texcoord_x = Truncate8(ZeroExtend32(origin_texcoord_x) + offset_x); - - ShadePixel(cmd, static_cast(x), draw_y, r, g, - b, texcoord_x, texcoord_y); - } - } -} - -////////////////////////////////////////////////////////////////////////// -// Polygon and line rasterization ported from Mednafen -////////////////////////////////////////////////////////////////////////// - -#define COORD_FBS 12 -#define COORD_MF_INT(n) ((n) << COORD_FBS) -#define COORD_POST_PADDING 12 - -static ALWAYS_INLINE_RELEASE s64 MakePolyXFP(s32 x) -{ - return ((u64)x << 32) + ((1ULL << 32) - (1 << 11)); -} - -static ALWAYS_INLINE_RELEASE s64 MakePolyXFPStep(s32 dx, s32 dy) -{ - s64 ret; - s64 dx_ex = (u64)dx << 32; - - if (dx_ex < 0) - dx_ex -= dy - 1; - - if (dx_ex > 0) - dx_ex += dy - 1; - - ret = dx_ex / dy; - - return (ret); -} - -static ALWAYS_INLINE_RELEASE s32 GetPolyXFP_Int(s64 xfp) -{ - return (xfp >> 32); -} - -template -bool ALWAYS_INLINE_RELEASE GPU_SW_Backend::CalcIDeltas(i_deltas& idl, const GPUBackendDrawPolygonCommand::Vertex* A, - const GPUBackendDrawPolygonCommand::Vertex* B, - const GPUBackendDrawPolygonCommand::Vertex* C) -{ -#define CALCIS(x, y) (((B->x - A->x) * (C->y - B->y)) - ((C->x - B->x) * (B->y - A->y))) - - s32 denom = CALCIS(x, y); - - if (!denom) - return false; - - if constexpr (shading_enable) - { - idl.dr_dx = (u32)(CALCIS(r, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING; - idl.dr_dy = (u32)(CALCIS(x, r) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING; - - idl.dg_dx = (u32)(CALCIS(g, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING; - idl.dg_dy = (u32)(CALCIS(x, g) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING; - - idl.db_dx = (u32)(CALCIS(b, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING; - idl.db_dy = (u32)(CALCIS(x, b) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING; - } - - if constexpr (texture_enable) - { - idl.du_dx = (u32)(CALCIS(u, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING; - idl.du_dy = (u32)(CALCIS(x, u) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING; - - idl.dv_dx = (u32)(CALCIS(v, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING; - idl.dv_dy = (u32)(CALCIS(x, v) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING; - } - - return true; - -#undef CALCIS -} - -template -void ALWAYS_INLINE_RELEASE GPU_SW_Backend::AddIDeltas_DX(i_group& ig, const i_deltas& idl, u32 count /*= 1*/) -{ - if constexpr (shading_enable) - { - ig.r += idl.dr_dx * count; - ig.g += idl.dg_dx * count; - ig.b += idl.db_dx * count; - } - - if constexpr (texture_enable) - { - ig.u += idl.du_dx * count; - ig.v += idl.dv_dx * count; - } -} - -template -void ALWAYS_INLINE_RELEASE GPU_SW_Backend::AddIDeltas_DY(i_group& ig, const i_deltas& idl, u32 count /*= 1*/) -{ - if constexpr (shading_enable) - { - ig.r += idl.dr_dy * count; - ig.g += idl.dg_dy * count; - ig.b += idl.db_dy * count; - } - - if constexpr (texture_enable) - { - ig.u += idl.du_dy * count; - ig.v += idl.dv_dy * count; - } -} - -template -void GPU_SW_Backend::DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, i_group ig, - const i_deltas& idl) -{ - if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u)) - return; - - s32 x_ig_adjust = x_start; - s32 w = x_bound - x_start; - s32 x = TruncateGPUVertexPosition(x_start); - - if (x < static_cast(m_drawing_area.left)) - { - s32 delta = static_cast(m_drawing_area.left) - x; - x_ig_adjust += delta; - x += delta; - w -= delta; - } - - if ((x + w) > (static_cast(m_drawing_area.right) + 1)) - w = static_cast(m_drawing_area.right) + 1 - x; - - if (w <= 0) - return; - - AddIDeltas_DX(ig, idl, x_ig_adjust); - AddIDeltas_DY(ig, idl, y); - - do - { - const u32 r = ig.r >> (COORD_FBS + COORD_POST_PADDING); - const u32 g = ig.g >> (COORD_FBS + COORD_POST_PADDING); - const u32 b = ig.b >> (COORD_FBS + COORD_POST_PADDING); - const u32 u = ig.u >> (COORD_FBS + COORD_POST_PADDING); - const u32 v = ig.v >> (COORD_FBS + COORD_POST_PADDING); - - ShadePixel( - cmd, static_cast(x), static_cast(y), Truncate8(r), Truncate8(g), Truncate8(b), Truncate8(u), - Truncate8(v)); - - x++; - AddIDeltas_DX(ig, idl); - } while (--w > 0); -} - -template -void GPU_SW_Backend::DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, - const GPUBackendDrawPolygonCommand::Vertex* v0, - const GPUBackendDrawPolygonCommand::Vertex* v1, - const GPUBackendDrawPolygonCommand::Vertex* v2) -{ - u32 core_vertex; - { - u32 cvtemp = 0; - - if (v1->x <= v0->x) - { - if (v2->x <= v1->x) - cvtemp = (1 << 2); - else - cvtemp = (1 << 1); - } - else if (v2->x < v0->x) - cvtemp = (1 << 2); - else - cvtemp = (1 << 0); - - if (v2->y < v1->y) - { - std::swap(v2, v1); - cvtemp = ((cvtemp >> 1) & 0x2) | ((cvtemp << 1) & 0x4) | (cvtemp & 0x1); - } - - if (v1->y < v0->y) - { - std::swap(v1, v0); - cvtemp = ((cvtemp >> 1) & 0x1) | ((cvtemp << 1) & 0x2) | (cvtemp & 0x4); - } - - if (v2->y < v1->y) - { - std::swap(v2, v1); - cvtemp = ((cvtemp >> 1) & 0x2) | ((cvtemp << 1) & 0x4) | (cvtemp & 0x1); - } - - core_vertex = cvtemp >> 1; - } - - if (v0->y == v2->y) - return; - - if (static_cast(std::abs(v2->x - v0->x)) >= MAX_PRIMITIVE_WIDTH || - static_cast(std::abs(v2->x - v1->x)) >= MAX_PRIMITIVE_WIDTH || - static_cast(std::abs(v1->x - v0->x)) >= MAX_PRIMITIVE_WIDTH || - static_cast(v2->y - v0->y) >= MAX_PRIMITIVE_HEIGHT) - { - return; - } - - s64 base_coord = MakePolyXFP(v0->x); - s64 base_step = MakePolyXFPStep((v2->x - v0->x), (v2->y - v0->y)); - s64 bound_coord_us; - s64 bound_coord_ls; - bool right_facing; - - if (v1->y == v0->y) - { - bound_coord_us = 0; - right_facing = (bool)(v1->x > v0->x); - } - else - { - bound_coord_us = MakePolyXFPStep((v1->x - v0->x), (v1->y - v0->y)); - right_facing = (bool)(bound_coord_us > base_step); - } - - if (v2->y == v1->y) - bound_coord_ls = 0; - else - bound_coord_ls = MakePolyXFPStep((v2->x - v1->x), (v2->y - v1->y)); - - i_deltas idl; - if (!CalcIDeltas(idl, v0, v1, v2)) - return; - - const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2}; - - i_group ig; - if constexpr (texture_enable) - { - ig.u = (COORD_MF_INT(vertices[core_vertex]->u) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING; - ig.v = (COORD_MF_INT(vertices[core_vertex]->v) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING; - } - - ig.r = (COORD_MF_INT(vertices[core_vertex]->r) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING; - ig.g = (COORD_MF_INT(vertices[core_vertex]->g) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING; - ig.b = (COORD_MF_INT(vertices[core_vertex]->b) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING; - - AddIDeltas_DX(ig, idl, -vertices[core_vertex]->x); - AddIDeltas_DY(ig, idl, -vertices[core_vertex]->y); - - struct TriangleHalf - { - u64 x_coord[2]; - u64 x_step[2]; - - s32 y_coord; - s32 y_bound; - - bool dec_mode; - } tripart[2]; - - u32 vo = 0; - u32 vp = 0; - if (core_vertex != 0) - vo = 1; - if (core_vertex == 2) - vp = 3; - - { - TriangleHalf* tp = &tripart[vo]; - tp->y_coord = vertices[0 ^ vo]->y; - tp->y_bound = vertices[1 ^ vo]->y; - tp->x_coord[right_facing] = MakePolyXFP(vertices[0 ^ vo]->x); - tp->x_step[right_facing] = bound_coord_us; - tp->x_coord[!right_facing] = base_coord + ((vertices[vo]->y - vertices[0]->y) * base_step); - tp->x_step[!right_facing] = base_step; - tp->dec_mode = vo; - } - - { - TriangleHalf* tp = &tripart[vo ^ 1]; - tp->y_coord = vertices[1 ^ vp]->y; - tp->y_bound = vertices[2 ^ vp]->y; - tp->x_coord[right_facing] = MakePolyXFP(vertices[1 ^ vp]->x); - tp->x_step[right_facing] = bound_coord_ls; - tp->x_coord[!right_facing] = - base_coord + ((vertices[1 ^ vp]->y - vertices[0]->y) * - base_step); // base_coord + ((vertices[1].y - vertices[0].y) * base_step); - tp->x_step[!right_facing] = base_step; - tp->dec_mode = vp; - } - - for (u32 i = 0; i < 2; i++) - { - s32 yi = tripart[i].y_coord; - s32 yb = tripart[i].y_bound; - - u64 lc = tripart[i].x_coord[0]; - u64 ls = tripart[i].x_step[0]; - - u64 rc = tripart[i].x_coord[1]; - u64 rs = tripart[i].x_step[1]; - - if (tripart[i].dec_mode) - { - while (yi > yb) - { - yi--; - lc -= ls; - rc -= rs; - - s32 y = TruncateGPUVertexPosition(yi); - - if (y < static_cast(m_drawing_area.top)) - break; - - if (y > static_cast(m_drawing_area.bottom)) - continue; - - DrawSpan( - cmd, y & VRAM_HEIGHT_MASK, GetPolyXFP_Int(lc), GetPolyXFP_Int(rc), ig, idl); - } - } - else - { - while (yi < yb) - { - s32 y = TruncateGPUVertexPosition(yi); - - if (y > static_cast(m_drawing_area.bottom)) - break; - - if (y >= static_cast(m_drawing_area.top)) - { - DrawSpan( - cmd, y & VRAM_HEIGHT_MASK, GetPolyXFP_Int(lc), GetPolyXFP_Int(rc), ig, idl); - } - - yi++; - lc += ls; - rc += rs; - } - } - } -} - -enum -{ - Line_XY_FractBits = 32 -}; -enum -{ - Line_RGB_FractBits = 12 -}; - -struct line_fxp_coord -{ - u64 x, y; - u32 r, g, b; -}; - -struct line_fxp_step -{ - s64 dx_dk, dy_dk; - s32 dr_dk, dg_dk, db_dk; -}; - -static ALWAYS_INLINE_RELEASE s64 LineDivide(s64 delta, s32 dk) -{ - delta = (u64)delta << Line_XY_FractBits; - - if (delta < 0) - delta -= dk - 1; - if (delta > 0) - delta += dk - 1; - - return (delta / dk); -} - -template -void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0, - const GPUBackendDrawLineCommand::Vertex* p1) -{ - const s32 i_dx = std::abs(p1->x - p0->x); - const s32 i_dy = std::abs(p1->y - p0->y); - const s32 k = (i_dx > i_dy) ? i_dx : i_dy; - if (i_dx >= MAX_PRIMITIVE_WIDTH || i_dy >= MAX_PRIMITIVE_HEIGHT) - return; - - if (p0->x >= p1->x && k > 0) - std::swap(p0, p1); - - line_fxp_step step; - if (k == 0) - { - step.dx_dk = 0; - step.dy_dk = 0; - - if constexpr (shading_enable) - { - step.dr_dk = 0; - step.dg_dk = 0; - step.db_dk = 0; - } - } - else - { - step.dx_dk = LineDivide(p1->x - p0->x, k); - step.dy_dk = LineDivide(p1->y - p0->y, k); - - if constexpr (shading_enable) - { - step.dr_dk = (s32)((u32)(p1->r - p0->r) << Line_RGB_FractBits) / k; - step.dg_dk = (s32)((u32)(p1->g - p0->g) << Line_RGB_FractBits) / k; - step.db_dk = (s32)((u32)(p1->b - p0->b) << Line_RGB_FractBits) / k; - } - } - - line_fxp_coord cur_point; - cur_point.x = ((u64)p0->x << Line_XY_FractBits) | (1ULL << (Line_XY_FractBits - 1)); - cur_point.y = ((u64)p0->y << Line_XY_FractBits) | (1ULL << (Line_XY_FractBits - 1)); - - cur_point.x -= 1024; - - if (step.dy_dk < 0) - cur_point.y -= 1024; - - if constexpr (shading_enable) - { - cur_point.r = (p0->r << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1)); - cur_point.g = (p0->g << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1)); - cur_point.b = (p0->b << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1)); - } - - for (s32 i = 0; i <= k; i++) - { - // Sign extension is not necessary here for x and y, due to the maximum values that ClipX1 and ClipY1 can contain. - const s32 x = (cur_point.x >> Line_XY_FractBits) & 2047; - const s32 y = (cur_point.y >> Line_XY_FractBits) & 2047; - - if ((!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast(y)) & 1u)) && - x >= static_cast(m_drawing_area.left) && x <= static_cast(m_drawing_area.right) && - y >= static_cast(m_drawing_area.top) && y <= static_cast(m_drawing_area.bottom)) - { - const u8 r = shading_enable ? static_cast(cur_point.r >> Line_RGB_FractBits) : p0->r; - const u8 g = shading_enable ? static_cast(cur_point.g >> Line_RGB_FractBits) : p0->g; - const u8 b = shading_enable ? static_cast(cur_point.b >> Line_RGB_FractBits) : p0->b; - - ShadePixel( - cmd, static_cast(x), static_cast(y) & VRAM_HEIGHT_MASK, r, g, b, 0, 0); - } - - cur_point.x += step.dx_dk; - cur_point.y += step.dy_dk; - - if constexpr (shading_enable) - { - cur_point.r += step.dr_dk; - cur_point.g += step.dg_dk; - cur_point.b += step.db_dk; - } - } + DrawFunction(cmd, &cmd->vertices[i - 1], &cmd->vertices[i]); } void GPU_SW_Backend::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params) @@ -896,82 +242,16 @@ void GPU_SW_Backend::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 wi } } -void GPU_SW_Backend::FlushRender() -{ -} - -void GPU_SW_Backend::DrawingAreaChanged() -{ -} - void GPU_SW_Backend::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) { GPU::ReadCLUT(g_gpu_clut, reg, clut_is_8bit); } -GPU_SW_Backend::DrawLineFunction GPU_SW_Backend::GetDrawLineFunction(bool shading_enable, bool transparency_enable, - bool dithering_enable) +void GPU_SW_Backend::DrawingAreaChanged(const GPUDrawingArea& new_drawing_area, const GSVector4i clamped_drawing_area) { - static constexpr DrawLineFunction funcs[2][2][2] = { - {{&GPU_SW_Backend::DrawLine, &GPU_SW_Backend::DrawLine}, - {&GPU_SW_Backend::DrawLine, &GPU_SW_Backend::DrawLine}}, - {{&GPU_SW_Backend::DrawLine, &GPU_SW_Backend::DrawLine}, - {&GPU_SW_Backend::DrawLine, &GPU_SW_Backend::DrawLine}}}; - - return funcs[u8(shading_enable)][u8(transparency_enable)][u8(dithering_enable)]; + GPU_SW_Rasterizer::g_drawing_area = new_drawing_area; } -GPU_SW_Backend::DrawRectangleFunction -GPU_SW_Backend::GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable, bool transparency_enable) +void GPU_SW_Backend::FlushRender() { - static constexpr DrawRectangleFunction funcs[2][2][2] = { - {{&GPU_SW_Backend::DrawRectangle, &GPU_SW_Backend::DrawRectangle}, - {&GPU_SW_Backend::DrawRectangle, &GPU_SW_Backend::DrawRectangle}}, - {{&GPU_SW_Backend::DrawRectangle, &GPU_SW_Backend::DrawRectangle}, - {&GPU_SW_Backend::DrawRectangle, &GPU_SW_Backend::DrawRectangle}}}; - - return funcs[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)]; } - -GPU_SW_Backend::DrawTriangleFunction GPU_SW_Backend::GetDrawTriangleFunction(bool shading_enable, bool texture_enable, - bool raw_texture_enable, - bool transparency_enable, - bool dithering_enable) -{ - static constexpr DrawTriangleFunction funcs[2][2][2][2][2] = { - {{{{&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}, - {&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}}, - {{&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}, - {&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}}}, - {{{&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}, - {&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}}, - {{&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}, - {&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}}}}, - {{{{&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}, - {&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}}, - {{&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}, - {&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}}}, - {{{&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}, - {&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}}, - {{&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}, - {&GPU_SW_Backend::DrawTriangle, - &GPU_SW_Backend::DrawTriangle}}}}}; - - return funcs[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)] - [u8(dithering_enable)]; -} \ No newline at end of file diff --git a/src/core/gpu_sw_backend.h b/src/core/gpu_sw_backend.h index 8d21793fa..2c0394ebc 100644 --- a/src/core/gpu_sw_backend.h +++ b/src/core/gpu_sw_backend.h @@ -17,77 +17,7 @@ public: bool Initialize(bool force_thread) override; void Reset() override; - ALWAYS_INLINE_RELEASE u16 GetPixel(const u32 x, const u32 y) const { return g_vram[VRAM_WIDTH * y + x]; } - ALWAYS_INLINE_RELEASE const u16* GetPixelPtr(const u32 x, const u32 y) const { return &g_vram[VRAM_WIDTH * y + x]; } - ALWAYS_INLINE_RELEASE u16* GetPixelPtr(const u32 x, const u32 y) { return &g_vram[VRAM_WIDTH * y + x]; } - ALWAYS_INLINE_RELEASE void SetPixel(const u32 x, const u32 y, const u16 value) { g_vram[VRAM_WIDTH * y + x] = value; } - - // this is actually (31 * 255) >> 4) == 494, but to simplify addressing we use the next power of two (512) - static constexpr u32 DITHER_LUT_SIZE = 512; - using DitherLUT = std::array, DITHER_MATRIX_SIZE>, DITHER_MATRIX_SIZE>; - static constexpr DitherLUT ComputeDitherLUT(); - protected: - union VRAMPixel - { - u16 bits; - - BitField r; - BitField g; - BitField b; - BitField c; - - void Set(u8 r_, u8 g_, u8 b_, bool c_ = false) - { - bits = (ZeroExtend16(r_)) | (ZeroExtend16(g_) << 5) | (ZeroExtend16(b_) << 10) | (static_cast(c_) << 15); - } - - void ClampAndSet(u8 r_, u8 g_, u8 b_, bool c_ = false) - { - Set(std::min(r_, 0x1F), std::min(g_, 0x1F), std::min(b_, 0x1F), c_); - } - - void SetRGB24(u32 rgb24, bool c_ = false) - { - bits = Truncate16(((rgb24 >> 3) & 0x1F) | (((rgb24 >> 11) & 0x1F) << 5) | (((rgb24 >> 19) & 0x1F) << 10)) | - (static_cast(c_) << 15); - } - - void SetRGB24(u8 r8, u8 g8, u8 b8, bool c_ = false) - { - bits = (ZeroExtend16(r8 >> 3)) | (ZeroExtend16(g8 >> 3) << 5) | (ZeroExtend16(b8 >> 3) << 10) | - (static_cast(c_) << 15); - } - - void SetRGB24Dithered(u32 x, u32 y, u8 r8, u8 g8, u8 b8, bool c_ = false) - { - const s32 offset = DITHER_MATRIX[y & 3][x & 3]; - r8 = static_cast(std::clamp(static_cast(ZeroExtend32(r8)) + offset, 0, 255)); - g8 = static_cast(std::clamp(static_cast(ZeroExtend32(g8)) + offset, 0, 255)); - b8 = static_cast(std::clamp(static_cast(ZeroExtend32(b8)) + offset, 0, 255)); - SetRGB24(r8, g8, b8, c_); - } - - u32 ToRGB24() const - { - const u32 r_ = ZeroExtend32(r.GetValue()); - const u32 g_ = ZeroExtend32(g.GetValue()); - const u32 b_ = ZeroExtend32(b.GetValue()); - - return ((r_ << 3) | (r_ & 7)) | (((g_ << 3) | (g_ & 7)) << 8) | (((b_ << 3) | (b_ & 7)) << 16); - } - }; - - static constexpr std::tuple UnpackTexcoord(u16 texcoord) - { - return std::make_tuple(static_cast(texcoord), static_cast(texcoord >> 8)); - } - - static constexpr std::tuple UnpackColorRGB24(u32 rgb24) - { - return std::make_tuple(static_cast(rgb24), static_cast(rgb24 >> 8), static_cast(rgb24 >> 16)); - } - void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params) override; void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, GPUBackendCommandParameters params) override; void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, @@ -96,75 +26,7 @@ protected: void DrawPolygon(const GPUBackendDrawPolygonCommand* cmd) override; void DrawLine(const GPUBackendDrawLineCommand* cmd) override; void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) override; - void FlushRender() override; - void DrawingAreaChanged() override; + void DrawingAreaChanged(const GPUDrawingArea& new_drawing_area, const GSVector4i clamped_drawing_area) override; void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) override; - - ////////////////////////////////////////////////////////////////////////// - // Rasterization - ////////////////////////////////////////////////////////////////////////// - template - void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x, - u8 texcoord_y); - - template - void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd); - - using DrawRectangleFunction = void (GPU_SW_Backend::*)(const GPUBackendDrawRectangleCommand* cmd); - DrawRectangleFunction GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable, - bool transparency_enable); - - ////////////////////////////////////////////////////////////////////////// - // Polygon and line rasterization ported from Mednafen - ////////////////////////////////////////////////////////////////////////// - struct i_deltas - { - u32 du_dx, dv_dx; - u32 dr_dx, dg_dx, db_dx; - - u32 du_dy, dv_dy; - u32 dr_dy, dg_dy, db_dy; - }; - - struct i_group - { - u32 u, v; - u32 r, g, b; - }; - - template - bool CalcIDeltas(i_deltas& idl, const GPUBackendDrawPolygonCommand::Vertex* A, - const GPUBackendDrawPolygonCommand::Vertex* B, const GPUBackendDrawPolygonCommand::Vertex* C); - - template - void AddIDeltas_DX(i_group& ig, const i_deltas& idl, u32 count = 1); - - template - void AddIDeltas_DY(i_group& ig, const i_deltas& idl, u32 count = 1); - - template - void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, i_group ig, - const i_deltas& idl); - - template - void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0, - const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2); - - using DrawTriangleFunction = void (GPU_SW_Backend::*)(const GPUBackendDrawPolygonCommand* cmd, - const GPUBackendDrawPolygonCommand::Vertex* v0, - const GPUBackendDrawPolygonCommand::Vertex* v1, - const GPUBackendDrawPolygonCommand::Vertex* v2); - DrawTriangleFunction GetDrawTriangleFunction(bool shading_enable, bool texture_enable, bool raw_texture_enable, - bool transparency_enable, bool dithering_enable); - - template - void DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0, - const GPUBackendDrawLineCommand::Vertex* p1); - - using DrawLineFunction = void (GPU_SW_Backend::*)(const GPUBackendDrawLineCommand* cmd, - const GPUBackendDrawLineCommand::Vertex* p0, - const GPUBackendDrawLineCommand::Vertex* p1); - DrawLineFunction GetDrawLineFunction(bool shading_enable, bool transparency_enable, bool dithering_enable); + void FlushRender() override; }; diff --git a/src/core/gpu_sw_rasterizer.cpp b/src/core/gpu_sw_rasterizer.cpp new file mode 100644 index 000000000..0df0c16c2 --- /dev/null +++ b/src/core/gpu_sw_rasterizer.cpp @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin +// SPDX-License-Identifier: CC-BY-NC-ND-4.0 + +#include "gpu_sw_rasterizer.h" +#include "gpu.h" + +#include "cpuinfo.h" + +#include "common/log.h" +#include "common/string_util.h" + +Log_SetChannel(GPU_SW_Rasterizer); + +namespace GPU_SW_Rasterizer { +// Default implementation, compatible with all ISAs. +extern const DrawRectangleFunctionTable DrawRectangleFunctions; +extern const DrawTriangleFunctionTable DrawTriangleFunctions; +extern const DrawLineFunctionTable DrawLineFunctions; + +constinit const DitherLUT g_dither_lut = []() constexpr { + DitherLUT lut = {}; + for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++) + { + for (u32 j = 0; j < DITHER_MATRIX_SIZE; j++) + { + for (u32 value = 0; value < DITHER_LUT_SIZE; value++) + { + const s32 dithered_value = (static_cast(value) + DITHER_MATRIX[i][j]) >> 3; + lut[i][j][value] = static_cast((dithered_value < 0) ? 0 : ((dithered_value > 31) ? 31 : dithered_value)); + } + } + } + return lut; +}(); + +GPUDrawingArea g_drawing_area = {}; +} // namespace GPU_SW_Rasterizer + +// Default implementation definitions. +namespace GPU_SW_Rasterizer { +#include "gpu_sw_rasterizer.inl" +} + +// Default vector implementation definitions. +#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON) +namespace GPU_SW_Rasterizer::SIMD { +#include "gpu_sw_rasterizer.inl" +} +#endif + +// Initialize with default implementation. +namespace GPU_SW_Rasterizer { +const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions = &DrawRectangleFunctions; +const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions = &DrawTriangleFunctions; +const DrawLineFunctionTable* SelectedDrawLineFunctions = &DrawLineFunctions; +} // namespace GPU_SW_Rasterizer + +// Declare alternative implementations. +void GPU_SW_Rasterizer::SelectImplementation() +{ + static bool selected = false; + if (selected) + return; + + selected = true; + +#define SELECT_ALTERNATIVE_RASTERIZER(isa) \ + do \ + { \ + INFO_LOG("Using " #isa " software rasterizer implementation."); \ + SelectedDrawRectangleFunctions = &isa::DrawRectangleFunctions; \ + SelectedDrawTriangleFunctions = &isa::DrawTriangleFunctions; \ + SelectedDrawLineFunctions = &isa::DrawLineFunctions; \ + } while (0) + +#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON) + const char* use_isa = std::getenv("SW_USE_ISA"); + + // Default to scalar for now, until vector is finished. + use_isa = use_isa ? use_isa : "Scalar"; + +#if defined(CPU_ARCH_SSE) && defined(_MSC_VER) + if (cpuinfo_has_x86_avx2() && (!use_isa || StringUtil::Strcasecmp(use_isa, "AVX2") == 0)) + { + SELECT_ALTERNATIVE_RASTERIZER(AVX2); + return; + } +#endif + + if (!use_isa || StringUtil::Strcasecmp(use_isa, "SIMD") == 0) + { + SELECT_ALTERNATIVE_RASTERIZER(SIMD); + return; + } +#endif + + INFO_LOG("Using scalar software rasterizer implementation."); + +#undef SELECT_ALTERNATIVE_RASTERIZER +} diff --git a/src/core/gpu_sw_rasterizer.h b/src/core/gpu_sw_rasterizer.h new file mode 100644 index 000000000..d6f3adace --- /dev/null +++ b/src/core/gpu_sw_rasterizer.h @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin +// SPDX-License-Identifier: CC-BY-NC-ND-4.0 + +#pragma once + +#include "gpu.h" +#include "gpu_types.h" + +#include "common/intrin.h" +#include "common/types.h" + +#include +#include + +namespace GPU_SW_Rasterizer { + +// this is actually (31 * 255) >> 4) == 494, but to simplify addressing we use the next power of two (512) +static constexpr u32 DITHER_LUT_SIZE = 512; +using DitherLUT = std::array, DITHER_MATRIX_SIZE>, DITHER_MATRIX_SIZE>; +extern const DitherLUT g_dither_lut; + +extern GPUDrawingArea g_drawing_area; + +using DrawRectangleFunction = void (*)(const GPUBackendDrawRectangleCommand* cmd); +typedef const DrawRectangleFunction DrawRectangleFunctionTable[2][2][2]; + +using DrawTriangleFunction = void (*)(const GPUBackendDrawPolygonCommand* cmd, + const GPUBackendDrawPolygonCommand::Vertex* v0, + const GPUBackendDrawPolygonCommand::Vertex* v1, + const GPUBackendDrawPolygonCommand::Vertex* v2); +typedef const DrawTriangleFunction DrawTriangleFunctionTable[2][2][2][2][2]; + +using DrawLineFunction = void (*)(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0, + const GPUBackendDrawLineCommand::Vertex* p1); +typedef const DrawLineFunction DrawLineFunctionTable[2][2][2]; + +// Default implementation, compatible with all ISAs. +extern const DrawRectangleFunctionTable DrawRectangleFunctions; +extern const DrawTriangleFunctionTable DrawTriangleFunctions; +extern const DrawLineFunctionTable DrawLineFunctions; + +// Current implementation, selected at runtime. +extern const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions; +extern const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions; +extern const DrawLineFunctionTable* SelectedDrawLineFunctions; + +extern void SelectImplementation(); + +ALWAYS_INLINE static DrawLineFunction GetDrawLineFunction(bool shading_enable, bool transparency_enable, + bool dithering_enable) +{ + return (*SelectedDrawLineFunctions)[u8(shading_enable)][u8(transparency_enable)][u8(dithering_enable)]; +} + +ALWAYS_INLINE static DrawRectangleFunction GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable, + bool transparency_enable) +{ + return (*SelectedDrawRectangleFunctions)[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)]; +} + +ALWAYS_INLINE static DrawTriangleFunction GetDrawTriangleFunction(bool shading_enable, bool texture_enable, + bool raw_texture_enable, bool transparency_enable, + bool dithering_enable) +{ + return (*SelectedDrawTriangleFunctions)[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)] + [u8(transparency_enable)][u8(dithering_enable)]; +} + +#define DECLARE_ALTERNATIVE_RASTERIZER(isa) \ + namespace isa { \ + extern const DrawRectangleFunctionTable DrawRectangleFunctions; \ + extern const DrawTriangleFunctionTable DrawTriangleFunctions; \ + extern const DrawLineFunctionTable DrawLineFunctions; \ + } + +// Have to define the symbols globally, because clang won't include them otherwise. +#if defined(CPU_ARCH_SSE) && defined(_MSC_VER) +#define ALTERNATIVE_RASTERIZER_LIST() DECLARE_ALTERNATIVE_RASTERIZER(AVX2) +#else +#define ALTERNATIVE_RASTERIZER_LIST() +#endif + +ALTERNATIVE_RASTERIZER_LIST() + +#undef DECLARE_ALTERNATIVE_RASTERIZER + +} // namespace GPU_SW_Rasterizer + +// static u32 s_bad_counter = 0; diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl new file mode 100644 index 000000000..d434b49b9 --- /dev/null +++ b/src/core/gpu_sw_rasterizer.inl @@ -0,0 +1,1250 @@ +// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin +// SPDX-License-Identifier: CC-BY-NC-ND-4.0 + +#ifdef __INTELLISENSE__ + +#include "common/gsvector.h" +#include "gpu.h" +#include + +#define USE_VECTOR 1 +#define GSVECTOR_HAS_SRLV 1 + +extern GPU_SW_Rasterizer::DitherLUT g_dither_lut; + +namespace GPU_SW_Rasterizer { + +#endif + +// TODO: UpdateVRAM, FillVRAM, etc. + +#ifdef USE_VECTOR +#if 0 +static u16 s_vram_backup[VRAM_WIDTH * VRAM_HEIGHT]; +static u16 s_new_vram[VRAM_WIDTH * VRAM_HEIGHT]; +#define BACKUP_VRAM() \ + do \ + { \ + std::memcpy(s_vram_backup, g_vram, sizeof(g_vram)); \ + s_bad_counter++; \ + } while (0) +#define CHECK_VRAM(drawer) \ + do \ + { \ + std::memcpy(s_new_vram, g_vram, sizeof(g_vram)); \ + std::memcpy(g_vram, s_vram_backup, sizeof(g_vram)); \ + \ + drawer; \ + for (u32 vidx = 0; vidx < (VRAM_WIDTH * VRAM_HEIGHT); vidx++) \ + { \ + if (s_new_vram[vidx] != g_vram[vidx]) \ + { \ + fprintf(stderr, "[%u] Mismatch at %d,%d, expected %04x got %04x\n", s_bad_counter, (vidx % VRAM_WIDTH), \ + (vidx / VRAM_WIDTH), g_vram[vidx], s_new_vram[vidx]); \ + AssertMsg(false, "Mismatch"); \ + } \ + } \ + /*Assert(std::memcmp(g_vram, s_new_vram, sizeof(g_vram)) == 0)*/ \ + } while (0) +#else +#define BACKUP_VRAM() +#define CHECK_VRAM(drawer) +#endif +#endif + +[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y) +{ + return g_vram[VRAM_WIDTH * y + x]; +} +[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y) +{ + return &g_vram[VRAM_WIDTH * y + x]; +} +[[maybe_unused]] ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value) +{ + g_vram[VRAM_WIDTH * y + x] = value; +} + +[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackTexcoord(u16 texcoord) +{ + return std::make_tuple(static_cast(texcoord), static_cast(texcoord >> 8)); +} + +[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackColorRGB24(u32 rgb24) +{ + return std::make_tuple(static_cast(rgb24), static_cast(rgb24 >> 8), static_cast(rgb24 >> 16)); +} + +template +[[maybe_unused]] ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, + u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x, + u8 texcoord_y) +{ + u16 color; + if constexpr (texture_enable) + { + // Apply texture window + texcoord_x = (texcoord_x & cmd->window.and_x) | cmd->window.or_x; + texcoord_y = (texcoord_y & cmd->window.and_y) | cmd->window.or_y; + + u16 texture_color; + switch (cmd->draw_mode.texture_mode) + { + case GPUTextureMode::Palette4Bit: + { + const u16 palette_value = + GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 4)) % VRAM_WIDTH, + (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT); + const size_t palette_index = (palette_value >> ((texcoord_x % 4) * 4)) & 0x0Fu; + texture_color = g_gpu_clut[palette_index]; + } + break; + + case GPUTextureMode::Palette8Bit: + { + const u16 palette_value = + GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 2)) % VRAM_WIDTH, + (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT); + const size_t palette_index = (palette_value >> ((texcoord_x % 2) * 8)) & 0xFFu; + texture_color = g_gpu_clut[palette_index]; + } + break; + + default: + { + texture_color = GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x)) % VRAM_WIDTH, + (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT); + } + break; + } + + if (texture_color == 0) + return; + + if constexpr (raw_texture_enable) + { + color = texture_color; + } + else + { + const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u; + const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u; + + color = + (ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16(texture_color & 0x1Fu) * u16(color_r)) >> 4]) << 0) | + (ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16((texture_color >> 5) & 0x1Fu) * u16(color_g)) >> 4]) << 5) | + (ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16((texture_color >> 10) & 0x1Fu) * u16(color_b)) >> 4]) + << 10) | + (texture_color & 0x8000u); + } + } + else + { + const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u; + const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u; + + // Non-textured transparent polygons don't set bit 15, but are treated as transparent. + color = (ZeroExtend16(g_dither_lut[dither_y][dither_x][color_r]) << 0) | + (ZeroExtend16(g_dither_lut[dither_y][dither_x][color_g]) << 5) | + (ZeroExtend16(g_dither_lut[dither_y][dither_x][color_b]) << 10) | (transparency_enable ? 0x8000u : 0); + } + + const u16 bg_color = GetPixel(static_cast(x), static_cast(y)); + if constexpr (transparency_enable) + { + if (color & 0x8000u || !texture_enable) + { + // Based on blargg's efficient 15bpp pixel math. + u32 bg_bits = ZeroExtend32(bg_color); + u32 fg_bits = ZeroExtend32(color); + switch (cmd->draw_mode.transparency_mode) + { + case GPUTransparencyMode::HalfBackgroundPlusHalfForeground: + { + bg_bits |= 0x8000u; + color = Truncate16(((fg_bits + bg_bits) - ((fg_bits ^ bg_bits) & 0x0421u)) >> 1); + } + break; + + case GPUTransparencyMode::BackgroundPlusForeground: + { + bg_bits &= ~0x8000u; + + const u32 sum = fg_bits + bg_bits; + const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u; + + color = Truncate16((sum - carry) | (carry - (carry >> 5))); + } + break; + + case GPUTransparencyMode::BackgroundMinusForeground: + { + bg_bits |= 0x8000u; + fg_bits &= ~0x8000u; + + const u32 diff = bg_bits - fg_bits + 0x108420u; + const u32 borrow = (diff - ((bg_bits ^ fg_bits) & 0x108420u)) & 0x108420u; + + color = Truncate16((diff - borrow) & (borrow - (borrow >> 5))); + } + break; + + case GPUTransparencyMode::BackgroundPlusQuarterForeground: + { + bg_bits &= ~0x8000u; + fg_bits = ((fg_bits >> 2) & 0x1CE7u) | 0x8000u; + + const u32 sum = fg_bits + bg_bits; + const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u; + + color = Truncate16((sum - carry) | (carry - (carry >> 5))); + } + break; + + default: + break; + } + + // See above. + if constexpr (!texture_enable) + color &= ~0x8000u; + } + } + + const u16 mask_and = cmd->params.GetMaskAND(); + if ((bg_color & mask_and) != 0) + return; + + DebugAssert(static_cast(x) < VRAM_WIDTH && static_cast(y) < VRAM_HEIGHT); + SetPixel(static_cast(x), static_cast(y), color | cmd->params.GetMaskOR()); +} + +#ifndef USE_VECTOR + +template +static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) +{ + const s32 origin_x = cmd->x; + const s32 origin_y = cmd->y; + const auto [r, g, b] = UnpackColorRGB24(cmd->color); + const auto [origin_texcoord_x, origin_texcoord_y] = UnpackTexcoord(cmd->texcoord); + + for (u32 offset_y = 0; offset_y < cmd->height; offset_y++) + { + const s32 y = origin_y + static_cast(offset_y); + if (y < static_cast(g_drawing_area.top) || y > static_cast(g_drawing_area.bottom) || + (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u))) + { + continue; + } + + const u32 draw_y = static_cast(y) & VRAM_HEIGHT_MASK; + const u8 texcoord_y = Truncate8(ZeroExtend32(origin_texcoord_y) + offset_y); + + for (u32 offset_x = 0; offset_x < cmd->width; offset_x++) + { + const s32 x = origin_x + static_cast(offset_x); + if (x < static_cast(g_drawing_area.left) || x > static_cast(g_drawing_area.right)) + continue; + + const u8 texcoord_x = Truncate8(ZeroExtend32(origin_texcoord_x) + offset_x); + + ShadePixel(cmd, static_cast(x), draw_y, r, g, + b, texcoord_x, texcoord_y); + } + } +} + +#else // USE_VECTOR + +ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVector4i coord_y) +{ + GSVector4i offsets = coord_y.sll32<11>(); // y * 2048 (1024 * sizeof(pixel)) + offsets = offsets.add32(coord_x.sll32<1>()); // x * 2 (x * sizeof(pixel)) + + const u32 o0 = offsets.extract32<0>(); + const u32 o1 = offsets.extract32<1>(); + const u32 o2 = offsets.extract32<2>(); + const u32 o3 = offsets.extract32<3>(); + + // TODO: split in two, merge, maybe could be zx loaded instead.. + u16 p0, p1, p2, p3; + std::memcpy(&p0, reinterpret_cast(g_vram) + o0, sizeof(p0)); + std::memcpy(&p1, reinterpret_cast(g_vram) + o1, sizeof(p1)); + std::memcpy(&p2, reinterpret_cast(g_vram) + o2, sizeof(p2)); + std::memcpy(&p3, reinterpret_cast(g_vram) + o3, sizeof(p3)); + GSVector4i pixels = GSVector4i::load(p0); + pixels = pixels.insert16<2>(p1); + pixels = pixels.insert16<4>(p2); + pixels = pixels.insert16<6>(p3); + + return pixels; +} + +ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices) +{ + const GSVector4i offsets = indices.sll32<1>(); // x * 2 (x * sizeof(pixel)) + const u32 o0 = offsets.extract32<0>(); + const u32 o1 = offsets.extract32<1>(); + const u32 o2 = offsets.extract32<2>(); + const u32 o3 = offsets.extract32<3>(); + + // TODO: split in two, merge, maybe could be zx loaded instead.. + u16 p0, p1, p2, p3; + std::memcpy(&p0, reinterpret_cast(g_gpu_clut) + o0, sizeof(p0)); + std::memcpy(&p1, reinterpret_cast(g_gpu_clut) + o1, sizeof(p1)); + std::memcpy(&p2, reinterpret_cast(g_gpu_clut) + o2, sizeof(p2)); + std::memcpy(&p3, reinterpret_cast(g_gpu_clut) + o3, sizeof(p3)); + GSVector4i pixels = GSVector4i::load(p0); + pixels = pixels.insert16<2>(p1); + pixels = pixels.insert16<4>(p2); + pixels = pixels.insert16<6>(p3); + + return pixels; +} + +ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y) +{ + if (x <= (VRAM_WIDTH - 4)) + { + return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32(); + } + else + { + const u16* line = &g_vram[y * VRAM_WIDTH]; + GSVector4i pixels = GSVector4i(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<6>(line[x & VRAM_WIDTH_MASK]); + return pixels; + } +} + +ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color) +{ + if (x <= (VRAM_WIDTH - 4)) + { + GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], color); + } + else + { + u16* line = &g_vram[y * VRAM_WIDTH]; + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<0>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<1>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<2>()); + line[x & VRAM_WIDTH_MASK] = Truncate16(color.extract16<3>()); + } +} + +ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVector4i rgb5a1, GSVector4i& rg, GSVector4i& ba) +{ + rg = rgb5a1 & GSVector4i::cxpr(0x1F); // R | R | R | R + rg = rg | (rgb5a1 & GSVector4i::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0 + ba = rgb5a1.srl32<10>() & GSVector4i::cxpr(0x1F); // B | B | B | B + ba = ba | (rgb5a1 & GSVector4i::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0 +} + +ALWAYS_INLINE_RELEASE static GSVector4i RG_BAToRGB5A1(GSVector4i rg, GSVector4i ba) +{ + GSVector4i res; + + res = rg & GSVector4i::cxpr(0x1F); // R | R | R | R + res = res | (rg.srl32<11>() & GSVector4i::cxpr(0x3E0)); // RG | RG | RG | RG + res = res | ((ba & GSVector4i::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB + res = res | ba.srl32<16>().sll32<15>(); // RGBA | RGBA | RGBA | RGBA + + return res; +} + +// Color repeated twice for RG packing, then duplicated to we can load based on the X offset. +static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = { +#define P(m, n) static_cast(DITHER_MATRIX[m][n]), static_cast(DITHER_MATRIX[m][n]) +#define R(m) P(m, 0), P(m, 1), P(m, 2), P(m, 3), P(m, 0), P(m, 1), P(m, 2), P(m, 3) + + {R(0)}, {R(1)}, {R(2)}, {R(3)} + +#undef R +#undef P +}; + +template +ALWAYS_INLINE_RELEASE static void +ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vertex_color_rg, GSVector4i vertex_color_ba, + GSVector4i texcoord_x, GSVector4i texcoord_y, GSVector4i preserve_mask, GSVector4i dither) +{ + static constinit GSVector4i coord_mask_x = GSVector4i::cxpr(VRAM_WIDTH_MASK); + static constinit GSVector4i coord_mask_y = GSVector4i::cxpr(VRAM_HEIGHT_MASK); + + GSVector4i color; + + if constexpr (texture_enable) + { + // Apply texture window + texcoord_x = (texcoord_x & GSVector4i(cmd->window.and_x)) | GSVector4i(cmd->window.or_x); + texcoord_y = (texcoord_y & GSVector4i(cmd->window.and_y)) | GSVector4i(cmd->window.or_y); + + const GSVector4i base_x = GSVector4i(cmd->draw_mode.GetTexturePageBaseX()); + const GSVector4i base_y = GSVector4i(cmd->draw_mode.GetTexturePageBaseY()); + + texcoord_y = base_y.add32(texcoord_y) & coord_mask_y; + + GSVector4i texture_color; + switch (cmd->draw_mode.texture_mode) + { + case GPUTextureMode::Palette4Bit: + { + GSVector4i load_texcoord_x = texcoord_x.srl32<2>(); + load_texcoord_x = base_x.add32(load_texcoord_x); + load_texcoord_x = load_texcoord_x & coord_mask_x; + + // todo: sse4 path + GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(3)).sll32<2>(); + GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y); +#ifdef GSVECTOR_HAS_SRLV + palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0x0F); +#else + Assert(false && "Fixme"); +#endif + + texture_color = GatherCLUTVector(palette_indices); + } + break; + + case GPUTextureMode::Palette8Bit: + { + GSVector4i load_texcoord_x = texcoord_x.srl32<1>(); + load_texcoord_x = base_x.add32(load_texcoord_x); + load_texcoord_x = load_texcoord_x & coord_mask_x; + + GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(1)).sll32<3>(); + GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y); +#ifdef GSVECTOR_HAS_SRLV + palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0xFF); +#else + Assert(false && "Fixme"); +#endif + + texture_color = GatherCLUTVector(palette_indices); + } + break; + + default: + { + texcoord_x = base_x.add32(texcoord_x); + texcoord_x = texcoord_x & coord_mask_x; + texture_color = GatherVector(texcoord_x, texcoord_y); + } + break; + } + + // check for zero texture colour across the 4 pixels, early out if so + const GSVector4i texture_transparent_mask = texture_color.eq32(GSVector4i::zero()); + if (texture_transparent_mask.alltrue()) + return; + + preserve_mask = preserve_mask | texture_transparent_mask; + + if constexpr (raw_texture_enable) + { + color = texture_color; + } + else + { + GSVector4i trg, tba; + RGB5A1ToRG_BA(texture_color, trg, tba); + + // now we have both the texture and vertex color in RG/GA pairs, for 4 pixels, which we can multiply + GSVector4i rg = trg.mul16l(vertex_color_rg); + GSVector4i ba = tba.mul16l(vertex_color_ba); + + // TODO: Dither + // Convert to 5bit. + if constexpr (dithering_enable) + { + rg = rg.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); + ba = ba.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); + } + else + { + rg = rg.sra16<7>(); + ba = ba.sra16<7>(); + } + + // Bit15 gets passed through as-is. + ba = ba.blend16<0xaa>(tba); + + // Clamp to 5bit. + static constexpr GSVector4i colclamp = GSVector4i::cxpr16(0x1F); + rg = rg.min_u16(colclamp); + ba = ba.min_u16(colclamp); + + // And interleave back to 16bpp. + color = RG_BAToRGB5A1(rg, ba); + } + } + else + { + // Non-textured transparent polygons don't set bit 15, but are treated as transparent. + if constexpr (dithering_enable) + { + GSVector4i rg = vertex_color_rg.add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); + GSVector4i ba = vertex_color_ba.add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); + + // Clamp to 5bit. We use 32bit for BA to set a to zero. + rg = rg.min_u16(GSVector4i::cxpr16(0x1F)); + ba = ba.min_u16(GSVector4i::cxpr(0x1F)); + + // And interleave back to 16bpp. + color = RG_BAToRGB5A1(rg, ba); + } + else + { + // Note that bit15 is set to 0 here, which the shift will do. + const GSVector4i rg = vertex_color_rg.srl16<3>(); + const GSVector4i ba = vertex_color_ba.srl16<3>(); + color = RG_BAToRGB5A1(rg, ba); + } + } + + GSVector4i bg_color = LoadVector(start_x, y); + + if constexpr (transparency_enable) + { + [[maybe_unused]] GSVector4i transparent_mask; + if constexpr (texture_enable) + { + // Compute transparent_mask, ffff per lane if transparent otherwise 0000 + transparent_mask = color.sra16<15>(); + } + + // TODO: We don't need to OR color here with 0x8000 for textures. + // 0x8000 is added to match serial path. + + GSVector4i blended_color; + switch (cmd->draw_mode.transparency_mode) + { + case GPUTransparencyMode::HalfBackgroundPlusHalfForeground: + { + const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u); + const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u); + const GSVector4i res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x0421u)).srl32<1>(); + blended_color = res & GSVector4i::cxpr(0xffff); + } + break; + + case GPUTransparencyMode::BackgroundPlusForeground: + { + const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u); + const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu); + const GSVector4i sum = fg_bits.add32(bg_bits); + const GSVector4i carry = + (sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u))) & GSVector4i::cxpr(0x8420u); + const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>()); + blended_color = res & GSVector4i::cxpr(0xffff); + } + break; + + case GPUTransparencyMode::BackgroundMinusForeground: + { + const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u); + const GSVector4i fg_bits = color & GSVector4i::cxpr(0x7FFFu); + const GSVector4i diff = bg_bits.sub32(fg_bits).add32(GSVector4i::cxpr(0x108420u)); + const GSVector4i borrow = + diff.sub32((bg_bits ^ fg_bits) & GSVector4i::cxpr(0x108420u)) & GSVector4i::cxpr(0x108420u); + const GSVector4i res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>()); + blended_color = res & GSVector4i::cxpr(0xffff); + } + break; + + case GPUTransparencyMode::BackgroundPlusQuarterForeground: + default: + { + const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu); + const GSVector4i fg_bits = + ((color | GSVector4i::cxpr(0x8000)).srl32<2>() & GSVector4i::cxpr(0x1CE7u)) | GSVector4i::cxpr(0x8000u); + const GSVector4i sum = fg_bits.add32(bg_bits); + const GSVector4i carry = sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u)) & GSVector4i::cxpr(0x8420u); + const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>()); + blended_color = res & GSVector4i::cxpr(0xffff); + } + break; + } + + // select blended pixels for transparent pixels, otherwise consider opaque + // TODO: SSE2 + if constexpr (texture_enable) + color = color.blend8(blended_color, transparent_mask); + else + color = blended_color & GSVector4i::cxpr(0x7fff); + } + + // TODO: lift out to parent? + const GSVector4i mask_and = GSVector4i(cmd->params.GetMaskAND()); + const GSVector4i mask_or = GSVector4i(cmd->params.GetMaskOR()); + + GSVector4i mask_bits_set = bg_color & mask_and; // 8000 if masked else 0000 + mask_bits_set = mask_bits_set.sra16<15>(); // ffff if masked else 0000 + preserve_mask = preserve_mask | mask_bits_set; // ffff if preserved else 0000 + + bg_color = bg_color & preserve_mask; + color = (color | mask_or).andnot(preserve_mask); + color = color | bg_color; + + const GSVector4i packed_color = color.pu32(); + StoreVector(start_x, y, packed_color); +} + +template +static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) +{ + const s32 origin_x = cmd->x; + const s32 origin_y = cmd->y; + + const GSVector4i rgba = GSVector4i(cmd->color); // RGBA | RGBA | RGBA | RGBA + GSVector4i rg = rgba.xxxxl(); // RGRG | RGRG | RGRG | RGRG + GSVector4i ba = rgba.yyyyl(); // BABA | BABA | BABA | BABA + rg = rg.u8to16(); // R0G0 | R0G0 | R0G0 | R0G0 + ba = ba.u8to16(); // B0A0 | B0A0 | B0A0 | B0A0 + + const GSVector4i texcoord_x = GSVector4i(cmd->texcoord & 0xFF).add32(GSVector4i::cxpr(0, 1, 2, 3)); + GSVector4i texcoord_y = GSVector4i(cmd->texcoord >> 8); + + const GSVector4i clip_left = GSVector4i(g_drawing_area.left); + const GSVector4i clip_right = GSVector4i(g_drawing_area.right); + const u32 width = cmd->width; + + BACKUP_VRAM(); + + for (u32 offset_y = 0; offset_y < cmd->height; offset_y++) + { + const s32 y = origin_y + static_cast(offset_y); + if (y < static_cast(g_drawing_area.top) || y > static_cast(g_drawing_area.bottom) || + (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u))) + { + continue; + } + + GSVector4i row_texcoord_x = texcoord_x; + GSVector4i xvec = GSVector4i(origin_x).add32(GSVector4i::cxpr(0, 1, 2, 3)); + GSVector4i wvec = GSVector4i(width).sub32(GSVector4i::cxpr(1, 2, 3, 4)); + + for (u32 offset_x = 0; offset_x < width; offset_x += 4) + { + const s32 x = origin_x + static_cast(offset_x); + + // width test + GSVector4i preserve_mask = wvec.lt32(GSVector4i::zero()); + + // clip test, if all pixels are outside, skip + preserve_mask = preserve_mask | xvec.lt32(clip_left); + preserve_mask = preserve_mask | xvec.gt32(clip_right); + if (!preserve_mask.alltrue()) + { + ShadePixel( + cmd, x, y, rg, ba, row_texcoord_x, texcoord_y, preserve_mask, GSVector4i::zero()); + } + + xvec = xvec.add32(GSVector4i::cxpr(4)); + wvec = wvec.sub32(GSVector4i::cxpr(4)); + + if constexpr (texture_enable) + row_texcoord_x = row_texcoord_x.add32(GSVector4i::cxpr(4)) & GSVector4i::cxpr(0xFF); + } + + if constexpr (texture_enable) + texcoord_y = texcoord_y.add32(GSVector4i::cxpr(1)) & GSVector4i::cxpr(0xFF); + } + + CHECK_VRAM(GPU_SW_Rasterizer::DrawRectangleFunctions[texture_enable][raw_texture_enable][transparency_enable](cmd)); +} + +#endif // USE_VECTOR + +// TODO: Vectorize line draw. +template +static void DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0, + const GPUBackendDrawLineCommand::Vertex* p1) +{ + static constexpr u32 XY_SHIFT = 32; + static constexpr u32 RGB_SHIFT = 12; + static constexpr auto makefp_xy = [](s32 x) { return (static_cast(x) << XY_SHIFT) | (1LL << (XY_SHIFT - 1)); }; + static constexpr auto unfp_xy = [](s64 x) { return static_cast(x >> XY_SHIFT) & 2047; }; + static constexpr auto div_xy = [](s64 delta, s32 dk) { + return ((delta << XY_SHIFT) - ((delta < 0) ? (dk - 1) : 0) + ((delta > 0) ? (dk - 1) : 0)) / dk; + }; + static constexpr auto makefp_rgb = [](u32 c) { return (static_cast(c) << RGB_SHIFT) | (1 << (RGB_SHIFT - 1)); }; + static constexpr auto unfp_rgb = [](s32 c) { return static_cast(c >> RGB_SHIFT); }; + static constexpr auto div_rgb = [](u32 c1, u32 c0, s32 dk) { + return ((static_cast(c1) - static_cast(c0)) << RGB_SHIFT) / dk; + }; + + const s32 i_dx = std::abs(p1->x - p0->x); + const s32 i_dy = std::abs(p1->y - p0->y); + const s32 k = (i_dx > i_dy) ? i_dx : i_dy; + if (i_dx >= MAX_PRIMITIVE_WIDTH || i_dy >= MAX_PRIMITIVE_HEIGHT) [[unlikely]] + return; + + if (p0->x >= p1->x && k > 0) + std::swap(p0, p1); + + s64 dxdk = 0, dydk = 0; + [[maybe_unused]] s32 drdk = 0, dgdk = 0, dbdk = 0; + if (k != 0) [[likely]] + { + dxdk = div_xy(p1->x - p0->x, k); + dydk = div_xy(p1->y - p0->y, k); + if constexpr (shading_enable) + { + drdk = div_rgb(p1->r, p0->r, k); + dgdk = div_rgb(p1->g, p0->g, k); + dbdk = div_rgb(p1->b, p0->b, k); + } + } + + s64 curx = makefp_xy(p0->x) - 1024; + s64 cury = makefp_xy(p0->y) - ((dydk < 0) ? 1024 : 0); + [[maybe_unused]] s32 curr, curg, curb; + if constexpr (shading_enable) + { + curr = makefp_rgb(p0->r); + curg = makefp_rgb(p0->g); + curb = makefp_rgb(p0->b); + } + + for (s32 i = 0; i <= k; i++) + { + const s32 x = unfp_xy(curx); + const s32 y = unfp_xy(cury); + + if ((!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast(y)) & 1u)) && + x >= static_cast(g_drawing_area.left) && x <= static_cast(g_drawing_area.right) && + y >= static_cast(g_drawing_area.top) && y <= static_cast(g_drawing_area.bottom)) + { + const u8 r = shading_enable ? unfp_rgb(curr) : p0->r; + const u8 g = shading_enable ? unfp_rgb(curg) : p0->g; + const u8 b = shading_enable ? unfp_rgb(curb) : p0->b; + + ShadePixel( + cmd, static_cast(x), static_cast(y) & VRAM_HEIGHT_MASK, r, g, b, 0, 0); + } + + curx += dxdk; + cury += dydk; + + if constexpr (shading_enable) + { + curr += drdk; + curg += dgdk; + curb += dbdk; + } + } +} + +// DDA triangle rasterization algorithm originally from Mednafen, rewritten and vectorized for DuckStation. +namespace { +static constexpr u32 ATTRIB_SHIFT = 12; +static constexpr u32 ATTRIB_POST_SHIFT = 12; + +struct UVSteps +{ + u32 dudx; + u32 dvdx; + u32 dudy; + u32 dvdy; +}; + +struct UVStepper +{ + u32 u; + u32 v; + + ALWAYS_INLINE u8 GetU() const { return Truncate8(u >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); } + ALWAYS_INLINE u8 GetV() const { return Truncate8(v >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); } + + ALWAYS_INLINE void SetStart(u32 ustart, u32 vstart) + { + u = (((ustart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT); + v = (((vstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT); + } + + ALWAYS_INLINE void StepX(const UVSteps& steps) + { + u = u + steps.dudx; + v = v + steps.dvdx; + } + ALWAYS_INLINE void StepXY(const UVSteps& steps, s32 x_count, s32 y_count) + { + u = u + (steps.dudx * static_cast(x_count)) + (steps.dudy * static_cast(y_count)); + v = v + (steps.dvdx * static_cast(x_count)) + (steps.dvdy * static_cast(y_count)); + } +}; + +struct RGBSteps +{ + u32 drdx; + u32 dgdx; + u32 dbdx; + + u32 drdy; + u32 dgdy; + u32 dbdy; +}; + +struct RGBStepper +{ + u32 r; + u32 g; + u32 b; + + ALWAYS_INLINE u8 GetR() const { return Truncate8(r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); } + ALWAYS_INLINE u8 GetG() const { return Truncate8(g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); } + ALWAYS_INLINE u8 GetB() const { return Truncate8(b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); } + + ALWAYS_INLINE void SetStart(u32 rstart, u32 gstart, u32 bstart) + { + r = (((rstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT); + g = (((gstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT); + b = (((bstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT); + } + + ALWAYS_INLINE void StepX(const RGBSteps& steps) + { + r = r + steps.drdx; + g = g + steps.dgdx; + b = b + steps.dbdx; + } + ALWAYS_INLINE void StepXY(const RGBSteps& steps, s32 x_count, s32 y_count) + { + r = r + (steps.drdx * static_cast(x_count)) + (steps.drdy * static_cast(y_count)); + g = g + (steps.dgdx * static_cast(x_count)) + (steps.dgdy * static_cast(y_count)); + b = b + (steps.dbdx * static_cast(x_count)) + (steps.dbdy * static_cast(y_count)); + } +}; + +struct TrianglePart +{ + // left/right edges + u64 start_x[2]; + u64 step_x[2]; + + s32 start_y; + s32 end_y; + + bool fill_upside_down; +}; +} // namespace + +#ifndef USE_VECTOR + +template +static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv, + const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep) +{ + if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u)) + return; + + s32 width = x_bound - x_start; + s32 current_x = TruncateGPUVertexPosition(x_start); + + // Skip pixels outside of the scissor rectangle. + if (current_x < static_cast(g_drawing_area.left)) + { + const s32 delta = static_cast(g_drawing_area.left) - current_x; + x_start += delta; + current_x += delta; + width -= delta; + } + + if ((current_x + width) > (static_cast(g_drawing_area.right) + 1)) + width = static_cast(g_drawing_area.right) + 1 - current_x; + + if (width <= 0) + return; + + if constexpr (texture_enable) + uv.StepXY(uvstep, x_start, y); + if constexpr (shading_enable) + rgb.StepXY(rgbstep, x_start, y); + + do + { + ShadePixel( + cmd, static_cast(current_x), static_cast(y), rgb.GetR(), rgb.GetG(), rgb.GetB(), uv.GetU(), uv.GetV()); + + current_x++; + if constexpr (texture_enable) + uv.StepX(uvstep); + if constexpr (shading_enable) + rgb.StepX(rgbstep); + } while (--width > 0); +} + +#else // USE_VECTOR + +template +static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv, + const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep) +{ + if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u)) + return; + + s32 w = x_bound - x_start; + s32 x = TruncateGPUVertexPosition(x_start); + + if (x < static_cast(g_drawing_area.left)) + { + const s32 delta = static_cast(g_drawing_area.left) - x; + x_start += delta; + x += delta; + w -= delta; + } + + if ((x + w) > (static_cast(g_drawing_area.right) + 1)) + w = static_cast(g_drawing_area.right) + 1 - x; + + if (w <= 0) + return; + + // TODO: Precompute. + + const auto clip_left = GSVector4i(g_drawing_area.left); + const auto clip_right = GSVector4i(g_drawing_area.right); + + const GSVector4i dr_dx = GSVector4i(rgbstep.drdx * 4); + const GSVector4i dg_dx = GSVector4i(rgbstep.dgdx * 4); + const GSVector4i db_dx = GSVector4i(rgbstep.dbdx * 4); + const GSVector4i du_dx = GSVector4i(uvstep.dudx * 4); + const GSVector4i dv_dx = GSVector4i(uvstep.dvdx * 4); + + // TODO: vectorize + const GSVector4i dr_dx_offset = GSVector4i(0, rgbstep.drdx, rgbstep.drdx * 2, rgbstep.drdx * 3); + const GSVector4i dg_dx_offset = GSVector4i(0, rgbstep.dgdx, rgbstep.dgdx * 2, rgbstep.dgdx * 3); + const GSVector4i db_dx_offset = GSVector4i(0, rgbstep.dbdx, rgbstep.dbdx * 2, rgbstep.dbdx * 3); + const GSVector4i du_dx_offset = GSVector4i(0, uvstep.dudx, uvstep.dudx * 2, uvstep.dudx * 3); + const GSVector4i dv_dx_offset = GSVector4i(0, uvstep.dvdx, uvstep.dvdx * 2, uvstep.dvdx * 3); + + GSVector4i dr, dg, db; + if constexpr (shading_enable) + { + dr = GSVector4i(rgb.r + rgbstep.drdx * x_start).add32(dr_dx_offset); + dg = GSVector4i(rgb.g + rgbstep.dgdx * x_start).add32(dg_dx_offset); + db = GSVector4i(rgb.b + rgbstep.dbdx * x_start).add32(db_dx_offset); + } + else + { + // precompute for flat shading + dr = GSVector4i(rgb.r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); + dg = GSVector4i((rgb.g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)) << 16); + db = GSVector4i(rgb.b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); + } + + GSVector4i du = GSVector4i(uv.u + uvstep.dudx * x_start).add32(du_dx_offset); + GSVector4i dv = GSVector4i(uv.v + uvstep.dvdx * x_start).add32(dv_dx_offset); + + // TODO: Move to caller. + if constexpr (shading_enable) + { + // TODO: vectorize multiply? + dr = dr.add32(GSVector4i(rgbstep.drdy * y)); + dg = dg.add32(GSVector4i(rgbstep.dgdy * y)); + db = db.add32(GSVector4i(rgbstep.dbdy * y)); + } + + if constexpr (texture_enable) + { + du = du.add32(GSVector4i(uvstep.dudy * y)); + dv = dv.add32(GSVector4i(uvstep.dvdy * y)); + } + + const GSVector4i dither = + GSVector4i::load(&VECTOR_DITHER_MATRIX[static_cast(y) & 3][(static_cast(x) & 3) * 2]); + + GSVector4i xvec = GSVector4i(x).add32(GSVector4i::cxpr(0, 1, 2, 3)); + GSVector4i wvec = GSVector4i(w).sub32(GSVector4i::cxpr(1, 2, 3, 4)); + + for (s32 count = (w + 3) / 4; count > 0; --count) + { + // R000 | R000 | R000 | R000 + // R0G0 | R0G0 | R0G0 | R0G0 + const GSVector4i r = shading_enable ? dr.srl32() : dr; + const GSVector4i g = + shading_enable ? dg.srl32().sll32<16>() : dg; // get G into the correct position + const GSVector4i b = shading_enable ? db.srl32() : db; + const GSVector4i u = du.srl32(); + const GSVector4i v = dv.srl32(); + + const GSVector4i rg = r.blend16<0xAA>(g); + + // mask based on what's outside the span + auto preserve_mask = wvec.lt32(GSVector4i::zero()); + + // clip test, if all pixels are outside, skip + preserve_mask = preserve_mask | xvec.lt32(clip_left); + preserve_mask = preserve_mask | xvec.gt32(clip_right); + if (!preserve_mask.alltrue()) + { + ShadePixel( + cmd, static_cast(x), static_cast(y), rg, b, u, v, preserve_mask, dither); + } + + x += 4; + + xvec = xvec.add32(GSVector4i::cxpr(4)); + wvec = wvec.sub32(GSVector4i::cxpr(4)); + + if constexpr (shading_enable) + { + dr = dr.add32(dr_dx); + dg = dg.add32(dg_dx); + db = db.add32(db_dx); + } + + if constexpr (texture_enable) + { + du = du.add32(du_dx); + dv = dv.add32(dv_dx); + } + } +} + +#endif // USE_VECTOR + +template +ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp, + const UVStepper& uv, const UVSteps& uvstep, const RGBStepper& rgb, + const RGBSteps& rgbstep) +{ + static constexpr auto unfp_xy = [](s64 xfp) -> s32 { return static_cast(static_cast(xfp) >> 32); }; + + const u64 left_x_step = tp.step_x[0]; + const u64 right_x_step = tp.step_x[1]; + const s32 end_y = tp.end_y; + u64 left_x = tp.start_x[0]; + u64 right_x = tp.start_x[1]; + s32 current_y = tp.start_y; + + if (tp.fill_upside_down) + { + while (current_y > end_y) + { + current_y--; + left_x -= left_x_step; + right_x -= right_x_step; + + const s32 y = TruncateGPUVertexPosition(current_y); + if (y < static_cast(g_drawing_area.top)) + break; + else if (y > static_cast(g_drawing_area.bottom)) + continue; + + DrawSpan( + cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), uv, uvstep, rgb, rgbstep); + } + } + else + { + while (current_y < end_y) + { + const s32 y = TruncateGPUVertexPosition(current_y); + + if (y > static_cast(g_drawing_area.bottom)) + { + break; + } + else if (y >= static_cast(g_drawing_area.top)) + { + DrawSpan( + cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), uv, uvstep, rgb, rgbstep); + } + + current_y++; + left_x += left_x_step; + right_x += right_x_step; + } + } +} + +template +static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0, + const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2) +{ +#if 0 + const GPUBackendDrawPolygonCommand::Vertex* orig_v0 = v0; + const GPUBackendDrawPolygonCommand::Vertex* orig_v1 = v1; + const GPUBackendDrawPolygonCommand::Vertex* orig_v2 = v2; +#endif + + // Sort vertices so that v0 is the top vertex, v1 is the bottom vertex, and v2 is the side vertex. + u32 vc = 0; + if (v1->x <= v0->x) + vc = (v2->x <= v1->x) ? 4 : 2; + else if (v2->x < v0->x) + vc = 4; + else + vc = 1; + if (v2->y < v1->y) + { + std::swap(v2, v1); + vc = ((vc >> 1) & 0x2) | ((vc << 1) & 0x4) | (vc & 0x1); + } + if (v1->y < v0->y) + { + std::swap(v1, v0); + vc = ((vc >> 1) & 0x1) | ((vc << 1) & 0x2) | (vc & 0x4); + } + if (v2->y < v1->y) + { + std::swap(v2, v1); + vc = ((vc >> 1) & 0x2) | ((vc << 1) & 0x4) | (vc & 0x1); + } + + const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2}; + vc = vc >> 1; + + // Invalid size early culling. + if (static_cast(std::abs(v2->x - v0->x)) >= MAX_PRIMITIVE_WIDTH || + static_cast(std::abs(v2->x - v1->x)) >= MAX_PRIMITIVE_WIDTH || + static_cast(std::abs(v1->x - v0->x)) >= MAX_PRIMITIVE_WIDTH || + static_cast(v2->y - v0->y) >= MAX_PRIMITIVE_HEIGHT || v0->y == v2->y) + { + return; + } + + // Same as line rasterization, use higher precision for position. + static constexpr auto makefp_xy = [](s32 x) { return (static_cast(x) << 32) + ((1LL << 32) - (1 << 11)); }; + static constexpr auto makestep_xy = [](s32 dx, s32 dy) -> s64 { + return (((static_cast(dx) << 32) + ((dx < 0) ? -(dy - 1) : ((dx > 0) ? (dy - 1) : 0))) / dy); + }; + const s64 base_coord = makefp_xy(v0->x); + const s64 base_step = makestep_xy(v2->x - v0->x, v2->y - v0->y); + const s64 bound_coord_us = (v1->y == v0->y) ? 0 : makestep_xy(v1->x - v0->x, v1->y - v0->y); + const s64 bound_coord_ls = (v2->y == v1->y) ? 0 : makestep_xy(v2->x - v1->x, v2->y - v1->y); + const u32 vo = (vc != 0) ? 1 : 0; + const u32 vp = (vc == 2) ? 3 : 0; + const bool right_facing = (v1->y == v0->y) ? (v1->x > v0->x) : (bound_coord_us > base_step); + const u32 rfi = BoolToUInt32(right_facing); + const u32 ofi = BoolToUInt32(!right_facing); + + TrianglePart triparts[2]; + TrianglePart& tpo = triparts[vo]; + TrianglePart& tpp = triparts[vo ^ 1]; + tpo.start_y = vertices[0 ^ vo]->y; + tpo.end_y = vertices[1 ^ vo]->y; + tpp.start_y = vertices[1 ^ vp]->y; + tpp.end_y = vertices[2 ^ vp]->y; + tpo.start_x[rfi] = makefp_xy(vertices[0 ^ vo]->x); + tpo.step_x[rfi] = bound_coord_us; + tpo.start_x[ofi] = base_coord + ((vertices[vo]->y - vertices[0]->y) * base_step); + tpo.step_x[ofi] = base_step; + tpo.fill_upside_down = ConvertToBoolUnchecked(vo); + tpp.start_x[rfi] = makefp_xy(vertices[1 ^ vp]->x); + tpp.step_x[rfi] = bound_coord_ls; + tpp.start_x[ofi] = base_coord + ((vertices[1 ^ vp]->y - vertices[0]->y) * base_step); + tpp.step_x[ofi] = base_step; + tpp.fill_upside_down = (vp != 0); + +#define ATTRIB_DETERMINANT(x, y) (((v1->x - v0->x) * (v2->y - v1->y)) - ((v2->x - v1->x) * (v1->y - v0->y))) +#define ATTRIB_STEP(x, y) (static_cast(ATTRIB_DETERMINANT(x, y) * (1 << ATTRIB_SHIFT) / det) << ATTRIB_POST_SHIFT) + + // Check edges. + const s32 det = ATTRIB_DETERMINANT(x, y); + if (det == 0) [[unlikely]] + return; + + // Compute step values. + UVSteps uvstep; + RGBSteps rgbstep; + if constexpr (texture_enable) + { + uvstep.dudx = ATTRIB_STEP(u, y); + uvstep.dvdx = ATTRIB_STEP(v, y); + uvstep.dudy = ATTRIB_STEP(x, u); + uvstep.dvdy = ATTRIB_STEP(x, v); + } + + if constexpr (shading_enable) + { + rgbstep.drdx = ATTRIB_STEP(r, y); + rgbstep.dgdx = ATTRIB_STEP(g, y); + rgbstep.dbdx = ATTRIB_STEP(b, y); + rgbstep.drdy = ATTRIB_STEP(x, r); + rgbstep.dgdy = ATTRIB_STEP(x, g); + rgbstep.dbdy = ATTRIB_STEP(x, b); + } + +#undef ATTRIB_STEP +#undef ATTRIB_DETERMINANT + + // Undo the start of the vertex, so that when we add the offset for each line, it starts at the beginning value. + UVStepper uv; + RGBStepper rgb; + const GPUBackendDrawPolygonCommand::Vertex* core_vertex = vertices[vc]; + if constexpr (texture_enable) + { + uv.SetStart(core_vertex->u, core_vertex->v); + uv.StepXY(uvstep, -core_vertex->x, -core_vertex->y); + } + else + { + // Not actually used, but shut up the compiler. Should get optimized out. + uv = {}; + } + + rgb.SetStart(core_vertex->r, core_vertex->g, core_vertex->b); + if constexpr (shading_enable) + rgb.StepXY(rgbstep, -core_vertex->x, -core_vertex->y); + +#ifdef USE_VECTOR + BACKUP_VRAM(); +#endif + + for (u32 i = 0; i < 2; i++) + { + DrawTrianglePart( + cmd, triparts[i], uv, uvstep, rgb, rgbstep); + } + +#ifdef USE_VECTOR + CHECK_VRAM( + GPU_SW_Rasterizer::DrawTriangleFunctions[shading_enable][texture_enable][raw_texture_enable][transparency_enable] + [dithering_enable](cmd, orig_v0, orig_v1, orig_v2)); +#endif +} + +constinit const DrawRectangleFunctionTable DrawRectangleFunctions = { + {{&DrawRectangle, &DrawRectangle}, + {&DrawRectangle, &DrawRectangle}}, + {{&DrawRectangle, &DrawRectangle}, + {&DrawRectangle, &DrawRectangle}}}; + +constinit const DrawLineFunctionTable DrawLineFunctions = { + {{&DrawLine, &DrawLine}, + {&DrawLine, &DrawLine}}, + {{&DrawLine, &DrawLine}, + {&DrawLine, &DrawLine}}}; + +constinit const DrawTriangleFunctionTable DrawTriangleFunctions = { + {{{{&DrawTriangle, &DrawTriangle}, + {&DrawTriangle, &DrawTriangle}}, + {{&DrawTriangle, &DrawTriangle}, + {&DrawTriangle, &DrawTriangle}}}, + {{{&DrawTriangle, &DrawTriangle}, + {&DrawTriangle, &DrawTriangle}}, + {{&DrawTriangle, &DrawTriangle}, + {&DrawTriangle, &DrawTriangle}}}}, + {{{{&DrawTriangle, &DrawTriangle}, + {&DrawTriangle, &DrawTriangle}}, + {{&DrawTriangle, &DrawTriangle}, + {&DrawTriangle, &DrawTriangle}}}, + {{{&DrawTriangle, &DrawTriangle}, + {&DrawTriangle, &DrawTriangle}}, + {{&DrawTriangle, &DrawTriangle}, + {&DrawTriangle, &DrawTriangle}}}}}; + +#ifdef __INTELLISENSE__ +} +#endif diff --git a/src/core/gpu_sw_rasterizer_avx2.cpp b/src/core/gpu_sw_rasterizer_avx2.cpp new file mode 100644 index 000000000..67dc938e6 --- /dev/null +++ b/src/core/gpu_sw_rasterizer_avx2.cpp @@ -0,0 +1,12 @@ +// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin +// SPDX-License-Identifier: CC-BY-NC-ND-4.0 + +#include "gpu_sw_rasterizer.h" + +#include "common/assert.h" +#include "common/gsvector.h" + +namespace GPU_SW_Rasterizer::AVX2 { +#define USE_VECTOR 1 +#include "gpu_sw_rasterizer.inl" +} diff --git a/src/core/gpu_types.h b/src/core/gpu_types.h index 2a01679e8..7faadd71e 100644 --- a/src/core/gpu_types.h +++ b/src/core/gpu_types.h @@ -333,6 +333,7 @@ struct GPUBackendCopyVRAMCommand : public GPUBackendCommand struct GPUBackendSetDrawingAreaCommand : public GPUBackendCommand { GPUDrawingArea new_area; + s32 new_clamped_area[4]; }; struct GPUBackendUpdateCLUTCommand : public GPUBackendCommand