diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 2297591e8..a0a9662e7 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -57,6 +57,8 @@ add_library(core
gpu_sw.h
gpu_sw_backend.cpp
gpu_sw_backend.h
+ gpu_sw_rasterizer.cpp
+ gpu_sw_rasterizer.h
gpu_types.h
guncon.cpp
guncon.h
diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj
index 7c11148c2..2144c2873 100644
--- a/src/core/core.vcxproj
+++ b/src/core/core.vcxproj
@@ -50,6 +50,13 @@
+
+
+ AdvancedVectorExtensions2
+ %(AdditionalOptions) -mavx2
+ true
+ NotUsing
+
@@ -127,6 +134,7 @@
+
@@ -195,6 +203,9 @@
{57f6206d-f264-4b07-baf8-11b9bbe1f455}
+
+
+
{868B98C8-65A1-494B-8346-250A73A48C0A}
diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters
index b089e83f1..f623ed9f2 100644
--- a/src/core/core.vcxproj.filters
+++ b/src/core/core.vcxproj.filters
@@ -67,6 +67,8 @@
+
+
@@ -140,5 +142,9 @@
+
+
+
+
\ No newline at end of file
diff --git a/src/core/gpu_backend.cpp b/src/core/gpu_backend.cpp
index 367236e05..57508ff78 100644
--- a/src/core/gpu_backend.cpp
+++ b/src/core/gpu_backend.cpp
@@ -26,7 +26,7 @@ bool GPUBackend::Initialize(bool force_thread)
void GPUBackend::Reset()
{
Sync(true);
- m_drawing_area = {};
+ DrawingAreaChanged(GPUDrawingArea{0, 0, 0, 0}, GSVector4i::zero());
}
void GPUBackend::UpdateSettings()
@@ -310,8 +310,8 @@ void GPUBackend::HandleCommand(const GPUBackendCommand* cmd)
case GPUBackendCommandType::SetDrawingArea:
{
FlushRender();
- m_drawing_area = static_cast(cmd)->new_area;
- DrawingAreaChanged();
+ const GPUBackendSetDrawingAreaCommand* ccmd = static_cast(cmd);
+ DrawingAreaChanged(ccmd->new_area, GSVector4i::load(ccmd->new_clamped_area));
}
break;
diff --git a/src/core/gpu_backend.h b/src/core/gpu_backend.h
index 67854dc5a..b58df358c 100644
--- a/src/core/gpu_backend.h
+++ b/src/core/gpu_backend.h
@@ -62,13 +62,11 @@ protected:
virtual void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) = 0;
virtual void DrawLine(const GPUBackendDrawLineCommand* cmd) = 0;
virtual void FlushRender() = 0;
- virtual void DrawingAreaChanged() = 0;
+ virtual void DrawingAreaChanged(const GPUDrawingArea& new_drawing_area, const GSVector4i clamped_drawing_area) = 0;
virtual void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) = 0;
void HandleCommand(const GPUBackendCommand* cmd);
- GPUDrawingArea m_drawing_area = {};
-
Threading::KernelSemaphore m_sync_semaphore;
std::atomic_bool m_gpu_thread_sleeping{false};
std::atomic_bool m_gpu_loop_done{false};
diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp
index e2f9110a8..56a88e779 100644
--- a/src/core/gpu_sw.cpp
+++ b/src/core/gpu_sw.cpp
@@ -501,6 +501,7 @@ void GPU_SW::DispatchRenderCommand()
{
GPUBackendSetDrawingAreaCommand* cmd = m_backend.NewSetDrawingAreaCommand();
cmd->new_area = m_drawing_area;
+ GSVector4i::store(cmd->new_clamped_area, m_clamped_drawing_area);
m_backend.PushCommand(cmd);
m_drawing_area_changed = false;
}
diff --git a/src/core/gpu_sw_backend.cpp b/src/core/gpu_sw_backend.cpp
index 0f4def66d..cbf547d88 100644
--- a/src/core/gpu_sw_backend.cpp
+++ b/src/core/gpu_sw_backend.cpp
@@ -3,6 +3,7 @@
#include "gpu_sw_backend.h"
#include "gpu.h"
+#include "gpu_sw_rasterizer.h"
#include "system.h"
#include "util/gpu_device.h"
@@ -15,6 +16,8 @@ GPU_SW_Backend::~GPU_SW_Backend() = default;
bool GPU_SW_Backend::Initialize(bool force_thread)
{
+ GPU_SW_Rasterizer::SelectImplementation();
+
return GPUBackend::Initialize(force_thread);
}
@@ -28,688 +31,31 @@ void GPU_SW_Backend::DrawPolygon(const GPUBackendDrawPolygonCommand* cmd)
const GPURenderCommand rc{cmd->rc.bits};
const bool dithering_enable = rc.IsDitheringEnabled() && cmd->draw_mode.dither_enable;
- const DrawTriangleFunction DrawFunction = GetDrawTriangleFunction(
+ const GPU_SW_Rasterizer::DrawTriangleFunction DrawFunction = GPU_SW_Rasterizer::GetDrawTriangleFunction(
rc.shading_enable, rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable, dithering_enable);
- (this->*DrawFunction)(cmd, &cmd->vertices[0], &cmd->vertices[1], &cmd->vertices[2]);
+ DrawFunction(cmd, &cmd->vertices[0], &cmd->vertices[1], &cmd->vertices[2]);
if (rc.quad_polygon)
- (this->*DrawFunction)(cmd, &cmd->vertices[2], &cmd->vertices[1], &cmd->vertices[3]);
+ DrawFunction(cmd, &cmd->vertices[2], &cmd->vertices[1], &cmd->vertices[3]);
}
void GPU_SW_Backend::DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
{
const GPURenderCommand rc{cmd->rc.bits};
- const DrawRectangleFunction DrawFunction =
- GetDrawRectangleFunction(rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable);
+ const GPU_SW_Rasterizer::DrawRectangleFunction DrawFunction =
+ GPU_SW_Rasterizer::GetDrawRectangleFunction(rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable);
- (this->*DrawFunction)(cmd);
+ DrawFunction(cmd);
}
void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd)
{
- const DrawLineFunction DrawFunction =
- GetDrawLineFunction(cmd->rc.shading_enable, cmd->rc.transparency_enable, cmd->IsDitheringEnabled());
+ const GPU_SW_Rasterizer::DrawLineFunction DrawFunction = GPU_SW_Rasterizer::GetDrawLineFunction(
+ cmd->rc.shading_enable, cmd->rc.transparency_enable, cmd->IsDitheringEnabled());
for (u16 i = 1; i < cmd->num_vertices; i++)
- (this->*DrawFunction)(cmd, &cmd->vertices[i - 1], &cmd->vertices[i]);
-}
-
-constexpr GPU_SW_Backend::DitherLUT GPU_SW_Backend::ComputeDitherLUT()
-{
- DitherLUT lut = {};
- for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++)
- {
- for (u32 j = 0; j < DITHER_MATRIX_SIZE; j++)
- {
- for (u32 value = 0; value < DITHER_LUT_SIZE; value++)
- {
- const s32 dithered_value = (static_cast(value) + DITHER_MATRIX[i][j]) >> 3;
- lut[i][j][value] = static_cast((dithered_value < 0) ? 0 : ((dithered_value > 31) ? 31 : dithered_value));
- }
- }
- }
- return lut;
-}
-
-static constexpr GPU_SW_Backend::DitherLUT s_dither_lut = GPU_SW_Backend::ComputeDitherLUT();
-
-template
-void ALWAYS_INLINE_RELEASE GPU_SW_Backend::ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r,
- u8 color_g, u8 color_b, u8 texcoord_x, u8 texcoord_y)
-{
- VRAMPixel color;
- if constexpr (texture_enable)
- {
- // Apply texture window
- texcoord_x = (texcoord_x & cmd->window.and_x) | cmd->window.or_x;
- texcoord_y = (texcoord_y & cmd->window.and_y) | cmd->window.or_y;
-
- VRAMPixel texture_color;
- switch (cmd->draw_mode.texture_mode)
- {
- case GPUTextureMode::Palette4Bit:
- {
- const u16 palette_value =
- GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 4)) % VRAM_WIDTH,
- (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
- const size_t palette_index = (palette_value >> ((texcoord_x % 4) * 4)) & 0x0Fu;
- texture_color.bits = g_gpu_clut[palette_index];
- }
- break;
-
- case GPUTextureMode::Palette8Bit:
- {
- const u16 palette_value =
- GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 2)) % VRAM_WIDTH,
- (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
- const size_t palette_index = (palette_value >> ((texcoord_x % 2) * 8)) & 0xFFu;
- texture_color.bits = g_gpu_clut[palette_index];
- }
- break;
-
- default:
- {
- texture_color.bits = GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x)) % VRAM_WIDTH,
- (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
- }
- break;
- }
-
- if (texture_color.bits == 0)
- return;
-
- if constexpr (raw_texture_enable)
- {
- color.bits = texture_color.bits;
- }
- else
- {
- const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
- const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
-
- color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.r) * u16(color_r)) >> 4]) << 0) |
- (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.g) * u16(color_g)) >> 4]) << 5) |
- (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.b) * u16(color_b)) >> 4]) << 10) |
- (texture_color.bits & 0x8000u);
- }
- }
- else
- {
- const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
- const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
-
- // Non-textured transparent polygons don't set bit 15, but are treated as transparent.
- color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_r]) << 0) |
- (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_g]) << 5) |
- (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_b]) << 10) | (transparency_enable ? 0x8000u : 0);
- }
-
- const VRAMPixel bg_color{GetPixel(static_cast(x), static_cast(y))};
- if constexpr (transparency_enable)
- {
- if (color.bits & 0x8000u || !texture_enable)
- {
- // Based on blargg's efficient 15bpp pixel math.
- u32 bg_bits = ZeroExtend32(bg_color.bits);
- u32 fg_bits = ZeroExtend32(color.bits);
- switch (cmd->draw_mode.transparency_mode)
- {
- case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
- {
- bg_bits |= 0x8000u;
- color.bits = Truncate16(((fg_bits + bg_bits) - ((fg_bits ^ bg_bits) & 0x0421u)) >> 1);
- }
- break;
-
- case GPUTransparencyMode::BackgroundPlusForeground:
- {
- bg_bits &= ~0x8000u;
-
- const u32 sum = fg_bits + bg_bits;
- const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u;
-
- color.bits = Truncate16((sum - carry) | (carry - (carry >> 5)));
- }
- break;
-
- case GPUTransparencyMode::BackgroundMinusForeground:
- {
- bg_bits |= 0x8000u;
- fg_bits &= ~0x8000u;
-
- const u32 diff = bg_bits - fg_bits + 0x108420u;
- const u32 borrow = (diff - ((bg_bits ^ fg_bits) & 0x108420u)) & 0x108420u;
-
- color.bits = Truncate16((diff - borrow) & (borrow - (borrow >> 5)));
- }
- break;
-
- case GPUTransparencyMode::BackgroundPlusQuarterForeground:
- {
- bg_bits &= ~0x8000u;
- fg_bits = ((fg_bits >> 2) & 0x1CE7u) | 0x8000u;
-
- const u32 sum = fg_bits + bg_bits;
- const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u;
-
- color.bits = Truncate16((sum - carry) | (carry - (carry >> 5)));
- }
- break;
-
- default:
- break;
- }
-
- // See above.
- if constexpr (!texture_enable)
- color.bits &= ~0x8000u;
- }
- }
-
- const u16 mask_and = cmd->params.GetMaskAND();
- if ((bg_color.bits & mask_and) != 0)
- return;
-
- DebugAssert(static_cast(x) < VRAM_WIDTH && static_cast(y) < VRAM_HEIGHT);
- SetPixel(static_cast(x), static_cast(y), color.bits | cmd->params.GetMaskOR());
-}
-
-template
-void GPU_SW_Backend::DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
-{
- const s32 origin_x = cmd->x;
- const s32 origin_y = cmd->y;
- const auto [r, g, b] = UnpackColorRGB24(cmd->color);
- const auto [origin_texcoord_x, origin_texcoord_y] = UnpackTexcoord(cmd->texcoord);
-
- for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
- {
- const s32 y = origin_y + static_cast(offset_y);
- if (y < static_cast(m_drawing_area.top) || y > static_cast(m_drawing_area.bottom) ||
- (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u)))
- {
- continue;
- }
-
- const u32 draw_y = static_cast(y) & VRAM_HEIGHT_MASK;
- const u8 texcoord_y = Truncate8(ZeroExtend32(origin_texcoord_y) + offset_y);
-
- for (u32 offset_x = 0; offset_x < cmd->width; offset_x++)
- {
- const s32 x = origin_x + static_cast(offset_x);
- if (x < static_cast(m_drawing_area.left) || x > static_cast(m_drawing_area.right))
- continue;
-
- const u8 texcoord_x = Truncate8(ZeroExtend32(origin_texcoord_x) + offset_x);
-
- ShadePixel(cmd, static_cast(x), draw_y, r, g,
- b, texcoord_x, texcoord_y);
- }
- }
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Polygon and line rasterization ported from Mednafen
-//////////////////////////////////////////////////////////////////////////
-
-#define COORD_FBS 12
-#define COORD_MF_INT(n) ((n) << COORD_FBS)
-#define COORD_POST_PADDING 12
-
-static ALWAYS_INLINE_RELEASE s64 MakePolyXFP(s32 x)
-{
- return ((u64)x << 32) + ((1ULL << 32) - (1 << 11));
-}
-
-static ALWAYS_INLINE_RELEASE s64 MakePolyXFPStep(s32 dx, s32 dy)
-{
- s64 ret;
- s64 dx_ex = (u64)dx << 32;
-
- if (dx_ex < 0)
- dx_ex -= dy - 1;
-
- if (dx_ex > 0)
- dx_ex += dy - 1;
-
- ret = dx_ex / dy;
-
- return (ret);
-}
-
-static ALWAYS_INLINE_RELEASE s32 GetPolyXFP_Int(s64 xfp)
-{
- return (xfp >> 32);
-}
-
-template
-bool ALWAYS_INLINE_RELEASE GPU_SW_Backend::CalcIDeltas(i_deltas& idl, const GPUBackendDrawPolygonCommand::Vertex* A,
- const GPUBackendDrawPolygonCommand::Vertex* B,
- const GPUBackendDrawPolygonCommand::Vertex* C)
-{
-#define CALCIS(x, y) (((B->x - A->x) * (C->y - B->y)) - ((C->x - B->x) * (B->y - A->y)))
-
- s32 denom = CALCIS(x, y);
-
- if (!denom)
- return false;
-
- if constexpr (shading_enable)
- {
- idl.dr_dx = (u32)(CALCIS(r, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
- idl.dr_dy = (u32)(CALCIS(x, r) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-
- idl.dg_dx = (u32)(CALCIS(g, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
- idl.dg_dy = (u32)(CALCIS(x, g) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-
- idl.db_dx = (u32)(CALCIS(b, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
- idl.db_dy = (u32)(CALCIS(x, b) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
- }
-
- if constexpr (texture_enable)
- {
- idl.du_dx = (u32)(CALCIS(u, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
- idl.du_dy = (u32)(CALCIS(x, u) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-
- idl.dv_dx = (u32)(CALCIS(v, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
- idl.dv_dy = (u32)(CALCIS(x, v) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
- }
-
- return true;
-
-#undef CALCIS
-}
-
-template
-void ALWAYS_INLINE_RELEASE GPU_SW_Backend::AddIDeltas_DX(i_group& ig, const i_deltas& idl, u32 count /*= 1*/)
-{
- if constexpr (shading_enable)
- {
- ig.r += idl.dr_dx * count;
- ig.g += idl.dg_dx * count;
- ig.b += idl.db_dx * count;
- }
-
- if constexpr (texture_enable)
- {
- ig.u += idl.du_dx * count;
- ig.v += idl.dv_dx * count;
- }
-}
-
-template
-void ALWAYS_INLINE_RELEASE GPU_SW_Backend::AddIDeltas_DY(i_group& ig, const i_deltas& idl, u32 count /*= 1*/)
-{
- if constexpr (shading_enable)
- {
- ig.r += idl.dr_dy * count;
- ig.g += idl.dg_dy * count;
- ig.b += idl.db_dy * count;
- }
-
- if constexpr (texture_enable)
- {
- ig.u += idl.du_dy * count;
- ig.v += idl.dv_dy * count;
- }
-}
-
-template
-void GPU_SW_Backend::DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, i_group ig,
- const i_deltas& idl)
-{
- if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u))
- return;
-
- s32 x_ig_adjust = x_start;
- s32 w = x_bound - x_start;
- s32 x = TruncateGPUVertexPosition(x_start);
-
- if (x < static_cast(m_drawing_area.left))
- {
- s32 delta = static_cast(m_drawing_area.left) - x;
- x_ig_adjust += delta;
- x += delta;
- w -= delta;
- }
-
- if ((x + w) > (static_cast(m_drawing_area.right) + 1))
- w = static_cast(m_drawing_area.right) + 1 - x;
-
- if (w <= 0)
- return;
-
- AddIDeltas_DX(ig, idl, x_ig_adjust);
- AddIDeltas_DY(ig, idl, y);
-
- do
- {
- const u32 r = ig.r >> (COORD_FBS + COORD_POST_PADDING);
- const u32 g = ig.g >> (COORD_FBS + COORD_POST_PADDING);
- const u32 b = ig.b >> (COORD_FBS + COORD_POST_PADDING);
- const u32 u = ig.u >> (COORD_FBS + COORD_POST_PADDING);
- const u32 v = ig.v >> (COORD_FBS + COORD_POST_PADDING);
-
- ShadePixel(
- cmd, static_cast(x), static_cast(y), Truncate8(r), Truncate8(g), Truncate8(b), Truncate8(u),
- Truncate8(v));
-
- x++;
- AddIDeltas_DX(ig, idl);
- } while (--w > 0);
-}
-
-template
-void GPU_SW_Backend::DrawTriangle(const GPUBackendDrawPolygonCommand* cmd,
- const GPUBackendDrawPolygonCommand::Vertex* v0,
- const GPUBackendDrawPolygonCommand::Vertex* v1,
- const GPUBackendDrawPolygonCommand::Vertex* v2)
-{
- u32 core_vertex;
- {
- u32 cvtemp = 0;
-
- if (v1->x <= v0->x)
- {
- if (v2->x <= v1->x)
- cvtemp = (1 << 2);
- else
- cvtemp = (1 << 1);
- }
- else if (v2->x < v0->x)
- cvtemp = (1 << 2);
- else
- cvtemp = (1 << 0);
-
- if (v2->y < v1->y)
- {
- std::swap(v2, v1);
- cvtemp = ((cvtemp >> 1) & 0x2) | ((cvtemp << 1) & 0x4) | (cvtemp & 0x1);
- }
-
- if (v1->y < v0->y)
- {
- std::swap(v1, v0);
- cvtemp = ((cvtemp >> 1) & 0x1) | ((cvtemp << 1) & 0x2) | (cvtemp & 0x4);
- }
-
- if (v2->y < v1->y)
- {
- std::swap(v2, v1);
- cvtemp = ((cvtemp >> 1) & 0x2) | ((cvtemp << 1) & 0x4) | (cvtemp & 0x1);
- }
-
- core_vertex = cvtemp >> 1;
- }
-
- if (v0->y == v2->y)
- return;
-
- if (static_cast(std::abs(v2->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
- static_cast(std::abs(v2->x - v1->x)) >= MAX_PRIMITIVE_WIDTH ||
- static_cast(std::abs(v1->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
- static_cast(v2->y - v0->y) >= MAX_PRIMITIVE_HEIGHT)
- {
- return;
- }
-
- s64 base_coord = MakePolyXFP(v0->x);
- s64 base_step = MakePolyXFPStep((v2->x - v0->x), (v2->y - v0->y));
- s64 bound_coord_us;
- s64 bound_coord_ls;
- bool right_facing;
-
- if (v1->y == v0->y)
- {
- bound_coord_us = 0;
- right_facing = (bool)(v1->x > v0->x);
- }
- else
- {
- bound_coord_us = MakePolyXFPStep((v1->x - v0->x), (v1->y - v0->y));
- right_facing = (bool)(bound_coord_us > base_step);
- }
-
- if (v2->y == v1->y)
- bound_coord_ls = 0;
- else
- bound_coord_ls = MakePolyXFPStep((v2->x - v1->x), (v2->y - v1->y));
-
- i_deltas idl;
- if (!CalcIDeltas(idl, v0, v1, v2))
- return;
-
- const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2};
-
- i_group ig;
- if constexpr (texture_enable)
- {
- ig.u = (COORD_MF_INT(vertices[core_vertex]->u) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
- ig.v = (COORD_MF_INT(vertices[core_vertex]->v) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
- }
-
- ig.r = (COORD_MF_INT(vertices[core_vertex]->r) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
- ig.g = (COORD_MF_INT(vertices[core_vertex]->g) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
- ig.b = (COORD_MF_INT(vertices[core_vertex]->b) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
-
- AddIDeltas_DX(ig, idl, -vertices[core_vertex]->x);
- AddIDeltas_DY(ig, idl, -vertices[core_vertex]->y);
-
- struct TriangleHalf
- {
- u64 x_coord[2];
- u64 x_step[2];
-
- s32 y_coord;
- s32 y_bound;
-
- bool dec_mode;
- } tripart[2];
-
- u32 vo = 0;
- u32 vp = 0;
- if (core_vertex != 0)
- vo = 1;
- if (core_vertex == 2)
- vp = 3;
-
- {
- TriangleHalf* tp = &tripart[vo];
- tp->y_coord = vertices[0 ^ vo]->y;
- tp->y_bound = vertices[1 ^ vo]->y;
- tp->x_coord[right_facing] = MakePolyXFP(vertices[0 ^ vo]->x);
- tp->x_step[right_facing] = bound_coord_us;
- tp->x_coord[!right_facing] = base_coord + ((vertices[vo]->y - vertices[0]->y) * base_step);
- tp->x_step[!right_facing] = base_step;
- tp->dec_mode = vo;
- }
-
- {
- TriangleHalf* tp = &tripart[vo ^ 1];
- tp->y_coord = vertices[1 ^ vp]->y;
- tp->y_bound = vertices[2 ^ vp]->y;
- tp->x_coord[right_facing] = MakePolyXFP(vertices[1 ^ vp]->x);
- tp->x_step[right_facing] = bound_coord_ls;
- tp->x_coord[!right_facing] =
- base_coord + ((vertices[1 ^ vp]->y - vertices[0]->y) *
- base_step); // base_coord + ((vertices[1].y - vertices[0].y) * base_step);
- tp->x_step[!right_facing] = base_step;
- tp->dec_mode = vp;
- }
-
- for (u32 i = 0; i < 2; i++)
- {
- s32 yi = tripart[i].y_coord;
- s32 yb = tripart[i].y_bound;
-
- u64 lc = tripart[i].x_coord[0];
- u64 ls = tripart[i].x_step[0];
-
- u64 rc = tripart[i].x_coord[1];
- u64 rs = tripart[i].x_step[1];
-
- if (tripart[i].dec_mode)
- {
- while (yi > yb)
- {
- yi--;
- lc -= ls;
- rc -= rs;
-
- s32 y = TruncateGPUVertexPosition(yi);
-
- if (y < static_cast(m_drawing_area.top))
- break;
-
- if (y > static_cast(m_drawing_area.bottom))
- continue;
-
- DrawSpan(
- cmd, y & VRAM_HEIGHT_MASK, GetPolyXFP_Int(lc), GetPolyXFP_Int(rc), ig, idl);
- }
- }
- else
- {
- while (yi < yb)
- {
- s32 y = TruncateGPUVertexPosition(yi);
-
- if (y > static_cast(m_drawing_area.bottom))
- break;
-
- if (y >= static_cast(m_drawing_area.top))
- {
- DrawSpan(
- cmd, y & VRAM_HEIGHT_MASK, GetPolyXFP_Int(lc), GetPolyXFP_Int(rc), ig, idl);
- }
-
- yi++;
- lc += ls;
- rc += rs;
- }
- }
- }
-}
-
-enum
-{
- Line_XY_FractBits = 32
-};
-enum
-{
- Line_RGB_FractBits = 12
-};
-
-struct line_fxp_coord
-{
- u64 x, y;
- u32 r, g, b;
-};
-
-struct line_fxp_step
-{
- s64 dx_dk, dy_dk;
- s32 dr_dk, dg_dk, db_dk;
-};
-
-static ALWAYS_INLINE_RELEASE s64 LineDivide(s64 delta, s32 dk)
-{
- delta = (u64)delta << Line_XY_FractBits;
-
- if (delta < 0)
- delta -= dk - 1;
- if (delta > 0)
- delta += dk - 1;
-
- return (delta / dk);
-}
-
-template
-void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0,
- const GPUBackendDrawLineCommand::Vertex* p1)
-{
- const s32 i_dx = std::abs(p1->x - p0->x);
- const s32 i_dy = std::abs(p1->y - p0->y);
- const s32 k = (i_dx > i_dy) ? i_dx : i_dy;
- if (i_dx >= MAX_PRIMITIVE_WIDTH || i_dy >= MAX_PRIMITIVE_HEIGHT)
- return;
-
- if (p0->x >= p1->x && k > 0)
- std::swap(p0, p1);
-
- line_fxp_step step;
- if (k == 0)
- {
- step.dx_dk = 0;
- step.dy_dk = 0;
-
- if constexpr (shading_enable)
- {
- step.dr_dk = 0;
- step.dg_dk = 0;
- step.db_dk = 0;
- }
- }
- else
- {
- step.dx_dk = LineDivide(p1->x - p0->x, k);
- step.dy_dk = LineDivide(p1->y - p0->y, k);
-
- if constexpr (shading_enable)
- {
- step.dr_dk = (s32)((u32)(p1->r - p0->r) << Line_RGB_FractBits) / k;
- step.dg_dk = (s32)((u32)(p1->g - p0->g) << Line_RGB_FractBits) / k;
- step.db_dk = (s32)((u32)(p1->b - p0->b) << Line_RGB_FractBits) / k;
- }
- }
-
- line_fxp_coord cur_point;
- cur_point.x = ((u64)p0->x << Line_XY_FractBits) | (1ULL << (Line_XY_FractBits - 1));
- cur_point.y = ((u64)p0->y << Line_XY_FractBits) | (1ULL << (Line_XY_FractBits - 1));
-
- cur_point.x -= 1024;
-
- if (step.dy_dk < 0)
- cur_point.y -= 1024;
-
- if constexpr (shading_enable)
- {
- cur_point.r = (p0->r << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
- cur_point.g = (p0->g << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
- cur_point.b = (p0->b << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
- }
-
- for (s32 i = 0; i <= k; i++)
- {
- // Sign extension is not necessary here for x and y, due to the maximum values that ClipX1 and ClipY1 can contain.
- const s32 x = (cur_point.x >> Line_XY_FractBits) & 2047;
- const s32 y = (cur_point.y >> Line_XY_FractBits) & 2047;
-
- if ((!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast(y)) & 1u)) &&
- x >= static_cast(m_drawing_area.left) && x <= static_cast(m_drawing_area.right) &&
- y >= static_cast(m_drawing_area.top) && y <= static_cast(m_drawing_area.bottom))
- {
- const u8 r = shading_enable ? static_cast(cur_point.r >> Line_RGB_FractBits) : p0->r;
- const u8 g = shading_enable ? static_cast(cur_point.g >> Line_RGB_FractBits) : p0->g;
- const u8 b = shading_enable ? static_cast(cur_point.b >> Line_RGB_FractBits) : p0->b;
-
- ShadePixel(
- cmd, static_cast(x), static_cast(y) & VRAM_HEIGHT_MASK, r, g, b, 0, 0);
- }
-
- cur_point.x += step.dx_dk;
- cur_point.y += step.dy_dk;
-
- if constexpr (shading_enable)
- {
- cur_point.r += step.dr_dk;
- cur_point.g += step.dg_dk;
- cur_point.b += step.db_dk;
- }
- }
+ DrawFunction(cmd, &cmd->vertices[i - 1], &cmd->vertices[i]);
}
void GPU_SW_Backend::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params)
@@ -896,82 +242,16 @@ void GPU_SW_Backend::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 wi
}
}
-void GPU_SW_Backend::FlushRender()
-{
-}
-
-void GPU_SW_Backend::DrawingAreaChanged()
-{
-}
-
void GPU_SW_Backend::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
{
GPU::ReadCLUT(g_gpu_clut, reg, clut_is_8bit);
}
-GPU_SW_Backend::DrawLineFunction GPU_SW_Backend::GetDrawLineFunction(bool shading_enable, bool transparency_enable,
- bool dithering_enable)
+void GPU_SW_Backend::DrawingAreaChanged(const GPUDrawingArea& new_drawing_area, const GSVector4i clamped_drawing_area)
{
- static constexpr DrawLineFunction funcs[2][2][2] = {
- {{&GPU_SW_Backend::DrawLine, &GPU_SW_Backend::DrawLine},
- {&GPU_SW_Backend::DrawLine, &GPU_SW_Backend::DrawLine}},
- {{&GPU_SW_Backend::DrawLine, &GPU_SW_Backend::DrawLine},
- {&GPU_SW_Backend::DrawLine, &GPU_SW_Backend::DrawLine}}};
-
- return funcs[u8(shading_enable)][u8(transparency_enable)][u8(dithering_enable)];
+ GPU_SW_Rasterizer::g_drawing_area = new_drawing_area;
}
-GPU_SW_Backend::DrawRectangleFunction
-GPU_SW_Backend::GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable, bool transparency_enable)
+void GPU_SW_Backend::FlushRender()
{
- static constexpr DrawRectangleFunction funcs[2][2][2] = {
- {{&GPU_SW_Backend::DrawRectangle, &GPU_SW_Backend::DrawRectangle},
- {&GPU_SW_Backend::DrawRectangle, &GPU_SW_Backend::DrawRectangle}},
- {{&GPU_SW_Backend::DrawRectangle, &GPU_SW_Backend::DrawRectangle},
- {&GPU_SW_Backend::DrawRectangle, &GPU_SW_Backend::DrawRectangle}}};
-
- return funcs[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)];
}
-
-GPU_SW_Backend::DrawTriangleFunction GPU_SW_Backend::GetDrawTriangleFunction(bool shading_enable, bool texture_enable,
- bool raw_texture_enable,
- bool transparency_enable,
- bool dithering_enable)
-{
- static constexpr DrawTriangleFunction funcs[2][2][2][2][2] = {
- {{{{&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle},
- {&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle}},
- {{&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle},
- {&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle}}},
- {{{&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle},
- {&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle}},
- {{&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle},
- {&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle}}}},
- {{{{&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle},
- {&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle}},
- {{&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle},
- {&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle}}},
- {{{&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle},
- {&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle}},
- {{&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle},
- {&GPU_SW_Backend::DrawTriangle,
- &GPU_SW_Backend::DrawTriangle}}}}};
-
- return funcs[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)]
- [u8(dithering_enable)];
-}
\ No newline at end of file
diff --git a/src/core/gpu_sw_backend.h b/src/core/gpu_sw_backend.h
index 8d21793fa..2c0394ebc 100644
--- a/src/core/gpu_sw_backend.h
+++ b/src/core/gpu_sw_backend.h
@@ -17,77 +17,7 @@ public:
bool Initialize(bool force_thread) override;
void Reset() override;
- ALWAYS_INLINE_RELEASE u16 GetPixel(const u32 x, const u32 y) const { return g_vram[VRAM_WIDTH * y + x]; }
- ALWAYS_INLINE_RELEASE const u16* GetPixelPtr(const u32 x, const u32 y) const { return &g_vram[VRAM_WIDTH * y + x]; }
- ALWAYS_INLINE_RELEASE u16* GetPixelPtr(const u32 x, const u32 y) { return &g_vram[VRAM_WIDTH * y + x]; }
- ALWAYS_INLINE_RELEASE void SetPixel(const u32 x, const u32 y, const u16 value) { g_vram[VRAM_WIDTH * y + x] = value; }
-
- // this is actually (31 * 255) >> 4) == 494, but to simplify addressing we use the next power of two (512)
- static constexpr u32 DITHER_LUT_SIZE = 512;
- using DitherLUT = std::array, DITHER_MATRIX_SIZE>, DITHER_MATRIX_SIZE>;
- static constexpr DitherLUT ComputeDitherLUT();
-
protected:
- union VRAMPixel
- {
- u16 bits;
-
- BitField r;
- BitField g;
- BitField b;
- BitField c;
-
- void Set(u8 r_, u8 g_, u8 b_, bool c_ = false)
- {
- bits = (ZeroExtend16(r_)) | (ZeroExtend16(g_) << 5) | (ZeroExtend16(b_) << 10) | (static_cast(c_) << 15);
- }
-
- void ClampAndSet(u8 r_, u8 g_, u8 b_, bool c_ = false)
- {
- Set(std::min(r_, 0x1F), std::min(g_, 0x1F), std::min(b_, 0x1F), c_);
- }
-
- void SetRGB24(u32 rgb24, bool c_ = false)
- {
- bits = Truncate16(((rgb24 >> 3) & 0x1F) | (((rgb24 >> 11) & 0x1F) << 5) | (((rgb24 >> 19) & 0x1F) << 10)) |
- (static_cast(c_) << 15);
- }
-
- void SetRGB24(u8 r8, u8 g8, u8 b8, bool c_ = false)
- {
- bits = (ZeroExtend16(r8 >> 3)) | (ZeroExtend16(g8 >> 3) << 5) | (ZeroExtend16(b8 >> 3) << 10) |
- (static_cast(c_) << 15);
- }
-
- void SetRGB24Dithered(u32 x, u32 y, u8 r8, u8 g8, u8 b8, bool c_ = false)
- {
- const s32 offset = DITHER_MATRIX[y & 3][x & 3];
- r8 = static_cast(std::clamp(static_cast(ZeroExtend32(r8)) + offset, 0, 255));
- g8 = static_cast(std::clamp(static_cast(ZeroExtend32(g8)) + offset, 0, 255));
- b8 = static_cast(std::clamp(static_cast(ZeroExtend32(b8)) + offset, 0, 255));
- SetRGB24(r8, g8, b8, c_);
- }
-
- u32 ToRGB24() const
- {
- const u32 r_ = ZeroExtend32(r.GetValue());
- const u32 g_ = ZeroExtend32(g.GetValue());
- const u32 b_ = ZeroExtend32(b.GetValue());
-
- return ((r_ << 3) | (r_ & 7)) | (((g_ << 3) | (g_ & 7)) << 8) | (((b_ << 3) | (b_ & 7)) << 16);
- }
- };
-
- static constexpr std::tuple UnpackTexcoord(u16 texcoord)
- {
- return std::make_tuple(static_cast(texcoord), static_cast(texcoord >> 8));
- }
-
- static constexpr std::tuple UnpackColorRGB24(u32 rgb24)
- {
- return std::make_tuple(static_cast(rgb24), static_cast(rgb24 >> 8), static_cast(rgb24 >> 16));
- }
-
void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params) override;
void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, GPUBackendCommandParameters params) override;
void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
@@ -96,75 +26,7 @@ protected:
void DrawPolygon(const GPUBackendDrawPolygonCommand* cmd) override;
void DrawLine(const GPUBackendDrawLineCommand* cmd) override;
void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) override;
- void FlushRender() override;
- void DrawingAreaChanged() override;
+ void DrawingAreaChanged(const GPUDrawingArea& new_drawing_area, const GSVector4i clamped_drawing_area) override;
void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) override;
-
- //////////////////////////////////////////////////////////////////////////
- // Rasterization
- //////////////////////////////////////////////////////////////////////////
- template
- void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x,
- u8 texcoord_y);
-
- template
- void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd);
-
- using DrawRectangleFunction = void (GPU_SW_Backend::*)(const GPUBackendDrawRectangleCommand* cmd);
- DrawRectangleFunction GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable,
- bool transparency_enable);
-
- //////////////////////////////////////////////////////////////////////////
- // Polygon and line rasterization ported from Mednafen
- //////////////////////////////////////////////////////////////////////////
- struct i_deltas
- {
- u32 du_dx, dv_dx;
- u32 dr_dx, dg_dx, db_dx;
-
- u32 du_dy, dv_dy;
- u32 dr_dy, dg_dy, db_dy;
- };
-
- struct i_group
- {
- u32 u, v;
- u32 r, g, b;
- };
-
- template
- bool CalcIDeltas(i_deltas& idl, const GPUBackendDrawPolygonCommand::Vertex* A,
- const GPUBackendDrawPolygonCommand::Vertex* B, const GPUBackendDrawPolygonCommand::Vertex* C);
-
- template
- void AddIDeltas_DX(i_group& ig, const i_deltas& idl, u32 count = 1);
-
- template
- void AddIDeltas_DY(i_group& ig, const i_deltas& idl, u32 count = 1);
-
- template
- void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, i_group ig,
- const i_deltas& idl);
-
- template
- void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0,
- const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2);
-
- using DrawTriangleFunction = void (GPU_SW_Backend::*)(const GPUBackendDrawPolygonCommand* cmd,
- const GPUBackendDrawPolygonCommand::Vertex* v0,
- const GPUBackendDrawPolygonCommand::Vertex* v1,
- const GPUBackendDrawPolygonCommand::Vertex* v2);
- DrawTriangleFunction GetDrawTriangleFunction(bool shading_enable, bool texture_enable, bool raw_texture_enable,
- bool transparency_enable, bool dithering_enable);
-
- template
- void DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0,
- const GPUBackendDrawLineCommand::Vertex* p1);
-
- using DrawLineFunction = void (GPU_SW_Backend::*)(const GPUBackendDrawLineCommand* cmd,
- const GPUBackendDrawLineCommand::Vertex* p0,
- const GPUBackendDrawLineCommand::Vertex* p1);
- DrawLineFunction GetDrawLineFunction(bool shading_enable, bool transparency_enable, bool dithering_enable);
+ void FlushRender() override;
};
diff --git a/src/core/gpu_sw_rasterizer.cpp b/src/core/gpu_sw_rasterizer.cpp
new file mode 100644
index 000000000..0df0c16c2
--- /dev/null
+++ b/src/core/gpu_sw_rasterizer.cpp
@@ -0,0 +1,100 @@
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin
+// SPDX-License-Identifier: CC-BY-NC-ND-4.0
+
+#include "gpu_sw_rasterizer.h"
+#include "gpu.h"
+
+#include "cpuinfo.h"
+
+#include "common/log.h"
+#include "common/string_util.h"
+
+Log_SetChannel(GPU_SW_Rasterizer);
+
+namespace GPU_SW_Rasterizer {
+// Default implementation, compatible with all ISAs.
+extern const DrawRectangleFunctionTable DrawRectangleFunctions;
+extern const DrawTriangleFunctionTable DrawTriangleFunctions;
+extern const DrawLineFunctionTable DrawLineFunctions;
+
+constinit const DitherLUT g_dither_lut = []() constexpr {
+ DitherLUT lut = {};
+ for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++)
+ {
+ for (u32 j = 0; j < DITHER_MATRIX_SIZE; j++)
+ {
+ for (u32 value = 0; value < DITHER_LUT_SIZE; value++)
+ {
+ const s32 dithered_value = (static_cast(value) + DITHER_MATRIX[i][j]) >> 3;
+ lut[i][j][value] = static_cast((dithered_value < 0) ? 0 : ((dithered_value > 31) ? 31 : dithered_value));
+ }
+ }
+ }
+ return lut;
+}();
+
+GPUDrawingArea g_drawing_area = {};
+} // namespace GPU_SW_Rasterizer
+
+// Default implementation definitions.
+namespace GPU_SW_Rasterizer {
+#include "gpu_sw_rasterizer.inl"
+}
+
+// Default vector implementation definitions.
+#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
+namespace GPU_SW_Rasterizer::SIMD {
+#include "gpu_sw_rasterizer.inl"
+}
+#endif
+
+// Initialize with default implementation.
+namespace GPU_SW_Rasterizer {
+const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions = &DrawRectangleFunctions;
+const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions = &DrawTriangleFunctions;
+const DrawLineFunctionTable* SelectedDrawLineFunctions = &DrawLineFunctions;
+} // namespace GPU_SW_Rasterizer
+
+// Declare alternative implementations.
+void GPU_SW_Rasterizer::SelectImplementation()
+{
+ static bool selected = false;
+ if (selected)
+ return;
+
+ selected = true;
+
+#define SELECT_ALTERNATIVE_RASTERIZER(isa) \
+ do \
+ { \
+ INFO_LOG("Using " #isa " software rasterizer implementation."); \
+ SelectedDrawRectangleFunctions = &isa::DrawRectangleFunctions; \
+ SelectedDrawTriangleFunctions = &isa::DrawTriangleFunctions; \
+ SelectedDrawLineFunctions = &isa::DrawLineFunctions; \
+ } while (0)
+
+#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
+ const char* use_isa = std::getenv("SW_USE_ISA");
+
+ // Default to scalar for now, until vector is finished.
+ use_isa = use_isa ? use_isa : "Scalar";
+
+#if defined(CPU_ARCH_SSE) && defined(_MSC_VER)
+ if (cpuinfo_has_x86_avx2() && (!use_isa || StringUtil::Strcasecmp(use_isa, "AVX2") == 0))
+ {
+ SELECT_ALTERNATIVE_RASTERIZER(AVX2);
+ return;
+ }
+#endif
+
+ if (!use_isa || StringUtil::Strcasecmp(use_isa, "SIMD") == 0)
+ {
+ SELECT_ALTERNATIVE_RASTERIZER(SIMD);
+ return;
+ }
+#endif
+
+ INFO_LOG("Using scalar software rasterizer implementation.");
+
+#undef SELECT_ALTERNATIVE_RASTERIZER
+}
diff --git a/src/core/gpu_sw_rasterizer.h b/src/core/gpu_sw_rasterizer.h
new file mode 100644
index 000000000..d6f3adace
--- /dev/null
+++ b/src/core/gpu_sw_rasterizer.h
@@ -0,0 +1,89 @@
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin
+// SPDX-License-Identifier: CC-BY-NC-ND-4.0
+
+#pragma once
+
+#include "gpu.h"
+#include "gpu_types.h"
+
+#include "common/intrin.h"
+#include "common/types.h"
+
+#include
+#include
+
+namespace GPU_SW_Rasterizer {
+
+// this is actually (31 * 255) >> 4) == 494, but to simplify addressing we use the next power of two (512)
+static constexpr u32 DITHER_LUT_SIZE = 512;
+using DitherLUT = std::array, DITHER_MATRIX_SIZE>, DITHER_MATRIX_SIZE>;
+extern const DitherLUT g_dither_lut;
+
+extern GPUDrawingArea g_drawing_area;
+
+using DrawRectangleFunction = void (*)(const GPUBackendDrawRectangleCommand* cmd);
+typedef const DrawRectangleFunction DrawRectangleFunctionTable[2][2][2];
+
+using DrawTriangleFunction = void (*)(const GPUBackendDrawPolygonCommand* cmd,
+ const GPUBackendDrawPolygonCommand::Vertex* v0,
+ const GPUBackendDrawPolygonCommand::Vertex* v1,
+ const GPUBackendDrawPolygonCommand::Vertex* v2);
+typedef const DrawTriangleFunction DrawTriangleFunctionTable[2][2][2][2][2];
+
+using DrawLineFunction = void (*)(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0,
+ const GPUBackendDrawLineCommand::Vertex* p1);
+typedef const DrawLineFunction DrawLineFunctionTable[2][2][2];
+
+// Default implementation, compatible with all ISAs.
+extern const DrawRectangleFunctionTable DrawRectangleFunctions;
+extern const DrawTriangleFunctionTable DrawTriangleFunctions;
+extern const DrawLineFunctionTable DrawLineFunctions;
+
+// Current implementation, selected at runtime.
+extern const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions;
+extern const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions;
+extern const DrawLineFunctionTable* SelectedDrawLineFunctions;
+
+extern void SelectImplementation();
+
+ALWAYS_INLINE static DrawLineFunction GetDrawLineFunction(bool shading_enable, bool transparency_enable,
+ bool dithering_enable)
+{
+ return (*SelectedDrawLineFunctions)[u8(shading_enable)][u8(transparency_enable)][u8(dithering_enable)];
+}
+
+ALWAYS_INLINE static DrawRectangleFunction GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable,
+ bool transparency_enable)
+{
+ return (*SelectedDrawRectangleFunctions)[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)];
+}
+
+ALWAYS_INLINE static DrawTriangleFunction GetDrawTriangleFunction(bool shading_enable, bool texture_enable,
+ bool raw_texture_enable, bool transparency_enable,
+ bool dithering_enable)
+{
+ return (*SelectedDrawTriangleFunctions)[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)]
+ [u8(transparency_enable)][u8(dithering_enable)];
+}
+
+#define DECLARE_ALTERNATIVE_RASTERIZER(isa) \
+ namespace isa { \
+ extern const DrawRectangleFunctionTable DrawRectangleFunctions; \
+ extern const DrawTriangleFunctionTable DrawTriangleFunctions; \
+ extern const DrawLineFunctionTable DrawLineFunctions; \
+ }
+
+// Have to define the symbols globally, because clang won't include them otherwise.
+#if defined(CPU_ARCH_SSE) && defined(_MSC_VER)
+#define ALTERNATIVE_RASTERIZER_LIST() DECLARE_ALTERNATIVE_RASTERIZER(AVX2)
+#else
+#define ALTERNATIVE_RASTERIZER_LIST()
+#endif
+
+ALTERNATIVE_RASTERIZER_LIST()
+
+#undef DECLARE_ALTERNATIVE_RASTERIZER
+
+} // namespace GPU_SW_Rasterizer
+
+// static u32 s_bad_counter = 0;
diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl
new file mode 100644
index 000000000..d434b49b9
--- /dev/null
+++ b/src/core/gpu_sw_rasterizer.inl
@@ -0,0 +1,1250 @@
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin
+// SPDX-License-Identifier: CC-BY-NC-ND-4.0
+
+#ifdef __INTELLISENSE__
+
+#include "common/gsvector.h"
+#include "gpu.h"
+#include
+
+#define USE_VECTOR 1
+#define GSVECTOR_HAS_SRLV 1
+
+extern GPU_SW_Rasterizer::DitherLUT g_dither_lut;
+
+namespace GPU_SW_Rasterizer {
+
+#endif
+
+// TODO: UpdateVRAM, FillVRAM, etc.
+
+#ifdef USE_VECTOR
+#if 0
+static u16 s_vram_backup[VRAM_WIDTH * VRAM_HEIGHT];
+static u16 s_new_vram[VRAM_WIDTH * VRAM_HEIGHT];
+#define BACKUP_VRAM() \
+ do \
+ { \
+ std::memcpy(s_vram_backup, g_vram, sizeof(g_vram)); \
+ s_bad_counter++; \
+ } while (0)
+#define CHECK_VRAM(drawer) \
+ do \
+ { \
+ std::memcpy(s_new_vram, g_vram, sizeof(g_vram)); \
+ std::memcpy(g_vram, s_vram_backup, sizeof(g_vram)); \
+ \
+ drawer; \
+ for (u32 vidx = 0; vidx < (VRAM_WIDTH * VRAM_HEIGHT); vidx++) \
+ { \
+ if (s_new_vram[vidx] != g_vram[vidx]) \
+ { \
+ fprintf(stderr, "[%u] Mismatch at %d,%d, expected %04x got %04x\n", s_bad_counter, (vidx % VRAM_WIDTH), \
+ (vidx / VRAM_WIDTH), g_vram[vidx], s_new_vram[vidx]); \
+ AssertMsg(false, "Mismatch"); \
+ } \
+ } \
+ /*Assert(std::memcmp(g_vram, s_new_vram, sizeof(g_vram)) == 0)*/ \
+ } while (0)
+#else
+#define BACKUP_VRAM()
+#define CHECK_VRAM(drawer)
+#endif
+#endif
+
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y)
+{
+ return g_vram[VRAM_WIDTH * y + x];
+}
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y)
+{
+ return &g_vram[VRAM_WIDTH * y + x];
+}
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value)
+{
+ g_vram[VRAM_WIDTH * y + x] = value;
+}
+
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackTexcoord(u16 texcoord)
+{
+ return std::make_tuple(static_cast(texcoord), static_cast(texcoord >> 8));
+}
+
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackColorRGB24(u32 rgb24)
+{
+ return std::make_tuple(static_cast(rgb24), static_cast(rgb24 >> 8), static_cast(rgb24 >> 16));
+}
+
+template
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y,
+ u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x,
+ u8 texcoord_y)
+{
+ u16 color;
+ if constexpr (texture_enable)
+ {
+ // Apply texture window
+ texcoord_x = (texcoord_x & cmd->window.and_x) | cmd->window.or_x;
+ texcoord_y = (texcoord_y & cmd->window.and_y) | cmd->window.or_y;
+
+ u16 texture_color;
+ switch (cmd->draw_mode.texture_mode)
+ {
+ case GPUTextureMode::Palette4Bit:
+ {
+ const u16 palette_value =
+ GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 4)) % VRAM_WIDTH,
+ (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
+ const size_t palette_index = (palette_value >> ((texcoord_x % 4) * 4)) & 0x0Fu;
+ texture_color = g_gpu_clut[palette_index];
+ }
+ break;
+
+ case GPUTextureMode::Palette8Bit:
+ {
+ const u16 palette_value =
+ GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 2)) % VRAM_WIDTH,
+ (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
+ const size_t palette_index = (palette_value >> ((texcoord_x % 2) * 8)) & 0xFFu;
+ texture_color = g_gpu_clut[palette_index];
+ }
+ break;
+
+ default:
+ {
+ texture_color = GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x)) % VRAM_WIDTH,
+ (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
+ }
+ break;
+ }
+
+ if (texture_color == 0)
+ return;
+
+ if constexpr (raw_texture_enable)
+ {
+ color = texture_color;
+ }
+ else
+ {
+ const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
+ const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
+
+ color =
+ (ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16(texture_color & 0x1Fu) * u16(color_r)) >> 4]) << 0) |
+ (ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16((texture_color >> 5) & 0x1Fu) * u16(color_g)) >> 4]) << 5) |
+ (ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16((texture_color >> 10) & 0x1Fu) * u16(color_b)) >> 4])
+ << 10) |
+ (texture_color & 0x8000u);
+ }
+ }
+ else
+ {
+ const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
+ const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
+
+ // Non-textured transparent polygons don't set bit 15, but are treated as transparent.
+ color = (ZeroExtend16(g_dither_lut[dither_y][dither_x][color_r]) << 0) |
+ (ZeroExtend16(g_dither_lut[dither_y][dither_x][color_g]) << 5) |
+ (ZeroExtend16(g_dither_lut[dither_y][dither_x][color_b]) << 10) | (transparency_enable ? 0x8000u : 0);
+ }
+
+ const u16 bg_color = GetPixel(static_cast(x), static_cast(y));
+ if constexpr (transparency_enable)
+ {
+ if (color & 0x8000u || !texture_enable)
+ {
+ // Based on blargg's efficient 15bpp pixel math.
+ u32 bg_bits = ZeroExtend32(bg_color);
+ u32 fg_bits = ZeroExtend32(color);
+ switch (cmd->draw_mode.transparency_mode)
+ {
+ case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
+ {
+ bg_bits |= 0x8000u;
+ color = Truncate16(((fg_bits + bg_bits) - ((fg_bits ^ bg_bits) & 0x0421u)) >> 1);
+ }
+ break;
+
+ case GPUTransparencyMode::BackgroundPlusForeground:
+ {
+ bg_bits &= ~0x8000u;
+
+ const u32 sum = fg_bits + bg_bits;
+ const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u;
+
+ color = Truncate16((sum - carry) | (carry - (carry >> 5)));
+ }
+ break;
+
+ case GPUTransparencyMode::BackgroundMinusForeground:
+ {
+ bg_bits |= 0x8000u;
+ fg_bits &= ~0x8000u;
+
+ const u32 diff = bg_bits - fg_bits + 0x108420u;
+ const u32 borrow = (diff - ((bg_bits ^ fg_bits) & 0x108420u)) & 0x108420u;
+
+ color = Truncate16((diff - borrow) & (borrow - (borrow >> 5)));
+ }
+ break;
+
+ case GPUTransparencyMode::BackgroundPlusQuarterForeground:
+ {
+ bg_bits &= ~0x8000u;
+ fg_bits = ((fg_bits >> 2) & 0x1CE7u) | 0x8000u;
+
+ const u32 sum = fg_bits + bg_bits;
+ const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u;
+
+ color = Truncate16((sum - carry) | (carry - (carry >> 5)));
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ // See above.
+ if constexpr (!texture_enable)
+ color &= ~0x8000u;
+ }
+ }
+
+ const u16 mask_and = cmd->params.GetMaskAND();
+ if ((bg_color & mask_and) != 0)
+ return;
+
+ DebugAssert(static_cast(x) < VRAM_WIDTH && static_cast(y) < VRAM_HEIGHT);
+ SetPixel(static_cast(x), static_cast(y), color | cmd->params.GetMaskOR());
+}
+
+#ifndef USE_VECTOR
+
+template
+static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
+{
+ const s32 origin_x = cmd->x;
+ const s32 origin_y = cmd->y;
+ const auto [r, g, b] = UnpackColorRGB24(cmd->color);
+ const auto [origin_texcoord_x, origin_texcoord_y] = UnpackTexcoord(cmd->texcoord);
+
+ for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
+ {
+ const s32 y = origin_y + static_cast(offset_y);
+ if (y < static_cast(g_drawing_area.top) || y > static_cast(g_drawing_area.bottom) ||
+ (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u)))
+ {
+ continue;
+ }
+
+ const u32 draw_y = static_cast(y) & VRAM_HEIGHT_MASK;
+ const u8 texcoord_y = Truncate8(ZeroExtend32(origin_texcoord_y) + offset_y);
+
+ for (u32 offset_x = 0; offset_x < cmd->width; offset_x++)
+ {
+ const s32 x = origin_x + static_cast(offset_x);
+ if (x < static_cast(g_drawing_area.left) || x > static_cast(g_drawing_area.right))
+ continue;
+
+ const u8 texcoord_x = Truncate8(ZeroExtend32(origin_texcoord_x) + offset_x);
+
+ ShadePixel(cmd, static_cast(x), draw_y, r, g,
+ b, texcoord_x, texcoord_y);
+ }
+ }
+}
+
+#else // USE_VECTOR
+
+ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVector4i coord_y)
+{
+ GSVector4i offsets = coord_y.sll32<11>(); // y * 2048 (1024 * sizeof(pixel))
+ offsets = offsets.add32(coord_x.sll32<1>()); // x * 2 (x * sizeof(pixel))
+
+ const u32 o0 = offsets.extract32<0>();
+ const u32 o1 = offsets.extract32<1>();
+ const u32 o2 = offsets.extract32<2>();
+ const u32 o3 = offsets.extract32<3>();
+
+ // TODO: split in two, merge, maybe could be zx loaded instead..
+ u16 p0, p1, p2, p3;
+ std::memcpy(&p0, reinterpret_cast(g_vram) + o0, sizeof(p0));
+ std::memcpy(&p1, reinterpret_cast(g_vram) + o1, sizeof(p1));
+ std::memcpy(&p2, reinterpret_cast(g_vram) + o2, sizeof(p2));
+ std::memcpy(&p3, reinterpret_cast(g_vram) + o3, sizeof(p3));
+ GSVector4i pixels = GSVector4i::load(p0);
+ pixels = pixels.insert16<2>(p1);
+ pixels = pixels.insert16<4>(p2);
+ pixels = pixels.insert16<6>(p3);
+
+ return pixels;
+}
+
+ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices)
+{
+ const GSVector4i offsets = indices.sll32<1>(); // x * 2 (x * sizeof(pixel))
+ const u32 o0 = offsets.extract32<0>();
+ const u32 o1 = offsets.extract32<1>();
+ const u32 o2 = offsets.extract32<2>();
+ const u32 o3 = offsets.extract32<3>();
+
+ // TODO: split in two, merge, maybe could be zx loaded instead..
+ u16 p0, p1, p2, p3;
+ std::memcpy(&p0, reinterpret_cast(g_gpu_clut) + o0, sizeof(p0));
+ std::memcpy(&p1, reinterpret_cast(g_gpu_clut) + o1, sizeof(p1));
+ std::memcpy(&p2, reinterpret_cast(g_gpu_clut) + o2, sizeof(p2));
+ std::memcpy(&p3, reinterpret_cast(g_gpu_clut) + o3, sizeof(p3));
+ GSVector4i pixels = GSVector4i::load(p0);
+ pixels = pixels.insert16<2>(p1);
+ pixels = pixels.insert16<4>(p2);
+ pixels = pixels.insert16<6>(p3);
+
+ return pixels;
+}
+
+ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
+{
+ if (x <= (VRAM_WIDTH - 4))
+ {
+ return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32();
+ }
+ else
+ {
+ const u16* line = &g_vram[y * VRAM_WIDTH];
+ GSVector4i pixels = GSVector4i(line[(x++) & VRAM_WIDTH_MASK]);
+ pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]);
+ pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]);
+ pixels = pixels.insert16<6>(line[x & VRAM_WIDTH_MASK]);
+ return pixels;
+ }
+}
+
+ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
+{
+ if (x <= (VRAM_WIDTH - 4))
+ {
+ GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], color);
+ }
+ else
+ {
+ u16* line = &g_vram[y * VRAM_WIDTH];
+ line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<0>());
+ line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<1>());
+ line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<2>());
+ line[x & VRAM_WIDTH_MASK] = Truncate16(color.extract16<3>());
+ }
+}
+
+ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVector4i rgb5a1, GSVector4i& rg, GSVector4i& ba)
+{
+ rg = rgb5a1 & GSVector4i::cxpr(0x1F); // R | R | R | R
+ rg = rg | (rgb5a1 & GSVector4i::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0
+ ba = rgb5a1.srl32<10>() & GSVector4i::cxpr(0x1F); // B | B | B | B
+ ba = ba | (rgb5a1 & GSVector4i::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0
+}
+
+ALWAYS_INLINE_RELEASE static GSVector4i RG_BAToRGB5A1(GSVector4i rg, GSVector4i ba)
+{
+ GSVector4i res;
+
+ res = rg & GSVector4i::cxpr(0x1F); // R | R | R | R
+ res = res | (rg.srl32<11>() & GSVector4i::cxpr(0x3E0)); // RG | RG | RG | RG
+ res = res | ((ba & GSVector4i::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB
+ res = res | ba.srl32<16>().sll32<15>(); // RGBA | RGBA | RGBA | RGBA
+
+ return res;
+}
+
+// Color repeated twice for RG packing, then duplicated to we can load based on the X offset.
+static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = {
+#define P(m, n) static_cast(DITHER_MATRIX[m][n]), static_cast(DITHER_MATRIX[m][n])
+#define R(m) P(m, 0), P(m, 1), P(m, 2), P(m, 3), P(m, 0), P(m, 1), P(m, 2), P(m, 3)
+
+ {R(0)}, {R(1)}, {R(2)}, {R(3)}
+
+#undef R
+#undef P
+};
+
+template
+ALWAYS_INLINE_RELEASE static void
+ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vertex_color_rg, GSVector4i vertex_color_ba,
+ GSVector4i texcoord_x, GSVector4i texcoord_y, GSVector4i preserve_mask, GSVector4i dither)
+{
+ static constinit GSVector4i coord_mask_x = GSVector4i::cxpr(VRAM_WIDTH_MASK);
+ static constinit GSVector4i coord_mask_y = GSVector4i::cxpr(VRAM_HEIGHT_MASK);
+
+ GSVector4i color;
+
+ if constexpr (texture_enable)
+ {
+ // Apply texture window
+ texcoord_x = (texcoord_x & GSVector4i(cmd->window.and_x)) | GSVector4i(cmd->window.or_x);
+ texcoord_y = (texcoord_y & GSVector4i(cmd->window.and_y)) | GSVector4i(cmd->window.or_y);
+
+ const GSVector4i base_x = GSVector4i(cmd->draw_mode.GetTexturePageBaseX());
+ const GSVector4i base_y = GSVector4i(cmd->draw_mode.GetTexturePageBaseY());
+
+ texcoord_y = base_y.add32(texcoord_y) & coord_mask_y;
+
+ GSVector4i texture_color;
+ switch (cmd->draw_mode.texture_mode)
+ {
+ case GPUTextureMode::Palette4Bit:
+ {
+ GSVector4i load_texcoord_x = texcoord_x.srl32<2>();
+ load_texcoord_x = base_x.add32(load_texcoord_x);
+ load_texcoord_x = load_texcoord_x & coord_mask_x;
+
+ // todo: sse4 path
+ GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(3)).sll32<2>();
+ GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
+#ifdef GSVECTOR_HAS_SRLV
+ palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0x0F);
+#else
+ Assert(false && "Fixme");
+#endif
+
+ texture_color = GatherCLUTVector(palette_indices);
+ }
+ break;
+
+ case GPUTextureMode::Palette8Bit:
+ {
+ GSVector4i load_texcoord_x = texcoord_x.srl32<1>();
+ load_texcoord_x = base_x.add32(load_texcoord_x);
+ load_texcoord_x = load_texcoord_x & coord_mask_x;
+
+ GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(1)).sll32<3>();
+ GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
+#ifdef GSVECTOR_HAS_SRLV
+ palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0xFF);
+#else
+ Assert(false && "Fixme");
+#endif
+
+ texture_color = GatherCLUTVector(palette_indices);
+ }
+ break;
+
+ default:
+ {
+ texcoord_x = base_x.add32(texcoord_x);
+ texcoord_x = texcoord_x & coord_mask_x;
+ texture_color = GatherVector(texcoord_x, texcoord_y);
+ }
+ break;
+ }
+
+ // check for zero texture colour across the 4 pixels, early out if so
+ const GSVector4i texture_transparent_mask = texture_color.eq32(GSVector4i::zero());
+ if (texture_transparent_mask.alltrue())
+ return;
+
+ preserve_mask = preserve_mask | texture_transparent_mask;
+
+ if constexpr (raw_texture_enable)
+ {
+ color = texture_color;
+ }
+ else
+ {
+ GSVector4i trg, tba;
+ RGB5A1ToRG_BA(texture_color, trg, tba);
+
+ // now we have both the texture and vertex color in RG/GA pairs, for 4 pixels, which we can multiply
+ GSVector4i rg = trg.mul16l(vertex_color_rg);
+ GSVector4i ba = tba.mul16l(vertex_color_ba);
+
+ // TODO: Dither
+ // Convert to 5bit.
+ if constexpr (dithering_enable)
+ {
+ rg = rg.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+ ba = ba.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+ }
+ else
+ {
+ rg = rg.sra16<7>();
+ ba = ba.sra16<7>();
+ }
+
+ // Bit15 gets passed through as-is.
+ ba = ba.blend16<0xaa>(tba);
+
+ // Clamp to 5bit.
+ static constexpr GSVector4i colclamp = GSVector4i::cxpr16(0x1F);
+ rg = rg.min_u16(colclamp);
+ ba = ba.min_u16(colclamp);
+
+ // And interleave back to 16bpp.
+ color = RG_BAToRGB5A1(rg, ba);
+ }
+ }
+ else
+ {
+ // Non-textured transparent polygons don't set bit 15, but are treated as transparent.
+ if constexpr (dithering_enable)
+ {
+ GSVector4i rg = vertex_color_rg.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+ GSVector4i ba = vertex_color_ba.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+
+ // Clamp to 5bit. We use 32bit for BA to set a to zero.
+ rg = rg.min_u16(GSVector4i::cxpr16(0x1F));
+ ba = ba.min_u16(GSVector4i::cxpr(0x1F));
+
+ // And interleave back to 16bpp.
+ color = RG_BAToRGB5A1(rg, ba);
+ }
+ else
+ {
+ // Note that bit15 is set to 0 here, which the shift will do.
+ const GSVector4i rg = vertex_color_rg.srl16<3>();
+ const GSVector4i ba = vertex_color_ba.srl16<3>();
+ color = RG_BAToRGB5A1(rg, ba);
+ }
+ }
+
+ GSVector4i bg_color = LoadVector(start_x, y);
+
+ if constexpr (transparency_enable)
+ {
+ [[maybe_unused]] GSVector4i transparent_mask;
+ if constexpr (texture_enable)
+ {
+ // Compute transparent_mask, ffff per lane if transparent otherwise 0000
+ transparent_mask = color.sra16<15>();
+ }
+
+ // TODO: We don't need to OR color here with 0x8000 for textures.
+ // 0x8000 is added to match serial path.
+
+ GSVector4i blended_color;
+ switch (cmd->draw_mode.transparency_mode)
+ {
+ case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
+ {
+ const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
+ const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
+ const GSVector4i res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x0421u)).srl32<1>();
+ blended_color = res & GSVector4i::cxpr(0xffff);
+ }
+ break;
+
+ case GPUTransparencyMode::BackgroundPlusForeground:
+ {
+ const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
+ const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
+ const GSVector4i sum = fg_bits.add32(bg_bits);
+ const GSVector4i carry =
+ (sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u))) & GSVector4i::cxpr(0x8420u);
+ const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
+ blended_color = res & GSVector4i::cxpr(0xffff);
+ }
+ break;
+
+ case GPUTransparencyMode::BackgroundMinusForeground:
+ {
+ const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
+ const GSVector4i fg_bits = color & GSVector4i::cxpr(0x7FFFu);
+ const GSVector4i diff = bg_bits.sub32(fg_bits).add32(GSVector4i::cxpr(0x108420u));
+ const GSVector4i borrow =
+ diff.sub32((bg_bits ^ fg_bits) & GSVector4i::cxpr(0x108420u)) & GSVector4i::cxpr(0x108420u);
+ const GSVector4i res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>());
+ blended_color = res & GSVector4i::cxpr(0xffff);
+ }
+ break;
+
+ case GPUTransparencyMode::BackgroundPlusQuarterForeground:
+ default:
+ {
+ const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
+ const GSVector4i fg_bits =
+ ((color | GSVector4i::cxpr(0x8000)).srl32<2>() & GSVector4i::cxpr(0x1CE7u)) | GSVector4i::cxpr(0x8000u);
+ const GSVector4i sum = fg_bits.add32(bg_bits);
+ const GSVector4i carry = sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u)) & GSVector4i::cxpr(0x8420u);
+ const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
+ blended_color = res & GSVector4i::cxpr(0xffff);
+ }
+ break;
+ }
+
+ // select blended pixels for transparent pixels, otherwise consider opaque
+ // TODO: SSE2
+ if constexpr (texture_enable)
+ color = color.blend8(blended_color, transparent_mask);
+ else
+ color = blended_color & GSVector4i::cxpr(0x7fff);
+ }
+
+ // TODO: lift out to parent?
+ const GSVector4i mask_and = GSVector4i(cmd->params.GetMaskAND());
+ const GSVector4i mask_or = GSVector4i(cmd->params.GetMaskOR());
+
+ GSVector4i mask_bits_set = bg_color & mask_and; // 8000 if masked else 0000
+ mask_bits_set = mask_bits_set.sra16<15>(); // ffff if masked else 0000
+ preserve_mask = preserve_mask | mask_bits_set; // ffff if preserved else 0000
+
+ bg_color = bg_color & preserve_mask;
+ color = (color | mask_or).andnot(preserve_mask);
+ color = color | bg_color;
+
+ const GSVector4i packed_color = color.pu32();
+ StoreVector(start_x, y, packed_color);
+}
+
+template
+static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
+{
+ const s32 origin_x = cmd->x;
+ const s32 origin_y = cmd->y;
+
+ const GSVector4i rgba = GSVector4i(cmd->color); // RGBA | RGBA | RGBA | RGBA
+ GSVector4i rg = rgba.xxxxl(); // RGRG | RGRG | RGRG | RGRG
+ GSVector4i ba = rgba.yyyyl(); // BABA | BABA | BABA | BABA
+ rg = rg.u8to16(); // R0G0 | R0G0 | R0G0 | R0G0
+ ba = ba.u8to16(); // B0A0 | B0A0 | B0A0 | B0A0
+
+ const GSVector4i texcoord_x = GSVector4i(cmd->texcoord & 0xFF).add32(GSVector4i::cxpr(0, 1, 2, 3));
+ GSVector4i texcoord_y = GSVector4i(cmd->texcoord >> 8);
+
+ const GSVector4i clip_left = GSVector4i(g_drawing_area.left);
+ const GSVector4i clip_right = GSVector4i(g_drawing_area.right);
+ const u32 width = cmd->width;
+
+ BACKUP_VRAM();
+
+ for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
+ {
+ const s32 y = origin_y + static_cast(offset_y);
+ if (y < static_cast(g_drawing_area.top) || y > static_cast(g_drawing_area.bottom) ||
+ (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u)))
+ {
+ continue;
+ }
+
+ GSVector4i row_texcoord_x = texcoord_x;
+ GSVector4i xvec = GSVector4i(origin_x).add32(GSVector4i::cxpr(0, 1, 2, 3));
+ GSVector4i wvec = GSVector4i(width).sub32(GSVector4i::cxpr(1, 2, 3, 4));
+
+ for (u32 offset_x = 0; offset_x < width; offset_x += 4)
+ {
+ const s32 x = origin_x + static_cast(offset_x);
+
+ // width test
+ GSVector4i preserve_mask = wvec.lt32(GSVector4i::zero());
+
+ // clip test, if all pixels are outside, skip
+ preserve_mask = preserve_mask | xvec.lt32(clip_left);
+ preserve_mask = preserve_mask | xvec.gt32(clip_right);
+ if (!preserve_mask.alltrue())
+ {
+ ShadePixel(
+ cmd, x, y, rg, ba, row_texcoord_x, texcoord_y, preserve_mask, GSVector4i::zero());
+ }
+
+ xvec = xvec.add32(GSVector4i::cxpr(4));
+ wvec = wvec.sub32(GSVector4i::cxpr(4));
+
+ if constexpr (texture_enable)
+ row_texcoord_x = row_texcoord_x.add32(GSVector4i::cxpr(4)) & GSVector4i::cxpr(0xFF);
+ }
+
+ if constexpr (texture_enable)
+ texcoord_y = texcoord_y.add32(GSVector4i::cxpr(1)) & GSVector4i::cxpr(0xFF);
+ }
+
+ CHECK_VRAM(GPU_SW_Rasterizer::DrawRectangleFunctions[texture_enable][raw_texture_enable][transparency_enable](cmd));
+}
+
+#endif // USE_VECTOR
+
+// TODO: Vectorize line draw.
+template
+static void DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0,
+ const GPUBackendDrawLineCommand::Vertex* p1)
+{
+ static constexpr u32 XY_SHIFT = 32;
+ static constexpr u32 RGB_SHIFT = 12;
+ static constexpr auto makefp_xy = [](s32 x) { return (static_cast(x) << XY_SHIFT) | (1LL << (XY_SHIFT - 1)); };
+ static constexpr auto unfp_xy = [](s64 x) { return static_cast(x >> XY_SHIFT) & 2047; };
+ static constexpr auto div_xy = [](s64 delta, s32 dk) {
+ return ((delta << XY_SHIFT) - ((delta < 0) ? (dk - 1) : 0) + ((delta > 0) ? (dk - 1) : 0)) / dk;
+ };
+ static constexpr auto makefp_rgb = [](u32 c) { return (static_cast(c) << RGB_SHIFT) | (1 << (RGB_SHIFT - 1)); };
+ static constexpr auto unfp_rgb = [](s32 c) { return static_cast(c >> RGB_SHIFT); };
+ static constexpr auto div_rgb = [](u32 c1, u32 c0, s32 dk) {
+ return ((static_cast(c1) - static_cast(c0)) << RGB_SHIFT) / dk;
+ };
+
+ const s32 i_dx = std::abs(p1->x - p0->x);
+ const s32 i_dy = std::abs(p1->y - p0->y);
+ const s32 k = (i_dx > i_dy) ? i_dx : i_dy;
+ if (i_dx >= MAX_PRIMITIVE_WIDTH || i_dy >= MAX_PRIMITIVE_HEIGHT) [[unlikely]]
+ return;
+
+ if (p0->x >= p1->x && k > 0)
+ std::swap(p0, p1);
+
+ s64 dxdk = 0, dydk = 0;
+ [[maybe_unused]] s32 drdk = 0, dgdk = 0, dbdk = 0;
+ if (k != 0) [[likely]]
+ {
+ dxdk = div_xy(p1->x - p0->x, k);
+ dydk = div_xy(p1->y - p0->y, k);
+ if constexpr (shading_enable)
+ {
+ drdk = div_rgb(p1->r, p0->r, k);
+ dgdk = div_rgb(p1->g, p0->g, k);
+ dbdk = div_rgb(p1->b, p0->b, k);
+ }
+ }
+
+ s64 curx = makefp_xy(p0->x) - 1024;
+ s64 cury = makefp_xy(p0->y) - ((dydk < 0) ? 1024 : 0);
+ [[maybe_unused]] s32 curr, curg, curb;
+ if constexpr (shading_enable)
+ {
+ curr = makefp_rgb(p0->r);
+ curg = makefp_rgb(p0->g);
+ curb = makefp_rgb(p0->b);
+ }
+
+ for (s32 i = 0; i <= k; i++)
+ {
+ const s32 x = unfp_xy(curx);
+ const s32 y = unfp_xy(cury);
+
+ if ((!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast(y)) & 1u)) &&
+ x >= static_cast(g_drawing_area.left) && x <= static_cast(g_drawing_area.right) &&
+ y >= static_cast(g_drawing_area.top) && y <= static_cast(g_drawing_area.bottom))
+ {
+ const u8 r = shading_enable ? unfp_rgb(curr) : p0->r;
+ const u8 g = shading_enable ? unfp_rgb(curg) : p0->g;
+ const u8 b = shading_enable ? unfp_rgb(curb) : p0->b;
+
+ ShadePixel(
+ cmd, static_cast(x), static_cast(y) & VRAM_HEIGHT_MASK, r, g, b, 0, 0);
+ }
+
+ curx += dxdk;
+ cury += dydk;
+
+ if constexpr (shading_enable)
+ {
+ curr += drdk;
+ curg += dgdk;
+ curb += dbdk;
+ }
+ }
+}
+
+// DDA triangle rasterization algorithm originally from Mednafen, rewritten and vectorized for DuckStation.
+namespace {
+static constexpr u32 ATTRIB_SHIFT = 12;
+static constexpr u32 ATTRIB_POST_SHIFT = 12;
+
+struct UVSteps
+{
+ u32 dudx;
+ u32 dvdx;
+ u32 dudy;
+ u32 dvdy;
+};
+
+struct UVStepper
+{
+ u32 u;
+ u32 v;
+
+ ALWAYS_INLINE u8 GetU() const { return Truncate8(u >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
+ ALWAYS_INLINE u8 GetV() const { return Truncate8(v >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
+
+ ALWAYS_INLINE void SetStart(u32 ustart, u32 vstart)
+ {
+ u = (((ustart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
+ v = (((vstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
+ }
+
+ ALWAYS_INLINE void StepX(const UVSteps& steps)
+ {
+ u = u + steps.dudx;
+ v = v + steps.dvdx;
+ }
+ ALWAYS_INLINE void StepXY(const UVSteps& steps, s32 x_count, s32 y_count)
+ {
+ u = u + (steps.dudx * static_cast(x_count)) + (steps.dudy * static_cast(y_count));
+ v = v + (steps.dvdx * static_cast(x_count)) + (steps.dvdy * static_cast(y_count));
+ }
+};
+
+struct RGBSteps
+{
+ u32 drdx;
+ u32 dgdx;
+ u32 dbdx;
+
+ u32 drdy;
+ u32 dgdy;
+ u32 dbdy;
+};
+
+struct RGBStepper
+{
+ u32 r;
+ u32 g;
+ u32 b;
+
+ ALWAYS_INLINE u8 GetR() const { return Truncate8(r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
+ ALWAYS_INLINE u8 GetG() const { return Truncate8(g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
+ ALWAYS_INLINE u8 GetB() const { return Truncate8(b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
+
+ ALWAYS_INLINE void SetStart(u32 rstart, u32 gstart, u32 bstart)
+ {
+ r = (((rstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
+ g = (((gstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
+ b = (((bstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
+ }
+
+ ALWAYS_INLINE void StepX(const RGBSteps& steps)
+ {
+ r = r + steps.drdx;
+ g = g + steps.dgdx;
+ b = b + steps.dbdx;
+ }
+ ALWAYS_INLINE void StepXY(const RGBSteps& steps, s32 x_count, s32 y_count)
+ {
+ r = r + (steps.drdx * static_cast(x_count)) + (steps.drdy * static_cast(y_count));
+ g = g + (steps.dgdx * static_cast(x_count)) + (steps.dgdy * static_cast(y_count));
+ b = b + (steps.dbdx * static_cast(x_count)) + (steps.dbdy * static_cast(y_count));
+ }
+};
+
+struct TrianglePart
+{
+ // left/right edges
+ u64 start_x[2];
+ u64 step_x[2];
+
+ s32 start_y;
+ s32 end_y;
+
+ bool fill_upside_down;
+};
+} // namespace
+
+#ifndef USE_VECTOR
+
+template
+static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv,
+ const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep)
+{
+ if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u))
+ return;
+
+ s32 width = x_bound - x_start;
+ s32 current_x = TruncateGPUVertexPosition(x_start);
+
+ // Skip pixels outside of the scissor rectangle.
+ if (current_x < static_cast(g_drawing_area.left))
+ {
+ const s32 delta = static_cast(g_drawing_area.left) - current_x;
+ x_start += delta;
+ current_x += delta;
+ width -= delta;
+ }
+
+ if ((current_x + width) > (static_cast(g_drawing_area.right) + 1))
+ width = static_cast(g_drawing_area.right) + 1 - current_x;
+
+ if (width <= 0)
+ return;
+
+ if constexpr (texture_enable)
+ uv.StepXY(uvstep, x_start, y);
+ if constexpr (shading_enable)
+ rgb.StepXY(rgbstep, x_start, y);
+
+ do
+ {
+ ShadePixel(
+ cmd, static_cast(current_x), static_cast(y), rgb.GetR(), rgb.GetG(), rgb.GetB(), uv.GetU(), uv.GetV());
+
+ current_x++;
+ if constexpr (texture_enable)
+ uv.StepX(uvstep);
+ if constexpr (shading_enable)
+ rgb.StepX(rgbstep);
+ } while (--width > 0);
+}
+
+#else // USE_VECTOR
+
+template
+static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv,
+ const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep)
+{
+ if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u))
+ return;
+
+ s32 w = x_bound - x_start;
+ s32 x = TruncateGPUVertexPosition(x_start);
+
+ if (x < static_cast(g_drawing_area.left))
+ {
+ const s32 delta = static_cast(g_drawing_area.left) - x;
+ x_start += delta;
+ x += delta;
+ w -= delta;
+ }
+
+ if ((x + w) > (static_cast(g_drawing_area.right) + 1))
+ w = static_cast(g_drawing_area.right) + 1 - x;
+
+ if (w <= 0)
+ return;
+
+ // TODO: Precompute.
+
+ const auto clip_left = GSVector4i(g_drawing_area.left);
+ const auto clip_right = GSVector4i(g_drawing_area.right);
+
+ const GSVector4i dr_dx = GSVector4i(rgbstep.drdx * 4);
+ const GSVector4i dg_dx = GSVector4i(rgbstep.dgdx * 4);
+ const GSVector4i db_dx = GSVector4i(rgbstep.dbdx * 4);
+ const GSVector4i du_dx = GSVector4i(uvstep.dudx * 4);
+ const GSVector4i dv_dx = GSVector4i(uvstep.dvdx * 4);
+
+ // TODO: vectorize
+ const GSVector4i dr_dx_offset = GSVector4i(0, rgbstep.drdx, rgbstep.drdx * 2, rgbstep.drdx * 3);
+ const GSVector4i dg_dx_offset = GSVector4i(0, rgbstep.dgdx, rgbstep.dgdx * 2, rgbstep.dgdx * 3);
+ const GSVector4i db_dx_offset = GSVector4i(0, rgbstep.dbdx, rgbstep.dbdx * 2, rgbstep.dbdx * 3);
+ const GSVector4i du_dx_offset = GSVector4i(0, uvstep.dudx, uvstep.dudx * 2, uvstep.dudx * 3);
+ const GSVector4i dv_dx_offset = GSVector4i(0, uvstep.dvdx, uvstep.dvdx * 2, uvstep.dvdx * 3);
+
+ GSVector4i dr, dg, db;
+ if constexpr (shading_enable)
+ {
+ dr = GSVector4i(rgb.r + rgbstep.drdx * x_start).add32(dr_dx_offset);
+ dg = GSVector4i(rgb.g + rgbstep.dgdx * x_start).add32(dg_dx_offset);
+ db = GSVector4i(rgb.b + rgbstep.dbdx * x_start).add32(db_dx_offset);
+ }
+ else
+ {
+ // precompute for flat shading
+ dr = GSVector4i(rgb.r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT));
+ dg = GSVector4i((rgb.g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)) << 16);
+ db = GSVector4i(rgb.b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT));
+ }
+
+ GSVector4i du = GSVector4i(uv.u + uvstep.dudx * x_start).add32(du_dx_offset);
+ GSVector4i dv = GSVector4i(uv.v + uvstep.dvdx * x_start).add32(dv_dx_offset);
+
+ // TODO: Move to caller.
+ if constexpr (shading_enable)
+ {
+ // TODO: vectorize multiply?
+ dr = dr.add32(GSVector4i(rgbstep.drdy * y));
+ dg = dg.add32(GSVector4i(rgbstep.dgdy * y));
+ db = db.add32(GSVector4i(rgbstep.dbdy * y));
+ }
+
+ if constexpr (texture_enable)
+ {
+ du = du.add32(GSVector4i(uvstep.dudy * y));
+ dv = dv.add32(GSVector4i(uvstep.dvdy * y));
+ }
+
+ const GSVector4i dither =
+ GSVector4i::load(&VECTOR_DITHER_MATRIX[static_cast(y) & 3][(static_cast(x) & 3) * 2]);
+
+ GSVector4i xvec = GSVector4i(x).add32(GSVector4i::cxpr(0, 1, 2, 3));
+ GSVector4i wvec = GSVector4i(w).sub32(GSVector4i::cxpr(1, 2, 3, 4));
+
+ for (s32 count = (w + 3) / 4; count > 0; --count)
+ {
+ // R000 | R000 | R000 | R000
+ // R0G0 | R0G0 | R0G0 | R0G0
+ const GSVector4i r = shading_enable ? dr.srl32() : dr;
+ const GSVector4i g =
+ shading_enable ? dg.srl32().sll32<16>() : dg; // get G into the correct position
+ const GSVector4i b = shading_enable ? db.srl32() : db;
+ const GSVector4i u = du.srl32();
+ const GSVector4i v = dv.srl32();
+
+ const GSVector4i rg = r.blend16<0xAA>(g);
+
+ // mask based on what's outside the span
+ auto preserve_mask = wvec.lt32(GSVector4i::zero());
+
+ // clip test, if all pixels are outside, skip
+ preserve_mask = preserve_mask | xvec.lt32(clip_left);
+ preserve_mask = preserve_mask | xvec.gt32(clip_right);
+ if (!preserve_mask.alltrue())
+ {
+ ShadePixel(
+ cmd, static_cast(x), static_cast(y), rg, b, u, v, preserve_mask, dither);
+ }
+
+ x += 4;
+
+ xvec = xvec.add32(GSVector4i::cxpr(4));
+ wvec = wvec.sub32(GSVector4i::cxpr(4));
+
+ if constexpr (shading_enable)
+ {
+ dr = dr.add32(dr_dx);
+ dg = dg.add32(dg_dx);
+ db = db.add32(db_dx);
+ }
+
+ if constexpr (texture_enable)
+ {
+ du = du.add32(du_dx);
+ dv = dv.add32(dv_dx);
+ }
+ }
+}
+
+#endif // USE_VECTOR
+
+template
+ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp,
+ const UVStepper& uv, const UVSteps& uvstep, const RGBStepper& rgb,
+ const RGBSteps& rgbstep)
+{
+ static constexpr auto unfp_xy = [](s64 xfp) -> s32 { return static_cast(static_cast(xfp) >> 32); };
+
+ const u64 left_x_step = tp.step_x[0];
+ const u64 right_x_step = tp.step_x[1];
+ const s32 end_y = tp.end_y;
+ u64 left_x = tp.start_x[0];
+ u64 right_x = tp.start_x[1];
+ s32 current_y = tp.start_y;
+
+ if (tp.fill_upside_down)
+ {
+ while (current_y > end_y)
+ {
+ current_y--;
+ left_x -= left_x_step;
+ right_x -= right_x_step;
+
+ const s32 y = TruncateGPUVertexPosition(current_y);
+ if (y < static_cast