diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj
index 29574d56a..b23b1cbf3 100644
--- a/src/core/core.vcxproj
+++ b/src/core/core.vcxproj
@@ -51,6 +51,12 @@
+
+ AdvancedVectorExtensions2
+ %(AdditionalOptions) -mavx2
+ true
+ NotUsing
+
diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters
index 2a3bcaef8..d0dd29200 100644
--- a/src/core/core.vcxproj.filters
+++ b/src/core/core.vcxproj.filters
@@ -69,6 +69,7 @@
+
diff --git a/src/core/gpu_sw_rasterizer.cpp b/src/core/gpu_sw_rasterizer.cpp
index 59d5ade53..47177c0da 100644
--- a/src/core/gpu_sw_rasterizer.cpp
+++ b/src/core/gpu_sw_rasterizer.cpp
@@ -7,6 +7,8 @@
#include "cpuinfo.h"
#include "common/log.h"
+#include "common/string_util.h"
+
Log_SetChannel(GPU_SW_Rasterizer);
namespace GPU_SW_Rasterizer {
@@ -39,6 +41,13 @@ namespace GPU_SW_Rasterizer {
#include "gpu_sw_rasterizer.inl"
}
+// Default vector implementation definitions.
+#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
+namespace GPU_SW_Rasterizer::Vector {
+#include "gpu_sw_rasterizer.inl"
+}
+#endif
+
// Initialize with default implementation.
namespace GPU_SW_Rasterizer {
const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions = &DrawRectangleFunctions;
@@ -49,4 +58,40 @@ const DrawLineFunctionTable* SelectedDrawLineFunctions = &DrawLineFunctions;
// Declare alternative implementations.
void GPU_SW_Rasterizer::SelectImplementation()
{
+ static bool selected = false;
+ if (selected)
+ return;
+
+ selected = true;
+
+#define SELECT_ALTERNATIVE_RASTERIZER(isa) \
+ do \
+ { \
+ INFO_LOG("* Using " #isa " software rasterizer implementation."); \
+ SelectedDrawRectangleFunctions = &isa::DrawRectangleFunctions; \
+ SelectedDrawTriangleFunctions = &isa::DrawTriangleFunctions; \
+ SelectedDrawLineFunctions = &isa::DrawLineFunctions; \
+ } while (0)
+
+#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
+ const char* use_isa = std::getenv("SW_USE_ISA");
+
+#ifdef CPU_ARCH_SSE
+ if (cpuinfo_has_x86_avx2() && (!use_isa || StringUtil::Strcasecmp(use_isa, "AVX2") == 0))
+ {
+ SELECT_ALTERNATIVE_RASTERIZER(AVX2);
+ return;
+ }
+#endif
+
+ if (!use_isa || StringUtil::Strcasecmp(use_isa, "Vector") == 0)
+ {
+ SELECT_ALTERNATIVE_RASTERIZER(Vector);
+ return;
+ }
+#endif
+
+ INFO_LOG("* Using scalar software rasterizer implementation.");
+
+#undef SELECT_ALTERNATIVE_RASTERIZER
}
diff --git a/src/core/gpu_sw_rasterizer.h b/src/core/gpu_sw_rasterizer.h
index f3d95f0bf..f183e26d3 100644
--- a/src/core/gpu_sw_rasterizer.h
+++ b/src/core/gpu_sw_rasterizer.h
@@ -1,4 +1,5 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
@@ -6,6 +7,7 @@
#include "gpu.h"
#include "gpu_types.h"
+#include "common/intrin.h"
#include "common/types.h"
#include
@@ -33,6 +35,11 @@ using DrawLineFunction = void (*)(const GPUBackendDrawLineCommand* cmd, const GP
const GPUBackendDrawLineCommand::Vertex* p1);
typedef const DrawLineFunction DrawLineFunctionTable[2][2][2];
+// Default implementation, compatible with all ISAs.
+extern const DrawRectangleFunctionTable DrawRectangleFunctions;
+extern const DrawTriangleFunctionTable DrawTriangleFunctions;
+extern const DrawLineFunctionTable DrawLineFunctions;
+
// Current implementation, selected at runtime.
extern const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions;
extern const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions;
@@ -60,4 +67,24 @@ ALWAYS_INLINE static DrawTriangleFunction GetDrawTriangleFunction(bool shading_e
[u8(transparency_enable)][u8(dithering_enable)];
}
+#define DECLARE_ALTERNATIVE_RASTERIZER(isa) \
+ namespace isa { \
+ extern const DrawRectangleFunctionTable DrawRectangleFunctions; \
+ extern const DrawTriangleFunctionTable DrawTriangleFunctions; \
+ extern const DrawLineFunctionTable DrawLineFunctions; \
+ }
+
+// Have to define the symbols globally, because clang won't include them otherwise.
+#if defined(CPU_ARCH_SSE)
+#define ALTERNATIVE_RASTERIZER_LIST() DECLARE_ALTERNATIVE_RASTERIZER(AVX2)
+#else
+#define ALTERNATIVE_RASTERIZER_LIST()
+#endif
+
+ALTERNATIVE_RASTERIZER_LIST()
+
+#undef DECLARE_ALTERNATIVE_RASTERIZER
+
} // namespace GPU_SW_Rasterizer
+
+// static u32 s_bad_counter = 0;
diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl
index 15871b95b..acf6c5cfe 100644
--- a/src/core/gpu_sw_rasterizer.inl
+++ b/src/core/gpu_sw_rasterizer.inl
@@ -3,15 +3,55 @@
#ifdef __INTELLISENSE__
+#include "common/gsvector.h"
#include "gpu.h"
#include
+#define USE_VECTOR 1
+#define GSVECTOR_HAS_SRLV 1
+
extern GPU_SW_Rasterizer::DitherLUT g_dither_lut;
namespace GPU_SW_Rasterizer {
#endif
+// TODO: UpdateVRAM, FillVRAM, etc.
+
+#ifdef USE_VECTOR
+#if 0
+static u16 s_vram_backup[VRAM_WIDTH * VRAM_HEIGHT];
+static u16 s_new_vram[VRAM_WIDTH * VRAM_HEIGHT];
+#define BACKUP_VRAM() \
+ do \
+ { \
+ std::memcpy(s_vram_backup, g_vram, sizeof(g_vram)); \
+ s_bad_counter++; \
+ } while (0)
+#define CHECK_VRAM(drawer) \
+ do \
+ { \
+ std::memcpy(s_new_vram, g_vram, sizeof(g_vram)); \
+ std::memcpy(g_vram, s_vram_backup, sizeof(g_vram)); \
+ \
+ drawer; \
+ for (u32 vidx = 0; vidx < (VRAM_WIDTH * VRAM_HEIGHT); vidx++) \
+ { \
+ if (s_new_vram[vidx] != g_vram[vidx]) \
+ { \
+ fprintf(stderr, "[%u] Mismatch at %d,%d, expected %04x got %04x\n", s_bad_counter, (vidx % VRAM_WIDTH), \
+ (vidx / VRAM_WIDTH), g_vram[vidx], s_new_vram[vidx]); \
+ AssertMsg(false, "Mismatch"); \
+ } \
+ } \
+ /*Assert(std::memcmp(g_vram, s_new_vram, sizeof(g_vram)) == 0)*/ \
+ } while (0)
+#else
+#define BACKUP_VRAM()
+#define CHECK_VRAM(drawer)
+#endif
+#endif
+
namespace {
enum
{
@@ -49,32 +89,33 @@ struct line_fxp_step
};
} // namespace
-ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y)
{
return g_vram[VRAM_WIDTH * y + x];
}
-ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y)
{
return &g_vram[VRAM_WIDTH * y + x];
}
-ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value)
{
g_vram[VRAM_WIDTH * y + x] = value;
}
-ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackTexcoord(u16 texcoord)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackTexcoord(u16 texcoord)
{
return std::make_tuple(static_cast(texcoord), static_cast(texcoord >> 8));
}
-ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackColorRGB24(u32 rgb24)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackColorRGB24(u32 rgb24)
{
return std::make_tuple(static_cast(rgb24), static_cast(rgb24 >> 8), static_cast(rgb24 >> 16));
}
template
-ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r, u8 color_g,
- u8 color_b, u8 texcoord_x, u8 texcoord_y)
+[[maybe_unused]] ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y,
+ u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x,
+ u8 texcoord_y)
{
u16 color;
if constexpr (texture_enable)
@@ -215,6 +256,8 @@ ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u
SetPixel(static_cast(x), static_cast(y), color | cmd->params.GetMaskOR());
}
+#ifndef USE_VECTOR
+
template
static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
{
@@ -249,6 +292,413 @@ static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
}
}
+#else // USE_VECTOR
+
+ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVector4i coord_y)
+{
+ GSVector4i offsets = coord_y.sll32<11>(); // y * 2048 (1024 * sizeof(pixel))
+ offsets = offsets.add32(coord_x.sll32<1>()); // x * 2 (x * sizeof(pixel))
+
+ const u32 o0 = offsets.extract32<0>();
+ const u32 o1 = offsets.extract32<1>();
+ const u32 o2 = offsets.extract32<2>();
+ const u32 o3 = offsets.extract32<3>();
+
+ // TODO: split in two, merge, maybe could be zx loaded instead..
+ u16 p0, p1, p2, p3;
+ std::memcpy(&p0, reinterpret_cast(g_vram) + o0, sizeof(p0));
+ std::memcpy(&p1, reinterpret_cast(g_vram) + o1, sizeof(p1));
+ std::memcpy(&p2, reinterpret_cast(g_vram) + o2, sizeof(p2));
+ std::memcpy(&p3, reinterpret_cast(g_vram) + o3, sizeof(p3));
+ GSVector4i pixels = GSVector4i::load(p0);
+ pixels = pixels.insert16<2>(p1);
+ pixels = pixels.insert16<4>(p2);
+ pixels = pixels.insert16<6>(p3);
+
+ return pixels;
+}
+
+ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices)
+{
+ const GSVector4i offsets = indices.sll32<1>(); // x * 2 (x * sizeof(pixel))
+ const u32 o0 = offsets.extract32<0>();
+ const u32 o1 = offsets.extract32<1>();
+ const u32 o2 = offsets.extract32<2>();
+ const u32 o3 = offsets.extract32<3>();
+
+ // TODO: split in two, merge, maybe could be zx loaded instead..
+ u16 p0, p1, p2, p3;
+ std::memcpy(&p0, reinterpret_cast(g_gpu_clut) + o0, sizeof(p0));
+ std::memcpy(&p1, reinterpret_cast(g_gpu_clut) + o1, sizeof(p1));
+ std::memcpy(&p2, reinterpret_cast(g_gpu_clut) + o2, sizeof(p2));
+ std::memcpy(&p3, reinterpret_cast(g_gpu_clut) + o3, sizeof(p3));
+ GSVector4i pixels = GSVector4i::load(p0);
+ pixels = pixels.insert16<2>(p1);
+ pixels = pixels.insert16<4>(p2);
+ pixels = pixels.insert16<6>(p3);
+
+ return pixels;
+}
+
+ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
+{
+ if (x <= (VRAM_WIDTH - 4))
+ {
+ return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32();
+ }
+ else
+ {
+ const u16* line = &g_vram[y * VRAM_WIDTH];
+ GSVector4i pixels = GSVector4i(line[(x++) & VRAM_WIDTH_MASK]);
+ pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]);
+ pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]);
+ pixels = pixels.insert16<6>(line[x & VRAM_WIDTH_MASK]);
+ return pixels;
+ }
+}
+
+ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
+{
+ if (x <= (VRAM_WIDTH - 4))
+ {
+ GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], color);
+ }
+ else
+ {
+ u16* line = &g_vram[y * VRAM_WIDTH];
+ line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<0>());
+ line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<1>());
+ line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<2>());
+ line[x & VRAM_WIDTH_MASK] = Truncate16(color.extract16<3>());
+ }
+}
+
+ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVector4i rgb5a1, GSVector4i& rg, GSVector4i& ba)
+{
+ rg = rgb5a1 & GSVector4i::cxpr(0x1F); // R | R | R | R
+ rg = rg | (rgb5a1 & GSVector4i::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0
+ ba = rgb5a1.srl32<10>() & GSVector4i::cxpr(0x1F); // B | B | B | B
+ ba = ba | (rgb5a1 & GSVector4i::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0
+}
+
+ALWAYS_INLINE_RELEASE static GSVector4i RG_BAToRGB5A1(GSVector4i rg, GSVector4i ba)
+{
+ GSVector4i res;
+
+ res = rg & GSVector4i::cxpr(0x1F); // R | R | R | R
+ res = res | (rg.srl32<11>() & GSVector4i::cxpr(0x3E0)); // RG | RG | RG | RG
+ res = res | ((ba & GSVector4i::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB
+ res = res | ba.srl32<16>().sll32<15>(); // RGBA | RGBA | RGBA | RGBA
+
+ return res;
+}
+
+// Color repeated twice for RG packing, then duplicated to we can load based on the X offset.
+static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = {
+#define P(m, n) static_cast(DITHER_MATRIX[m][n]), static_cast(DITHER_MATRIX[m][n])
+#define R(m) P(m, 0), P(m, 1), P(m, 2), P(m, 3), P(m, 0), P(m, 1), P(m, 2), P(m, 3)
+
+ {R(0)}, {R(1)}, {R(2)}, {R(3)}
+
+#undef R
+#undef P
+};
+
+template
+ALWAYS_INLINE_RELEASE static void
+ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vertex_color_rg, GSVector4i vertex_color_ba,
+ GSVector4i texcoord_x, GSVector4i texcoord_y, GSVector4i preserve_mask, GSVector4i dither)
+{
+ static constinit GSVector4i coord_mask_x = GSVector4i::cxpr(VRAM_WIDTH_MASK);
+ static constinit GSVector4i coord_mask_y = GSVector4i::cxpr(VRAM_HEIGHT_MASK);
+
+ GSVector4i color;
+
+ if constexpr (texture_enable)
+ {
+ // Apply texture window
+ texcoord_x = (texcoord_x & GSVector4i(cmd->window.and_x)) | GSVector4i(cmd->window.or_x);
+ texcoord_y = (texcoord_y & GSVector4i(cmd->window.and_y)) | GSVector4i(cmd->window.or_y);
+
+ const GSVector4i base_x = GSVector4i(cmd->draw_mode.GetTexturePageBaseX());
+ const GSVector4i base_y = GSVector4i(cmd->draw_mode.GetTexturePageBaseY());
+ const GSVector4i palette_x = GSVector4i(cmd->palette.GetXBase());
+ const GSVector4i palette_y = GSVector4i(cmd->palette.GetYBase());
+
+ texcoord_y = base_y.add32(texcoord_y) & coord_mask_y;
+
+ GSVector4i texture_color;
+ switch (cmd->draw_mode.texture_mode)
+ {
+ case GPUTextureMode::Palette4Bit:
+ {
+ GSVector4i load_texcoord_x = texcoord_x.srl32<2>();
+ load_texcoord_x = base_x.add32(load_texcoord_x);
+ load_texcoord_x = load_texcoord_x & coord_mask_x;
+
+ // todo: sse4 path
+ GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(3)).sll32<2>();
+ GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
+#ifdef GSVECTOR_HAS_SRLV
+ palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0x0F);
+#else
+ Assert(false && "Fixme");
+#endif
+
+ texture_color = GatherCLUTVector(palette_indices);
+ }
+ break;
+
+ case GPUTextureMode::Palette8Bit:
+ {
+ GSVector4i load_texcoord_x = texcoord_x.srl32<1>();
+ load_texcoord_x = base_x.add32(load_texcoord_x);
+ load_texcoord_x = load_texcoord_x & coord_mask_x;
+
+ GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(1)).sll32<3>();
+ GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
+#ifdef GSVECTOR_HAS_SRLV
+ palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0xFF);
+#else
+ Assert(false && "Fixme");
+#endif
+
+ texture_color = GatherCLUTVector(palette_indices);
+ }
+ break;
+
+ default:
+ {
+ texcoord_x = base_x.add32(texcoord_x);
+ texcoord_x = texcoord_x & coord_mask_x;
+ texture_color = GatherVector(texcoord_x, texcoord_y);
+ }
+ break;
+ }
+
+ // check for zero texture colour across the 4 pixels, early out if so
+ const GSVector4i texture_transparent_mask = texture_color.eq32(GSVector4i::zero());
+ if (texture_transparent_mask.alltrue())
+ return;
+
+ preserve_mask = preserve_mask | texture_transparent_mask;
+
+ if constexpr (raw_texture_enable)
+ {
+ color = texture_color;
+ }
+ else
+ {
+ GSVector4i trg, tba;
+ RGB5A1ToRG_BA(texture_color, trg, tba);
+
+ // now we have both the texture and vertex color in RG/GA pairs, for 4 pixels, which we can multiply
+ GSVector4i rg = trg.mul16l(vertex_color_rg);
+ GSVector4i ba = tba.mul16l(vertex_color_ba);
+
+ // TODO: Dither
+ // Convert to 5bit.
+ if constexpr (dithering_enable)
+ {
+ rg = rg.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+ ba = ba.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+ }
+ else
+ {
+ rg = rg.sra16<7>();
+ ba = ba.sra16<7>();
+ }
+
+ // Bit15 gets passed through as-is.
+ ba = ba.blend16<0xaa>(tba);
+
+ // Clamp to 5bit.
+ static constexpr GSVector4i colclamp = GSVector4i::cxpr16(0x1F);
+ rg = rg.min_u16(colclamp);
+ ba = ba.min_u16(colclamp);
+
+ // And interleave back to 16bpp.
+ color = RG_BAToRGB5A1(rg, ba);
+ }
+ }
+ else
+ {
+ // Non-textured transparent polygons don't set bit 15, but are treated as transparent.
+ if constexpr (dithering_enable)
+ {
+ GSVector4i rg = vertex_color_rg.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+ GSVector4i ba = vertex_color_ba.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
+
+ // Clamp to 5bit. We use 32bit for BA to set a to zero.
+ rg = rg.min_u16(GSVector4i::cxpr16(0x1F));
+ ba = ba.min_u16(GSVector4i::cxpr(0x1F));
+
+ // And interleave back to 16bpp.
+ color = RG_BAToRGB5A1(rg, ba);
+ }
+ else
+ {
+ // Note that bit15 is set to 0 here, which the shift will do.
+ const GSVector4i rg = vertex_color_rg.srl16<3>();
+ const GSVector4i ba = vertex_color_ba.srl16<3>();
+ color = RG_BAToRGB5A1(rg, ba);
+ }
+ }
+
+ GSVector4i bg_color = LoadVector(start_x, y);
+
+ if constexpr (transparency_enable)
+ {
+ [[maybe_unused]] GSVector4i transparent_mask;
+ if constexpr (texture_enable)
+ {
+ // Compute transparent_mask, ffff per lane if transparent otherwise 0000
+ transparent_mask = color.sra16<15>();
+ }
+
+ // TODO: We don't need to OR color here with 0x8000 for textures.
+ // 0x8000 is added to match serial path.
+
+ GSVector4i blended_color;
+ switch (cmd->draw_mode.transparency_mode)
+ {
+ case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
+ {
+ const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
+ const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
+ const GSVector4i res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x0421u)).srl32<1>();
+ blended_color = res & GSVector4i::cxpr(0xffff);
+ }
+ break;
+
+ case GPUTransparencyMode::BackgroundPlusForeground:
+ {
+ const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
+ const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
+ const GSVector4i sum = fg_bits.add32(bg_bits);
+ const GSVector4i carry =
+ (sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u))) & GSVector4i::cxpr(0x8420u);
+ const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
+ blended_color = res & GSVector4i::cxpr(0xffff);
+ }
+ break;
+
+ case GPUTransparencyMode::BackgroundMinusForeground:
+ {
+ const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
+ const GSVector4i fg_bits = color & GSVector4i::cxpr(0x7FFFu);
+ const GSVector4i diff = bg_bits.sub32(fg_bits).add32(GSVector4i::cxpr(0x108420u));
+ const GSVector4i borrow =
+ diff.sub32((bg_bits ^ fg_bits) & GSVector4i::cxpr(0x108420u)) & GSVector4i::cxpr(0x108420u);
+ const GSVector4i res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>());
+ blended_color = res & GSVector4i::cxpr(0xffff);
+ }
+ break;
+
+ case GPUTransparencyMode::BackgroundPlusQuarterForeground:
+ default:
+ {
+ const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
+ const GSVector4i fg_bits =
+ ((color | GSVector4i::cxpr(0x8000)).srl32<2>() & GSVector4i::cxpr(0x1CE7u)) | GSVector4i::cxpr(0x8000u);
+ const GSVector4i sum = fg_bits.add32(bg_bits);
+ const GSVector4i carry = sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u)) & GSVector4i::cxpr(0x8420u);
+ const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
+ blended_color = res & GSVector4i::cxpr(0xffff);
+ }
+ break;
+ }
+
+ // select blended pixels for transparent pixels, otherwise consider opaque
+ // TODO: SSE2
+ if constexpr (texture_enable)
+ color = color.blend8(blended_color, transparent_mask);
+ else
+ color = blended_color & GSVector4i::cxpr(0x7fff);
+ }
+
+ // TODO: lift out to parent?
+ const GSVector4i mask_and = GSVector4i(cmd->params.GetMaskAND());
+ const GSVector4i mask_or = GSVector4i(cmd->params.GetMaskOR());
+
+ GSVector4i mask_bits_set = bg_color & mask_and; // 8000 if masked else 0000
+ mask_bits_set = mask_bits_set.sra16<15>(); // ffff if masked else 0000
+ preserve_mask = preserve_mask | mask_bits_set; // ffff if preserved else 0000
+
+ bg_color = bg_color & preserve_mask;
+ color = (color | mask_or).andnot(preserve_mask);
+ color = color | bg_color;
+
+ const GSVector4i packed_color = color.pu32();
+ StoreVector(start_x, y, packed_color);
+}
+
+template
+static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
+{
+ const s32 origin_x = cmd->x;
+ const s32 origin_y = cmd->y;
+
+ const GSVector4i rgba = GSVector4i(cmd->color); // RGBA | RGBA | RGBA | RGBA
+ GSVector4i rg = rgba.xxxxl(); // RGRG | RGRG | RGRG | RGRG
+ GSVector4i ba = rgba.yyyyl(); // BABA | BABA | BABA | BABA
+ rg = rg.u8to16(); // R0G0 | R0G0 | R0G0 | R0G0
+ ba = ba.u8to16(); // B0A0 | B0A0 | B0A0 | B0A0
+
+ const GSVector4i texcoord_x = GSVector4i(cmd->texcoord & 0xFF).add32(GSVector4i::cxpr(0, 1, 2, 3));
+ GSVector4i texcoord_y = GSVector4i(cmd->texcoord >> 8);
+
+ const GSVector4i clip_left = GSVector4i(g_drawing_area.left);
+ const GSVector4i clip_right = GSVector4i(g_drawing_area.right);
+ const u32 width = cmd->width;
+
+ BACKUP_VRAM();
+
+ for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
+ {
+ const s32 y = origin_y + static_cast(offset_y);
+ if (y < static_cast(g_drawing_area.top) || y > static_cast(g_drawing_area.bottom) ||
+ (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u)))
+ {
+ continue;
+ }
+
+ GSVector4i row_texcoord_x = texcoord_x;
+ GSVector4i xvec = GSVector4i(origin_x).add32(GSVector4i::cxpr(0, 1, 2, 3));
+ GSVector4i wvec = GSVector4i(width).sub32(GSVector4i::cxpr(1, 2, 3, 4));
+
+ for (u32 offset_x = 0; offset_x < width; offset_x += 4)
+ {
+ const s32 x = origin_x + static_cast(offset_x);
+
+ // width test
+ GSVector4i preserve_mask = wvec.lt32(GSVector4i::zero());
+
+ // clip test, if all pixels are outside, skip
+ preserve_mask = preserve_mask | xvec.lt32(clip_left);
+ preserve_mask = preserve_mask | xvec.gt32(clip_right);
+ if (!preserve_mask.alltrue())
+ {
+ ShadePixel(
+ cmd, x, y, rg, ba, row_texcoord_x, texcoord_y, preserve_mask, GSVector4i::zero());
+ }
+
+ xvec = xvec.add32(GSVector4i::cxpr(4));
+ wvec = wvec.sub32(GSVector4i::cxpr(4));
+
+ if constexpr (texture_enable)
+ row_texcoord_x = row_texcoord_x.add32(GSVector4i::cxpr(4)) & GSVector4i::cxpr(0xFF);
+ }
+
+ if constexpr (texture_enable)
+ texcoord_y = texcoord_y.add32(GSVector4i::cxpr(1)) & GSVector4i::cxpr(0xFF);
+ }
+
+ CHECK_VRAM(GPU_SW_Rasterizer::DrawRectangleFunctions[texture_enable][raw_texture_enable][transparency_enable](cmd));
+}
+
+#endif // USE_VECTOR
+
//////////////////////////////////////////////////////////////////////////
// Polygon and line rasterization ported from Mednafen
//////////////////////////////////////////////////////////////////////////
@@ -355,6 +805,8 @@ ALWAYS_INLINE_RELEASE static void AddIDeltas_DY(i_group& ig, const i_deltas& idl
}
}
+#ifndef USE_VECTOR
+
template
ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound,
@@ -401,11 +853,150 @@ ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* c
} while (--w > 0);
}
+#else // USE_VECTOR
+
+template
+ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawCommand* cmd, s32 y, s32 x_start, s32 x_bound,
+ i_group ig, const i_deltas& idl)
+{
+ if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u))
+ return;
+
+ s32 x_ig_adjust = x_start;
+ s32 w = x_bound - x_start;
+ s32 x = TruncateGPUVertexPosition(x_start);
+
+ if (x < static_cast(g_drawing_area.left))
+ {
+ s32 delta = static_cast(g_drawing_area.left) - x;
+ x_ig_adjust += delta;
+ x += delta;
+ w -= delta;
+ }
+
+ if ((x + w) > (static_cast(g_drawing_area.right) + 1))
+ w = static_cast(g_drawing_area.right) + 1 - x;
+
+ if (w <= 0)
+ return;
+
+ // TODO: Precompute.
+
+ const auto clip_left = GSVector4i(g_drawing_area.left);
+ const auto clip_right = GSVector4i(g_drawing_area.right);
+
+ const GSVector4i dr_dx = GSVector4i(idl.dr_dx * 4);
+ const GSVector4i dg_dx = GSVector4i(idl.dg_dx * 4);
+ const GSVector4i db_dx = GSVector4i(idl.db_dx * 4);
+ const GSVector4i du_dx = GSVector4i(idl.du_dx * 4);
+ const GSVector4i dv_dx = GSVector4i(idl.dv_dx * 4);
+
+ // TODO: vectorize
+ const GSVector4i dr_dx_offset = GSVector4i(0, idl.dr_dx, idl.dr_dx * 2, idl.dr_dx * 3);
+ const GSVector4i dg_dx_offset = GSVector4i(0, idl.dg_dx, idl.dg_dx * 2, idl.dg_dx * 3);
+ const GSVector4i db_dx_offset = GSVector4i(0, idl.db_dx, idl.db_dx * 2, idl.db_dx * 3);
+ const GSVector4i du_dx_offset = GSVector4i(0, idl.du_dx, idl.du_dx * 2, idl.du_dx * 3);
+ const GSVector4i dv_dx_offset = GSVector4i(0, idl.dv_dx, idl.dv_dx * 2, idl.dv_dx * 3);
+
+ GSVector4i dr, dg, db;
+ if constexpr (shading_enable)
+ {
+ dr = GSVector4i(ig.r + idl.dr_dx * x_ig_adjust).add32(dr_dx_offset);
+ dg = GSVector4i(ig.g + idl.dg_dx * x_ig_adjust).add32(dg_dx_offset);
+ db = GSVector4i(ig.b + idl.db_dx * x_ig_adjust).add32(db_dx_offset);
+ }
+ else
+ {
+ // precompute for flat shading
+ dr = GSVector4i(ig.r >> (COORD_FBS + COORD_POST_PADDING));
+ dg = GSVector4i((ig.g >> (COORD_FBS + COORD_POST_PADDING)) << 16);
+ db = GSVector4i(ig.b >> (COORD_FBS + COORD_POST_PADDING));
+ }
+
+ GSVector4i du = GSVector4i(ig.u + idl.du_dx * x_ig_adjust).add32(du_dx_offset);
+ GSVector4i dv = GSVector4i(ig.v + idl.dv_dx * x_ig_adjust).add32(dv_dx_offset);
+
+ // TODO: Move to caller.
+ if constexpr (shading_enable)
+ {
+ // TODO: vectorize multiply?
+ dr = dr.add32(GSVector4i(idl.dr_dy * y));
+ dg = dg.add32(GSVector4i(idl.dg_dy * y));
+ db = db.add32(GSVector4i(idl.db_dy * y));
+ }
+
+ if constexpr (texture_enable)
+ {
+ du = du.add32(GSVector4i(idl.du_dy * y));
+ dv = dv.add32(GSVector4i(idl.dv_dy * y));
+ }
+
+ const GSVector4i dither =
+ GSVector4i::load(&VECTOR_DITHER_MATRIX[static_cast(y) & 3][(static_cast(x) & 3) * 2]);
+
+ GSVector4i xvec = GSVector4i(x).add32(GSVector4i::cxpr(0, 1, 2, 3));
+ GSVector4i wvec = GSVector4i(w).sub32(GSVector4i::cxpr(1, 2, 3, 4));
+
+ for (s32 count = (w + 3) / 4; count > 0; --count)
+ {
+ // R000 | R000 | R000 | R000
+ // R0G0 | R0G0 | R0G0 | R0G0
+ const GSVector4i r = shading_enable ? dr.srl32() : dr;
+ const GSVector4i g =
+ shading_enable ? dg.srl32().sll32<16>() : dg; // get G into the correct position
+ const GSVector4i b = shading_enable ? db.srl32() : db;
+ const GSVector4i u = du.srl32();
+ const GSVector4i v = dv.srl32();
+
+ // TODO: no-sse4
+ const GSVector4i rg = r.blend16<0xAA>(g);
+
+ // mask based on what's outside the span
+ auto preserve_mask = wvec.lt32(GSVector4i::zero());
+
+ // clip test, if all pixels are outside, skip
+ preserve_mask = preserve_mask | xvec.lt32(clip_left);
+ preserve_mask = preserve_mask | xvec.gt32(clip_right);
+ if (!preserve_mask.alltrue())
+ {
+ ShadePixel(
+ cmd, static_cast(x), static_cast(y), rg, b, u, v, preserve_mask, dither);
+ }
+
+ x += 4;
+
+ xvec = xvec.add32(GSVector4i::cxpr(4));
+ wvec = wvec.sub32(GSVector4i::cxpr(4));
+
+ if constexpr (shading_enable)
+ {
+ dr = dr.add32(dr_dx);
+ dg = dg.add32(dg_dx);
+ db = db.add32(db_dx);
+ }
+
+ if constexpr (texture_enable)
+ {
+ du = du.add32(du_dx);
+ dv = dv.add32(dv_dx);
+ }
+ }
+}
+
+#endif // USE_VECTOR
+
template
static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0,
const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2)
{
+#if 0
+ const GPUBackendDrawPolygonCommand::Vertex* orig_v0 = v0;
+ const GPUBackendDrawPolygonCommand::Vertex* orig_v1 = v1;
+ const GPUBackendDrawPolygonCommand::Vertex* orig_v2 = v2;
+#endif
+
u32 core_vertex;
{
u32 cvtemp = 0;
@@ -480,6 +1071,10 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke
if (!CalcIDeltas(idl, v0, v1, v2))
return;
+#ifdef USE_VECTOR
+ BACKUP_VRAM();
+#endif
+
const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2};
i_group ig;
@@ -591,6 +1186,12 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke
}
}
}
+
+#ifdef USE_VECTOR
+ CHECK_VRAM(
+ GPU_SW_Rasterizer::DrawTriangleFunctions[shading_enable][texture_enable][raw_texture_enable][transparency_enable]
+ [dithering_enable](cmd, orig_v0, orig_v1, orig_v2));
+#endif
}
ALWAYS_INLINE_RELEASE static s64 LineDivide(s64 delta, s32 dk)
diff --git a/src/core/gpu_sw_rasterizer_avx2.cpp b/src/core/gpu_sw_rasterizer_avx2.cpp
new file mode 100644
index 000000000..c145ee98c
--- /dev/null
+++ b/src/core/gpu_sw_rasterizer_avx2.cpp
@@ -0,0 +1,12 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "gpu_sw_rasterizer.h"
+
+#include "common/assert.h"
+#include "common/gsvector.h"
+
+namespace GPU_SW_Rasterizer::AVX2 {
+#define USE_VECTOR 1
+#include "gpu_sw_rasterizer.inl"
+}