From 724b1a7cc4d9e4c0c1bb01bc9e13475a89bbf4da Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Thu, 31 Mar 2022 02:11:02 +1000 Subject: [PATCH] GPU/SW: SIMD-ify the software renderer --- src/core/core.vcxproj | 6 + src/core/core.vcxproj.filters | 1 + src/core/gpu_sw_rasterizer.cpp | 45 ++ src/core/gpu_sw_rasterizer.h | 27 ++ src/core/gpu_sw_rasterizer.inl | 615 +++++++++++++++++++++++++++- src/core/gpu_sw_rasterizer_avx2.cpp | 12 + 6 files changed, 699 insertions(+), 7 deletions(-) create mode 100644 src/core/gpu_sw_rasterizer_avx2.cpp diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj index 29574d56a..b23b1cbf3 100644 --- a/src/core/core.vcxproj +++ b/src/core/core.vcxproj @@ -51,6 +51,12 @@ + + AdvancedVectorExtensions2 + %(AdditionalOptions) -mavx2 + true + NotUsing + diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters index 2a3bcaef8..d0dd29200 100644 --- a/src/core/core.vcxproj.filters +++ b/src/core/core.vcxproj.filters @@ -69,6 +69,7 @@ + diff --git a/src/core/gpu_sw_rasterizer.cpp b/src/core/gpu_sw_rasterizer.cpp index 59d5ade53..47177c0da 100644 --- a/src/core/gpu_sw_rasterizer.cpp +++ b/src/core/gpu_sw_rasterizer.cpp @@ -7,6 +7,8 @@ #include "cpuinfo.h" #include "common/log.h" +#include "common/string_util.h" + Log_SetChannel(GPU_SW_Rasterizer); namespace GPU_SW_Rasterizer { @@ -39,6 +41,13 @@ namespace GPU_SW_Rasterizer { #include "gpu_sw_rasterizer.inl" } +// Default vector implementation definitions. +#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON) +namespace GPU_SW_Rasterizer::Vector { +#include "gpu_sw_rasterizer.inl" +} +#endif + // Initialize with default implementation. namespace GPU_SW_Rasterizer { const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions = &DrawRectangleFunctions; @@ -49,4 +58,40 @@ const DrawLineFunctionTable* SelectedDrawLineFunctions = &DrawLineFunctions; // Declare alternative implementations. void GPU_SW_Rasterizer::SelectImplementation() { + static bool selected = false; + if (selected) + return; + + selected = true; + +#define SELECT_ALTERNATIVE_RASTERIZER(isa) \ + do \ + { \ + INFO_LOG("* Using " #isa " software rasterizer implementation."); \ + SelectedDrawRectangleFunctions = &isa::DrawRectangleFunctions; \ + SelectedDrawTriangleFunctions = &isa::DrawTriangleFunctions; \ + SelectedDrawLineFunctions = &isa::DrawLineFunctions; \ + } while (0) + +#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON) + const char* use_isa = std::getenv("SW_USE_ISA"); + +#ifdef CPU_ARCH_SSE + if (cpuinfo_has_x86_avx2() && (!use_isa || StringUtil::Strcasecmp(use_isa, "AVX2") == 0)) + { + SELECT_ALTERNATIVE_RASTERIZER(AVX2); + return; + } +#endif + + if (!use_isa || StringUtil::Strcasecmp(use_isa, "Vector") == 0) + { + SELECT_ALTERNATIVE_RASTERIZER(Vector); + return; + } +#endif + + INFO_LOG("* Using scalar software rasterizer implementation."); + +#undef SELECT_ALTERNATIVE_RASTERIZER } diff --git a/src/core/gpu_sw_rasterizer.h b/src/core/gpu_sw_rasterizer.h index f3d95f0bf..f183e26d3 100644 --- a/src/core/gpu_sw_rasterizer.h +++ b/src/core/gpu_sw_rasterizer.h @@ -1,4 +1,5 @@ // SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #pragma once @@ -6,6 +7,7 @@ #include "gpu.h" #include "gpu_types.h" +#include "common/intrin.h" #include "common/types.h" #include @@ -33,6 +35,11 @@ using DrawLineFunction = void (*)(const GPUBackendDrawLineCommand* cmd, const GP const GPUBackendDrawLineCommand::Vertex* p1); typedef const DrawLineFunction DrawLineFunctionTable[2][2][2]; +// Default implementation, compatible with all ISAs. +extern const DrawRectangleFunctionTable DrawRectangleFunctions; +extern const DrawTriangleFunctionTable DrawTriangleFunctions; +extern const DrawLineFunctionTable DrawLineFunctions; + // Current implementation, selected at runtime. extern const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions; extern const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions; @@ -60,4 +67,24 @@ ALWAYS_INLINE static DrawTriangleFunction GetDrawTriangleFunction(bool shading_e [u8(transparency_enable)][u8(dithering_enable)]; } +#define DECLARE_ALTERNATIVE_RASTERIZER(isa) \ + namespace isa { \ + extern const DrawRectangleFunctionTable DrawRectangleFunctions; \ + extern const DrawTriangleFunctionTable DrawTriangleFunctions; \ + extern const DrawLineFunctionTable DrawLineFunctions; \ + } + +// Have to define the symbols globally, because clang won't include them otherwise. +#if defined(CPU_ARCH_SSE) +#define ALTERNATIVE_RASTERIZER_LIST() DECLARE_ALTERNATIVE_RASTERIZER(AVX2) +#else +#define ALTERNATIVE_RASTERIZER_LIST() +#endif + +ALTERNATIVE_RASTERIZER_LIST() + +#undef DECLARE_ALTERNATIVE_RASTERIZER + } // namespace GPU_SW_Rasterizer + +// static u32 s_bad_counter = 0; diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl index 15871b95b..acf6c5cfe 100644 --- a/src/core/gpu_sw_rasterizer.inl +++ b/src/core/gpu_sw_rasterizer.inl @@ -3,15 +3,55 @@ #ifdef __INTELLISENSE__ +#include "common/gsvector.h" #include "gpu.h" #include +#define USE_VECTOR 1 +#define GSVECTOR_HAS_SRLV 1 + extern GPU_SW_Rasterizer::DitherLUT g_dither_lut; namespace GPU_SW_Rasterizer { #endif +// TODO: UpdateVRAM, FillVRAM, etc. + +#ifdef USE_VECTOR +#if 0 +static u16 s_vram_backup[VRAM_WIDTH * VRAM_HEIGHT]; +static u16 s_new_vram[VRAM_WIDTH * VRAM_HEIGHT]; +#define BACKUP_VRAM() \ + do \ + { \ + std::memcpy(s_vram_backup, g_vram, sizeof(g_vram)); \ + s_bad_counter++; \ + } while (0) +#define CHECK_VRAM(drawer) \ + do \ + { \ + std::memcpy(s_new_vram, g_vram, sizeof(g_vram)); \ + std::memcpy(g_vram, s_vram_backup, sizeof(g_vram)); \ + \ + drawer; \ + for (u32 vidx = 0; vidx < (VRAM_WIDTH * VRAM_HEIGHT); vidx++) \ + { \ + if (s_new_vram[vidx] != g_vram[vidx]) \ + { \ + fprintf(stderr, "[%u] Mismatch at %d,%d, expected %04x got %04x\n", s_bad_counter, (vidx % VRAM_WIDTH), \ + (vidx / VRAM_WIDTH), g_vram[vidx], s_new_vram[vidx]); \ + AssertMsg(false, "Mismatch"); \ + } \ + } \ + /*Assert(std::memcmp(g_vram, s_new_vram, sizeof(g_vram)) == 0)*/ \ + } while (0) +#else +#define BACKUP_VRAM() +#define CHECK_VRAM(drawer) +#endif +#endif + namespace { enum { @@ -49,32 +89,33 @@ struct line_fxp_step }; } // namespace -ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y) +[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y) { return g_vram[VRAM_WIDTH * y + x]; } -ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y) +[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y) { return &g_vram[VRAM_WIDTH * y + x]; } -ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value) +[[maybe_unused]] ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value) { g_vram[VRAM_WIDTH * y + x] = value; } -ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackTexcoord(u16 texcoord) +[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackTexcoord(u16 texcoord) { return std::make_tuple(static_cast(texcoord), static_cast(texcoord >> 8)); } -ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackColorRGB24(u32 rgb24) +[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple UnpackColorRGB24(u32 rgb24) { return std::make_tuple(static_cast(rgb24), static_cast(rgb24 >> 8), static_cast(rgb24 >> 16)); } template -ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r, u8 color_g, - u8 color_b, u8 texcoord_x, u8 texcoord_y) +[[maybe_unused]] ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, + u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x, + u8 texcoord_y) { u16 color; if constexpr (texture_enable) @@ -215,6 +256,8 @@ ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u SetPixel(static_cast(x), static_cast(y), color | cmd->params.GetMaskOR()); } +#ifndef USE_VECTOR + template static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) { @@ -249,6 +292,413 @@ static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) } } +#else // USE_VECTOR + +ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVector4i coord_y) +{ + GSVector4i offsets = coord_y.sll32<11>(); // y * 2048 (1024 * sizeof(pixel)) + offsets = offsets.add32(coord_x.sll32<1>()); // x * 2 (x * sizeof(pixel)) + + const u32 o0 = offsets.extract32<0>(); + const u32 o1 = offsets.extract32<1>(); + const u32 o2 = offsets.extract32<2>(); + const u32 o3 = offsets.extract32<3>(); + + // TODO: split in two, merge, maybe could be zx loaded instead.. + u16 p0, p1, p2, p3; + std::memcpy(&p0, reinterpret_cast(g_vram) + o0, sizeof(p0)); + std::memcpy(&p1, reinterpret_cast(g_vram) + o1, sizeof(p1)); + std::memcpy(&p2, reinterpret_cast(g_vram) + o2, sizeof(p2)); + std::memcpy(&p3, reinterpret_cast(g_vram) + o3, sizeof(p3)); + GSVector4i pixels = GSVector4i::load(p0); + pixels = pixels.insert16<2>(p1); + pixels = pixels.insert16<4>(p2); + pixels = pixels.insert16<6>(p3); + + return pixels; +} + +ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices) +{ + const GSVector4i offsets = indices.sll32<1>(); // x * 2 (x * sizeof(pixel)) + const u32 o0 = offsets.extract32<0>(); + const u32 o1 = offsets.extract32<1>(); + const u32 o2 = offsets.extract32<2>(); + const u32 o3 = offsets.extract32<3>(); + + // TODO: split in two, merge, maybe could be zx loaded instead.. + u16 p0, p1, p2, p3; + std::memcpy(&p0, reinterpret_cast(g_gpu_clut) + o0, sizeof(p0)); + std::memcpy(&p1, reinterpret_cast(g_gpu_clut) + o1, sizeof(p1)); + std::memcpy(&p2, reinterpret_cast(g_gpu_clut) + o2, sizeof(p2)); + std::memcpy(&p3, reinterpret_cast(g_gpu_clut) + o3, sizeof(p3)); + GSVector4i pixels = GSVector4i::load(p0); + pixels = pixels.insert16<2>(p1); + pixels = pixels.insert16<4>(p2); + pixels = pixels.insert16<6>(p3); + + return pixels; +} + +ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y) +{ + if (x <= (VRAM_WIDTH - 4)) + { + return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32(); + } + else + { + const u16* line = &g_vram[y * VRAM_WIDTH]; + GSVector4i pixels = GSVector4i(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<6>(line[x & VRAM_WIDTH_MASK]); + return pixels; + } +} + +ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color) +{ + if (x <= (VRAM_WIDTH - 4)) + { + GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], color); + } + else + { + u16* line = &g_vram[y * VRAM_WIDTH]; + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<0>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<1>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<2>()); + line[x & VRAM_WIDTH_MASK] = Truncate16(color.extract16<3>()); + } +} + +ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVector4i rgb5a1, GSVector4i& rg, GSVector4i& ba) +{ + rg = rgb5a1 & GSVector4i::cxpr(0x1F); // R | R | R | R + rg = rg | (rgb5a1 & GSVector4i::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0 + ba = rgb5a1.srl32<10>() & GSVector4i::cxpr(0x1F); // B | B | B | B + ba = ba | (rgb5a1 & GSVector4i::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0 +} + +ALWAYS_INLINE_RELEASE static GSVector4i RG_BAToRGB5A1(GSVector4i rg, GSVector4i ba) +{ + GSVector4i res; + + res = rg & GSVector4i::cxpr(0x1F); // R | R | R | R + res = res | (rg.srl32<11>() & GSVector4i::cxpr(0x3E0)); // RG | RG | RG | RG + res = res | ((ba & GSVector4i::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB + res = res | ba.srl32<16>().sll32<15>(); // RGBA | RGBA | RGBA | RGBA + + return res; +} + +// Color repeated twice for RG packing, then duplicated to we can load based on the X offset. +static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = { +#define P(m, n) static_cast(DITHER_MATRIX[m][n]), static_cast(DITHER_MATRIX[m][n]) +#define R(m) P(m, 0), P(m, 1), P(m, 2), P(m, 3), P(m, 0), P(m, 1), P(m, 2), P(m, 3) + + {R(0)}, {R(1)}, {R(2)}, {R(3)} + +#undef R +#undef P +}; + +template +ALWAYS_INLINE_RELEASE static void +ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vertex_color_rg, GSVector4i vertex_color_ba, + GSVector4i texcoord_x, GSVector4i texcoord_y, GSVector4i preserve_mask, GSVector4i dither) +{ + static constinit GSVector4i coord_mask_x = GSVector4i::cxpr(VRAM_WIDTH_MASK); + static constinit GSVector4i coord_mask_y = GSVector4i::cxpr(VRAM_HEIGHT_MASK); + + GSVector4i color; + + if constexpr (texture_enable) + { + // Apply texture window + texcoord_x = (texcoord_x & GSVector4i(cmd->window.and_x)) | GSVector4i(cmd->window.or_x); + texcoord_y = (texcoord_y & GSVector4i(cmd->window.and_y)) | GSVector4i(cmd->window.or_y); + + const GSVector4i base_x = GSVector4i(cmd->draw_mode.GetTexturePageBaseX()); + const GSVector4i base_y = GSVector4i(cmd->draw_mode.GetTexturePageBaseY()); + const GSVector4i palette_x = GSVector4i(cmd->palette.GetXBase()); + const GSVector4i palette_y = GSVector4i(cmd->palette.GetYBase()); + + texcoord_y = base_y.add32(texcoord_y) & coord_mask_y; + + GSVector4i texture_color; + switch (cmd->draw_mode.texture_mode) + { + case GPUTextureMode::Palette4Bit: + { + GSVector4i load_texcoord_x = texcoord_x.srl32<2>(); + load_texcoord_x = base_x.add32(load_texcoord_x); + load_texcoord_x = load_texcoord_x & coord_mask_x; + + // todo: sse4 path + GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(3)).sll32<2>(); + GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y); +#ifdef GSVECTOR_HAS_SRLV + palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0x0F); +#else + Assert(false && "Fixme"); +#endif + + texture_color = GatherCLUTVector(palette_indices); + } + break; + + case GPUTextureMode::Palette8Bit: + { + GSVector4i load_texcoord_x = texcoord_x.srl32<1>(); + load_texcoord_x = base_x.add32(load_texcoord_x); + load_texcoord_x = load_texcoord_x & coord_mask_x; + + GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(1)).sll32<3>(); + GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y); +#ifdef GSVECTOR_HAS_SRLV + palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0xFF); +#else + Assert(false && "Fixme"); +#endif + + texture_color = GatherCLUTVector(palette_indices); + } + break; + + default: + { + texcoord_x = base_x.add32(texcoord_x); + texcoord_x = texcoord_x & coord_mask_x; + texture_color = GatherVector(texcoord_x, texcoord_y); + } + break; + } + + // check for zero texture colour across the 4 pixels, early out if so + const GSVector4i texture_transparent_mask = texture_color.eq32(GSVector4i::zero()); + if (texture_transparent_mask.alltrue()) + return; + + preserve_mask = preserve_mask | texture_transparent_mask; + + if constexpr (raw_texture_enable) + { + color = texture_color; + } + else + { + GSVector4i trg, tba; + RGB5A1ToRG_BA(texture_color, trg, tba); + + // now we have both the texture and vertex color in RG/GA pairs, for 4 pixels, which we can multiply + GSVector4i rg = trg.mul16l(vertex_color_rg); + GSVector4i ba = tba.mul16l(vertex_color_ba); + + // TODO: Dither + // Convert to 5bit. + if constexpr (dithering_enable) + { + rg = rg.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); + ba = ba.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); + } + else + { + rg = rg.sra16<7>(); + ba = ba.sra16<7>(); + } + + // Bit15 gets passed through as-is. + ba = ba.blend16<0xaa>(tba); + + // Clamp to 5bit. + static constexpr GSVector4i colclamp = GSVector4i::cxpr16(0x1F); + rg = rg.min_u16(colclamp); + ba = ba.min_u16(colclamp); + + // And interleave back to 16bpp. + color = RG_BAToRGB5A1(rg, ba); + } + } + else + { + // Non-textured transparent polygons don't set bit 15, but are treated as transparent. + if constexpr (dithering_enable) + { + GSVector4i rg = vertex_color_rg.add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); + GSVector4i ba = vertex_color_ba.add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); + + // Clamp to 5bit. We use 32bit for BA to set a to zero. + rg = rg.min_u16(GSVector4i::cxpr16(0x1F)); + ba = ba.min_u16(GSVector4i::cxpr(0x1F)); + + // And interleave back to 16bpp. + color = RG_BAToRGB5A1(rg, ba); + } + else + { + // Note that bit15 is set to 0 here, which the shift will do. + const GSVector4i rg = vertex_color_rg.srl16<3>(); + const GSVector4i ba = vertex_color_ba.srl16<3>(); + color = RG_BAToRGB5A1(rg, ba); + } + } + + GSVector4i bg_color = LoadVector(start_x, y); + + if constexpr (transparency_enable) + { + [[maybe_unused]] GSVector4i transparent_mask; + if constexpr (texture_enable) + { + // Compute transparent_mask, ffff per lane if transparent otherwise 0000 + transparent_mask = color.sra16<15>(); + } + + // TODO: We don't need to OR color here with 0x8000 for textures. + // 0x8000 is added to match serial path. + + GSVector4i blended_color; + switch (cmd->draw_mode.transparency_mode) + { + case GPUTransparencyMode::HalfBackgroundPlusHalfForeground: + { + const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u); + const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u); + const GSVector4i res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x0421u)).srl32<1>(); + blended_color = res & GSVector4i::cxpr(0xffff); + } + break; + + case GPUTransparencyMode::BackgroundPlusForeground: + { + const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u); + const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu); + const GSVector4i sum = fg_bits.add32(bg_bits); + const GSVector4i carry = + (sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u))) & GSVector4i::cxpr(0x8420u); + const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>()); + blended_color = res & GSVector4i::cxpr(0xffff); + } + break; + + case GPUTransparencyMode::BackgroundMinusForeground: + { + const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u); + const GSVector4i fg_bits = color & GSVector4i::cxpr(0x7FFFu); + const GSVector4i diff = bg_bits.sub32(fg_bits).add32(GSVector4i::cxpr(0x108420u)); + const GSVector4i borrow = + diff.sub32((bg_bits ^ fg_bits) & GSVector4i::cxpr(0x108420u)) & GSVector4i::cxpr(0x108420u); + const GSVector4i res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>()); + blended_color = res & GSVector4i::cxpr(0xffff); + } + break; + + case GPUTransparencyMode::BackgroundPlusQuarterForeground: + default: + { + const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu); + const GSVector4i fg_bits = + ((color | GSVector4i::cxpr(0x8000)).srl32<2>() & GSVector4i::cxpr(0x1CE7u)) | GSVector4i::cxpr(0x8000u); + const GSVector4i sum = fg_bits.add32(bg_bits); + const GSVector4i carry = sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u)) & GSVector4i::cxpr(0x8420u); + const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>()); + blended_color = res & GSVector4i::cxpr(0xffff); + } + break; + } + + // select blended pixels for transparent pixels, otherwise consider opaque + // TODO: SSE2 + if constexpr (texture_enable) + color = color.blend8(blended_color, transparent_mask); + else + color = blended_color & GSVector4i::cxpr(0x7fff); + } + + // TODO: lift out to parent? + const GSVector4i mask_and = GSVector4i(cmd->params.GetMaskAND()); + const GSVector4i mask_or = GSVector4i(cmd->params.GetMaskOR()); + + GSVector4i mask_bits_set = bg_color & mask_and; // 8000 if masked else 0000 + mask_bits_set = mask_bits_set.sra16<15>(); // ffff if masked else 0000 + preserve_mask = preserve_mask | mask_bits_set; // ffff if preserved else 0000 + + bg_color = bg_color & preserve_mask; + color = (color | mask_or).andnot(preserve_mask); + color = color | bg_color; + + const GSVector4i packed_color = color.pu32(); + StoreVector(start_x, y, packed_color); +} + +template +static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) +{ + const s32 origin_x = cmd->x; + const s32 origin_y = cmd->y; + + const GSVector4i rgba = GSVector4i(cmd->color); // RGBA | RGBA | RGBA | RGBA + GSVector4i rg = rgba.xxxxl(); // RGRG | RGRG | RGRG | RGRG + GSVector4i ba = rgba.yyyyl(); // BABA | BABA | BABA | BABA + rg = rg.u8to16(); // R0G0 | R0G0 | R0G0 | R0G0 + ba = ba.u8to16(); // B0A0 | B0A0 | B0A0 | B0A0 + + const GSVector4i texcoord_x = GSVector4i(cmd->texcoord & 0xFF).add32(GSVector4i::cxpr(0, 1, 2, 3)); + GSVector4i texcoord_y = GSVector4i(cmd->texcoord >> 8); + + const GSVector4i clip_left = GSVector4i(g_drawing_area.left); + const GSVector4i clip_right = GSVector4i(g_drawing_area.right); + const u32 width = cmd->width; + + BACKUP_VRAM(); + + for (u32 offset_y = 0; offset_y < cmd->height; offset_y++) + { + const s32 y = origin_y + static_cast(offset_y); + if (y < static_cast(g_drawing_area.top) || y > static_cast(g_drawing_area.bottom) || + (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u))) + { + continue; + } + + GSVector4i row_texcoord_x = texcoord_x; + GSVector4i xvec = GSVector4i(origin_x).add32(GSVector4i::cxpr(0, 1, 2, 3)); + GSVector4i wvec = GSVector4i(width).sub32(GSVector4i::cxpr(1, 2, 3, 4)); + + for (u32 offset_x = 0; offset_x < width; offset_x += 4) + { + const s32 x = origin_x + static_cast(offset_x); + + // width test + GSVector4i preserve_mask = wvec.lt32(GSVector4i::zero()); + + // clip test, if all pixels are outside, skip + preserve_mask = preserve_mask | xvec.lt32(clip_left); + preserve_mask = preserve_mask | xvec.gt32(clip_right); + if (!preserve_mask.alltrue()) + { + ShadePixel( + cmd, x, y, rg, ba, row_texcoord_x, texcoord_y, preserve_mask, GSVector4i::zero()); + } + + xvec = xvec.add32(GSVector4i::cxpr(4)); + wvec = wvec.sub32(GSVector4i::cxpr(4)); + + if constexpr (texture_enable) + row_texcoord_x = row_texcoord_x.add32(GSVector4i::cxpr(4)) & GSVector4i::cxpr(0xFF); + } + + if constexpr (texture_enable) + texcoord_y = texcoord_y.add32(GSVector4i::cxpr(1)) & GSVector4i::cxpr(0xFF); + } + + CHECK_VRAM(GPU_SW_Rasterizer::DrawRectangleFunctions[texture_enable][raw_texture_enable][transparency_enable](cmd)); +} + +#endif // USE_VECTOR + ////////////////////////////////////////////////////////////////////////// // Polygon and line rasterization ported from Mednafen ////////////////////////////////////////////////////////////////////////// @@ -355,6 +805,8 @@ ALWAYS_INLINE_RELEASE static void AddIDeltas_DY(i_group& ig, const i_deltas& idl } } +#ifndef USE_VECTOR + template ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, @@ -401,11 +853,150 @@ ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* c } while (--w > 0); } +#else // USE_VECTOR + +template +ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawCommand* cmd, s32 y, s32 x_start, s32 x_bound, + i_group ig, const i_deltas& idl) +{ + if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u)) + return; + + s32 x_ig_adjust = x_start; + s32 w = x_bound - x_start; + s32 x = TruncateGPUVertexPosition(x_start); + + if (x < static_cast(g_drawing_area.left)) + { + s32 delta = static_cast(g_drawing_area.left) - x; + x_ig_adjust += delta; + x += delta; + w -= delta; + } + + if ((x + w) > (static_cast(g_drawing_area.right) + 1)) + w = static_cast(g_drawing_area.right) + 1 - x; + + if (w <= 0) + return; + + // TODO: Precompute. + + const auto clip_left = GSVector4i(g_drawing_area.left); + const auto clip_right = GSVector4i(g_drawing_area.right); + + const GSVector4i dr_dx = GSVector4i(idl.dr_dx * 4); + const GSVector4i dg_dx = GSVector4i(idl.dg_dx * 4); + const GSVector4i db_dx = GSVector4i(idl.db_dx * 4); + const GSVector4i du_dx = GSVector4i(idl.du_dx * 4); + const GSVector4i dv_dx = GSVector4i(idl.dv_dx * 4); + + // TODO: vectorize + const GSVector4i dr_dx_offset = GSVector4i(0, idl.dr_dx, idl.dr_dx * 2, idl.dr_dx * 3); + const GSVector4i dg_dx_offset = GSVector4i(0, idl.dg_dx, idl.dg_dx * 2, idl.dg_dx * 3); + const GSVector4i db_dx_offset = GSVector4i(0, idl.db_dx, idl.db_dx * 2, idl.db_dx * 3); + const GSVector4i du_dx_offset = GSVector4i(0, idl.du_dx, idl.du_dx * 2, idl.du_dx * 3); + const GSVector4i dv_dx_offset = GSVector4i(0, idl.dv_dx, idl.dv_dx * 2, idl.dv_dx * 3); + + GSVector4i dr, dg, db; + if constexpr (shading_enable) + { + dr = GSVector4i(ig.r + idl.dr_dx * x_ig_adjust).add32(dr_dx_offset); + dg = GSVector4i(ig.g + idl.dg_dx * x_ig_adjust).add32(dg_dx_offset); + db = GSVector4i(ig.b + idl.db_dx * x_ig_adjust).add32(db_dx_offset); + } + else + { + // precompute for flat shading + dr = GSVector4i(ig.r >> (COORD_FBS + COORD_POST_PADDING)); + dg = GSVector4i((ig.g >> (COORD_FBS + COORD_POST_PADDING)) << 16); + db = GSVector4i(ig.b >> (COORD_FBS + COORD_POST_PADDING)); + } + + GSVector4i du = GSVector4i(ig.u + idl.du_dx * x_ig_adjust).add32(du_dx_offset); + GSVector4i dv = GSVector4i(ig.v + idl.dv_dx * x_ig_adjust).add32(dv_dx_offset); + + // TODO: Move to caller. + if constexpr (shading_enable) + { + // TODO: vectorize multiply? + dr = dr.add32(GSVector4i(idl.dr_dy * y)); + dg = dg.add32(GSVector4i(idl.dg_dy * y)); + db = db.add32(GSVector4i(idl.db_dy * y)); + } + + if constexpr (texture_enable) + { + du = du.add32(GSVector4i(idl.du_dy * y)); + dv = dv.add32(GSVector4i(idl.dv_dy * y)); + } + + const GSVector4i dither = + GSVector4i::load(&VECTOR_DITHER_MATRIX[static_cast(y) & 3][(static_cast(x) & 3) * 2]); + + GSVector4i xvec = GSVector4i(x).add32(GSVector4i::cxpr(0, 1, 2, 3)); + GSVector4i wvec = GSVector4i(w).sub32(GSVector4i::cxpr(1, 2, 3, 4)); + + for (s32 count = (w + 3) / 4; count > 0; --count) + { + // R000 | R000 | R000 | R000 + // R0G0 | R0G0 | R0G0 | R0G0 + const GSVector4i r = shading_enable ? dr.srl32() : dr; + const GSVector4i g = + shading_enable ? dg.srl32().sll32<16>() : dg; // get G into the correct position + const GSVector4i b = shading_enable ? db.srl32() : db; + const GSVector4i u = du.srl32(); + const GSVector4i v = dv.srl32(); + + // TODO: no-sse4 + const GSVector4i rg = r.blend16<0xAA>(g); + + // mask based on what's outside the span + auto preserve_mask = wvec.lt32(GSVector4i::zero()); + + // clip test, if all pixels are outside, skip + preserve_mask = preserve_mask | xvec.lt32(clip_left); + preserve_mask = preserve_mask | xvec.gt32(clip_right); + if (!preserve_mask.alltrue()) + { + ShadePixel( + cmd, static_cast(x), static_cast(y), rg, b, u, v, preserve_mask, dither); + } + + x += 4; + + xvec = xvec.add32(GSVector4i::cxpr(4)); + wvec = wvec.sub32(GSVector4i::cxpr(4)); + + if constexpr (shading_enable) + { + dr = dr.add32(dr_dx); + dg = dg.add32(dg_dx); + db = db.add32(db_dx); + } + + if constexpr (texture_enable) + { + du = du.add32(du_dx); + dv = dv.add32(dv_dx); + } + } +} + +#endif // USE_VECTOR + template static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0, const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2) { +#if 0 + const GPUBackendDrawPolygonCommand::Vertex* orig_v0 = v0; + const GPUBackendDrawPolygonCommand::Vertex* orig_v1 = v1; + const GPUBackendDrawPolygonCommand::Vertex* orig_v2 = v2; +#endif + u32 core_vertex; { u32 cvtemp = 0; @@ -480,6 +1071,10 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke if (!CalcIDeltas(idl, v0, v1, v2)) return; +#ifdef USE_VECTOR + BACKUP_VRAM(); +#endif + const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2}; i_group ig; @@ -591,6 +1186,12 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke } } } + +#ifdef USE_VECTOR + CHECK_VRAM( + GPU_SW_Rasterizer::DrawTriangleFunctions[shading_enable][texture_enable][raw_texture_enable][transparency_enable] + [dithering_enable](cmd, orig_v0, orig_v1, orig_v2)); +#endif } ALWAYS_INLINE_RELEASE static s64 LineDivide(s64 delta, s32 dk) diff --git a/src/core/gpu_sw_rasterizer_avx2.cpp b/src/core/gpu_sw_rasterizer_avx2.cpp new file mode 100644 index 000000000..c145ee98c --- /dev/null +++ b/src/core/gpu_sw_rasterizer_avx2.cpp @@ -0,0 +1,12 @@ +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#include "gpu_sw_rasterizer.h" + +#include "common/assert.h" +#include "common/gsvector.h" + +namespace GPU_SW_Rasterizer::AVX2 { +#define USE_VECTOR 1 +#include "gpu_sw_rasterizer.inl" +}