GPU/SW: SIMD-ify the software renderer

This commit is contained in:
Connor McLaughlin 2022-03-31 02:11:02 +10:00 committed by Stenzek
parent 7386ad23b8
commit 724b1a7cc4
No known key found for this signature in database
6 changed files with 699 additions and 7 deletions

View File

@ -51,6 +51,12 @@
<ClCompile Include="gpu_sw.cpp" />
<ClCompile Include="gpu_sw_backend.cpp" />
<ClCompile Include="gpu_sw_rasterizer.cpp" />
<ClCompile Include="gpu_sw_rasterizer_avx2.cpp">
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<AdditionalOptions Condition="$(Configuration.Contains(Clang))">%(AdditionalOptions) -mavx2</AdditionalOptions>
<ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
</ClCompile>
<ClCompile Include="gte.cpp" />
<ClCompile Include="dma.cpp" />
<ClCompile Include="gpu.cpp" />

View File

@ -69,6 +69,7 @@
<ClCompile Include="pine_server.cpp" />
<ClCompile Include="gdb_server.cpp" />
<ClCompile Include="gpu_sw_rasterizer.cpp" />
<ClCompile Include="gpu_sw_rasterizer_avx2.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="types.h" />

View File

@ -7,6 +7,8 @@
#include "cpuinfo.h"
#include "common/log.h"
#include "common/string_util.h"
Log_SetChannel(GPU_SW_Rasterizer);
namespace GPU_SW_Rasterizer {
@ -39,6 +41,13 @@ namespace GPU_SW_Rasterizer {
#include "gpu_sw_rasterizer.inl"
}
// Default vector implementation definitions.
#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
namespace GPU_SW_Rasterizer::Vector {
#include "gpu_sw_rasterizer.inl"
}
#endif
// Initialize with default implementation.
namespace GPU_SW_Rasterizer {
const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions = &DrawRectangleFunctions;
@ -49,4 +58,40 @@ const DrawLineFunctionTable* SelectedDrawLineFunctions = &DrawLineFunctions;
// Declare alternative implementations.
void GPU_SW_Rasterizer::SelectImplementation()
{
static bool selected = false;
if (selected)
return;
selected = true;
#define SELECT_ALTERNATIVE_RASTERIZER(isa) \
do \
{ \
INFO_LOG("* Using " #isa " software rasterizer implementation."); \
SelectedDrawRectangleFunctions = &isa::DrawRectangleFunctions; \
SelectedDrawTriangleFunctions = &isa::DrawTriangleFunctions; \
SelectedDrawLineFunctions = &isa::DrawLineFunctions; \
} while (0)
#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
const char* use_isa = std::getenv("SW_USE_ISA");
#ifdef CPU_ARCH_SSE
if (cpuinfo_has_x86_avx2() && (!use_isa || StringUtil::Strcasecmp(use_isa, "AVX2") == 0))
{
SELECT_ALTERNATIVE_RASTERIZER(AVX2);
return;
}
#endif
if (!use_isa || StringUtil::Strcasecmp(use_isa, "Vector") == 0)
{
SELECT_ALTERNATIVE_RASTERIZER(Vector);
return;
}
#endif
INFO_LOG("* Using scalar software rasterizer implementation.");
#undef SELECT_ALTERNATIVE_RASTERIZER
}

View File

@ -1,4 +1,5 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
@ -6,6 +7,7 @@
#include "gpu.h"
#include "gpu_types.h"
#include "common/intrin.h"
#include "common/types.h"
#include <algorithm>
@ -33,6 +35,11 @@ using DrawLineFunction = void (*)(const GPUBackendDrawLineCommand* cmd, const GP
const GPUBackendDrawLineCommand::Vertex* p1);
typedef const DrawLineFunction DrawLineFunctionTable[2][2][2];
// Default implementation, compatible with all ISAs.
extern const DrawRectangleFunctionTable DrawRectangleFunctions;
extern const DrawTriangleFunctionTable DrawTriangleFunctions;
extern const DrawLineFunctionTable DrawLineFunctions;
// Current implementation, selected at runtime.
extern const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions;
extern const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions;
@ -60,4 +67,24 @@ ALWAYS_INLINE static DrawTriangleFunction GetDrawTriangleFunction(bool shading_e
[u8(transparency_enable)][u8(dithering_enable)];
}
#define DECLARE_ALTERNATIVE_RASTERIZER(isa) \
namespace isa { \
extern const DrawRectangleFunctionTable DrawRectangleFunctions; \
extern const DrawTriangleFunctionTable DrawTriangleFunctions; \
extern const DrawLineFunctionTable DrawLineFunctions; \
}
// Have to define the symbols globally, because clang won't include them otherwise.
#if defined(CPU_ARCH_SSE)
#define ALTERNATIVE_RASTERIZER_LIST() DECLARE_ALTERNATIVE_RASTERIZER(AVX2)
#else
#define ALTERNATIVE_RASTERIZER_LIST()
#endif
ALTERNATIVE_RASTERIZER_LIST()
#undef DECLARE_ALTERNATIVE_RASTERIZER
} // namespace GPU_SW_Rasterizer
// static u32 s_bad_counter = 0;

View File

@ -3,15 +3,55 @@
#ifdef __INTELLISENSE__
#include "common/gsvector.h"
#include "gpu.h"
#include <algorithm>
#define USE_VECTOR 1
#define GSVECTOR_HAS_SRLV 1
extern GPU_SW_Rasterizer::DitherLUT g_dither_lut;
namespace GPU_SW_Rasterizer {
#endif
// TODO: UpdateVRAM, FillVRAM, etc.
#ifdef USE_VECTOR
#if 0
static u16 s_vram_backup[VRAM_WIDTH * VRAM_HEIGHT];
static u16 s_new_vram[VRAM_WIDTH * VRAM_HEIGHT];
#define BACKUP_VRAM() \
do \
{ \
std::memcpy(s_vram_backup, g_vram, sizeof(g_vram)); \
s_bad_counter++; \
} while (0)
#define CHECK_VRAM(drawer) \
do \
{ \
std::memcpy(s_new_vram, g_vram, sizeof(g_vram)); \
std::memcpy(g_vram, s_vram_backup, sizeof(g_vram)); \
\
drawer; \
for (u32 vidx = 0; vidx < (VRAM_WIDTH * VRAM_HEIGHT); vidx++) \
{ \
if (s_new_vram[vidx] != g_vram[vidx]) \
{ \
fprintf(stderr, "[%u] Mismatch at %d,%d, expected %04x got %04x\n", s_bad_counter, (vidx % VRAM_WIDTH), \
(vidx / VRAM_WIDTH), g_vram[vidx], s_new_vram[vidx]); \
AssertMsg(false, "Mismatch"); \
} \
} \
/*Assert(std::memcmp(g_vram, s_new_vram, sizeof(g_vram)) == 0)*/ \
} while (0)
#else
#define BACKUP_VRAM()
#define CHECK_VRAM(drawer)
#endif
#endif
namespace {
enum
{
@ -49,32 +89,33 @@ struct line_fxp_step
};
} // namespace
ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y)
[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y)
{
return g_vram[VRAM_WIDTH * y + x];
}
ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y)
[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y)
{
return &g_vram[VRAM_WIDTH * y + x];
}
ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value)
[[maybe_unused]] ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value)
{
g_vram[VRAM_WIDTH * y + x] = value;
}
ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8> UnpackTexcoord(u16 texcoord)
[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8> UnpackTexcoord(u16 texcoord)
{
return std::make_tuple(static_cast<u8>(texcoord), static_cast<u8>(texcoord >> 8));
}
ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8, u8> UnpackColorRGB24(u32 rgb24)
[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8, u8> UnpackColorRGB24(u32 rgb24)
{
return std::make_tuple(static_cast<u8>(rgb24), static_cast<u8>(rgb24 >> 8), static_cast<u8>(rgb24 >> 16));
}
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r, u8 color_g,
u8 color_b, u8 texcoord_x, u8 texcoord_y)
[[maybe_unused]] ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y,
u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x,
u8 texcoord_y)
{
u16 color;
if constexpr (texture_enable)
@ -215,6 +256,8 @@ ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u
SetPixel(static_cast<u32>(x), static_cast<u32>(y), color | cmd->params.GetMaskOR());
}
#ifndef USE_VECTOR
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
{
@ -249,6 +292,413 @@ static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
}
}
#else // USE_VECTOR
ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVector4i coord_y)
{
GSVector4i offsets = coord_y.sll32<11>(); // y * 2048 (1024 * sizeof(pixel))
offsets = offsets.add32(coord_x.sll32<1>()); // x * 2 (x * sizeof(pixel))
const u32 o0 = offsets.extract32<0>();
const u32 o1 = offsets.extract32<1>();
const u32 o2 = offsets.extract32<2>();
const u32 o3 = offsets.extract32<3>();
// TODO: split in two, merge, maybe could be zx loaded instead..
u16 p0, p1, p2, p3;
std::memcpy(&p0, reinterpret_cast<const u8*>(g_vram) + o0, sizeof(p0));
std::memcpy(&p1, reinterpret_cast<const u8*>(g_vram) + o1, sizeof(p1));
std::memcpy(&p2, reinterpret_cast<const u8*>(g_vram) + o2, sizeof(p2));
std::memcpy(&p3, reinterpret_cast<const u8*>(g_vram) + o3, sizeof(p3));
GSVector4i pixels = GSVector4i::load(p0);
pixels = pixels.insert16<2>(p1);
pixels = pixels.insert16<4>(p2);
pixels = pixels.insert16<6>(p3);
return pixels;
}
ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices)
{
const GSVector4i offsets = indices.sll32<1>(); // x * 2 (x * sizeof(pixel))
const u32 o0 = offsets.extract32<0>();
const u32 o1 = offsets.extract32<1>();
const u32 o2 = offsets.extract32<2>();
const u32 o3 = offsets.extract32<3>();
// TODO: split in two, merge, maybe could be zx loaded instead..
u16 p0, p1, p2, p3;
std::memcpy(&p0, reinterpret_cast<const u8*>(g_gpu_clut) + o0, sizeof(p0));
std::memcpy(&p1, reinterpret_cast<const u8*>(g_gpu_clut) + o1, sizeof(p1));
std::memcpy(&p2, reinterpret_cast<const u8*>(g_gpu_clut) + o2, sizeof(p2));
std::memcpy(&p3, reinterpret_cast<const u8*>(g_gpu_clut) + o3, sizeof(p3));
GSVector4i pixels = GSVector4i::load(p0);
pixels = pixels.insert16<2>(p1);
pixels = pixels.insert16<4>(p2);
pixels = pixels.insert16<6>(p3);
return pixels;
}
ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
{
if (x <= (VRAM_WIDTH - 4))
{
return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32();
}
else
{
const u16* line = &g_vram[y * VRAM_WIDTH];
GSVector4i pixels = GSVector4i(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<6>(line[x & VRAM_WIDTH_MASK]);
return pixels;
}
}
ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
{
if (x <= (VRAM_WIDTH - 4))
{
GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], color);
}
else
{
u16* line = &g_vram[y * VRAM_WIDTH];
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<0>());
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<1>());
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<2>());
line[x & VRAM_WIDTH_MASK] = Truncate16(color.extract16<3>());
}
}
ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVector4i rgb5a1, GSVector4i& rg, GSVector4i& ba)
{
rg = rgb5a1 & GSVector4i::cxpr(0x1F); // R | R | R | R
rg = rg | (rgb5a1 & GSVector4i::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0
ba = rgb5a1.srl32<10>() & GSVector4i::cxpr(0x1F); // B | B | B | B
ba = ba | (rgb5a1 & GSVector4i::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0
}
ALWAYS_INLINE_RELEASE static GSVector4i RG_BAToRGB5A1(GSVector4i rg, GSVector4i ba)
{
GSVector4i res;
res = rg & GSVector4i::cxpr(0x1F); // R | R | R | R
res = res | (rg.srl32<11>() & GSVector4i::cxpr(0x3E0)); // RG | RG | RG | RG
res = res | ((ba & GSVector4i::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB
res = res | ba.srl32<16>().sll32<15>(); // RGBA | RGBA | RGBA | RGBA
return res;
}
// Color repeated twice for RG packing, then duplicated to we can load based on the X offset.
static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = {
#define P(m, n) static_cast<s16>(DITHER_MATRIX[m][n]), static_cast<s16>(DITHER_MATRIX[m][n])
#define R(m) P(m, 0), P(m, 1), P(m, 2), P(m, 3), P(m, 0), P(m, 1), P(m, 2), P(m, 3)
{R(0)}, {R(1)}, {R(2)}, {R(3)}
#undef R
#undef P
};
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
ALWAYS_INLINE_RELEASE static void
ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vertex_color_rg, GSVector4i vertex_color_ba,
GSVector4i texcoord_x, GSVector4i texcoord_y, GSVector4i preserve_mask, GSVector4i dither)
{
static constinit GSVector4i coord_mask_x = GSVector4i::cxpr(VRAM_WIDTH_MASK);
static constinit GSVector4i coord_mask_y = GSVector4i::cxpr(VRAM_HEIGHT_MASK);
GSVector4i color;
if constexpr (texture_enable)
{
// Apply texture window
texcoord_x = (texcoord_x & GSVector4i(cmd->window.and_x)) | GSVector4i(cmd->window.or_x);
texcoord_y = (texcoord_y & GSVector4i(cmd->window.and_y)) | GSVector4i(cmd->window.or_y);
const GSVector4i base_x = GSVector4i(cmd->draw_mode.GetTexturePageBaseX());
const GSVector4i base_y = GSVector4i(cmd->draw_mode.GetTexturePageBaseY());
const GSVector4i palette_x = GSVector4i(cmd->palette.GetXBase());
const GSVector4i palette_y = GSVector4i(cmd->palette.GetYBase());
texcoord_y = base_y.add32(texcoord_y) & coord_mask_y;
GSVector4i texture_color;
switch (cmd->draw_mode.texture_mode)
{
case GPUTextureMode::Palette4Bit:
{
GSVector4i load_texcoord_x = texcoord_x.srl32<2>();
load_texcoord_x = base_x.add32(load_texcoord_x);
load_texcoord_x = load_texcoord_x & coord_mask_x;
// todo: sse4 path
GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(3)).sll32<2>();
GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
#ifdef GSVECTOR_HAS_SRLV
palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0x0F);
#else
Assert(false && "Fixme");
#endif
texture_color = GatherCLUTVector(palette_indices);
}
break;
case GPUTextureMode::Palette8Bit:
{
GSVector4i load_texcoord_x = texcoord_x.srl32<1>();
load_texcoord_x = base_x.add32(load_texcoord_x);
load_texcoord_x = load_texcoord_x & coord_mask_x;
GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(1)).sll32<3>();
GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
#ifdef GSVECTOR_HAS_SRLV
palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0xFF);
#else
Assert(false && "Fixme");
#endif
texture_color = GatherCLUTVector(palette_indices);
}
break;
default:
{
texcoord_x = base_x.add32(texcoord_x);
texcoord_x = texcoord_x & coord_mask_x;
texture_color = GatherVector(texcoord_x, texcoord_y);
}
break;
}
// check for zero texture colour across the 4 pixels, early out if so
const GSVector4i texture_transparent_mask = texture_color.eq32(GSVector4i::zero());
if (texture_transparent_mask.alltrue())
return;
preserve_mask = preserve_mask | texture_transparent_mask;
if constexpr (raw_texture_enable)
{
color = texture_color;
}
else
{
GSVector4i trg, tba;
RGB5A1ToRG_BA(texture_color, trg, tba);
// now we have both the texture and vertex color in RG/GA pairs, for 4 pixels, which we can multiply
GSVector4i rg = trg.mul16l(vertex_color_rg);
GSVector4i ba = tba.mul16l(vertex_color_ba);
// TODO: Dither
// Convert to 5bit.
if constexpr (dithering_enable)
{
rg = rg.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
ba = ba.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
}
else
{
rg = rg.sra16<7>();
ba = ba.sra16<7>();
}
// Bit15 gets passed through as-is.
ba = ba.blend16<0xaa>(tba);
// Clamp to 5bit.
static constexpr GSVector4i colclamp = GSVector4i::cxpr16(0x1F);
rg = rg.min_u16(colclamp);
ba = ba.min_u16(colclamp);
// And interleave back to 16bpp.
color = RG_BAToRGB5A1(rg, ba);
}
}
else
{
// Non-textured transparent polygons don't set bit 15, but are treated as transparent.
if constexpr (dithering_enable)
{
GSVector4i rg = vertex_color_rg.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
GSVector4i ba = vertex_color_ba.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
// Clamp to 5bit. We use 32bit for BA to set a to zero.
rg = rg.min_u16(GSVector4i::cxpr16(0x1F));
ba = ba.min_u16(GSVector4i::cxpr(0x1F));
// And interleave back to 16bpp.
color = RG_BAToRGB5A1(rg, ba);
}
else
{
// Note that bit15 is set to 0 here, which the shift will do.
const GSVector4i rg = vertex_color_rg.srl16<3>();
const GSVector4i ba = vertex_color_ba.srl16<3>();
color = RG_BAToRGB5A1(rg, ba);
}
}
GSVector4i bg_color = LoadVector(start_x, y);
if constexpr (transparency_enable)
{
[[maybe_unused]] GSVector4i transparent_mask;
if constexpr (texture_enable)
{
// Compute transparent_mask, ffff per lane if transparent otherwise 0000
transparent_mask = color.sra16<15>();
}
// TODO: We don't need to OR color here with 0x8000 for textures.
// 0x8000 is added to match serial path.
GSVector4i blended_color;
switch (cmd->draw_mode.transparency_mode)
{
case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
{
const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
const GSVector4i res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x0421u)).srl32<1>();
blended_color = res & GSVector4i::cxpr(0xffff);
}
break;
case GPUTransparencyMode::BackgroundPlusForeground:
{
const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
const GSVector4i sum = fg_bits.add32(bg_bits);
const GSVector4i carry =
(sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u))) & GSVector4i::cxpr(0x8420u);
const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
blended_color = res & GSVector4i::cxpr(0xffff);
}
break;
case GPUTransparencyMode::BackgroundMinusForeground:
{
const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
const GSVector4i fg_bits = color & GSVector4i::cxpr(0x7FFFu);
const GSVector4i diff = bg_bits.sub32(fg_bits).add32(GSVector4i::cxpr(0x108420u));
const GSVector4i borrow =
diff.sub32((bg_bits ^ fg_bits) & GSVector4i::cxpr(0x108420u)) & GSVector4i::cxpr(0x108420u);
const GSVector4i res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>());
blended_color = res & GSVector4i::cxpr(0xffff);
}
break;
case GPUTransparencyMode::BackgroundPlusQuarterForeground:
default:
{
const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
const GSVector4i fg_bits =
((color | GSVector4i::cxpr(0x8000)).srl32<2>() & GSVector4i::cxpr(0x1CE7u)) | GSVector4i::cxpr(0x8000u);
const GSVector4i sum = fg_bits.add32(bg_bits);
const GSVector4i carry = sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u)) & GSVector4i::cxpr(0x8420u);
const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
blended_color = res & GSVector4i::cxpr(0xffff);
}
break;
}
// select blended pixels for transparent pixels, otherwise consider opaque
// TODO: SSE2
if constexpr (texture_enable)
color = color.blend8(blended_color, transparent_mask);
else
color = blended_color & GSVector4i::cxpr(0x7fff);
}
// TODO: lift out to parent?
const GSVector4i mask_and = GSVector4i(cmd->params.GetMaskAND());
const GSVector4i mask_or = GSVector4i(cmd->params.GetMaskOR());
GSVector4i mask_bits_set = bg_color & mask_and; // 8000 if masked else 0000
mask_bits_set = mask_bits_set.sra16<15>(); // ffff if masked else 0000
preserve_mask = preserve_mask | mask_bits_set; // ffff if preserved else 0000
bg_color = bg_color & preserve_mask;
color = (color | mask_or).andnot(preserve_mask);
color = color | bg_color;
const GSVector4i packed_color = color.pu32();
StoreVector(start_x, y, packed_color);
}
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
{
const s32 origin_x = cmd->x;
const s32 origin_y = cmd->y;
const GSVector4i rgba = GSVector4i(cmd->color); // RGBA | RGBA | RGBA | RGBA
GSVector4i rg = rgba.xxxxl(); // RGRG | RGRG | RGRG | RGRG
GSVector4i ba = rgba.yyyyl(); // BABA | BABA | BABA | BABA
rg = rg.u8to16(); // R0G0 | R0G0 | R0G0 | R0G0
ba = ba.u8to16(); // B0A0 | B0A0 | B0A0 | B0A0
const GSVector4i texcoord_x = GSVector4i(cmd->texcoord & 0xFF).add32(GSVector4i::cxpr(0, 1, 2, 3));
GSVector4i texcoord_y = GSVector4i(cmd->texcoord >> 8);
const GSVector4i clip_left = GSVector4i(g_drawing_area.left);
const GSVector4i clip_right = GSVector4i(g_drawing_area.right);
const u32 width = cmd->width;
BACKUP_VRAM();
for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
{
const s32 y = origin_y + static_cast<s32>(offset_y);
if (y < static_cast<s32>(g_drawing_area.top) || y > static_cast<s32>(g_drawing_area.bottom) ||
(cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u)))
{
continue;
}
GSVector4i row_texcoord_x = texcoord_x;
GSVector4i xvec = GSVector4i(origin_x).add32(GSVector4i::cxpr(0, 1, 2, 3));
GSVector4i wvec = GSVector4i(width).sub32(GSVector4i::cxpr(1, 2, 3, 4));
for (u32 offset_x = 0; offset_x < width; offset_x += 4)
{
const s32 x = origin_x + static_cast<s32>(offset_x);
// width test
GSVector4i preserve_mask = wvec.lt32(GSVector4i::zero());
// clip test, if all pixels are outside, skip
preserve_mask = preserve_mask | xvec.lt32(clip_left);
preserve_mask = preserve_mask | xvec.gt32(clip_right);
if (!preserve_mask.alltrue())
{
ShadePixel<texture_enable, raw_texture_enable, transparency_enable, false>(
cmd, x, y, rg, ba, row_texcoord_x, texcoord_y, preserve_mask, GSVector4i::zero());
}
xvec = xvec.add32(GSVector4i::cxpr(4));
wvec = wvec.sub32(GSVector4i::cxpr(4));
if constexpr (texture_enable)
row_texcoord_x = row_texcoord_x.add32(GSVector4i::cxpr(4)) & GSVector4i::cxpr(0xFF);
}
if constexpr (texture_enable)
texcoord_y = texcoord_y.add32(GSVector4i::cxpr(1)) & GSVector4i::cxpr(0xFF);
}
CHECK_VRAM(GPU_SW_Rasterizer::DrawRectangleFunctions[texture_enable][raw_texture_enable][transparency_enable](cmd));
}
#endif // USE_VECTOR
//////////////////////////////////////////////////////////////////////////
// Polygon and line rasterization ported from Mednafen
//////////////////////////////////////////////////////////////////////////
@ -355,6 +805,8 @@ ALWAYS_INLINE_RELEASE static void AddIDeltas_DY(i_group& ig, const i_deltas& idl
}
}
#ifndef USE_VECTOR
template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
bool dithering_enable>
ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound,
@ -401,11 +853,150 @@ ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* c
} while (--w > 0);
}
#else // USE_VECTOR
template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
bool dithering_enable>
ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawCommand* cmd, s32 y, s32 x_start, s32 x_bound,
i_group ig, const i_deltas& idl)
{
if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u))
return;
s32 x_ig_adjust = x_start;
s32 w = x_bound - x_start;
s32 x = TruncateGPUVertexPosition(x_start);
if (x < static_cast<s32>(g_drawing_area.left))
{
s32 delta = static_cast<s32>(g_drawing_area.left) - x;
x_ig_adjust += delta;
x += delta;
w -= delta;
}
if ((x + w) > (static_cast<s32>(g_drawing_area.right) + 1))
w = static_cast<s32>(g_drawing_area.right) + 1 - x;
if (w <= 0)
return;
// TODO: Precompute.
const auto clip_left = GSVector4i(g_drawing_area.left);
const auto clip_right = GSVector4i(g_drawing_area.right);
const GSVector4i dr_dx = GSVector4i(idl.dr_dx * 4);
const GSVector4i dg_dx = GSVector4i(idl.dg_dx * 4);
const GSVector4i db_dx = GSVector4i(idl.db_dx * 4);
const GSVector4i du_dx = GSVector4i(idl.du_dx * 4);
const GSVector4i dv_dx = GSVector4i(idl.dv_dx * 4);
// TODO: vectorize
const GSVector4i dr_dx_offset = GSVector4i(0, idl.dr_dx, idl.dr_dx * 2, idl.dr_dx * 3);
const GSVector4i dg_dx_offset = GSVector4i(0, idl.dg_dx, idl.dg_dx * 2, idl.dg_dx * 3);
const GSVector4i db_dx_offset = GSVector4i(0, idl.db_dx, idl.db_dx * 2, idl.db_dx * 3);
const GSVector4i du_dx_offset = GSVector4i(0, idl.du_dx, idl.du_dx * 2, idl.du_dx * 3);
const GSVector4i dv_dx_offset = GSVector4i(0, idl.dv_dx, idl.dv_dx * 2, idl.dv_dx * 3);
GSVector4i dr, dg, db;
if constexpr (shading_enable)
{
dr = GSVector4i(ig.r + idl.dr_dx * x_ig_adjust).add32(dr_dx_offset);
dg = GSVector4i(ig.g + idl.dg_dx * x_ig_adjust).add32(dg_dx_offset);
db = GSVector4i(ig.b + idl.db_dx * x_ig_adjust).add32(db_dx_offset);
}
else
{
// precompute for flat shading
dr = GSVector4i(ig.r >> (COORD_FBS + COORD_POST_PADDING));
dg = GSVector4i((ig.g >> (COORD_FBS + COORD_POST_PADDING)) << 16);
db = GSVector4i(ig.b >> (COORD_FBS + COORD_POST_PADDING));
}
GSVector4i du = GSVector4i(ig.u + idl.du_dx * x_ig_adjust).add32(du_dx_offset);
GSVector4i dv = GSVector4i(ig.v + idl.dv_dx * x_ig_adjust).add32(dv_dx_offset);
// TODO: Move to caller.
if constexpr (shading_enable)
{
// TODO: vectorize multiply?
dr = dr.add32(GSVector4i(idl.dr_dy * y));
dg = dg.add32(GSVector4i(idl.dg_dy * y));
db = db.add32(GSVector4i(idl.db_dy * y));
}
if constexpr (texture_enable)
{
du = du.add32(GSVector4i(idl.du_dy * y));
dv = dv.add32(GSVector4i(idl.dv_dy * y));
}
const GSVector4i dither =
GSVector4i::load<false>(&VECTOR_DITHER_MATRIX[static_cast<u32>(y) & 3][(static_cast<u32>(x) & 3) * 2]);
GSVector4i xvec = GSVector4i(x).add32(GSVector4i::cxpr(0, 1, 2, 3));
GSVector4i wvec = GSVector4i(w).sub32(GSVector4i::cxpr(1, 2, 3, 4));
for (s32 count = (w + 3) / 4; count > 0; --count)
{
// R000 | R000 | R000 | R000
// R0G0 | R0G0 | R0G0 | R0G0
const GSVector4i r = shading_enable ? dr.srl32<COORD_FBS + COORD_POST_PADDING>() : dr;
const GSVector4i g =
shading_enable ? dg.srl32<COORD_FBS + COORD_POST_PADDING>().sll32<16>() : dg; // get G into the correct position
const GSVector4i b = shading_enable ? db.srl32<COORD_FBS + COORD_POST_PADDING>() : db;
const GSVector4i u = du.srl32<COORD_FBS + COORD_POST_PADDING>();
const GSVector4i v = dv.srl32<COORD_FBS + COORD_POST_PADDING>();
// TODO: no-sse4
const GSVector4i rg = r.blend16<0xAA>(g);
// mask based on what's outside the span
auto preserve_mask = wvec.lt32(GSVector4i::zero());
// clip test, if all pixels are outside, skip
preserve_mask = preserve_mask | xvec.lt32(clip_left);
preserve_mask = preserve_mask | xvec.gt32(clip_right);
if (!preserve_mask.alltrue())
{
ShadePixel<texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
cmd, static_cast<u32>(x), static_cast<u32>(y), rg, b, u, v, preserve_mask, dither);
}
x += 4;
xvec = xvec.add32(GSVector4i::cxpr(4));
wvec = wvec.sub32(GSVector4i::cxpr(4));
if constexpr (shading_enable)
{
dr = dr.add32(dr_dx);
dg = dg.add32(dg_dx);
db = db.add32(db_dx);
}
if constexpr (texture_enable)
{
du = du.add32(du_dx);
dv = dv.add32(dv_dx);
}
}
}
#endif // USE_VECTOR
template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
bool dithering_enable>
static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0,
const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2)
{
#if 0
const GPUBackendDrawPolygonCommand::Vertex* orig_v0 = v0;
const GPUBackendDrawPolygonCommand::Vertex* orig_v1 = v1;
const GPUBackendDrawPolygonCommand::Vertex* orig_v2 = v2;
#endif
u32 core_vertex;
{
u32 cvtemp = 0;
@ -480,6 +1071,10 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke
if (!CalcIDeltas<shading_enable, texture_enable>(idl, v0, v1, v2))
return;
#ifdef USE_VECTOR
BACKUP_VRAM();
#endif
const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2};
i_group ig;
@ -591,6 +1186,12 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke
}
}
}
#ifdef USE_VECTOR
CHECK_VRAM(
GPU_SW_Rasterizer::DrawTriangleFunctions[shading_enable][texture_enable][raw_texture_enable][transparency_enable]
[dithering_enable](cmd, orig_v0, orig_v1, orig_v2));
#endif
}
ALWAYS_INLINE_RELEASE static s64 LineDivide(s64 delta, s32 dk)

View File

@ -0,0 +1,12 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "gpu_sw_rasterizer.h"
#include "common/assert.h"
#include "common/gsvector.h"
namespace GPU_SW_Rasterizer::AVX2 {
#define USE_VECTOR 1
#include "gpu_sw_rasterizer.inl"
}