|
|
|
@ -3,15 +3,55 @@
|
|
|
|
|
|
|
|
|
|
#ifdef __INTELLISENSE__
|
|
|
|
|
|
|
|
|
|
#include "common/gsvector.h"
|
|
|
|
|
#include "gpu.h"
|
|
|
|
|
#include <algorithm>
|
|
|
|
|
|
|
|
|
|
#define USE_VECTOR 1
|
|
|
|
|
#define GSVECTOR_HAS_SRLV 1
|
|
|
|
|
|
|
|
|
|
extern GPU_SW_Rasterizer::DitherLUT g_dither_lut;
|
|
|
|
|
|
|
|
|
|
namespace GPU_SW_Rasterizer {
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// TODO: UpdateVRAM, FillVRAM, etc.
|
|
|
|
|
|
|
|
|
|
#ifdef USE_VECTOR
|
|
|
|
|
#if 0
|
|
|
|
|
static u16 s_vram_backup[VRAM_WIDTH * VRAM_HEIGHT];
|
|
|
|
|
static u16 s_new_vram[VRAM_WIDTH * VRAM_HEIGHT];
|
|
|
|
|
#define BACKUP_VRAM() \
|
|
|
|
|
do \
|
|
|
|
|
{ \
|
|
|
|
|
std::memcpy(s_vram_backup, g_vram, sizeof(g_vram)); \
|
|
|
|
|
s_bad_counter++; \
|
|
|
|
|
} while (0)
|
|
|
|
|
#define CHECK_VRAM(drawer) \
|
|
|
|
|
do \
|
|
|
|
|
{ \
|
|
|
|
|
std::memcpy(s_new_vram, g_vram, sizeof(g_vram)); \
|
|
|
|
|
std::memcpy(g_vram, s_vram_backup, sizeof(g_vram)); \
|
|
|
|
|
\
|
|
|
|
|
drawer; \
|
|
|
|
|
for (u32 vidx = 0; vidx < (VRAM_WIDTH * VRAM_HEIGHT); vidx++) \
|
|
|
|
|
{ \
|
|
|
|
|
if (s_new_vram[vidx] != g_vram[vidx]) \
|
|
|
|
|
{ \
|
|
|
|
|
fprintf(stderr, "[%u] Mismatch at %d,%d, expected %04x got %04x\n", s_bad_counter, (vidx % VRAM_WIDTH), \
|
|
|
|
|
(vidx / VRAM_WIDTH), g_vram[vidx], s_new_vram[vidx]); \
|
|
|
|
|
AssertMsg(false, "Mismatch"); \
|
|
|
|
|
} \
|
|
|
|
|
} \
|
|
|
|
|
/*Assert(std::memcmp(g_vram, s_new_vram, sizeof(g_vram)) == 0)*/ \
|
|
|
|
|
} while (0)
|
|
|
|
|
#else
|
|
|
|
|
#define BACKUP_VRAM()
|
|
|
|
|
#define CHECK_VRAM(drawer)
|
|
|
|
|
#endif
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
enum
|
|
|
|
|
{
|
|
|
|
@ -49,32 +89,33 @@ struct line_fxp_step
|
|
|
|
|
};
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y)
|
|
|
|
|
[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y)
|
|
|
|
|
{
|
|
|
|
|
return g_vram[VRAM_WIDTH * y + x];
|
|
|
|
|
}
|
|
|
|
|
ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y)
|
|
|
|
|
[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y)
|
|
|
|
|
{
|
|
|
|
|
return &g_vram[VRAM_WIDTH * y + x];
|
|
|
|
|
}
|
|
|
|
|
ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value)
|
|
|
|
|
[[maybe_unused]] ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value)
|
|
|
|
|
{
|
|
|
|
|
g_vram[VRAM_WIDTH * y + x] = value;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8> UnpackTexcoord(u16 texcoord)
|
|
|
|
|
[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8> UnpackTexcoord(u16 texcoord)
|
|
|
|
|
{
|
|
|
|
|
return std::make_tuple(static_cast<u8>(texcoord), static_cast<u8>(texcoord >> 8));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8, u8> UnpackColorRGB24(u32 rgb24)
|
|
|
|
|
[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8, u8> UnpackColorRGB24(u32 rgb24)
|
|
|
|
|
{
|
|
|
|
|
return std::make_tuple(static_cast<u8>(rgb24), static_cast<u8>(rgb24 >> 8), static_cast<u8>(rgb24 >> 16));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
|
|
|
|
|
ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r, u8 color_g,
|
|
|
|
|
u8 color_b, u8 texcoord_x, u8 texcoord_y)
|
|
|
|
|
[[maybe_unused]] ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y,
|
|
|
|
|
u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x,
|
|
|
|
|
u8 texcoord_y)
|
|
|
|
|
{
|
|
|
|
|
u16 color;
|
|
|
|
|
if constexpr (texture_enable)
|
|
|
|
@ -215,6 +256,8 @@ ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u
|
|
|
|
|
SetPixel(static_cast<u32>(x), static_cast<u32>(y), color | cmd->params.GetMaskOR());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifndef USE_VECTOR
|
|
|
|
|
|
|
|
|
|
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
|
|
|
|
|
static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
|
|
|
|
|
{
|
|
|
|
@ -249,6 +292,413 @@ static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#else // USE_VECTOR
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVector4i coord_y)
|
|
|
|
|
{
|
|
|
|
|
GSVector4i offsets = coord_y.sll32<11>(); // y * 2048 (1024 * sizeof(pixel))
|
|
|
|
|
offsets = offsets.add32(coord_x.sll32<1>()); // x * 2 (x * sizeof(pixel))
|
|
|
|
|
|
|
|
|
|
const u32 o0 = offsets.extract32<0>();
|
|
|
|
|
const u32 o1 = offsets.extract32<1>();
|
|
|
|
|
const u32 o2 = offsets.extract32<2>();
|
|
|
|
|
const u32 o3 = offsets.extract32<3>();
|
|
|
|
|
|
|
|
|
|
// TODO: split in two, merge, maybe could be zx loaded instead..
|
|
|
|
|
u16 p0, p1, p2, p3;
|
|
|
|
|
std::memcpy(&p0, reinterpret_cast<const u8*>(g_vram) + o0, sizeof(p0));
|
|
|
|
|
std::memcpy(&p1, reinterpret_cast<const u8*>(g_vram) + o1, sizeof(p1));
|
|
|
|
|
std::memcpy(&p2, reinterpret_cast<const u8*>(g_vram) + o2, sizeof(p2));
|
|
|
|
|
std::memcpy(&p3, reinterpret_cast<const u8*>(g_vram) + o3, sizeof(p3));
|
|
|
|
|
GSVector4i pixels = GSVector4i::load(p0);
|
|
|
|
|
pixels = pixels.insert16<2>(p1);
|
|
|
|
|
pixels = pixels.insert16<4>(p2);
|
|
|
|
|
pixels = pixels.insert16<6>(p3);
|
|
|
|
|
|
|
|
|
|
return pixels;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices)
|
|
|
|
|
{
|
|
|
|
|
const GSVector4i offsets = indices.sll32<1>(); // x * 2 (x * sizeof(pixel))
|
|
|
|
|
const u32 o0 = offsets.extract32<0>();
|
|
|
|
|
const u32 o1 = offsets.extract32<1>();
|
|
|
|
|
const u32 o2 = offsets.extract32<2>();
|
|
|
|
|
const u32 o3 = offsets.extract32<3>();
|
|
|
|
|
|
|
|
|
|
// TODO: split in two, merge, maybe could be zx loaded instead..
|
|
|
|
|
u16 p0, p1, p2, p3;
|
|
|
|
|
std::memcpy(&p0, reinterpret_cast<const u8*>(g_gpu_clut) + o0, sizeof(p0));
|
|
|
|
|
std::memcpy(&p1, reinterpret_cast<const u8*>(g_gpu_clut) + o1, sizeof(p1));
|
|
|
|
|
std::memcpy(&p2, reinterpret_cast<const u8*>(g_gpu_clut) + o2, sizeof(p2));
|
|
|
|
|
std::memcpy(&p3, reinterpret_cast<const u8*>(g_gpu_clut) + o3, sizeof(p3));
|
|
|
|
|
GSVector4i pixels = GSVector4i::load(p0);
|
|
|
|
|
pixels = pixels.insert16<2>(p1);
|
|
|
|
|
pixels = pixels.insert16<4>(p2);
|
|
|
|
|
pixels = pixels.insert16<6>(p3);
|
|
|
|
|
|
|
|
|
|
return pixels;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
|
|
|
|
|
{
|
|
|
|
|
if (x <= (VRAM_WIDTH - 4))
|
|
|
|
|
{
|
|
|
|
|
return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32();
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
const u16* line = &g_vram[y * VRAM_WIDTH];
|
|
|
|
|
GSVector4i pixels = GSVector4i(line[(x++) & VRAM_WIDTH_MASK]);
|
|
|
|
|
pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]);
|
|
|
|
|
pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]);
|
|
|
|
|
pixels = pixels.insert16<6>(line[x & VRAM_WIDTH_MASK]);
|
|
|
|
|
return pixels;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
|
|
|
|
|
{
|
|
|
|
|
if (x <= (VRAM_WIDTH - 4))
|
|
|
|
|
{
|
|
|
|
|
GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], color);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
u16* line = &g_vram[y * VRAM_WIDTH];
|
|
|
|
|
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<0>());
|
|
|
|
|
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<1>());
|
|
|
|
|
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<2>());
|
|
|
|
|
line[x & VRAM_WIDTH_MASK] = Truncate16(color.extract16<3>());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVector4i rgb5a1, GSVector4i& rg, GSVector4i& ba)
|
|
|
|
|
{
|
|
|
|
|
rg = rgb5a1 & GSVector4i::cxpr(0x1F); // R | R | R | R
|
|
|
|
|
rg = rg | (rgb5a1 & GSVector4i::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0
|
|
|
|
|
ba = rgb5a1.srl32<10>() & GSVector4i::cxpr(0x1F); // B | B | B | B
|
|
|
|
|
ba = ba | (rgb5a1 & GSVector4i::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE_RELEASE static GSVector4i RG_BAToRGB5A1(GSVector4i rg, GSVector4i ba)
|
|
|
|
|
{
|
|
|
|
|
GSVector4i res;
|
|
|
|
|
|
|
|
|
|
res = rg & GSVector4i::cxpr(0x1F); // R | R | R | R
|
|
|
|
|
res = res | (rg.srl32<11>() & GSVector4i::cxpr(0x3E0)); // RG | RG | RG | RG
|
|
|
|
|
res = res | ((ba & GSVector4i::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB
|
|
|
|
|
res = res | ba.srl32<16>().sll32<15>(); // RGBA | RGBA | RGBA | RGBA
|
|
|
|
|
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Color repeated twice for RG packing, then duplicated to we can load based on the X offset.
|
|
|
|
|
static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = {
|
|
|
|
|
#define P(m, n) static_cast<s16>(DITHER_MATRIX[m][n]), static_cast<s16>(DITHER_MATRIX[m][n])
|
|
|
|
|
#define R(m) P(m, 0), P(m, 1), P(m, 2), P(m, 3), P(m, 0), P(m, 1), P(m, 2), P(m, 3)
|
|
|
|
|
|
|
|
|
|
{R(0)}, {R(1)}, {R(2)}, {R(3)}
|
|
|
|
|
|
|
|
|
|
#undef R
|
|
|
|
|
#undef P
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
|
|
|
|
|
ALWAYS_INLINE_RELEASE static void
|
|
|
|
|
ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vertex_color_rg, GSVector4i vertex_color_ba,
|
|
|
|
|
GSVector4i texcoord_x, GSVector4i texcoord_y, GSVector4i preserve_mask, GSVector4i dither)
|
|
|
|
|
{
|
|
|
|
|
static constinit GSVector4i coord_mask_x = GSVector4i::cxpr(VRAM_WIDTH_MASK);
|
|
|
|
|
static constinit GSVector4i coord_mask_y = GSVector4i::cxpr(VRAM_HEIGHT_MASK);
|
|
|
|
|
|
|
|
|
|
GSVector4i color;
|
|
|
|
|
|
|
|
|
|
if constexpr (texture_enable)
|
|
|
|
|
{
|
|
|
|
|
// Apply texture window
|
|
|
|
|
texcoord_x = (texcoord_x & GSVector4i(cmd->window.and_x)) | GSVector4i(cmd->window.or_x);
|
|
|
|
|
texcoord_y = (texcoord_y & GSVector4i(cmd->window.and_y)) | GSVector4i(cmd->window.or_y);
|
|
|
|
|
|
|
|
|
|
const GSVector4i base_x = GSVector4i(cmd->draw_mode.GetTexturePageBaseX());
|
|
|
|
|
const GSVector4i base_y = GSVector4i(cmd->draw_mode.GetTexturePageBaseY());
|
|
|
|
|
const GSVector4i palette_x = GSVector4i(cmd->palette.GetXBase());
|
|
|
|
|
const GSVector4i palette_y = GSVector4i(cmd->palette.GetYBase());
|
|
|
|
|
|
|
|
|
|
texcoord_y = base_y.add32(texcoord_y) & coord_mask_y;
|
|
|
|
|
|
|
|
|
|
GSVector4i texture_color;
|
|
|
|
|
switch (cmd->draw_mode.texture_mode)
|
|
|
|
|
{
|
|
|
|
|
case GPUTextureMode::Palette4Bit:
|
|
|
|
|
{
|
|
|
|
|
GSVector4i load_texcoord_x = texcoord_x.srl32<2>();
|
|
|
|
|
load_texcoord_x = base_x.add32(load_texcoord_x);
|
|
|
|
|
load_texcoord_x = load_texcoord_x & coord_mask_x;
|
|
|
|
|
|
|
|
|
|
// todo: sse4 path
|
|
|
|
|
GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(3)).sll32<2>();
|
|
|
|
|
GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
|
|
|
|
|
#ifdef GSVECTOR_HAS_SRLV
|
|
|
|
|
palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0x0F);
|
|
|
|
|
#else
|
|
|
|
|
Assert(false && "Fixme");
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
texture_color = GatherCLUTVector(palette_indices);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case GPUTextureMode::Palette8Bit:
|
|
|
|
|
{
|
|
|
|
|
GSVector4i load_texcoord_x = texcoord_x.srl32<1>();
|
|
|
|
|
load_texcoord_x = base_x.add32(load_texcoord_x);
|
|
|
|
|
load_texcoord_x = load_texcoord_x & coord_mask_x;
|
|
|
|
|
|
|
|
|
|
GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(1)).sll32<3>();
|
|
|
|
|
GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y);
|
|
|
|
|
#ifdef GSVECTOR_HAS_SRLV
|
|
|
|
|
palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0xFF);
|
|
|
|
|
#else
|
|
|
|
|
Assert(false && "Fixme");
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
texture_color = GatherCLUTVector(palette_indices);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
{
|
|
|
|
|
texcoord_x = base_x.add32(texcoord_x);
|
|
|
|
|
texcoord_x = texcoord_x & coord_mask_x;
|
|
|
|
|
texture_color = GatherVector(texcoord_x, texcoord_y);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// check for zero texture colour across the 4 pixels, early out if so
|
|
|
|
|
const GSVector4i texture_transparent_mask = texture_color.eq32(GSVector4i::zero());
|
|
|
|
|
if (texture_transparent_mask.alltrue())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
preserve_mask = preserve_mask | texture_transparent_mask;
|
|
|
|
|
|
|
|
|
|
if constexpr (raw_texture_enable)
|
|
|
|
|
{
|
|
|
|
|
color = texture_color;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
GSVector4i trg, tba;
|
|
|
|
|
RGB5A1ToRG_BA(texture_color, trg, tba);
|
|
|
|
|
|
|
|
|
|
// now we have both the texture and vertex color in RG/GA pairs, for 4 pixels, which we can multiply
|
|
|
|
|
GSVector4i rg = trg.mul16l(vertex_color_rg);
|
|
|
|
|
GSVector4i ba = tba.mul16l(vertex_color_ba);
|
|
|
|
|
|
|
|
|
|
// TODO: Dither
|
|
|
|
|
// Convert to 5bit.
|
|
|
|
|
if constexpr (dithering_enable)
|
|
|
|
|
{
|
|
|
|
|
rg = rg.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
|
|
|
|
|
ba = ba.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
rg = rg.sra16<7>();
|
|
|
|
|
ba = ba.sra16<7>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Bit15 gets passed through as-is.
|
|
|
|
|
ba = ba.blend16<0xaa>(tba);
|
|
|
|
|
|
|
|
|
|
// Clamp to 5bit.
|
|
|
|
|
static constexpr GSVector4i colclamp = GSVector4i::cxpr16(0x1F);
|
|
|
|
|
rg = rg.min_u16(colclamp);
|
|
|
|
|
ba = ba.min_u16(colclamp);
|
|
|
|
|
|
|
|
|
|
// And interleave back to 16bpp.
|
|
|
|
|
color = RG_BAToRGB5A1(rg, ba);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// Non-textured transparent polygons don't set bit 15, but are treated as transparent.
|
|
|
|
|
if constexpr (dithering_enable)
|
|
|
|
|
{
|
|
|
|
|
GSVector4i rg = vertex_color_rg.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
|
|
|
|
|
GSVector4i ba = vertex_color_ba.add16(dither).max_i16(GSVector4i::zero()).sra16<3>();
|
|
|
|
|
|
|
|
|
|
// Clamp to 5bit. We use 32bit for BA to set a to zero.
|
|
|
|
|
rg = rg.min_u16(GSVector4i::cxpr16(0x1F));
|
|
|
|
|
ba = ba.min_u16(GSVector4i::cxpr(0x1F));
|
|
|
|
|
|
|
|
|
|
// And interleave back to 16bpp.
|
|
|
|
|
color = RG_BAToRGB5A1(rg, ba);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// Note that bit15 is set to 0 here, which the shift will do.
|
|
|
|
|
const GSVector4i rg = vertex_color_rg.srl16<3>();
|
|
|
|
|
const GSVector4i ba = vertex_color_ba.srl16<3>();
|
|
|
|
|
color = RG_BAToRGB5A1(rg, ba);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
GSVector4i bg_color = LoadVector(start_x, y);
|
|
|
|
|
|
|
|
|
|
if constexpr (transparency_enable)
|
|
|
|
|
{
|
|
|
|
|
[[maybe_unused]] GSVector4i transparent_mask;
|
|
|
|
|
if constexpr (texture_enable)
|
|
|
|
|
{
|
|
|
|
|
// Compute transparent_mask, ffff per lane if transparent otherwise 0000
|
|
|
|
|
transparent_mask = color.sra16<15>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO: We don't need to OR color here with 0x8000 for textures.
|
|
|
|
|
// 0x8000 is added to match serial path.
|
|
|
|
|
|
|
|
|
|
GSVector4i blended_color;
|
|
|
|
|
switch (cmd->draw_mode.transparency_mode)
|
|
|
|
|
{
|
|
|
|
|
case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
|
|
|
|
|
{
|
|
|
|
|
const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
|
|
|
|
|
const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
|
|
|
|
|
const GSVector4i res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x0421u)).srl32<1>();
|
|
|
|
|
blended_color = res & GSVector4i::cxpr(0xffff);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case GPUTransparencyMode::BackgroundPlusForeground:
|
|
|
|
|
{
|
|
|
|
|
const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u);
|
|
|
|
|
const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
|
|
|
|
|
const GSVector4i sum = fg_bits.add32(bg_bits);
|
|
|
|
|
const GSVector4i carry =
|
|
|
|
|
(sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u))) & GSVector4i::cxpr(0x8420u);
|
|
|
|
|
const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
|
|
|
|
|
blended_color = res & GSVector4i::cxpr(0xffff);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case GPUTransparencyMode::BackgroundMinusForeground:
|
|
|
|
|
{
|
|
|
|
|
const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u);
|
|
|
|
|
const GSVector4i fg_bits = color & GSVector4i::cxpr(0x7FFFu);
|
|
|
|
|
const GSVector4i diff = bg_bits.sub32(fg_bits).add32(GSVector4i::cxpr(0x108420u));
|
|
|
|
|
const GSVector4i borrow =
|
|
|
|
|
diff.sub32((bg_bits ^ fg_bits) & GSVector4i::cxpr(0x108420u)) & GSVector4i::cxpr(0x108420u);
|
|
|
|
|
const GSVector4i res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>());
|
|
|
|
|
blended_color = res & GSVector4i::cxpr(0xffff);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case GPUTransparencyMode::BackgroundPlusQuarterForeground:
|
|
|
|
|
default:
|
|
|
|
|
{
|
|
|
|
|
const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu);
|
|
|
|
|
const GSVector4i fg_bits =
|
|
|
|
|
((color | GSVector4i::cxpr(0x8000)).srl32<2>() & GSVector4i::cxpr(0x1CE7u)) | GSVector4i::cxpr(0x8000u);
|
|
|
|
|
const GSVector4i sum = fg_bits.add32(bg_bits);
|
|
|
|
|
const GSVector4i carry = sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u)) & GSVector4i::cxpr(0x8420u);
|
|
|
|
|
const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
|
|
|
|
|
blended_color = res & GSVector4i::cxpr(0xffff);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// select blended pixels for transparent pixels, otherwise consider opaque
|
|
|
|
|
// TODO: SSE2
|
|
|
|
|
if constexpr (texture_enable)
|
|
|
|
|
color = color.blend8(blended_color, transparent_mask);
|
|
|
|
|
else
|
|
|
|
|
color = blended_color & GSVector4i::cxpr(0x7fff);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO: lift out to parent?
|
|
|
|
|
const GSVector4i mask_and = GSVector4i(cmd->params.GetMaskAND());
|
|
|
|
|
const GSVector4i mask_or = GSVector4i(cmd->params.GetMaskOR());
|
|
|
|
|
|
|
|
|
|
GSVector4i mask_bits_set = bg_color & mask_and; // 8000 if masked else 0000
|
|
|
|
|
mask_bits_set = mask_bits_set.sra16<15>(); // ffff if masked else 0000
|
|
|
|
|
preserve_mask = preserve_mask | mask_bits_set; // ffff if preserved else 0000
|
|
|
|
|
|
|
|
|
|
bg_color = bg_color & preserve_mask;
|
|
|
|
|
color = (color | mask_or).andnot(preserve_mask);
|
|
|
|
|
color = color | bg_color;
|
|
|
|
|
|
|
|
|
|
const GSVector4i packed_color = color.pu32();
|
|
|
|
|
StoreVector(start_x, y, packed_color);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
|
|
|
|
|
static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
|
|
|
|
|
{
|
|
|
|
|
const s32 origin_x = cmd->x;
|
|
|
|
|
const s32 origin_y = cmd->y;
|
|
|
|
|
|
|
|
|
|
const GSVector4i rgba = GSVector4i(cmd->color); // RGBA | RGBA | RGBA | RGBA
|
|
|
|
|
GSVector4i rg = rgba.xxxxl(); // RGRG | RGRG | RGRG | RGRG
|
|
|
|
|
GSVector4i ba = rgba.yyyyl(); // BABA | BABA | BABA | BABA
|
|
|
|
|
rg = rg.u8to16(); // R0G0 | R0G0 | R0G0 | R0G0
|
|
|
|
|
ba = ba.u8to16(); // B0A0 | B0A0 | B0A0 | B0A0
|
|
|
|
|
|
|
|
|
|
const GSVector4i texcoord_x = GSVector4i(cmd->texcoord & 0xFF).add32(GSVector4i::cxpr(0, 1, 2, 3));
|
|
|
|
|
GSVector4i texcoord_y = GSVector4i(cmd->texcoord >> 8);
|
|
|
|
|
|
|
|
|
|
const GSVector4i clip_left = GSVector4i(g_drawing_area.left);
|
|
|
|
|
const GSVector4i clip_right = GSVector4i(g_drawing_area.right);
|
|
|
|
|
const u32 width = cmd->width;
|
|
|
|
|
|
|
|
|
|
BACKUP_VRAM();
|
|
|
|
|
|
|
|
|
|
for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
|
|
|
|
|
{
|
|
|
|
|
const s32 y = origin_y + static_cast<s32>(offset_y);
|
|
|
|
|
if (y < static_cast<s32>(g_drawing_area.top) || y > static_cast<s32>(g_drawing_area.bottom) ||
|
|
|
|
|
(cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u)))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
GSVector4i row_texcoord_x = texcoord_x;
|
|
|
|
|
GSVector4i xvec = GSVector4i(origin_x).add32(GSVector4i::cxpr(0, 1, 2, 3));
|
|
|
|
|
GSVector4i wvec = GSVector4i(width).sub32(GSVector4i::cxpr(1, 2, 3, 4));
|
|
|
|
|
|
|
|
|
|
for (u32 offset_x = 0; offset_x < width; offset_x += 4)
|
|
|
|
|
{
|
|
|
|
|
const s32 x = origin_x + static_cast<s32>(offset_x);
|
|
|
|
|
|
|
|
|
|
// width test
|
|
|
|
|
GSVector4i preserve_mask = wvec.lt32(GSVector4i::zero());
|
|
|
|
|
|
|
|
|
|
// clip test, if all pixels are outside, skip
|
|
|
|
|
preserve_mask = preserve_mask | xvec.lt32(clip_left);
|
|
|
|
|
preserve_mask = preserve_mask | xvec.gt32(clip_right);
|
|
|
|
|
if (!preserve_mask.alltrue())
|
|
|
|
|
{
|
|
|
|
|
ShadePixel<texture_enable, raw_texture_enable, transparency_enable, false>(
|
|
|
|
|
cmd, x, y, rg, ba, row_texcoord_x, texcoord_y, preserve_mask, GSVector4i::zero());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
xvec = xvec.add32(GSVector4i::cxpr(4));
|
|
|
|
|
wvec = wvec.sub32(GSVector4i::cxpr(4));
|
|
|
|
|
|
|
|
|
|
if constexpr (texture_enable)
|
|
|
|
|
row_texcoord_x = row_texcoord_x.add32(GSVector4i::cxpr(4)) & GSVector4i::cxpr(0xFF);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if constexpr (texture_enable)
|
|
|
|
|
texcoord_y = texcoord_y.add32(GSVector4i::cxpr(1)) & GSVector4i::cxpr(0xFF);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CHECK_VRAM(GPU_SW_Rasterizer::DrawRectangleFunctions[texture_enable][raw_texture_enable][transparency_enable](cmd));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif // USE_VECTOR
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
// Polygon and line rasterization ported from Mednafen
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
@ -355,6 +805,8 @@ ALWAYS_INLINE_RELEASE static void AddIDeltas_DY(i_group& ig, const i_deltas& idl
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifndef USE_VECTOR
|
|
|
|
|
|
|
|
|
|
template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
|
|
|
|
|
bool dithering_enable>
|
|
|
|
|
ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound,
|
|
|
|
@ -401,11 +853,150 @@ ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* c
|
|
|
|
|
} while (--w > 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#else // USE_VECTOR
|
|
|
|
|
|
|
|
|
|
template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
|
|
|
|
|
bool dithering_enable>
|
|
|
|
|
ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawCommand* cmd, s32 y, s32 x_start, s32 x_bound,
|
|
|
|
|
i_group ig, const i_deltas& idl)
|
|
|
|
|
{
|
|
|
|
|
if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
s32 x_ig_adjust = x_start;
|
|
|
|
|
s32 w = x_bound - x_start;
|
|
|
|
|
s32 x = TruncateGPUVertexPosition(x_start);
|
|
|
|
|
|
|
|
|
|
if (x < static_cast<s32>(g_drawing_area.left))
|
|
|
|
|
{
|
|
|
|
|
s32 delta = static_cast<s32>(g_drawing_area.left) - x;
|
|
|
|
|
x_ig_adjust += delta;
|
|
|
|
|
x += delta;
|
|
|
|
|
w -= delta;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ((x + w) > (static_cast<s32>(g_drawing_area.right) + 1))
|
|
|
|
|
w = static_cast<s32>(g_drawing_area.right) + 1 - x;
|
|
|
|
|
|
|
|
|
|
if (w <= 0)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
// TODO: Precompute.
|
|
|
|
|
|
|
|
|
|
const auto clip_left = GSVector4i(g_drawing_area.left);
|
|
|
|
|
const auto clip_right = GSVector4i(g_drawing_area.right);
|
|
|
|
|
|
|
|
|
|
const GSVector4i dr_dx = GSVector4i(idl.dr_dx * 4);
|
|
|
|
|
const GSVector4i dg_dx = GSVector4i(idl.dg_dx * 4);
|
|
|
|
|
const GSVector4i db_dx = GSVector4i(idl.db_dx * 4);
|
|
|
|
|
const GSVector4i du_dx = GSVector4i(idl.du_dx * 4);
|
|
|
|
|
const GSVector4i dv_dx = GSVector4i(idl.dv_dx * 4);
|
|
|
|
|
|
|
|
|
|
// TODO: vectorize
|
|
|
|
|
const GSVector4i dr_dx_offset = GSVector4i(0, idl.dr_dx, idl.dr_dx * 2, idl.dr_dx * 3);
|
|
|
|
|
const GSVector4i dg_dx_offset = GSVector4i(0, idl.dg_dx, idl.dg_dx * 2, idl.dg_dx * 3);
|
|
|
|
|
const GSVector4i db_dx_offset = GSVector4i(0, idl.db_dx, idl.db_dx * 2, idl.db_dx * 3);
|
|
|
|
|
const GSVector4i du_dx_offset = GSVector4i(0, idl.du_dx, idl.du_dx * 2, idl.du_dx * 3);
|
|
|
|
|
const GSVector4i dv_dx_offset = GSVector4i(0, idl.dv_dx, idl.dv_dx * 2, idl.dv_dx * 3);
|
|
|
|
|
|
|
|
|
|
GSVector4i dr, dg, db;
|
|
|
|
|
if constexpr (shading_enable)
|
|
|
|
|
{
|
|
|
|
|
dr = GSVector4i(ig.r + idl.dr_dx * x_ig_adjust).add32(dr_dx_offset);
|
|
|
|
|
dg = GSVector4i(ig.g + idl.dg_dx * x_ig_adjust).add32(dg_dx_offset);
|
|
|
|
|
db = GSVector4i(ig.b + idl.db_dx * x_ig_adjust).add32(db_dx_offset);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// precompute for flat shading
|
|
|
|
|
dr = GSVector4i(ig.r >> (COORD_FBS + COORD_POST_PADDING));
|
|
|
|
|
dg = GSVector4i((ig.g >> (COORD_FBS + COORD_POST_PADDING)) << 16);
|
|
|
|
|
db = GSVector4i(ig.b >> (COORD_FBS + COORD_POST_PADDING));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
GSVector4i du = GSVector4i(ig.u + idl.du_dx * x_ig_adjust).add32(du_dx_offset);
|
|
|
|
|
GSVector4i dv = GSVector4i(ig.v + idl.dv_dx * x_ig_adjust).add32(dv_dx_offset);
|
|
|
|
|
|
|
|
|
|
// TODO: Move to caller.
|
|
|
|
|
if constexpr (shading_enable)
|
|
|
|
|
{
|
|
|
|
|
// TODO: vectorize multiply?
|
|
|
|
|
dr = dr.add32(GSVector4i(idl.dr_dy * y));
|
|
|
|
|
dg = dg.add32(GSVector4i(idl.dg_dy * y));
|
|
|
|
|
db = db.add32(GSVector4i(idl.db_dy * y));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if constexpr (texture_enable)
|
|
|
|
|
{
|
|
|
|
|
du = du.add32(GSVector4i(idl.du_dy * y));
|
|
|
|
|
dv = dv.add32(GSVector4i(idl.dv_dy * y));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const GSVector4i dither =
|
|
|
|
|
GSVector4i::load<false>(&VECTOR_DITHER_MATRIX[static_cast<u32>(y) & 3][(static_cast<u32>(x) & 3) * 2]);
|
|
|
|
|
|
|
|
|
|
GSVector4i xvec = GSVector4i(x).add32(GSVector4i::cxpr(0, 1, 2, 3));
|
|
|
|
|
GSVector4i wvec = GSVector4i(w).sub32(GSVector4i::cxpr(1, 2, 3, 4));
|
|
|
|
|
|
|
|
|
|
for (s32 count = (w + 3) / 4; count > 0; --count)
|
|
|
|
|
{
|
|
|
|
|
// R000 | R000 | R000 | R000
|
|
|
|
|
// R0G0 | R0G0 | R0G0 | R0G0
|
|
|
|
|
const GSVector4i r = shading_enable ? dr.srl32<COORD_FBS + COORD_POST_PADDING>() : dr;
|
|
|
|
|
const GSVector4i g =
|
|
|
|
|
shading_enable ? dg.srl32<COORD_FBS + COORD_POST_PADDING>().sll32<16>() : dg; // get G into the correct position
|
|
|
|
|
const GSVector4i b = shading_enable ? db.srl32<COORD_FBS + COORD_POST_PADDING>() : db;
|
|
|
|
|
const GSVector4i u = du.srl32<COORD_FBS + COORD_POST_PADDING>();
|
|
|
|
|
const GSVector4i v = dv.srl32<COORD_FBS + COORD_POST_PADDING>();
|
|
|
|
|
|
|
|
|
|
// TODO: no-sse4
|
|
|
|
|
const GSVector4i rg = r.blend16<0xAA>(g);
|
|
|
|
|
|
|
|
|
|
// mask based on what's outside the span
|
|
|
|
|
auto preserve_mask = wvec.lt32(GSVector4i::zero());
|
|
|
|
|
|
|
|
|
|
// clip test, if all pixels are outside, skip
|
|
|
|
|
preserve_mask = preserve_mask | xvec.lt32(clip_left);
|
|
|
|
|
preserve_mask = preserve_mask | xvec.gt32(clip_right);
|
|
|
|
|
if (!preserve_mask.alltrue())
|
|
|
|
|
{
|
|
|
|
|
ShadePixel<texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
|
|
|
|
|
cmd, static_cast<u32>(x), static_cast<u32>(y), rg, b, u, v, preserve_mask, dither);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
x += 4;
|
|
|
|
|
|
|
|
|
|
xvec = xvec.add32(GSVector4i::cxpr(4));
|
|
|
|
|
wvec = wvec.sub32(GSVector4i::cxpr(4));
|
|
|
|
|
|
|
|
|
|
if constexpr (shading_enable)
|
|
|
|
|
{
|
|
|
|
|
dr = dr.add32(dr_dx);
|
|
|
|
|
dg = dg.add32(dg_dx);
|
|
|
|
|
db = db.add32(db_dx);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if constexpr (texture_enable)
|
|
|
|
|
{
|
|
|
|
|
du = du.add32(du_dx);
|
|
|
|
|
dv = dv.add32(dv_dx);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif // USE_VECTOR
|
|
|
|
|
|
|
|
|
|
template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
|
|
|
|
|
bool dithering_enable>
|
|
|
|
|
static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0,
|
|
|
|
|
const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2)
|
|
|
|
|
{
|
|
|
|
|
#if 0
|
|
|
|
|
const GPUBackendDrawPolygonCommand::Vertex* orig_v0 = v0;
|
|
|
|
|
const GPUBackendDrawPolygonCommand::Vertex* orig_v1 = v1;
|
|
|
|
|
const GPUBackendDrawPolygonCommand::Vertex* orig_v2 = v2;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
u32 core_vertex;
|
|
|
|
|
{
|
|
|
|
|
u32 cvtemp = 0;
|
|
|
|
@ -480,6 +1071,10 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke
|
|
|
|
|
if (!CalcIDeltas<shading_enable, texture_enable>(idl, v0, v1, v2))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
#ifdef USE_VECTOR
|
|
|
|
|
BACKUP_VRAM();
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2};
|
|
|
|
|
|
|
|
|
|
i_group ig;
|
|
|
|
@ -591,6 +1186,12 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef USE_VECTOR
|
|
|
|
|
CHECK_VRAM(
|
|
|
|
|
GPU_SW_Rasterizer::DrawTriangleFunctions[shading_enable][texture_enable][raw_texture_enable][transparency_enable]
|
|
|
|
|
[dithering_enable](cmd, orig_v0, orig_v1, orig_v2));
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ALWAYS_INLINE_RELEASE static s64 LineDivide(s64 delta, s32 dk)
|
|
|
|
|