diff --git a/src/core/gpu_sw_rasterizer.cpp b/src/core/gpu_sw_rasterizer.cpp index 0df0c16c2..371766ddb 100644 --- a/src/core/gpu_sw_rasterizer.cpp +++ b/src/core/gpu_sw_rasterizer.cpp @@ -44,7 +44,9 @@ namespace GPU_SW_Rasterizer { // Default vector implementation definitions. #if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON) namespace GPU_SW_Rasterizer::SIMD { +#define USE_VECTOR 1 #include "gpu_sw_rasterizer.inl" +#undef USE_VECTOR } #endif diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl index ebfeb4efd..be4ad94b2 100644 --- a/src/core/gpu_sw_rasterizer.inl +++ b/src/core/gpu_sw_rasterizer.inl @@ -9,6 +9,7 @@ #define USE_VECTOR 1 #define GSVECTOR_HAS_SRLV 1 +#define GSVECTOR_HAS_256 1 extern GPU_SW_Rasterizer::DitherLUT g_dither_lut; @@ -22,6 +23,7 @@ namespace GPU_SW_Rasterizer { #if 0 static u16 s_vram_backup[VRAM_WIDTH * VRAM_HEIGHT]; static u16 s_new_vram[VRAM_WIDTH * VRAM_HEIGHT]; +static u32 s_bad_counter = 0; #define BACKUP_VRAM() \ do \ { \ @@ -257,50 +259,138 @@ static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) #else // USE_VECTOR +#ifdef GSVECTOR_HAS_256 +using GSVectorNi = GSVector8i; +static constexpr GSVector8i SPAN_OFFSET_VEC = GSVector8i::cxpr(0, 1, 2, 3, 4, 5, 6, 7); +static constexpr GSVector8i SPAN_WIDTH_VEC = GSVector8i::cxpr(1, 2, 3, 4, 5, 6, 7, 8); +static constexpr GSVector8i PIXELS_PER_VEC_VEC = GSVector8i::cxpr(8); +static constexpr u32 PIXELS_PER_VEC = 8; +#else +using GSVectorNi = GSVector4i; +static constexpr GSVector4i SPAN_OFFSET_VEC = GSVector4i::cxpr(0, 1, 2, 3); +static constexpr GSVector4i SPAN_WIDTH_VEC = GSVector4i::cxpr(1, 2, 3, 4); +static constexpr GSVector4i PIXELS_PER_VEC_VEC = GSVector4i::cxpr(4); +static constexpr u32 PIXELS_PER_VEC = 4; +#endif + +#ifdef GSVECTOR_HAS_256 + +ALWAYS_INLINE_RELEASE static GSVector8i GatherVector(GSVector8i coord_x, GSVector8i coord_y) +{ + const GSVector8i offsets = coord_y.sll32<10>().add32(coord_x); // y * 1024 + x + GSVector8i pixels = GSVector8i::zext32(g_vram[static_cast(offsets.extract32<0>())]); + pixels = pixels.insert16<2>(g_vram[static_cast(offsets.extract32<1>())]); + pixels = pixels.insert16<4>(g_vram[static_cast(offsets.extract32<2>())]); + pixels = pixels.insert16<6>(g_vram[static_cast(offsets.extract32<3>())]); + pixels = pixels.insert16<8>(g_vram[static_cast(offsets.extract32<4>())]); + pixels = pixels.insert16<10>(g_vram[static_cast(offsets.extract32<5>())]); + pixels = pixels.insert16<12>(g_vram[static_cast(offsets.extract32<6>())]); + pixels = pixels.insert16<14>(g_vram[static_cast(offsets.extract32<7>())]); + return pixels; +} + +template +ALWAYS_INLINE_RELEASE static GSVector8i GatherCLUTVector(GSVector8i indices, GSVector8i shifts) +{ + const GSVector8i offsets = indices.srlv32(shifts) & GSVector8i::cxpr(mask); + GSVector8i pixels = GSVector8i::zext32(g_gpu_clut[static_cast(offsets.extract32<0>())]); + pixels = pixels.insert16<2>(g_gpu_clut[static_cast(offsets.extract32<1>())]); + pixels = pixels.insert16<4>(g_gpu_clut[static_cast(offsets.extract32<2>())]); + pixels = pixels.insert16<6>(g_gpu_clut[static_cast(offsets.extract32<3>())]); + pixels = pixels.insert16<8>(g_gpu_clut[static_cast(offsets.extract32<4>())]); + pixels = pixels.insert16<10>(g_gpu_clut[static_cast(offsets.extract32<5>())]); + pixels = pixels.insert16<12>(g_gpu_clut[static_cast(offsets.extract32<6>())]); + pixels = pixels.insert16<14>(g_gpu_clut[static_cast(offsets.extract32<7>())]); + return pixels; +} + +ALWAYS_INLINE_RELEASE static GSVector8i LoadVector(u32 x, u32 y) +{ + // TODO: Split into high/low + if (x <= (VRAM_WIDTH - 8)) + { + return GSVector8i::u16to32(GSVector4i::load(&g_vram[y * VRAM_WIDTH + x])); + } + else + { + // TODO: Avoid loads for masked pixels if a contiguous region is masked + const u16* line = &g_vram[y * VRAM_WIDTH]; + GSVector8i pixels = GSVector8i::zero(); + pixels = pixels.insert16<0>(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<6>(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<8>(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<10>(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<12>(line[(x++) & VRAM_WIDTH_MASK]); + pixels = pixels.insert16<14>(line[(x++) & VRAM_WIDTH_MASK]); + return pixels; + } +} + +ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector8i color) +{ + // TODO: Split into high/low + const GSVector4i packed = color.low128().pu32(color.high128()); + if (x <= (VRAM_WIDTH - 8)) + { + GSVector4i::store(&g_vram[y * VRAM_WIDTH + x], packed); + } + else + { + // TODO: Avoid stores for masked pixels if a contiguous region is masked + u16* line = &g_vram[y * VRAM_WIDTH]; + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<0>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<1>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<2>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<3>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<4>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<5>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<6>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<7>()); + } +} + +#else + ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVector4i coord_y) { - GSVector4i offsets = coord_y.sll32<11>(); // y * 2048 (1024 * sizeof(pixel)) - offsets = offsets.add32(coord_x.sll32<1>()); // x * 2 (x * sizeof(pixel)) + const GSVector4i offsets = coord_y.sll32<10>().add32(coord_x); // y * 1024 + x - const u32 o0 = offsets.extract32<0>(); - const u32 o1 = offsets.extract32<1>(); - const u32 o2 = offsets.extract32<2>(); - const u32 o3 = offsets.extract32<3>(); - - // TODO: split in two, merge, maybe could be zx loaded instead.. - u16 p0, p1, p2, p3; - std::memcpy(&p0, reinterpret_cast(g_vram) + o0, sizeof(p0)); - std::memcpy(&p1, reinterpret_cast(g_vram) + o1, sizeof(p1)); - std::memcpy(&p2, reinterpret_cast(g_vram) + o2, sizeof(p2)); - std::memcpy(&p3, reinterpret_cast(g_vram) + o3, sizeof(p3)); - GSVector4i pixels = GSVector4i::zext32(p0); - pixels = pixels.insert16<2>(p1); - pixels = pixels.insert16<4>(p2); - pixels = pixels.insert16<6>(p3); + // Clang seems to optimize this directly into pextrd+pinsrw, good. + GSVector4i pixels = GSVector4i::zext32(g_vram[static_cast(offsets.extract32<0>())]); + pixels = pixels.insert16<2>(g_vram[static_cast(offsets.extract32<1>())]); + pixels = pixels.insert16<4>(g_vram[static_cast(offsets.extract32<2>())]); + pixels = pixels.insert16<6>(g_vram[static_cast(offsets.extract32<3>())]); return pixels; } -ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices) +template +ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices, GSVector4i shifts) { - const GSVector4i offsets = indices.sll32<1>(); // x * 2 (x * sizeof(pixel)) - const u32 o0 = offsets.extract32<0>(); - const u32 o1 = offsets.extract32<1>(); - const u32 o2 = offsets.extract32<2>(); - const u32 o3 = offsets.extract32<3>(); - - // TODO: split in two, merge, maybe could be zx loaded instead.. - u16 p0, p1, p2, p3; - std::memcpy(&p0, reinterpret_cast(g_gpu_clut) + o0, sizeof(p0)); - std::memcpy(&p1, reinterpret_cast(g_gpu_clut) + o1, sizeof(p1)); - std::memcpy(&p2, reinterpret_cast(g_gpu_clut) + o2, sizeof(p2)); - std::memcpy(&p3, reinterpret_cast(g_gpu_clut) + o3, sizeof(p3)); - GSVector4i pixels = GSVector4i::zext32(p0); - pixels = pixels.insert16<2>(p1); - pixels = pixels.insert16<4>(p2); - pixels = pixels.insert16<6>(p3); - +#ifdef GSVECTOR_HAS_SRLV + // On everywhere except RISC-V, we can do the shl 1 (* 2) as part of the load instruction. + const GSVector4i offsets = indices.srlv32(shifts) & GSVector4i::cxpr(mask); + GSVector4i pixels = GSVector4i::zext32(g_gpu_clut[static_cast(offsets.extract32<0>())]); + pixels = pixels.insert16<2>(g_gpu_clut[static_cast(offsets.extract32<1>())]); + pixels = pixels.insert16<4>(g_gpu_clut[static_cast(offsets.extract32<2>())]); + pixels = pixels.insert16<6>(g_gpu_clut[static_cast(offsets.extract32<3>())]); return pixels; +#else + // Without variable shifts, it's probably quicker to do it without vectors. + // Because otherwise we have to do 4 separate vector shifts, as well as broadcasting the shifts... + // Clang seems to turn this into a bunch of extracts, and skips memory. Nice. + alignas(VECTOR_ALIGNMENT) s32 indices_array[4], shifts_array[4]; + GSVector4i::store(indices_array, indices); + GSVector4i::store(shifts_array, shifts); + + GSVector4i pixels = GSVector4i::zext32(g_gpu_clut[((indices_array[0] >> shifts_array[0]) & mask)]); + pixels = pixels.insert16<2>(g_gpu_clut[((indices_array[1] >> shifts_array[1]) & mask)]); + pixels = pixels.insert16<4>(g_gpu_clut[((indices_array[2] >> shifts_array[2]) & mask)]); + pixels = pixels.insert16<6>(g_gpu_clut[((indices_array[3] >> shifts_array[3]) & mask)]); + return pixels; +#endif } ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y) @@ -322,42 +412,45 @@ ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y) ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color) { + const GSVector4i packed_color = color.pu32(); if (x <= (VRAM_WIDTH - 4)) { - GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], color); + GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], packed_color); } else { u16* line = &g_vram[y * VRAM_WIDTH]; - line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<0>()); - line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<1>()); - line[(x++) & VRAM_WIDTH_MASK] = Truncate16(color.extract16<2>()); - line[x & VRAM_WIDTH_MASK] = Truncate16(color.extract16<3>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<0>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<1>()); + line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<2>()); + line[x & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<3>()); } } -ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVector4i rgb5a1, GSVector4i& rg, GSVector4i& ba) +#endif + +ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVectorNi rgb5a1, GSVectorNi& rg, GSVectorNi& ba) { - rg = rgb5a1 & GSVector4i::cxpr(0x1F); // R | R | R | R - rg = rg | (rgb5a1 & GSVector4i::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0 - ba = rgb5a1.srl32<10>() & GSVector4i::cxpr(0x1F); // B | B | B | B - ba = ba | (rgb5a1 & GSVector4i::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0 + rg = rgb5a1 & GSVectorNi::cxpr(0x1F); // R | R | R | R + rg = rg | (rgb5a1 & GSVectorNi::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0 + ba = rgb5a1.srl32<10>() & GSVectorNi::cxpr(0x1F); // B | B | B | B + ba = ba | (rgb5a1 & GSVectorNi::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0 } -ALWAYS_INLINE_RELEASE static GSVector4i RG_BAToRGB5A1(GSVector4i rg, GSVector4i ba) +ALWAYS_INLINE_RELEASE static GSVectorNi RG_BAToRGB5A1(GSVectorNi rg, GSVectorNi ba) { - GSVector4i res; + GSVectorNi res; - res = rg & GSVector4i::cxpr(0x1F); // R | R | R | R - res = res | (rg.srl32<11>() & GSVector4i::cxpr(0x3E0)); // RG | RG | RG | RG - res = res | ((ba & GSVector4i::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB + res = rg & GSVectorNi::cxpr(0x1F); // R | R | R | R + res = res | (rg.srl32<11>() & GSVectorNi::cxpr(0x3E0)); // RG | RG | RG | RG + res = res | ((ba & GSVectorNi::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB res = res | ba.srl32<16>().sll32<15>(); // RGBA | RGBA | RGBA | RGBA return res; } // Color repeated twice for RG packing, then duplicated to we can load based on the X offset. -static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = { +alignas(VECTOR_ALIGNMENT) static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = { #define P(m, n) static_cast(DITHER_MATRIX[m][n]), static_cast(DITHER_MATRIX[m][n]) #define R(m) P(m, 0), P(m, 1), P(m, 2), P(m, 3), P(m, 0), P(m, 1), P(m, 2), P(m, 3) @@ -367,70 +460,97 @@ static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = { #undef P }; +namespace { +template +struct PixelVectors +{ + struct UnusedField + { + }; + + GSVectorNi clip_left; + GSVectorNi clip_right; + + GSVectorNi mask_and; + GSVectorNi mask_or; + + typename std::conditional_t texture_window_and_x; + typename std::conditional_t texture_window_or_x; + typename std::conditional_t texture_window_and_y; + typename std::conditional_t texture_window_or_y; + typename std::conditional_t texture_base_x; + typename std::conditional_t texture_base_y; + + PixelVectors(const GPUBackendDrawCommand* cmd) + { + clip_left = GSVectorNi(g_drawing_area.left); + clip_right = GSVectorNi(g_drawing_area.right); + + mask_and = GSVectorNi(cmd->params.GetMaskAND()); + mask_or = GSVectorNi(cmd->params.GetMaskOR()); + + if constexpr (texture_enable) + { + texture_window_and_x = GSVectorNi(cmd->window.and_x); + texture_window_or_x = GSVectorNi(cmd->window.or_x); + texture_window_and_y = GSVectorNi(cmd->window.and_y); + texture_window_or_y = GSVectorNi(cmd->window.or_y); + texture_base_x = GSVectorNi(cmd->draw_mode.GetTexturePageBaseX()); + texture_base_y = GSVectorNi(cmd->draw_mode.GetTexturePageBaseY()); + } + } +}; +} // namespace + template ALWAYS_INLINE_RELEASE static void -ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vertex_color_rg, GSVector4i vertex_color_ba, - GSVector4i texcoord_x, GSVector4i texcoord_y, GSVector4i preserve_mask, GSVector4i dither) +ShadePixel(const PixelVectors& pv, GPUTextureMode texture_mode, GPUTransparencyMode transparency_mode, + u32 start_x, u32 y, GSVectorNi vertex_color_rg, GSVectorNi vertex_color_ba, GSVectorNi texcoord_x, + GSVectorNi texcoord_y, GSVectorNi preserve_mask, GSVectorNi dither) { - static constinit GSVector4i coord_mask_x = GSVector4i::cxpr(VRAM_WIDTH_MASK); - static constinit GSVector4i coord_mask_y = GSVector4i::cxpr(VRAM_HEIGHT_MASK); + static constexpr GSVectorNi coord_mask_x = GSVectorNi::cxpr(VRAM_WIDTH_MASK); + static constexpr GSVectorNi coord_mask_y = GSVectorNi::cxpr(VRAM_HEIGHT_MASK); - GSVector4i color; + GSVectorNi color; if constexpr (texture_enable) { // Apply texture window - texcoord_x = (texcoord_x & GSVector4i(cmd->window.and_x)) | GSVector4i(cmd->window.or_x); - texcoord_y = (texcoord_y & GSVector4i(cmd->window.and_y)) | GSVector4i(cmd->window.or_y); + texcoord_x = (texcoord_x & pv.texture_window_and_x) | pv.texture_window_or_x; + texcoord_y = (texcoord_y & pv.texture_window_and_y) | pv.texture_window_or_y; - const GSVector4i base_x = GSVector4i(cmd->draw_mode.GetTexturePageBaseX()); - const GSVector4i base_y = GSVector4i(cmd->draw_mode.GetTexturePageBaseY()); + texcoord_y = pv.texture_base_y.add32(texcoord_y) & coord_mask_y; - texcoord_y = base_y.add32(texcoord_y) & coord_mask_y; - - GSVector4i texture_color; - switch (cmd->draw_mode.texture_mode) + GSVectorNi texture_color; + switch (texture_mode) { case GPUTextureMode::Palette4Bit: { - GSVector4i load_texcoord_x = texcoord_x.srl32<2>(); - load_texcoord_x = base_x.add32(load_texcoord_x); + GSVectorNi load_texcoord_x = texcoord_x.srl32<2>(); + load_texcoord_x = pv.texture_base_x.add32(load_texcoord_x); load_texcoord_x = load_texcoord_x & coord_mask_x; - // todo: sse4 path - GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(3)).sll32<2>(); - GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y); -#ifdef GSVECTOR_HAS_SRLV - palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0x0F); -#else - Assert(false && "Fixme"); -#endif - - texture_color = GatherCLUTVector(palette_indices); + const GSVectorNi palette_shift = (texcoord_x & GSVectorNi::cxpr(3)).sll32<2>(); + const GSVectorNi palette_indices = GatherVector(load_texcoord_x, texcoord_y); + texture_color = GatherCLUTVector<0x0F>(palette_indices, palette_shift); } break; case GPUTextureMode::Palette8Bit: { - GSVector4i load_texcoord_x = texcoord_x.srl32<1>(); - load_texcoord_x = base_x.add32(load_texcoord_x); + GSVectorNi load_texcoord_x = texcoord_x.srl32<1>(); + load_texcoord_x = pv.texture_base_x.add32(load_texcoord_x); load_texcoord_x = load_texcoord_x & coord_mask_x; - GSVector4i palette_shift = (texcoord_x & GSVector4i::cxpr(1)).sll32<3>(); - GSVector4i palette_indices = GatherVector(load_texcoord_x, texcoord_y); -#ifdef GSVECTOR_HAS_SRLV - palette_indices = palette_indices.srlv32(palette_shift) & GSVector4i::cxpr(0xFF); -#else - Assert(false && "Fixme"); -#endif - - texture_color = GatherCLUTVector(palette_indices); + const GSVectorNi palette_shift = (texcoord_x & GSVectorNi::cxpr(1)).sll32<3>(); + const GSVectorNi palette_indices = GatherVector(load_texcoord_x, texcoord_y); + texture_color = GatherCLUTVector<0xFF>(palette_indices, palette_shift); } break; default: { - texcoord_x = base_x.add32(texcoord_x); + texcoord_x = pv.texture_base_x.add32(texcoord_x); texcoord_x = texcoord_x & coord_mask_x; texture_color = GatherVector(texcoord_x, texcoord_y); } @@ -438,7 +558,7 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert } // check for zero texture colour across the 4 pixels, early out if so - const GSVector4i texture_transparent_mask = texture_color.eq32(GSVector4i::zero()); + const GSVectorNi texture_transparent_mask = texture_color.eq32(GSVectorNi::zero()); if (texture_transparent_mask.alltrue()) return; @@ -450,19 +570,18 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert } else { - GSVector4i trg, tba; + GSVectorNi trg, tba; RGB5A1ToRG_BA(texture_color, trg, tba); // now we have both the texture and vertex color in RG/GA pairs, for 4 pixels, which we can multiply - GSVector4i rg = trg.mul16l(vertex_color_rg); - GSVector4i ba = tba.mul16l(vertex_color_ba); + GSVectorNi rg = trg.mul16l(vertex_color_rg); + GSVectorNi ba = tba.mul16l(vertex_color_ba); - // TODO: Dither // Convert to 5bit. if constexpr (dithering_enable) { - rg = rg.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); - ba = ba.sra16<4>().add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); + rg = rg.sra16<4>().add16(dither).max_i16(GSVectorNi::zero()).sra16<3>(); + ba = ba.sra16<4>().add16(dither).max_i16(GSVectorNi::zero()).sra16<3>(); } else { @@ -474,7 +593,7 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert ba = ba.blend16<0xaa>(tba); // Clamp to 5bit. - static constexpr GSVector4i colclamp = GSVector4i::cxpr16(0x1F); + static constexpr GSVectorNi colclamp = GSVectorNi::cxpr16(0x1F); rg = rg.min_u16(colclamp); ba = ba.min_u16(colclamp); @@ -487,12 +606,12 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert // Non-textured transparent polygons don't set bit 15, but are treated as transparent. if constexpr (dithering_enable) { - GSVector4i rg = vertex_color_rg.add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); - GSVector4i ba = vertex_color_ba.add16(dither).max_i16(GSVector4i::zero()).sra16<3>(); + GSVectorNi rg = vertex_color_rg.add16(dither).max_i16(GSVectorNi::zero()).sra16<3>(); + GSVectorNi ba = vertex_color_ba.add16(dither).max_i16(GSVectorNi::zero()).sra16<3>(); // Clamp to 5bit. We use 32bit for BA to set a to zero. - rg = rg.min_u16(GSVector4i::cxpr16(0x1F)); - ba = ba.min_u16(GSVector4i::cxpr(0x1F)); + rg = rg.min_u16(GSVectorNi::cxpr16(0x1F)); + ba = ba.min_u16(GSVectorNi::cxpr(0x1F)); // And interleave back to 16bpp. color = RG_BAToRGB5A1(rg, ba); @@ -500,17 +619,17 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert else { // Note that bit15 is set to 0 here, which the shift will do. - const GSVector4i rg = vertex_color_rg.srl16<3>(); - const GSVector4i ba = vertex_color_ba.srl16<3>(); + const GSVectorNi rg = vertex_color_rg.srl16<3>(); + const GSVectorNi ba = vertex_color_ba.srl16<3>(); color = RG_BAToRGB5A1(rg, ba); } } - GSVector4i bg_color = LoadVector(start_x, y); + GSVectorNi bg_color = LoadVector(start_x, y); if constexpr (transparency_enable) { - [[maybe_unused]] GSVector4i transparent_mask; + [[maybe_unused]] GSVectorNi transparent_mask; if constexpr (texture_enable) { // Compute transparent_mask, ffff per lane if transparent otherwise 0000 @@ -520,78 +639,72 @@ ShadePixel(const GPUBackendDrawCommand* cmd, u32 start_x, u32 y, GSVector4i vert // TODO: We don't need to OR color here with 0x8000 for textures. // 0x8000 is added to match serial path. - GSVector4i blended_color; - switch (cmd->draw_mode.transparency_mode) + GSVectorNi blended_color; + switch (transparency_mode) { case GPUTransparencyMode::HalfBackgroundPlusHalfForeground: { - const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u); - const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u); - const GSVector4i res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x0421u)).srl32<1>(); - blended_color = res & GSVector4i::cxpr(0xffff); + const GSVectorNi fg_bits = color | GSVectorNi::cxpr(0x8000u); + const GSVectorNi bg_bits = bg_color | GSVectorNi::cxpr(0x8000u); + const GSVectorNi res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x0421u)).srl32<1>(); + blended_color = res & GSVectorNi::cxpr(0xffff); } break; case GPUTransparencyMode::BackgroundPlusForeground: { - const GSVector4i fg_bits = color | GSVector4i::cxpr(0x8000u); - const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu); - const GSVector4i sum = fg_bits.add32(bg_bits); - const GSVector4i carry = - (sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u))) & GSVector4i::cxpr(0x8420u); - const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>()); - blended_color = res & GSVector4i::cxpr(0xffff); + const GSVectorNi fg_bits = color | GSVectorNi::cxpr(0x8000u); + const GSVectorNi bg_bits = bg_color & GSVectorNi::cxpr(0x7FFFu); + const GSVectorNi sum = fg_bits.add32(bg_bits); + const GSVectorNi carry = + (sum.sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x8421u))) & GSVectorNi::cxpr(0x8420u); + const GSVectorNi res = sum.sub32(carry) | carry.sub32(carry.srl32<5>()); + blended_color = res & GSVectorNi::cxpr(0xffff); } break; case GPUTransparencyMode::BackgroundMinusForeground: { - const GSVector4i bg_bits = bg_color | GSVector4i::cxpr(0x8000u); - const GSVector4i fg_bits = color & GSVector4i::cxpr(0x7FFFu); - const GSVector4i diff = bg_bits.sub32(fg_bits).add32(GSVector4i::cxpr(0x108420u)); - const GSVector4i borrow = - diff.sub32((bg_bits ^ fg_bits) & GSVector4i::cxpr(0x108420u)) & GSVector4i::cxpr(0x108420u); - const GSVector4i res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>()); - blended_color = res & GSVector4i::cxpr(0xffff); + const GSVectorNi bg_bits = bg_color | GSVectorNi::cxpr(0x8000u); + const GSVectorNi fg_bits = color & GSVectorNi::cxpr(0x7FFFu); + const GSVectorNi diff = bg_bits.sub32(fg_bits).add32(GSVectorNi::cxpr(0x108420u)); + const GSVectorNi borrow = + diff.sub32((bg_bits ^ fg_bits) & GSVectorNi::cxpr(0x108420u)) & GSVectorNi::cxpr(0x108420u); + const GSVectorNi res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>()); + blended_color = res & GSVectorNi::cxpr(0xffff); } break; case GPUTransparencyMode::BackgroundPlusQuarterForeground: default: { - const GSVector4i bg_bits = bg_color & GSVector4i::cxpr(0x7FFFu); - const GSVector4i fg_bits = - ((color | GSVector4i::cxpr(0x8000)).srl32<2>() & GSVector4i::cxpr(0x1CE7u)) | GSVector4i::cxpr(0x8000u); - const GSVector4i sum = fg_bits.add32(bg_bits); - const GSVector4i carry = sum.sub32((fg_bits ^ bg_bits) & GSVector4i::cxpr(0x8421u)) & GSVector4i::cxpr(0x8420u); - const GSVector4i res = sum.sub32(carry) | carry.sub32(carry.srl32<5>()); - blended_color = res & GSVector4i::cxpr(0xffff); + const GSVectorNi bg_bits = bg_color & GSVectorNi::cxpr(0x7FFFu); + const GSVectorNi fg_bits = + ((color | GSVectorNi::cxpr(0x8000)).srl32<2>() & GSVectorNi::cxpr(0x1CE7u)) | GSVectorNi::cxpr(0x8000u); + const GSVectorNi sum = fg_bits.add32(bg_bits); + const GSVectorNi carry = sum.sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x8421u)) & GSVectorNi::cxpr(0x8420u); + const GSVectorNi res = sum.sub32(carry) | carry.sub32(carry.srl32<5>()); + blended_color = res & GSVectorNi::cxpr(0xffff); } break; } // select blended pixels for transparent pixels, otherwise consider opaque - // TODO: SSE2 if constexpr (texture_enable) color = color.blend8(blended_color, transparent_mask); else - color = blended_color & GSVector4i::cxpr(0x7fff); + color = blended_color & GSVectorNi::cxpr(0x7fff); } - // TODO: lift out to parent? - const GSVector4i mask_and = GSVector4i(cmd->params.GetMaskAND()); - const GSVector4i mask_or = GSVector4i(cmd->params.GetMaskOR()); - - GSVector4i mask_bits_set = bg_color & mask_and; // 8000 if masked else 0000 - mask_bits_set = mask_bits_set.sra16<15>(); // ffff if masked else 0000 - preserve_mask = preserve_mask | mask_bits_set; // ffff if preserved else 0000 + GSVectorNi mask_bits_set = bg_color & pv.mask_and; // 8000 if masked else 0000 + mask_bits_set = mask_bits_set.sra16<15>(); // ffff if masked else 0000 + preserve_mask = preserve_mask | mask_bits_set; // ffff if preserved else 0000 bg_color = bg_color & preserve_mask; - color = (color | mask_or).andnot(preserve_mask); + color = (color | pv.mask_or).andnot(preserve_mask); color = color | bg_color; - const GSVector4i packed_color = color.pu32(); - StoreVector(start_x, y, packed_color); + StoreVector(start_x, y, color); } template @@ -600,17 +713,16 @@ static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) const s32 origin_x = cmd->x; const s32 origin_y = cmd->y; - const GSVector4i rgba = GSVector4i(cmd->color); // RGBA | RGBA | RGBA | RGBA - GSVector4i rg = rgba.xxxxl(); // RGRG | RGRG | RGRG | RGRG - GSVector4i ba = rgba.yyyyl(); // BABA | BABA | BABA | BABA - rg = rg.u8to16(); // R0G0 | R0G0 | R0G0 | R0G0 - ba = ba.u8to16(); // B0A0 | B0A0 | B0A0 | B0A0 + const GSVector4i rgba = GSVector4i(cmd->color); // RGBA | RGBA | RGBA | RGBA + const GSVector4i rgp = rgba.xxxxl(); // RGRG | RGRG | RGRG | RGRG + const GSVector4i bap = rgba.yyyyl(); // BABA | BABA | BABA | BABA + const GSVectorNi rg = GSVectorNi::broadcast128(rgp.u8to16()); // R0G0 | R0G0 | R0G0 | R0G0 + const GSVectorNi ba = GSVectorNi::broadcast128(bap.u8to16()); // B0A0 | B0A0 | B0A0 | B0A0 - const GSVector4i texcoord_x = GSVector4i(cmd->texcoord & 0xFF).add32(GSVector4i::cxpr(0, 1, 2, 3)); - GSVector4i texcoord_y = GSVector4i(cmd->texcoord >> 8); + const GSVectorNi texcoord_x = GSVectorNi(cmd->texcoord & 0xFF).add32(SPAN_OFFSET_VEC); + GSVectorNi texcoord_y = GSVectorNi(cmd->texcoord >> 8); - const GSVector4i clip_left = GSVector4i(g_drawing_area.left); - const GSVector4i clip_right = GSVector4i(g_drawing_area.right); + const PixelVectors pv(cmd); const u32 width = cmd->width; BACKUP_VRAM(); @@ -618,41 +730,42 @@ static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) for (u32 offset_y = 0; offset_y < cmd->height; offset_y++) { const s32 y = origin_y + static_cast(offset_y); - if (y < static_cast(g_drawing_area.top) || y > static_cast(g_drawing_area.bottom) || - (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u))) + if (y >= static_cast(g_drawing_area.top) && y <= static_cast(g_drawing_area.bottom) && + (!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast(y)) & 1u))) { - continue; - } + const s32 draw_y = (y & VRAM_HEIGHT_MASK); - GSVector4i row_texcoord_x = texcoord_x; - GSVector4i xvec = GSVector4i(origin_x).add32(GSVector4i::cxpr(0, 1, 2, 3)); - GSVector4i wvec = GSVector4i(width).sub32(GSVector4i::cxpr(1, 2, 3, 4)); + GSVectorNi row_texcoord_x = texcoord_x; + GSVectorNi xvec = GSVectorNi(origin_x).add32(SPAN_OFFSET_VEC); + GSVectorNi wvec = GSVectorNi(width).sub32(SPAN_WIDTH_VEC); - for (u32 offset_x = 0; offset_x < width; offset_x += 4) - { - const s32 x = origin_x + static_cast(offset_x); - - // width test - GSVector4i preserve_mask = wvec.lt32(GSVector4i::zero()); - - // clip test, if all pixels are outside, skip - preserve_mask = preserve_mask | xvec.lt32(clip_left); - preserve_mask = preserve_mask | xvec.gt32(clip_right); - if (!preserve_mask.alltrue()) + for (u32 offset_x = 0; offset_x < width; offset_x += PIXELS_PER_VEC) { - ShadePixel( - cmd, x, y, rg, ba, row_texcoord_x, texcoord_y, preserve_mask, GSVector4i::zero()); + const s32 x = origin_x + static_cast(offset_x); + + // width test + GSVectorNi preserve_mask = wvec.lt32(GSVectorNi::zero()); + + // clip test, if all pixels are outside, skip + preserve_mask = preserve_mask | xvec.lt32(pv.clip_left); + preserve_mask = preserve_mask | xvec.gt32(pv.clip_right); + if (!preserve_mask.alltrue()) + { + ShadePixel( + pv, cmd->draw_mode.texture_mode, cmd->draw_mode.transparency_mode, x, draw_y, rg, ba, row_texcoord_x, + texcoord_y, preserve_mask, GSVectorNi::zero()); + } + + xvec = xvec.add32(PIXELS_PER_VEC_VEC); + wvec = wvec.sub32(PIXELS_PER_VEC_VEC); + + if constexpr (texture_enable) + row_texcoord_x = row_texcoord_x.add32(PIXELS_PER_VEC_VEC) & GSVectorNi::cxpr(0xFF); } - - xvec = xvec.add32(GSVector4i::cxpr(4)); - wvec = wvec.sub32(GSVector4i::cxpr(4)); - - if constexpr (texture_enable) - row_texcoord_x = row_texcoord_x.add32(GSVector4i::cxpr(4)) & GSVector4i::cxpr(0xFF); } if constexpr (texture_enable) - texcoord_y = texcoord_y.add32(GSVector4i::cxpr(1)) & GSVector4i::cxpr(0xFF); + texcoord_y = texcoord_y.add32(GSVectorNi::cxpr(1)) & GSVectorNi::cxpr(0xFF); } CHECK_VRAM(GPU_SW_Rasterizer::DrawRectangleFunctions[texture_enable][raw_texture_enable][transparency_enable](cmd)); @@ -761,7 +874,7 @@ struct UVStepper ALWAYS_INLINE u8 GetU() const { return Truncate8(u >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); } ALWAYS_INLINE u8 GetV() const { return Truncate8(v >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); } - ALWAYS_INLINE void SetStart(u32 ustart, u32 vstart) + ALWAYS_INLINE void Init(u32 ustart, u32 vstart) { u = (((ustart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT); v = (((vstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT); @@ -772,10 +885,24 @@ struct UVStepper u = u + steps.dudx; v = v + steps.dvdx; } - ALWAYS_INLINE void StepXY(const UVSteps& steps, s32 x_count, s32 y_count) + + ALWAYS_INLINE void StepX(const UVSteps& steps, s32 count) { - u = u + (steps.dudx * static_cast(x_count)) + (steps.dudy * static_cast(y_count)); - v = v + (steps.dvdx * static_cast(x_count)) + (steps.dvdy * static_cast(y_count)); + u = u + static_cast(static_cast(steps.dudx) * count); + v = v + static_cast(static_cast(steps.dvdx) * count); + } + + template + ALWAYS_INLINE void StepY(const UVSteps& steps) + { + u = upside_down ? (u - steps.dudy) : (u + steps.dudy); + v = upside_down ? (v - steps.dvdy) : (v + steps.dvdy); + } + + ALWAYS_INLINE void StepY(const UVSteps& steps, s32 count) + { + u = u + static_cast(static_cast(steps.dudy) * count); + v = v + static_cast(static_cast(steps.dvdy) * count); } }; @@ -800,7 +927,7 @@ struct RGBStepper ALWAYS_INLINE u8 GetG() const { return Truncate8(g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); } ALWAYS_INLINE u8 GetB() const { return Truncate8(b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); } - ALWAYS_INLINE void SetStart(u32 rstart, u32 gstart, u32 bstart) + ALWAYS_INLINE void Init(u32 rstart, u32 gstart, u32 bstart) { r = (((rstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT); g = (((gstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT); @@ -813,11 +940,27 @@ struct RGBStepper g = g + steps.dgdx; b = b + steps.dbdx; } - ALWAYS_INLINE void StepXY(const RGBSteps& steps, s32 x_count, s32 y_count) + + ALWAYS_INLINE void StepX(const RGBSteps& steps, s32 count) { - r = r + (steps.drdx * static_cast(x_count)) + (steps.drdy * static_cast(y_count)); - g = g + (steps.dgdx * static_cast(x_count)) + (steps.dgdy * static_cast(y_count)); - b = b + (steps.dbdx * static_cast(x_count)) + (steps.dbdy * static_cast(y_count)); + r = r + static_cast(static_cast(steps.drdx) * count); + g = g + static_cast(static_cast(steps.dgdx) * count); + b = b + static_cast(static_cast(steps.dbdx) * count); + } + + template + ALWAYS_INLINE void StepY(const RGBSteps& steps) + { + r = upside_down ? (r - steps.drdy) : (r + steps.drdy); + g = upside_down ? (g - steps.dgdy) : (g + steps.dgdy); + b = upside_down ? (b - steps.dbdy) : (b + steps.dbdy); + } + + ALWAYS_INLINE void StepY(const RGBSteps& steps, s32 count) + { + r = r + static_cast(static_cast(steps.drdy) * count); + g = g + static_cast(static_cast(steps.dgdy) * count); + b = b + static_cast(static_cast(steps.dbdy) * count); } }; @@ -841,9 +984,6 @@ templateparams.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u)) - return; - s32 width = x_bound - x_start; s32 current_x = TruncateGPUVertexPosition(x_start); @@ -863,9 +1003,9 @@ static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start return; if constexpr (texture_enable) - uv.StepXY(uvstep, x_start, y); + uv.StepX(uvstep, x_start); if constexpr (shading_enable) - rgb.StepXY(rgbstep, x_start, y); + rgb.StepX(rgbstep, x_start); do { @@ -880,137 +1020,6 @@ static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start } while (--width > 0); } -#else // USE_VECTOR - -template -static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv, - const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep) -{ - if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast(y)) & 1u)) - return; - - s32 w = x_bound - x_start; - s32 x = TruncateGPUVertexPosition(x_start); - - if (x < static_cast(g_drawing_area.left)) - { - const s32 delta = static_cast(g_drawing_area.left) - x; - x_start += delta; - x += delta; - w -= delta; - } - - if ((x + w) > (static_cast(g_drawing_area.right) + 1)) - w = static_cast(g_drawing_area.right) + 1 - x; - - if (w <= 0) - return; - - // TODO: Precompute. - - const auto clip_left = GSVector4i(g_drawing_area.left); - const auto clip_right = GSVector4i(g_drawing_area.right); - - const GSVector4i dr_dx = GSVector4i(rgbstep.drdx * 4); - const GSVector4i dg_dx = GSVector4i(rgbstep.dgdx * 4); - const GSVector4i db_dx = GSVector4i(rgbstep.dbdx * 4); - const GSVector4i du_dx = GSVector4i(uvstep.dudx * 4); - const GSVector4i dv_dx = GSVector4i(uvstep.dvdx * 4); - - // TODO: vectorize - const GSVector4i dr_dx_offset = GSVector4i(0, rgbstep.drdx, rgbstep.drdx * 2, rgbstep.drdx * 3); - const GSVector4i dg_dx_offset = GSVector4i(0, rgbstep.dgdx, rgbstep.dgdx * 2, rgbstep.dgdx * 3); - const GSVector4i db_dx_offset = GSVector4i(0, rgbstep.dbdx, rgbstep.dbdx * 2, rgbstep.dbdx * 3); - const GSVector4i du_dx_offset = GSVector4i(0, uvstep.dudx, uvstep.dudx * 2, uvstep.dudx * 3); - const GSVector4i dv_dx_offset = GSVector4i(0, uvstep.dvdx, uvstep.dvdx * 2, uvstep.dvdx * 3); - - GSVector4i dr, dg, db; - if constexpr (shading_enable) - { - dr = GSVector4i(rgb.r + rgbstep.drdx * x_start).add32(dr_dx_offset); - dg = GSVector4i(rgb.g + rgbstep.dgdx * x_start).add32(dg_dx_offset); - db = GSVector4i(rgb.b + rgbstep.dbdx * x_start).add32(db_dx_offset); - } - else - { - // precompute for flat shading - dr = GSVector4i(rgb.r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); - dg = GSVector4i((rgb.g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)) << 16); - db = GSVector4i(rgb.b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); - } - - GSVector4i du = GSVector4i(uv.u + uvstep.dudx * x_start).add32(du_dx_offset); - GSVector4i dv = GSVector4i(uv.v + uvstep.dvdx * x_start).add32(dv_dx_offset); - - // TODO: Move to caller. - if constexpr (shading_enable) - { - // TODO: vectorize multiply? - dr = dr.add32(GSVector4i(rgbstep.drdy * y)); - dg = dg.add32(GSVector4i(rgbstep.dgdy * y)); - db = db.add32(GSVector4i(rgbstep.dbdy * y)); - } - - if constexpr (texture_enable) - { - du = du.add32(GSVector4i(uvstep.dudy * y)); - dv = dv.add32(GSVector4i(uvstep.dvdy * y)); - } - - const GSVector4i dither = - GSVector4i::load(&VECTOR_DITHER_MATRIX[static_cast(y) & 3][(static_cast(x) & 3) * 2]); - - GSVector4i xvec = GSVector4i(x).add32(GSVector4i::cxpr(0, 1, 2, 3)); - GSVector4i wvec = GSVector4i(w).sub32(GSVector4i::cxpr(1, 2, 3, 4)); - - for (s32 count = (w + 3) / 4; count > 0; --count) - { - // R000 | R000 | R000 | R000 - // R0G0 | R0G0 | R0G0 | R0G0 - const GSVector4i r = shading_enable ? dr.srl32() : dr; - const GSVector4i g = - shading_enable ? dg.srl32().sll32<16>() : dg; // get G into the correct position - const GSVector4i b = shading_enable ? db.srl32() : db; - const GSVector4i u = du.srl32(); - const GSVector4i v = dv.srl32(); - - const GSVector4i rg = r.blend16<0xAA>(g); - - // mask based on what's outside the span - auto preserve_mask = wvec.lt32(GSVector4i::zero()); - - // clip test, if all pixels are outside, skip - preserve_mask = preserve_mask | xvec.lt32(clip_left); - preserve_mask = preserve_mask | xvec.gt32(clip_right); - if (!preserve_mask.alltrue()) - { - ShadePixel( - cmd, static_cast(x), static_cast(y), rg, b, u, v, preserve_mask, dither); - } - - x += 4; - - xvec = xvec.add32(GSVector4i::cxpr(4)); - wvec = wvec.sub32(GSVector4i::cxpr(4)); - - if constexpr (shading_enable) - { - dr = dr.add32(dr_dx); - dg = dg.add32(dg_dx); - db = db.add32(db_dx); - } - - if constexpr (texture_enable) - { - du = du.add32(du_dx); - dv = dv.add32(dv_dx); - } - } -} - -#endif // USE_VECTOR - template ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp, @@ -1028,7 +1037,18 @@ ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCo if (tp.fill_upside_down) { - while (current_y > end_y) + if (current_y <= end_y) + return; + + UVStepper luv = uv; + if constexpr (texture_enable) + luv.StepY(uvstep, current_y); + + RGBStepper lrgb = rgb; + if constexpr (shading_enable) + lrgb.StepY(rgbstep, current_y); + + do { current_y--; left_x -= left_x_step; @@ -1037,16 +1057,37 @@ ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCo const s32 y = TruncateGPUVertexPosition(current_y); if (y < static_cast(g_drawing_area.top)) break; - else if (y > static_cast(g_drawing_area.bottom)) + + // Opposite direction means we need to subtract when stepping instead of adding. + if constexpr (texture_enable) + luv.StepY(uvstep); + if constexpr (shading_enable) + lrgb.StepY(rgbstep); + + if (y > static_cast(g_drawing_area.bottom) || + (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (static_cast(current_y) & 1u))) + { continue; + } DrawSpan( - cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), uv, uvstep, rgb, rgbstep); - } + cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep); + } while (current_y > end_y); } else { - while (current_y < end_y) + if (current_y >= end_y) + return; + + UVStepper luv = uv; + if constexpr (texture_enable) + luv.StepY(uvstep, current_y); + + RGBStepper lrgb = rgb; + if constexpr (shading_enable) + lrgb.StepY(rgbstep, current_y); + + do { const s32 y = TruncateGPUVertexPosition(current_y); @@ -1054,19 +1095,273 @@ ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCo { break; } - else if (y >= static_cast(g_drawing_area.top)) + if (y >= static_cast(g_drawing_area.top) && + (!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (static_cast(current_y) & 1u))) { DrawSpan( - cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), uv, uvstep, rgb, rgbstep); + cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep); } current_y++; left_x += left_x_step; right_x += right_x_step; + + if constexpr (texture_enable) + luv.StepY(uvstep); + if constexpr (shading_enable) + lrgb.StepY(rgbstep); + } while (current_y < end_y); + } +} + +#else // USE_VECTOR + +namespace { +template +struct TriangleVectors : PixelVectors +{ + using UnusedField = PixelVectors::UnusedField; + + typename std::conditional_t drdx; + typename std::conditional_t dgdx; + typename std::conditional_t dbdx; + typename std::conditional_t dudx; + typename std::conditional_t dvdx; + typename std::conditional_t drdx_0123; + typename std::conditional_t dgdx_0123; + typename std::conditional_t dbdx_0123; + typename std::conditional_t dudx_0123; + typename std::conditional_t dvdx_0123; + + TriangleVectors(const GPUBackendDrawCommand* cmd, const UVSteps& uvstep, const RGBSteps& rgbstep) + : PixelVectors(cmd) + { + if constexpr (shading_enable) + { + drdx = GSVectorNi(rgbstep.drdx * PIXELS_PER_VEC); + dgdx = GSVectorNi(rgbstep.dgdx * PIXELS_PER_VEC); + dbdx = GSVectorNi(rgbstep.dbdx * PIXELS_PER_VEC); + + drdx_0123 = GSVectorNi(rgbstep.drdx).mul32l(SPAN_OFFSET_VEC); + dgdx_0123 = GSVectorNi(rgbstep.dgdx).mul32l(SPAN_OFFSET_VEC); + dbdx_0123 = GSVectorNi(rgbstep.dbdx).mul32l(SPAN_OFFSET_VEC); + } + + if constexpr (texture_enable) + { + dudx = GSVectorNi(uvstep.dudx * PIXELS_PER_VEC); + dvdx = GSVectorNi(uvstep.dvdx * PIXELS_PER_VEC); + dudx_0123 = GSVectorNi(uvstep.dudx).mul32l(SPAN_OFFSET_VEC); + dvdx_0123 = GSVectorNi(uvstep.dvdx).mul32l(SPAN_OFFSET_VEC); + } + } +}; +} // namespace + +template +static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv, + const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep, + const TriangleVectors& tv) +{ + s32 width = x_bound - x_start; + s32 current_x = TruncateGPUVertexPosition(x_start); + + // Skip pixels outside of the scissor rectangle. + if (current_x < static_cast(g_drawing_area.left)) + { + const s32 delta = static_cast(g_drawing_area.left) - current_x; + x_start += delta; + current_x += delta; + width -= delta; + } + + if ((current_x + width) > (static_cast(g_drawing_area.right) + 1)) + width = static_cast(g_drawing_area.right) + 1 - current_x; + + if (width <= 0) + return; + + GSVectorNi dr, dg, db; + if constexpr (shading_enable) + { + dr = GSVectorNi(rgb.r + rgbstep.drdx * x_start).add32(tv.drdx_0123); + dg = GSVectorNi(rgb.g + rgbstep.dgdx * x_start).add32(tv.dgdx_0123); + db = GSVectorNi(rgb.b + rgbstep.dbdx * x_start).add32(tv.dbdx_0123); + } + else + { + // precompute for flat shading + dr = GSVectorNi(rgb.r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); + dg = GSVectorNi((rgb.g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)) << 16); + db = GSVectorNi(rgb.b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); + } + + GSVectorNi du, dv; + if constexpr (texture_enable) + { + du = GSVectorNi(uv.u + uvstep.dudx * x_start).add32(tv.dudx_0123); + dv = GSVectorNi(uv.v + uvstep.dvdx * x_start).add32(tv.dvdx_0123); + } + else + { + // Hopefully optimized out... + du = GSVectorNi::zero(); + dv = GSVectorNi::zero(); + } + + const GSVectorNi dither = GSVectorNi::broadcast128( + &VECTOR_DITHER_MATRIX[static_cast(y) & 3][(static_cast(current_x) & 3) * 2]); + + GSVectorNi xvec = GSVectorNi(current_x).add32(SPAN_OFFSET_VEC); + GSVectorNi wvec = GSVectorNi(width).sub32(SPAN_WIDTH_VEC); + + for (s32 count = (width + (PIXELS_PER_VEC - 1)) / PIXELS_PER_VEC; count > 0; --count) + { + // R000 | R000 | R000 | R000 + // R0G0 | R0G0 | R0G0 | R0G0 + const GSVectorNi r = shading_enable ? dr.srl32() : dr; + const GSVectorNi g = + shading_enable ? dg.srl32().sll32<16>() : dg; // get G into the correct position + const GSVectorNi b = shading_enable ? db.srl32() : db; + const GSVectorNi u = du.srl32(); + const GSVectorNi v = dv.srl32(); + + const GSVectorNi rg = r.blend16<0xAA>(g); + + // mask based on what's outside the span + GSVectorNi preserve_mask = wvec.lt32(GSVectorNi::zero()); + + // clip test, if all pixels are outside, skip + preserve_mask = preserve_mask | xvec.lt32(tv.clip_left); + preserve_mask = preserve_mask | xvec.gt32(tv.clip_right); + if (!preserve_mask.alltrue()) + { + ShadePixel( + tv, cmd->draw_mode.texture_mode, cmd->draw_mode.transparency_mode, static_cast(current_x), + static_cast(y), rg, b, u, v, preserve_mask, dither); + } + + current_x += PIXELS_PER_VEC; + + xvec = xvec.add32(PIXELS_PER_VEC_VEC); + wvec = wvec.sub32(PIXELS_PER_VEC_VEC); + + if constexpr (shading_enable) + { + dr = dr.add32(tv.drdx); + dg = dg.add32(tv.dgdx); + db = db.add32(tv.dbdx); + } + + if constexpr (texture_enable) + { + du = du.add32(tv.dudx); + dv = dv.add32(tv.dvdx); } } } +template +ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp, + const UVStepper& uv, const UVSteps& uvstep, const RGBStepper& rgb, + const RGBSteps& rgbstep) +{ + static constexpr auto unfp_xy = [](s64 xfp) -> s32 { return static_cast(static_cast(xfp) >> 32); }; + + const u64 left_x_step = tp.step_x[0]; + const u64 right_x_step = tp.step_x[1]; + const s32 end_y = tp.end_y; + u64 left_x = tp.start_x[0]; + u64 right_x = tp.start_x[1]; + s32 current_y = tp.start_y; + + if (tp.fill_upside_down) + { + if (current_y <= end_y) + return; + + UVStepper luv = uv; + if constexpr (texture_enable) + luv.StepY(uvstep, current_y); + + RGBStepper lrgb = rgb; + if constexpr (shading_enable) + lrgb.StepY(rgbstep, current_y); + + const TriangleVectors tv(cmd, uvstep, rgbstep); + + do + { + current_y--; + left_x -= left_x_step; + right_x -= right_x_step; + + const s32 y = TruncateGPUVertexPosition(current_y); + if (y < static_cast(g_drawing_area.top)) + break; + + // Opposite direction means we need to subtract when stepping instead of adding. + if constexpr (texture_enable) + luv.StepY(uvstep); + if constexpr (shading_enable) + lrgb.StepY(rgbstep); + + if (y > static_cast(g_drawing_area.bottom) || + (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (static_cast(current_y) & 1u))) + { + continue; + } + + DrawSpan( + cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep, tv); + } while (current_y > end_y); + } + else + { + if (current_y >= end_y) + return; + + UVStepper luv = uv; + if constexpr (texture_enable) + luv.StepY(uvstep, current_y); + + RGBStepper lrgb = rgb; + if constexpr (shading_enable) + lrgb.StepY(rgbstep, current_y); + + const TriangleVectors tv(cmd, uvstep, rgbstep); + + do + { + const s32 y = TruncateGPUVertexPosition(current_y); + + if (y > static_cast(g_drawing_area.bottom)) + { + break; + } + if (y >= static_cast(g_drawing_area.top) && + (!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (static_cast(current_y) & 1u))) + { + DrawSpan( + cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep, tv); + } + + current_y++; + left_x += left_x_step; + right_x += right_x_step; + + if constexpr (texture_enable) + luv.StepY(uvstep); + if constexpr (shading_enable) + lrgb.StepY(rgbstep); + } while (current_y < end_y); + } +} + +#endif // USE_VECTOR + template static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0, @@ -1079,31 +1374,31 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke #endif // Sort vertices so that v0 is the top vertex, v1 is the bottom vertex, and v2 is the side vertex. - u32 vc = 0; + u32 tl = 0; if (v1->x <= v0->x) - vc = (v2->x <= v1->x) ? 4 : 2; + tl = (v2->x <= v1->x) ? 4 : 2; else if (v2->x < v0->x) - vc = 4; + tl = 4; else - vc = 1; + tl = 1; if (v2->y < v1->y) { std::swap(v2, v1); - vc = ((vc >> 1) & 0x2) | ((vc << 1) & 0x4) | (vc & 0x1); + tl = ((tl >> 1) & 0x2) | ((tl << 1) & 0x4) | (tl & 0x1); } if (v1->y < v0->y) { std::swap(v1, v0); - vc = ((vc >> 1) & 0x1) | ((vc << 1) & 0x2) | (vc & 0x4); + tl = ((tl >> 1) & 0x1) | ((tl << 1) & 0x2) | (tl & 0x4); } if (v2->y < v1->y) { std::swap(v2, v1); - vc = ((vc >> 1) & 0x2) | ((vc << 1) & 0x4) | (vc & 0x1); + tl = ((tl >> 1) & 0x2) | ((tl << 1) & 0x4) | (tl & 0x1); } const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2}; - vc = vc >> 1; + tl = tl >> 1; // Invalid size early culling. if (static_cast(std::abs(v2->x - v0->x)) >= MAX_PRIMITIVE_WIDTH || @@ -1123,8 +1418,8 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke const s64 base_step = makestep_xy(v2->x - v0->x, v2->y - v0->y); const s64 bound_coord_us = (v1->y == v0->y) ? 0 : makestep_xy(v1->x - v0->x, v1->y - v0->y); const s64 bound_coord_ls = (v2->y == v1->y) ? 0 : makestep_xy(v2->x - v1->x, v2->y - v1->y); - const u32 vo = (vc != 0) ? 1 : 0; - const u32 vp = (vc == 2) ? 3 : 0; + const u32 vo = (tl != 0) ? 1 : 0; + const u32 vp = (tl == 2) ? 3 : 0; const bool right_facing = (v1->y == v0->y) ? (v1->x > v0->x) : (bound_coord_us > base_step); const u32 rfi = BoolToUInt32(right_facing); const u32 ofi = BoolToUInt32(!right_facing); @@ -1182,21 +1477,28 @@ static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBacke // Undo the start of the vertex, so that when we add the offset for each line, it starts at the beginning value. UVStepper uv; RGBStepper rgb; - const GPUBackendDrawPolygonCommand::Vertex* core_vertex = vertices[vc]; + const GPUBackendDrawPolygonCommand::Vertex* top_left_vertex = vertices[tl]; if constexpr (texture_enable) { - uv.SetStart(core_vertex->u, core_vertex->v); - uv.StepXY(uvstep, -core_vertex->x, -core_vertex->y); + uv.Init(top_left_vertex->u, top_left_vertex->v); + uv.StepX(uvstep, -top_left_vertex->x); + uv.StepY(uvstep, -top_left_vertex->y); } else { - // Not actually used, but shut up the compiler. Should get optimized out. uv = {}; } - rgb.SetStart(core_vertex->r, core_vertex->g, core_vertex->b); if constexpr (shading_enable) - rgb.StepXY(rgbstep, -core_vertex->x, -core_vertex->y); + { + rgb.Init(top_left_vertex->r, top_left_vertex->g, top_left_vertex->b); + rgb.StepX(rgbstep, -top_left_vertex->x); + rgb.StepY(rgbstep, -top_left_vertex->y); + } + else + { + rgb.Init(top_left_vertex->r, top_left_vertex->g, top_left_vertex->b); + } #ifdef USE_VECTOR BACKUP_VRAM();