From c46ec398dccd901e394f4d3bae6ec305ba05c018 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Thu, 26 Sep 2024 18:29:22 +1000 Subject: [PATCH] GPU: Move software fill/write/copy into rasterizer namespace --- src/core/gpu.cpp | 191 +-------------------------- src/core/gpu.h | 22 +--- src/core/gpu_hw.cpp | 1 + src/core/gpu_sw_backend.cpp | 176 +------------------------ src/core/gpu_sw_rasterizer.cpp | 44 ++++--- src/core/gpu_sw_rasterizer.h | 26 ++-- src/core/gpu_sw_rasterizer.inl | 231 +++++++++++++++++++++++++++++++++ src/core/gpu_types.h | 2 - 8 files changed, 282 insertions(+), 411 deletions(-) diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp index 6de97d785..7c3723105 100644 --- a/src/core/gpu.cpp +++ b/src/core/gpu.cpp @@ -4,6 +4,7 @@ #include "gpu.h" #include "dma.h" #include "gpu_shadergen.h" +#include "gpu_sw_rasterizer.h" #include "host.h" #include "interrupt_controller.h" #include "settings.h" @@ -72,6 +73,7 @@ static void JoinScreenshotThreads(); GPU::GPU() { + GPU_SW_Rasterizer::SelectImplementation(); ResetStatistics(); } @@ -1529,195 +1531,6 @@ void GPU::ClearDisplay() DestroyDeinterlaceTextures(); } -void GPU::ReadVRAM(u32 x, u32 y, u32 width, u32 height) -{ -} - -void GPU::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) -{ - const u16 color16 = VRAMRGBA8888ToRGBA5551(color); - const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16); - constexpr u32 vector_width = 8; - const u32 aligned_width = Common::AlignDownPow2(width, vector_width); - - if ((x + width) <= VRAM_WIDTH && !IsInterlacedRenderingEnabled()) - { - for (u32 yoffs = 0; yoffs < height; yoffs++) - { - const u32 row = (y + yoffs) % VRAM_HEIGHT; - - u16* row_ptr = &g_vram[row * VRAM_WIDTH + x]; - u32 xoffs = 0; - for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width) - GSVector4i::store(row_ptr, fill); - for (; xoffs < width; xoffs++) - *(row_ptr++) = color16; - } - } - else if (IsInterlacedRenderingEnabled()) - { - // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field. - if (IsCRTCScanlinePending()) - SynchronizeCRTC(); - - const u32 active_field = GetActiveLineLSB(); - if ((x + width) <= VRAM_WIDTH) - { - for (u32 yoffs = 0; yoffs < height; yoffs++) - { - const u32 row = (y + yoffs) % VRAM_HEIGHT; - if ((row & u32(1)) == active_field) - continue; - - u16* row_ptr = &g_vram[row * VRAM_WIDTH + x]; - u32 xoffs = 0; - for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width) - GSVector4i::store(row_ptr, fill); - for (; xoffs < width; xoffs++) - *(row_ptr++) = color16; - } - } - else - { - for (u32 yoffs = 0; yoffs < height; yoffs++) - { - const u32 row = (y + yoffs) % VRAM_HEIGHT; - if ((row & u32(1)) == active_field) - continue; - - u16* row_ptr = &g_vram[row * VRAM_WIDTH]; - for (u32 xoffs = 0; xoffs < width; xoffs++) - { - const u32 col = (x + xoffs) % VRAM_WIDTH; - row_ptr[col] = color16; - } - } - } - } - else - { - for (u32 yoffs = 0; yoffs < height; yoffs++) - { - const u32 row = (y + yoffs) % VRAM_HEIGHT; - u16* row_ptr = &g_vram[row * VRAM_WIDTH]; - for (u32 xoffs = 0; xoffs < width; xoffs++) - { - const u32 col = (x + xoffs) % VRAM_WIDTH; - row_ptr[col] = color16; - } - } - } -} - -void GPU::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask) -{ - // Fast path when the copy is not oversized. - if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !set_mask && !check_mask) - { - const u16* src_ptr = static_cast(data); - u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x]; - for (u32 yoffs = 0; yoffs < height; yoffs++) - { - std::copy_n(src_ptr, width, dst_ptr); - src_ptr += width; - dst_ptr += VRAM_WIDTH; - } - } - else - { - // Slow path when we need to handle wrap-around. - // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or } - const u16* src_ptr = static_cast(data); - const u16 mask_and = check_mask ? 0x8000 : 0; - const u16 mask_or = set_mask ? 0x8000 : 0; - - for (u32 row = 0; row < height;) - { - u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH]; - for (u32 col = 0; col < width;) - { - // TODO: Handle unaligned reads... - u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH]; - if (((*pixel_ptr) & mask_and) == 0) - *pixel_ptr = *(src_ptr++) | mask_or; - } - } - } -} - -void GPU::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) -{ - // Break up oversized copies. This behavior has not been verified on console. - if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH) - { - u32 remaining_rows = height; - u32 current_src_y = src_y; - u32 current_dst_y = dst_y; - while (remaining_rows > 0) - { - const u32 rows_to_copy = - std::min(remaining_rows, std::min(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y)); - - u32 remaining_columns = width; - u32 current_src_x = src_x; - u32 current_dst_x = dst_x; - while (remaining_columns > 0) - { - const u32 columns_to_copy = - std::min(remaining_columns, std::min(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x)); - CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy); - current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH; - current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH; - remaining_columns -= columns_to_copy; - } - - current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT; - current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT; - remaining_rows -= rows_to_copy; - } - - return; - } - - // This doesn't have a fast path, but do we really need one? It's not common. - const u16 mask_and = m_GPUSTAT.GetMaskAND(); - const u16 mask_or = m_GPUSTAT.GetMaskOR(); - - // Copy in reverse when src_x < dst_x, this is verified on console. - if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH)) - { - for (u32 row = 0; row < height; row++) - { - const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; - u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; - - for (s32 col = static_cast(width - 1); col >= 0; col--) - { - const u16 src_pixel = src_row_ptr[(src_x + static_cast(col)) % VRAM_WIDTH]; - u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast(col)) % VRAM_WIDTH]; - if ((*dst_pixel_ptr & mask_and) == 0) - *dst_pixel_ptr = src_pixel | mask_or; - } - } - } - else - { - for (u32 row = 0; row < height; row++) - { - const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; - u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; - - for (u32 col = 0; col < width; col++) - { - const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH]; - u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH]; - if ((*dst_pixel_ptr & mask_and) == 0) - *dst_pixel_ptr = src_pixel | mask_or; - } - } - } -} - void GPU::SetClampedDrawingArea() { if (m_drawing_area.left > m_drawing_area.right || m_drawing_area.top > m_drawing_area.bottom) [[unlikely]] diff --git a/src/core/gpu.h b/src/core/gpu.h index fa76559f8..03283ca13 100644 --- a/src/core/gpu.h +++ b/src/core/gpu.h @@ -322,10 +322,10 @@ protected: bool IsCLUTValid() const; // Rendering in the backend - virtual void ReadVRAM(u32 x, u32 y, u32 width, u32 height); - virtual void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color); - virtual void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask); - virtual void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height); + virtual void ReadVRAM(u32 x, u32 y, u32 width, u32 height) = 0; + virtual void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) = 0; + virtual void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask) = 0; + virtual void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) = 0; virtual void DispatchRenderCommand() = 0; virtual void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) = 0; virtual void UpdateDisplay() = 0; @@ -416,6 +416,8 @@ protected: union GPUSTAT { + // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or } + u32 bits; BitField texture_page_x_base; BitField texture_page_y_base; @@ -459,18 +461,6 @@ protected: static constexpr u32 ACTIVE = (1 << 19) | (1 << 22); return ((bits & ACTIVE) == ACTIVE); } - - // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or } - ALWAYS_INLINE u16 GetMaskAND() const - { - // return check_mask_before_draw ? 0x8000 : 0x0000; - return Truncate16((bits << 3) & 0x8000); - } - ALWAYS_INLINE u16 GetMaskOR() const - { - // return set_mask_while_drawing ? 0x8000 : 0x0000; - return Truncate16((bits << 4) & 0x8000); - } } m_GPUSTAT = {}; struct DrawMode diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index f01fa0d2d..ea0f24d48 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -6,6 +6,7 @@ #include "cpu_pgxp.h" #include "gpu_hw_shadergen.h" #include "gpu_sw_backend.h" +#include "gpu_sw_rasterizer.h" #include "host.h" #include "settings.h" #include "system.h" diff --git a/src/core/gpu_sw_backend.cpp b/src/core/gpu_sw_backend.cpp index 6168f0d39..57e5e5196 100644 --- a/src/core/gpu_sw_backend.cpp +++ b/src/core/gpu_sw_backend.cpp @@ -16,8 +16,6 @@ GPU_SW_Backend::~GPU_SW_Backend() = default; bool GPU_SW_Backend::Initialize(bool force_thread) { - GPU_SW_Rasterizer::SelectImplementation(); - return GPUBackend::Initialize(force_thread); } @@ -59,186 +57,20 @@ void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd) void GPU_SW_Backend::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params) { - const u16 color16 = VRAMRGBA8888ToRGBA5551(color); - const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16); - constexpr u32 vector_width = 8; - const u32 aligned_width = Common::AlignDownPow2(width, vector_width); - - if ((x + width) <= VRAM_WIDTH && !params.interlaced_rendering) - { - for (u32 yoffs = 0; yoffs < height; yoffs++) - { - const u32 row = (y + yoffs) % VRAM_HEIGHT; - - u16* row_ptr = &g_vram[row * VRAM_WIDTH + x]; - u32 xoffs = 0; - for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width) - GSVector4i::store(row_ptr, fill); - for (; xoffs < width; xoffs++) - *(row_ptr++) = color16; - } - } - else if (params.interlaced_rendering) - { - // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field. - const u32 active_field = params.active_line_lsb; - - if ((x + width) <= VRAM_WIDTH) - { - for (u32 yoffs = 0; yoffs < height; yoffs++) - { - const u32 row = (y + yoffs) % VRAM_HEIGHT; - if ((row & u32(1)) == active_field) - continue; - - u16* row_ptr = &g_vram[row * VRAM_WIDTH + x]; - u32 xoffs = 0; - for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width) - GSVector4i::store(row_ptr, fill); - for (; xoffs < width; xoffs++) - *(row_ptr++) = color16; - } - } - else - { - for (u32 yoffs = 0; yoffs < height; yoffs++) - { - const u32 row = (y + yoffs) % VRAM_HEIGHT; - if ((row & u32(1)) == active_field) - continue; - - u16* row_ptr = &g_vram[row * VRAM_WIDTH]; - for (u32 xoffs = 0; xoffs < width; xoffs++) - { - const u32 col = (x + xoffs) % VRAM_WIDTH; - row_ptr[col] = color16; - } - } - } - } - else - { - for (u32 yoffs = 0; yoffs < height; yoffs++) - { - const u32 row = (y + yoffs) % VRAM_HEIGHT; - u16* row_ptr = &g_vram[row * VRAM_WIDTH]; - for (u32 xoffs = 0; xoffs < width; xoffs++) - { - const u32 col = (x + xoffs) % VRAM_WIDTH; - row_ptr[col] = color16; - } - } - } + GPU_SW_Rasterizer::FillVRAM(x, y, width, height, color, params.interlaced_rendering, params.active_line_lsb); } void GPU_SW_Backend::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, GPUBackendCommandParameters params) { - // Fast path when the copy is not oversized. - if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !params.IsMaskingEnabled()) - { - const u16* src_ptr = static_cast(data); - u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x]; - for (u32 yoffs = 0; yoffs < height; yoffs++) - { - std::copy_n(src_ptr, width, dst_ptr); - src_ptr += width; - dst_ptr += VRAM_WIDTH; - } - } - else - { - // Slow path when we need to handle wrap-around. - const u16* src_ptr = static_cast(data); - const u16 mask_and = params.GetMaskAND(); - const u16 mask_or = params.GetMaskOR(); - - for (u32 row = 0; row < height;) - { - u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH]; - for (u32 col = 0; col < width;) - { - // TODO: Handle unaligned reads... - u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH]; - if (((*pixel_ptr) & mask_and) == 0) - *pixel_ptr = *(src_ptr++) | mask_or; - } - } - } + GPU_SW_Rasterizer::WriteVRAM(x, y, width, height, data, params.set_mask_while_drawing, params.check_mask_before_draw); } void GPU_SW_Backend::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, GPUBackendCommandParameters params) { - // Break up oversized copies. This behavior has not been verified on console. - if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH) - { - u32 remaining_rows = height; - u32 current_src_y = src_y; - u32 current_dst_y = dst_y; - while (remaining_rows > 0) - { - const u32 rows_to_copy = - std::min(remaining_rows, std::min(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y)); - - u32 remaining_columns = width; - u32 current_src_x = src_x; - u32 current_dst_x = dst_x; - while (remaining_columns > 0) - { - const u32 columns_to_copy = - std::min(remaining_columns, std::min(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x)); - CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, params); - current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH; - current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH; - remaining_columns -= columns_to_copy; - } - - current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT; - current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT; - remaining_rows -= rows_to_copy; - } - - return; - } - - // This doesn't have a fast path, but do we really need one? It's not common. - const u16 mask_and = params.GetMaskAND(); - const u16 mask_or = params.GetMaskOR(); - - // Copy in reverse when src_x < dst_x, this is verified on console. - if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH)) - { - for (u32 row = 0; row < height; row++) - { - const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; - u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; - - for (s32 col = static_cast(width - 1); col >= 0; col--) - { - const u16 src_pixel = src_row_ptr[(src_x + static_cast(col)) % VRAM_WIDTH]; - u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast(col)) % VRAM_WIDTH]; - if ((*dst_pixel_ptr & mask_and) == 0) - *dst_pixel_ptr = src_pixel | mask_or; - } - } - } - else - { - for (u32 row = 0; row < height; row++) - { - const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; - u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; - - for (u32 col = 0; col < width; col++) - { - const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH]; - u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH]; - if ((*dst_pixel_ptr & mask_and) == 0) - *dst_pixel_ptr = src_pixel | mask_or; - } - } - } + GPU_SW_Rasterizer::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height, params.set_mask_while_drawing, + params.check_mask_before_draw); } void GPU_SW_Backend::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) diff --git a/src/core/gpu_sw_rasterizer.cpp b/src/core/gpu_sw_rasterizer.cpp index 816cf3076..87fd65134 100644 --- a/src/core/gpu_sw_rasterizer.cpp +++ b/src/core/gpu_sw_rasterizer.cpp @@ -12,11 +12,6 @@ LOG_CHANNEL(GPU_SW_Rasterizer); namespace GPU_SW_Rasterizer { -// Default implementation, compatible with all ISAs. -extern const DrawRectangleFunctionTable DrawRectangleFunctions; -extern const DrawTriangleFunctionTable DrawTriangleFunctions; -extern const DrawLineFunctionTable DrawLineFunctions; - constinit const DitherLUT g_dither_lut = []() constexpr { DitherLUT lut = {}; for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++) @@ -33,30 +28,33 @@ constinit const DitherLUT g_dither_lut = []() constexpr { return lut; }(); +const DrawRectangleFunctionTable* DrawRectangleFunctions = nullptr; +const DrawTriangleFunctionTable* DrawTriangleFunctions = nullptr; +const DrawLineFunctionTable* DrawLineFunctions = nullptr; +FillVRAMFunction FillVRAM = nullptr; +WriteVRAMFunction WriteVRAM = nullptr; +CopyVRAMFunction CopyVRAM = nullptr; GPUDrawingArea g_drawing_area = {}; } // namespace GPU_SW_Rasterizer -// Default implementation definitions. -namespace GPU_SW_Rasterizer { +// Default scalar implementation definitions. +namespace GPU_SW_Rasterizer::Scalar { +namespace { #include "gpu_sw_rasterizer.inl" } +} // namespace GPU_SW_Rasterizer::Scalar // Default vector implementation definitions. #if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON) namespace GPU_SW_Rasterizer::SIMD { +namespace { #define USE_VECTOR 1 #include "gpu_sw_rasterizer.inl" #undef USE_VECTOR +} // namespace } // namespace GPU_SW_Rasterizer::SIMD #endif -// Initialize with default implementation. -namespace GPU_SW_Rasterizer { -const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions = &DrawRectangleFunctions; -const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions = &DrawTriangleFunctions; -const DrawLineFunctionTable* SelectedDrawLineFunctions = &DrawLineFunctions; -} // namespace GPU_SW_Rasterizer - // Declare alternative implementations. void GPU_SW_Rasterizer::SelectImplementation() { @@ -66,13 +64,16 @@ void GPU_SW_Rasterizer::SelectImplementation() selected = true; -#define SELECT_ALTERNATIVE_RASTERIZER(isa) \ +#define SELECT_IMPLEMENTATION(isa) \ do \ { \ INFO_LOG("Using " #isa " software rasterizer implementation."); \ - SelectedDrawRectangleFunctions = &isa::DrawRectangleFunctions; \ - SelectedDrawTriangleFunctions = &isa::DrawTriangleFunctions; \ - SelectedDrawLineFunctions = &isa::DrawLineFunctions; \ + DrawRectangleFunctions = &isa::DrawRectangleFunctions; \ + DrawTriangleFunctions = &isa::DrawTriangleFunctions; \ + DrawLineFunctions = &isa::DrawLineFunctions; \ + FillVRAM = &isa::FillVRAMImpl; \ + WriteVRAM = &isa::WriteVRAMImpl; \ + CopyVRAM = &isa::CopyVRAMImpl; \ } while (0) #if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON) @@ -83,19 +84,20 @@ void GPU_SW_Rasterizer::SelectImplementation() #if defined(CPU_ARCH_SSE) && defined(_MSC_VER) && 0 if (cpuinfo_has_x86_avx2() && (!use_isa || StringUtil::Strcasecmp(use_isa, "AVX2") == 0)) { - SELECT_ALTERNATIVE_RASTERIZER(AVX2); + SELECT_IMPLEMENTATION(AVX2); return; } #endif if (!use_isa || StringUtil::Strcasecmp(use_isa, "SIMD") == 0) { - SELECT_ALTERNATIVE_RASTERIZER(SIMD); + SELECT_IMPLEMENTATION(SIMD); return; } #endif INFO_LOG("Using scalar software rasterizer implementation."); + SELECT_IMPLEMENTATION(Scalar); -#undef SELECT_ALTERNATIVE_RASTERIZER +#undef SELECT_IMPLEMENTATION } diff --git a/src/core/gpu_sw_rasterizer.h b/src/core/gpu_sw_rasterizer.h index a3eebe062..3861d9945 100644 --- a/src/core/gpu_sw_rasterizer.h +++ b/src/core/gpu_sw_rasterizer.h @@ -34,34 +34,38 @@ using DrawLineFunction = void (*)(const GPUBackendDrawLineCommand* cmd, const GP const GPUBackendDrawLineCommand::Vertex* p1); typedef const DrawLineFunction DrawLineFunctionTable[2][2]; -// Default implementation, compatible with all ISAs. -extern const DrawRectangleFunctionTable DrawRectangleFunctions; -extern const DrawTriangleFunctionTable DrawTriangleFunctions; -extern const DrawLineFunctionTable DrawLineFunctions; +using FillVRAMFunction = void (*)(u32 x, u32 y, u32 width, u32 height, u32 color, bool interlaced, u8 active_line_lsb); +using WriteVRAMFunction = void (*)(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, + bool check_mask); +using CopyVRAMFunction = void (*)(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, bool set_mask, + bool check_mask); // Current implementation, selected at runtime. -extern const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions; -extern const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions; -extern const DrawLineFunctionTable* SelectedDrawLineFunctions; +extern const DrawRectangleFunctionTable* DrawRectangleFunctions; +extern const DrawTriangleFunctionTable* DrawTriangleFunctions; +extern const DrawLineFunctionTable* DrawLineFunctions; +extern FillVRAMFunction FillVRAM; +extern WriteVRAMFunction WriteVRAM; +extern CopyVRAMFunction CopyVRAM; extern void SelectImplementation(); ALWAYS_INLINE static DrawLineFunction GetDrawLineFunction(bool shading_enable, bool transparency_enable) { - return (*SelectedDrawLineFunctions)[u8(shading_enable)][u8(transparency_enable)]; + return (*DrawLineFunctions)[u8(shading_enable)][u8(transparency_enable)]; } ALWAYS_INLINE static DrawRectangleFunction GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable, bool transparency_enable) { - return (*SelectedDrawRectangleFunctions)[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)]; + return (*DrawRectangleFunctions)[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)]; } ALWAYS_INLINE static DrawTriangleFunction GetDrawTriangleFunction(bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable) { - return (*SelectedDrawTriangleFunctions)[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)] - [u8(transparency_enable)]; + return ( + *DrawTriangleFunctions)[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)]; } #define DECLARE_ALTERNATIVE_RASTERIZER(isa) \ diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl index b49d344f7..0a1ed95e3 100644 --- a/src/core/gpu_sw_rasterizer.inl +++ b/src/core/gpu_sw_rasterizer.inl @@ -1519,6 +1519,237 @@ constinit const DrawTriangleFunctionTable DrawTriangleFunctions = { {{&DrawTriangle, &DrawTriangle}, {&DrawTriangle, &DrawTriangle}}}}; +static void FillVRAMImpl(u32 x, u32 y, u32 width, u32 height, u32 color, bool interlaced, u8 active_line_lsb) +{ +#ifdef USE_VECTOR + const u16 color16 = VRAMRGBA8888ToRGBA5551(color); + const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16); + constexpr u32 vector_width = 8; + const u32 aligned_width = Common::AlignDownPow2(width, vector_width); + + if ((x + width) <= VRAM_WIDTH && !interlaced) + { + for (u32 yoffs = 0; yoffs < height; yoffs++) + { + const u32 row = (y + yoffs) % VRAM_HEIGHT; + + u16* row_ptr = &g_vram[row * VRAM_WIDTH + x]; + u32 xoffs = 0; + for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width) + GSVector4i::store(row_ptr, fill); + for (; xoffs < width; xoffs++) + *(row_ptr++) = color16; + } + } + else if (interlaced) + { + // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field. + const u32 active_field = active_line_lsb; + + if ((x + width) <= VRAM_WIDTH) + { + for (u32 yoffs = 0; yoffs < height; yoffs++) + { + const u32 row = (y + yoffs) % VRAM_HEIGHT; + if ((row & u32(1)) == active_field) + continue; + + u16* row_ptr = &g_vram[row * VRAM_WIDTH + x]; + u32 xoffs = 0; + for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width) + GSVector4i::store(row_ptr, fill); + for (; xoffs < width; xoffs++) + *(row_ptr++) = color16; + } + } + else + { + for (u32 yoffs = 0; yoffs < height; yoffs++) + { + const u32 row = (y + yoffs) % VRAM_HEIGHT; + if ((row & u32(1)) == active_field) + continue; + + u16* row_ptr = &g_vram[row * VRAM_WIDTH]; + for (u32 xoffs = 0; xoffs < width; xoffs++) + { + const u32 col = (x + xoffs) % VRAM_WIDTH; + row_ptr[col] = color16; + } + } + } + } + else + { + for (u32 yoffs = 0; yoffs < height; yoffs++) + { + const u32 row = (y + yoffs) % VRAM_HEIGHT; + u16* row_ptr = &g_vram[row * VRAM_WIDTH]; + for (u32 xoffs = 0; xoffs < width; xoffs++) + { + const u32 col = (x + xoffs) % VRAM_WIDTH; + row_ptr[col] = color16; + } + } + } +#else + const u16 color16 = VRAMRGBA8888ToRGBA5551(color); + if ((x + width) <= VRAM_WIDTH && !interlaced) + { + for (u32 yoffs = 0; yoffs < height; yoffs++) + { + const u32 row = (y + yoffs) % VRAM_HEIGHT; + std::fill_n(&g_vram[row * VRAM_WIDTH + x], width, color16); + } + } + else if (interlaced) + { + // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field. + const u32 active_field = active_line_lsb; + + for (u32 yoffs = 0; yoffs < height; yoffs++) + { + const u32 row = (y + yoffs) % VRAM_HEIGHT; + if ((row & u32(1)) == active_field) + continue; + + u16* row_ptr = &g_vram[row * VRAM_WIDTH]; + for (u32 xoffs = 0; xoffs < width; xoffs++) + { + const u32 col = (x + xoffs) % VRAM_WIDTH; + row_ptr[col] = color16; + } + } + } + else + { + for (u32 yoffs = 0; yoffs < height; yoffs++) + { + const u32 row = (y + yoffs) % VRAM_HEIGHT; + u16* row_ptr = &g_vram[row * VRAM_WIDTH]; + for (u32 xoffs = 0; xoffs < width; xoffs++) + { + const u32 col = (x + xoffs) % VRAM_WIDTH; + row_ptr[col] = color16; + } + } + } +#endif +} + +static void WriteVRAMImpl(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask) +{ + // TODO: Vector implementation + + // Fast path when the copy is not oversized. + if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !set_mask && !check_mask) + { + const u16* src_ptr = static_cast(data); + u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x]; + for (u32 yoffs = 0; yoffs < height; yoffs++) + { + std::copy_n(src_ptr, width, dst_ptr); + src_ptr += width; + dst_ptr += VRAM_WIDTH; + } + } + else + { + // Slow path when we need to handle wrap-around. + // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or } + const u16* src_ptr = static_cast(data); + const u16 mask_and = check_mask ? 0x8000u : 0x0000u; + const u16 mask_or = set_mask ? 0x8000u : 0x0000u; + + for (u32 row = 0; row < height;) + { + u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH]; + for (u32 col = 0; col < width;) + { + // TODO: Handle unaligned reads... + u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH]; + if (((*pixel_ptr) & mask_and) == 0) + *pixel_ptr = *(src_ptr++) | mask_or; + } + } + } +} + +static void CopyVRAMImpl(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, bool set_mask, + bool check_mask) +{ + // TODO: Vector implementation. + + // Break up oversized copies. This behavior has not been verified on console. + if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH) + { + u32 remaining_rows = height; + u32 current_src_y = src_y; + u32 current_dst_y = dst_y; + while (remaining_rows > 0) + { + const u32 rows_to_copy = + std::min(remaining_rows, std::min(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y)); + + u32 remaining_columns = width; + u32 current_src_x = src_x; + u32 current_dst_x = dst_x; + while (remaining_columns > 0) + { + const u32 columns_to_copy = + std::min(remaining_columns, std::min(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x)); + CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, set_mask, + check_mask); + current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH; + current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH; + remaining_columns -= columns_to_copy; + } + + current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT; + current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT; + remaining_rows -= rows_to_copy; + } + + return; + } + + // This doesn't have a fast path, but do we really need one? It's not common. + const u16 mask_and = check_mask ? 0x8000u : 0x0000u; + const u16 mask_or = set_mask ? 0x8000u : 0x0000u; + + // Copy in reverse when src_x < dst_x, this is verified on console. + if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH)) + { + for (u32 row = 0; row < height; row++) + { + const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; + u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; + + for (s32 col = static_cast(width - 1); col >= 0; col--) + { + const u16 src_pixel = src_row_ptr[(src_x + static_cast(col)) % VRAM_WIDTH]; + u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast(col)) % VRAM_WIDTH]; + *dst_pixel_ptr = ((*dst_pixel_ptr & mask_and) == 0) ? (src_pixel | mask_or) : *dst_pixel_ptr; + } + } + } + else + { + for (u32 row = 0; row < height; row++) + { + const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; + u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; + + for (u32 col = 0; col < width; col++) + { + const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH]; + u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH]; + *dst_pixel_ptr = ((*dst_pixel_ptr & mask_and) == 0) ? (src_pixel | mask_or) : *dst_pixel_ptr; + } + } + } +} + #ifdef __INTELLISENSE__ } #endif diff --git a/src/core/gpu_types.h b/src/core/gpu_types.h index 99fa8932f..95e153e5e 100644 --- a/src/core/gpu_types.h +++ b/src/core/gpu_types.h @@ -275,8 +275,6 @@ union GPUBackendCommandParameters BitField set_mask_while_drawing; BitField check_mask_before_draw; - ALWAYS_INLINE bool IsMaskingEnabled() const { return (bits & 12u) != 0u; } - // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or } u16 GetMaskAND() const {