From d5b9b54a6989235e75253312fc16515f7267a235 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 29 Nov 2024 18:05:49 +1000 Subject: [PATCH] GPU/SW: Vectorize VRAM writes/copies --- src/core/gpu_sw_rasterizer.inl | 86 ++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 8 deletions(-) diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl index 3343862a9..803dbfdef 100644 --- a/src/core/gpu_sw_rasterizer.inl +++ b/src/core/gpu_sw_rasterizer.inl @@ -1639,8 +1639,6 @@ static void FillVRAMImpl(u32 x, u32 y, u32 width, u32 height, u32 color, bool in static void WriteVRAMImpl(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask) { - // TODO: Vector implementation - // Fast path when the copy is not oversized. if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !set_mask && !check_mask) { @@ -1661,10 +1659,49 @@ static void WriteVRAMImpl(u32 x, u32 y, u32 width, u32 height, const void* data, const u16 mask_and = check_mask ? 0x8000u : 0x0000u; const u16 mask_or = set_mask ? 0x8000u : 0x0000u; +#ifdef USE_VECTOR + constexpr u32 write_pixels_per_vec = sizeof(GSVectorNi) / sizeof(u16); + const u32 aligned_width = Common::AlignDownPow2(std::min(width, VRAM_WIDTH - x), write_pixels_per_vec); + const GSVectorNi mask_or_vec = GSVectorNi::cxpr16(mask_or); + const GSVectorNi mask_and_vec = GSVectorNi::cxpr16(mask_and); +#endif + for (u32 row = 0; row < height;) { u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH]; - for (u32 col = 0; col < width;) + + u32 col = 0; + +#ifdef USE_VECTOR + // This doesn't do wraparound. + if (mask_and != 0) + { + for (; col < aligned_width; col += write_pixels_per_vec) + { + const GSVectorNi src = GSVectorNi::load(src_ptr); + src_ptr += write_pixels_per_vec; + + GSVectorNi dst = GSVectorNi::load(&dst_row_ptr[x + col]); + + const GSVectorNi mask = (dst & mask_and_vec).sra16<15>(); + dst = (dst & mask) | src.andnot(mask) | mask_or_vec; + + GSVectorNi::store(&dst_row_ptr[x + col], dst); + } + } + else + { + for (; col < aligned_width; col += write_pixels_per_vec) + { + const GSVectorNi src = GSVectorNi::load(src_ptr); + src_ptr += write_pixels_per_vec; + + GSVectorNi::store(&dst_row_ptr[x + col], src | mask_or_vec); + } + } +#endif + + for (; col < width;) { // TODO: Handle unaligned reads... u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH]; @@ -1678,8 +1715,6 @@ static void WriteVRAMImpl(u32 x, u32 y, u32 width, u32 height, const void* data, static void CopyVRAMImpl(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, bool set_mask, bool check_mask) { - // TODO: Vector implementation. - // Break up oversized copies. This behavior has not been verified on console. if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH) { @@ -1698,8 +1733,8 @@ static void CopyVRAMImpl(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, { const u32 columns_to_copy = std::min(remaining_columns, std::min(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x)); - CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, set_mask, - check_mask); + CopyVRAMImpl(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, + set_mask, check_mask); current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH; current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH; remaining_columns -= columns_to_copy; @@ -1735,12 +1770,47 @@ static void CopyVRAMImpl(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, } else { +#ifdef USE_VECTOR + constexpr u32 copy_pixels_per_vec = sizeof(GSVectorNi) / sizeof(u16); + const u32 aligned_width = Common::AlignDownPow2( + std::min(width, std::min(VRAM_WIDTH - src_x, VRAM_WIDTH - dst_x)), copy_pixels_per_vec); + const GSVectorNi mask_or_vec = GSVectorNi::cxpr16(mask_or); + const GSVectorNi mask_and_vec = GSVectorNi::cxpr16(mask_and); +#endif + for (u32 row = 0; row < height; row++) { const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; - for (u32 col = 0; col < width; col++) + u32 col = 0; + +#ifdef USE_VECTOR + // This doesn't do wraparound. + if (mask_and != 0) + { + for (; col < aligned_width; col += copy_pixels_per_vec) + { + const GSVectorNi src = GSVectorNi::load(&src_row_ptr[src_x + col]); + GSVectorNi dst = GSVectorNi::load(&dst_row_ptr[dst_x + col]); + + const GSVectorNi mask = (dst & mask_and_vec).sra16<15>(); + dst = (dst & mask) | src.andnot(mask) | mask_or_vec; + + GSVectorNi::store(&dst_row_ptr[dst_x + col], dst); + } + } + else + { + for (; col < aligned_width; col += copy_pixels_per_vec) + { + const GSVectorNi src = GSVectorNi::load(&src_row_ptr[src_x + col]); + GSVectorNi::store(&dst_row_ptr[dst_x + col], src | mask_or_vec); + } + } +#endif + + for (; col < width; col++) { const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH]; u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH];