GPU: Move software fill/write/copy into rasterizer namespace
This commit is contained in:
parent
495a0da8d4
commit
c46ec398dc
191
src/core/gpu.cpp
191
src/core/gpu.cpp
|
@ -4,6 +4,7 @@
|
||||||
#include "gpu.h"
|
#include "gpu.h"
|
||||||
#include "dma.h"
|
#include "dma.h"
|
||||||
#include "gpu_shadergen.h"
|
#include "gpu_shadergen.h"
|
||||||
|
#include "gpu_sw_rasterizer.h"
|
||||||
#include "host.h"
|
#include "host.h"
|
||||||
#include "interrupt_controller.h"
|
#include "interrupt_controller.h"
|
||||||
#include "settings.h"
|
#include "settings.h"
|
||||||
|
@ -72,6 +73,7 @@ static void JoinScreenshotThreads();
|
||||||
|
|
||||||
GPU::GPU()
|
GPU::GPU()
|
||||||
{
|
{
|
||||||
|
GPU_SW_Rasterizer::SelectImplementation();
|
||||||
ResetStatistics();
|
ResetStatistics();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1529,195 +1531,6 @@ void GPU::ClearDisplay()
|
||||||
DestroyDeinterlaceTextures();
|
DestroyDeinterlaceTextures();
|
||||||
}
|
}
|
||||||
|
|
||||||
void GPU::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
void GPU::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
|
|
||||||
{
|
|
||||||
const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
|
|
||||||
const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16);
|
|
||||||
constexpr u32 vector_width = 8;
|
|
||||||
const u32 aligned_width = Common::AlignDownPow2(width, vector_width);
|
|
||||||
|
|
||||||
if ((x + width) <= VRAM_WIDTH && !IsInterlacedRenderingEnabled())
|
|
||||||
{
|
|
||||||
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
|
||||||
{
|
|
||||||
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
|
||||||
|
|
||||||
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
|
|
||||||
u32 xoffs = 0;
|
|
||||||
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
|
|
||||||
GSVector4i::store<false>(row_ptr, fill);
|
|
||||||
for (; xoffs < width; xoffs++)
|
|
||||||
*(row_ptr++) = color16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (IsInterlacedRenderingEnabled())
|
|
||||||
{
|
|
||||||
// Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
|
|
||||||
if (IsCRTCScanlinePending())
|
|
||||||
SynchronizeCRTC();
|
|
||||||
|
|
||||||
const u32 active_field = GetActiveLineLSB();
|
|
||||||
if ((x + width) <= VRAM_WIDTH)
|
|
||||||
{
|
|
||||||
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
|
||||||
{
|
|
||||||
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
|
||||||
if ((row & u32(1)) == active_field)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
|
|
||||||
u32 xoffs = 0;
|
|
||||||
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
|
|
||||||
GSVector4i::store<false>(row_ptr, fill);
|
|
||||||
for (; xoffs < width; xoffs++)
|
|
||||||
*(row_ptr++) = color16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
|
||||||
{
|
|
||||||
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
|
||||||
if ((row & u32(1)) == active_field)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
|
|
||||||
for (u32 xoffs = 0; xoffs < width; xoffs++)
|
|
||||||
{
|
|
||||||
const u32 col = (x + xoffs) % VRAM_WIDTH;
|
|
||||||
row_ptr[col] = color16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
|
||||||
{
|
|
||||||
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
|
||||||
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
|
|
||||||
for (u32 xoffs = 0; xoffs < width; xoffs++)
|
|
||||||
{
|
|
||||||
const u32 col = (x + xoffs) % VRAM_WIDTH;
|
|
||||||
row_ptr[col] = color16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void GPU::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask)
|
|
||||||
{
|
|
||||||
// Fast path when the copy is not oversized.
|
|
||||||
if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !set_mask && !check_mask)
|
|
||||||
{
|
|
||||||
const u16* src_ptr = static_cast<const u16*>(data);
|
|
||||||
u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x];
|
|
||||||
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
|
||||||
{
|
|
||||||
std::copy_n(src_ptr, width, dst_ptr);
|
|
||||||
src_ptr += width;
|
|
||||||
dst_ptr += VRAM_WIDTH;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Slow path when we need to handle wrap-around.
|
|
||||||
// During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
|
|
||||||
const u16* src_ptr = static_cast<const u16*>(data);
|
|
||||||
const u16 mask_and = check_mask ? 0x8000 : 0;
|
|
||||||
const u16 mask_or = set_mask ? 0x8000 : 0;
|
|
||||||
|
|
||||||
for (u32 row = 0; row < height;)
|
|
||||||
{
|
|
||||||
u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH];
|
|
||||||
for (u32 col = 0; col < width;)
|
|
||||||
{
|
|
||||||
// TODO: Handle unaligned reads...
|
|
||||||
u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH];
|
|
||||||
if (((*pixel_ptr) & mask_and) == 0)
|
|
||||||
*pixel_ptr = *(src_ptr++) | mask_or;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void GPU::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
|
|
||||||
{
|
|
||||||
// Break up oversized copies. This behavior has not been verified on console.
|
|
||||||
if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH)
|
|
||||||
{
|
|
||||||
u32 remaining_rows = height;
|
|
||||||
u32 current_src_y = src_y;
|
|
||||||
u32 current_dst_y = dst_y;
|
|
||||||
while (remaining_rows > 0)
|
|
||||||
{
|
|
||||||
const u32 rows_to_copy =
|
|
||||||
std::min<u32>(remaining_rows, std::min<u32>(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y));
|
|
||||||
|
|
||||||
u32 remaining_columns = width;
|
|
||||||
u32 current_src_x = src_x;
|
|
||||||
u32 current_dst_x = dst_x;
|
|
||||||
while (remaining_columns > 0)
|
|
||||||
{
|
|
||||||
const u32 columns_to_copy =
|
|
||||||
std::min<u32>(remaining_columns, std::min<u32>(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x));
|
|
||||||
CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy);
|
|
||||||
current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH;
|
|
||||||
current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH;
|
|
||||||
remaining_columns -= columns_to_copy;
|
|
||||||
}
|
|
||||||
|
|
||||||
current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT;
|
|
||||||
current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT;
|
|
||||||
remaining_rows -= rows_to_copy;
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// This doesn't have a fast path, but do we really need one? It's not common.
|
|
||||||
const u16 mask_and = m_GPUSTAT.GetMaskAND();
|
|
||||||
const u16 mask_or = m_GPUSTAT.GetMaskOR();
|
|
||||||
|
|
||||||
// Copy in reverse when src_x < dst_x, this is verified on console.
|
|
||||||
if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH))
|
|
||||||
{
|
|
||||||
for (u32 row = 0; row < height; row++)
|
|
||||||
{
|
|
||||||
const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
|
||||||
u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
|
||||||
|
|
||||||
for (s32 col = static_cast<s32>(width - 1); col >= 0; col--)
|
|
||||||
{
|
|
||||||
const u16 src_pixel = src_row_ptr[(src_x + static_cast<u32>(col)) % VRAM_WIDTH];
|
|
||||||
u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast<u32>(col)) % VRAM_WIDTH];
|
|
||||||
if ((*dst_pixel_ptr & mask_and) == 0)
|
|
||||||
*dst_pixel_ptr = src_pixel | mask_or;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (u32 row = 0; row < height; row++)
|
|
||||||
{
|
|
||||||
const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
|
||||||
u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
|
||||||
|
|
||||||
for (u32 col = 0; col < width; col++)
|
|
||||||
{
|
|
||||||
const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH];
|
|
||||||
u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH];
|
|
||||||
if ((*dst_pixel_ptr & mask_and) == 0)
|
|
||||||
*dst_pixel_ptr = src_pixel | mask_or;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void GPU::SetClampedDrawingArea()
|
void GPU::SetClampedDrawingArea()
|
||||||
{
|
{
|
||||||
if (m_drawing_area.left > m_drawing_area.right || m_drawing_area.top > m_drawing_area.bottom) [[unlikely]]
|
if (m_drawing_area.left > m_drawing_area.right || m_drawing_area.top > m_drawing_area.bottom) [[unlikely]]
|
||||||
|
|
|
@ -322,10 +322,10 @@ protected:
|
||||||
bool IsCLUTValid() const;
|
bool IsCLUTValid() const;
|
||||||
|
|
||||||
// Rendering in the backend
|
// Rendering in the backend
|
||||||
virtual void ReadVRAM(u32 x, u32 y, u32 width, u32 height);
|
virtual void ReadVRAM(u32 x, u32 y, u32 width, u32 height) = 0;
|
||||||
virtual void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color);
|
virtual void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) = 0;
|
||||||
virtual void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask);
|
virtual void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask) = 0;
|
||||||
virtual void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height);
|
virtual void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) = 0;
|
||||||
virtual void DispatchRenderCommand() = 0;
|
virtual void DispatchRenderCommand() = 0;
|
||||||
virtual void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) = 0;
|
virtual void UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit) = 0;
|
||||||
virtual void UpdateDisplay() = 0;
|
virtual void UpdateDisplay() = 0;
|
||||||
|
@ -416,6 +416,8 @@ protected:
|
||||||
|
|
||||||
union GPUSTAT
|
union GPUSTAT
|
||||||
{
|
{
|
||||||
|
// During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
|
||||||
|
|
||||||
u32 bits;
|
u32 bits;
|
||||||
BitField<u32, u8, 0, 4> texture_page_x_base;
|
BitField<u32, u8, 0, 4> texture_page_x_base;
|
||||||
BitField<u32, u8, 4, 1> texture_page_y_base;
|
BitField<u32, u8, 4, 1> texture_page_y_base;
|
||||||
|
@ -459,18 +461,6 @@ protected:
|
||||||
static constexpr u32 ACTIVE = (1 << 19) | (1 << 22);
|
static constexpr u32 ACTIVE = (1 << 19) | (1 << 22);
|
||||||
return ((bits & ACTIVE) == ACTIVE);
|
return ((bits & ACTIVE) == ACTIVE);
|
||||||
}
|
}
|
||||||
|
|
||||||
// During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
|
|
||||||
ALWAYS_INLINE u16 GetMaskAND() const
|
|
||||||
{
|
|
||||||
// return check_mask_before_draw ? 0x8000 : 0x0000;
|
|
||||||
return Truncate16((bits << 3) & 0x8000);
|
|
||||||
}
|
|
||||||
ALWAYS_INLINE u16 GetMaskOR() const
|
|
||||||
{
|
|
||||||
// return set_mask_while_drawing ? 0x8000 : 0x0000;
|
|
||||||
return Truncate16((bits << 4) & 0x8000);
|
|
||||||
}
|
|
||||||
} m_GPUSTAT = {};
|
} m_GPUSTAT = {};
|
||||||
|
|
||||||
struct DrawMode
|
struct DrawMode
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
#include "cpu_pgxp.h"
|
#include "cpu_pgxp.h"
|
||||||
#include "gpu_hw_shadergen.h"
|
#include "gpu_hw_shadergen.h"
|
||||||
#include "gpu_sw_backend.h"
|
#include "gpu_sw_backend.h"
|
||||||
|
#include "gpu_sw_rasterizer.h"
|
||||||
#include "host.h"
|
#include "host.h"
|
||||||
#include "settings.h"
|
#include "settings.h"
|
||||||
#include "system.h"
|
#include "system.h"
|
||||||
|
|
|
@ -16,8 +16,6 @@ GPU_SW_Backend::~GPU_SW_Backend() = default;
|
||||||
|
|
||||||
bool GPU_SW_Backend::Initialize(bool force_thread)
|
bool GPU_SW_Backend::Initialize(bool force_thread)
|
||||||
{
|
{
|
||||||
GPU_SW_Rasterizer::SelectImplementation();
|
|
||||||
|
|
||||||
return GPUBackend::Initialize(force_thread);
|
return GPUBackend::Initialize(force_thread);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,186 +57,20 @@ void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd)
|
||||||
|
|
||||||
void GPU_SW_Backend::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params)
|
void GPU_SW_Backend::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params)
|
||||||
{
|
{
|
||||||
const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
|
GPU_SW_Rasterizer::FillVRAM(x, y, width, height, color, params.interlaced_rendering, params.active_line_lsb);
|
||||||
const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16);
|
|
||||||
constexpr u32 vector_width = 8;
|
|
||||||
const u32 aligned_width = Common::AlignDownPow2(width, vector_width);
|
|
||||||
|
|
||||||
if ((x + width) <= VRAM_WIDTH && !params.interlaced_rendering)
|
|
||||||
{
|
|
||||||
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
|
||||||
{
|
|
||||||
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
|
||||||
|
|
||||||
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
|
|
||||||
u32 xoffs = 0;
|
|
||||||
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
|
|
||||||
GSVector4i::store<false>(row_ptr, fill);
|
|
||||||
for (; xoffs < width; xoffs++)
|
|
||||||
*(row_ptr++) = color16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (params.interlaced_rendering)
|
|
||||||
{
|
|
||||||
// Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
|
|
||||||
const u32 active_field = params.active_line_lsb;
|
|
||||||
|
|
||||||
if ((x + width) <= VRAM_WIDTH)
|
|
||||||
{
|
|
||||||
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
|
||||||
{
|
|
||||||
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
|
||||||
if ((row & u32(1)) == active_field)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
|
|
||||||
u32 xoffs = 0;
|
|
||||||
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
|
|
||||||
GSVector4i::store<false>(row_ptr, fill);
|
|
||||||
for (; xoffs < width; xoffs++)
|
|
||||||
*(row_ptr++) = color16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
|
||||||
{
|
|
||||||
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
|
||||||
if ((row & u32(1)) == active_field)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
|
|
||||||
for (u32 xoffs = 0; xoffs < width; xoffs++)
|
|
||||||
{
|
|
||||||
const u32 col = (x + xoffs) % VRAM_WIDTH;
|
|
||||||
row_ptr[col] = color16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
|
||||||
{
|
|
||||||
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
|
||||||
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
|
|
||||||
for (u32 xoffs = 0; xoffs < width; xoffs++)
|
|
||||||
{
|
|
||||||
const u32 col = (x + xoffs) % VRAM_WIDTH;
|
|
||||||
row_ptr[col] = color16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void GPU_SW_Backend::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data,
|
void GPU_SW_Backend::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data,
|
||||||
GPUBackendCommandParameters params)
|
GPUBackendCommandParameters params)
|
||||||
{
|
{
|
||||||
// Fast path when the copy is not oversized.
|
GPU_SW_Rasterizer::WriteVRAM(x, y, width, height, data, params.set_mask_while_drawing, params.check_mask_before_draw);
|
||||||
if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !params.IsMaskingEnabled())
|
|
||||||
{
|
|
||||||
const u16* src_ptr = static_cast<const u16*>(data);
|
|
||||||
u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x];
|
|
||||||
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
|
||||||
{
|
|
||||||
std::copy_n(src_ptr, width, dst_ptr);
|
|
||||||
src_ptr += width;
|
|
||||||
dst_ptr += VRAM_WIDTH;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Slow path when we need to handle wrap-around.
|
|
||||||
const u16* src_ptr = static_cast<const u16*>(data);
|
|
||||||
const u16 mask_and = params.GetMaskAND();
|
|
||||||
const u16 mask_or = params.GetMaskOR();
|
|
||||||
|
|
||||||
for (u32 row = 0; row < height;)
|
|
||||||
{
|
|
||||||
u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH];
|
|
||||||
for (u32 col = 0; col < width;)
|
|
||||||
{
|
|
||||||
// TODO: Handle unaligned reads...
|
|
||||||
u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH];
|
|
||||||
if (((*pixel_ptr) & mask_and) == 0)
|
|
||||||
*pixel_ptr = *(src_ptr++) | mask_or;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void GPU_SW_Backend::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
|
void GPU_SW_Backend::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
|
||||||
GPUBackendCommandParameters params)
|
GPUBackendCommandParameters params)
|
||||||
{
|
{
|
||||||
// Break up oversized copies. This behavior has not been verified on console.
|
GPU_SW_Rasterizer::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height, params.set_mask_while_drawing,
|
||||||
if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH)
|
params.check_mask_before_draw);
|
||||||
{
|
|
||||||
u32 remaining_rows = height;
|
|
||||||
u32 current_src_y = src_y;
|
|
||||||
u32 current_dst_y = dst_y;
|
|
||||||
while (remaining_rows > 0)
|
|
||||||
{
|
|
||||||
const u32 rows_to_copy =
|
|
||||||
std::min<u32>(remaining_rows, std::min<u32>(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y));
|
|
||||||
|
|
||||||
u32 remaining_columns = width;
|
|
||||||
u32 current_src_x = src_x;
|
|
||||||
u32 current_dst_x = dst_x;
|
|
||||||
while (remaining_columns > 0)
|
|
||||||
{
|
|
||||||
const u32 columns_to_copy =
|
|
||||||
std::min<u32>(remaining_columns, std::min<u32>(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x));
|
|
||||||
CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, params);
|
|
||||||
current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH;
|
|
||||||
current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH;
|
|
||||||
remaining_columns -= columns_to_copy;
|
|
||||||
}
|
|
||||||
|
|
||||||
current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT;
|
|
||||||
current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT;
|
|
||||||
remaining_rows -= rows_to_copy;
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// This doesn't have a fast path, but do we really need one? It's not common.
|
|
||||||
const u16 mask_and = params.GetMaskAND();
|
|
||||||
const u16 mask_or = params.GetMaskOR();
|
|
||||||
|
|
||||||
// Copy in reverse when src_x < dst_x, this is verified on console.
|
|
||||||
if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH))
|
|
||||||
{
|
|
||||||
for (u32 row = 0; row < height; row++)
|
|
||||||
{
|
|
||||||
const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
|
||||||
u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
|
||||||
|
|
||||||
for (s32 col = static_cast<s32>(width - 1); col >= 0; col--)
|
|
||||||
{
|
|
||||||
const u16 src_pixel = src_row_ptr[(src_x + static_cast<u32>(col)) % VRAM_WIDTH];
|
|
||||||
u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast<u32>(col)) % VRAM_WIDTH];
|
|
||||||
if ((*dst_pixel_ptr & mask_and) == 0)
|
|
||||||
*dst_pixel_ptr = src_pixel | mask_or;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (u32 row = 0; row < height; row++)
|
|
||||||
{
|
|
||||||
const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
|
||||||
u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
|
||||||
|
|
||||||
for (u32 col = 0; col < width; col++)
|
|
||||||
{
|
|
||||||
const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH];
|
|
||||||
u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH];
|
|
||||||
if ((*dst_pixel_ptr & mask_and) == 0)
|
|
||||||
*dst_pixel_ptr = src_pixel | mask_or;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void GPU_SW_Backend::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
|
void GPU_SW_Backend::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
|
||||||
|
|
|
@ -12,11 +12,6 @@
|
||||||
LOG_CHANNEL(GPU_SW_Rasterizer);
|
LOG_CHANNEL(GPU_SW_Rasterizer);
|
||||||
|
|
||||||
namespace GPU_SW_Rasterizer {
|
namespace GPU_SW_Rasterizer {
|
||||||
// Default implementation, compatible with all ISAs.
|
|
||||||
extern const DrawRectangleFunctionTable DrawRectangleFunctions;
|
|
||||||
extern const DrawTriangleFunctionTable DrawTriangleFunctions;
|
|
||||||
extern const DrawLineFunctionTable DrawLineFunctions;
|
|
||||||
|
|
||||||
constinit const DitherLUT g_dither_lut = []() constexpr {
|
constinit const DitherLUT g_dither_lut = []() constexpr {
|
||||||
DitherLUT lut = {};
|
DitherLUT lut = {};
|
||||||
for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++)
|
for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++)
|
||||||
|
@ -33,30 +28,33 @@ constinit const DitherLUT g_dither_lut = []() constexpr {
|
||||||
return lut;
|
return lut;
|
||||||
}();
|
}();
|
||||||
|
|
||||||
|
const DrawRectangleFunctionTable* DrawRectangleFunctions = nullptr;
|
||||||
|
const DrawTriangleFunctionTable* DrawTriangleFunctions = nullptr;
|
||||||
|
const DrawLineFunctionTable* DrawLineFunctions = nullptr;
|
||||||
|
FillVRAMFunction FillVRAM = nullptr;
|
||||||
|
WriteVRAMFunction WriteVRAM = nullptr;
|
||||||
|
CopyVRAMFunction CopyVRAM = nullptr;
|
||||||
GPUDrawingArea g_drawing_area = {};
|
GPUDrawingArea g_drawing_area = {};
|
||||||
} // namespace GPU_SW_Rasterizer
|
} // namespace GPU_SW_Rasterizer
|
||||||
|
|
||||||
// Default implementation definitions.
|
// Default scalar implementation definitions.
|
||||||
namespace GPU_SW_Rasterizer {
|
namespace GPU_SW_Rasterizer::Scalar {
|
||||||
|
namespace {
|
||||||
#include "gpu_sw_rasterizer.inl"
|
#include "gpu_sw_rasterizer.inl"
|
||||||
}
|
}
|
||||||
|
} // namespace GPU_SW_Rasterizer::Scalar
|
||||||
|
|
||||||
// Default vector implementation definitions.
|
// Default vector implementation definitions.
|
||||||
#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
|
#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
|
||||||
namespace GPU_SW_Rasterizer::SIMD {
|
namespace GPU_SW_Rasterizer::SIMD {
|
||||||
|
namespace {
|
||||||
#define USE_VECTOR 1
|
#define USE_VECTOR 1
|
||||||
#include "gpu_sw_rasterizer.inl"
|
#include "gpu_sw_rasterizer.inl"
|
||||||
#undef USE_VECTOR
|
#undef USE_VECTOR
|
||||||
|
} // namespace
|
||||||
} // namespace GPU_SW_Rasterizer::SIMD
|
} // namespace GPU_SW_Rasterizer::SIMD
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Initialize with default implementation.
|
|
||||||
namespace GPU_SW_Rasterizer {
|
|
||||||
const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions = &DrawRectangleFunctions;
|
|
||||||
const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions = &DrawTriangleFunctions;
|
|
||||||
const DrawLineFunctionTable* SelectedDrawLineFunctions = &DrawLineFunctions;
|
|
||||||
} // namespace GPU_SW_Rasterizer
|
|
||||||
|
|
||||||
// Declare alternative implementations.
|
// Declare alternative implementations.
|
||||||
void GPU_SW_Rasterizer::SelectImplementation()
|
void GPU_SW_Rasterizer::SelectImplementation()
|
||||||
{
|
{
|
||||||
|
@ -66,13 +64,16 @@ void GPU_SW_Rasterizer::SelectImplementation()
|
||||||
|
|
||||||
selected = true;
|
selected = true;
|
||||||
|
|
||||||
#define SELECT_ALTERNATIVE_RASTERIZER(isa) \
|
#define SELECT_IMPLEMENTATION(isa) \
|
||||||
do \
|
do \
|
||||||
{ \
|
{ \
|
||||||
INFO_LOG("Using " #isa " software rasterizer implementation."); \
|
INFO_LOG("Using " #isa " software rasterizer implementation."); \
|
||||||
SelectedDrawRectangleFunctions = &isa::DrawRectangleFunctions; \
|
DrawRectangleFunctions = &isa::DrawRectangleFunctions; \
|
||||||
SelectedDrawTriangleFunctions = &isa::DrawTriangleFunctions; \
|
DrawTriangleFunctions = &isa::DrawTriangleFunctions; \
|
||||||
SelectedDrawLineFunctions = &isa::DrawLineFunctions; \
|
DrawLineFunctions = &isa::DrawLineFunctions; \
|
||||||
|
FillVRAM = &isa::FillVRAMImpl; \
|
||||||
|
WriteVRAM = &isa::WriteVRAMImpl; \
|
||||||
|
CopyVRAM = &isa::CopyVRAMImpl; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
|
#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
|
||||||
|
@ -83,19 +84,20 @@ void GPU_SW_Rasterizer::SelectImplementation()
|
||||||
#if defined(CPU_ARCH_SSE) && defined(_MSC_VER) && 0
|
#if defined(CPU_ARCH_SSE) && defined(_MSC_VER) && 0
|
||||||
if (cpuinfo_has_x86_avx2() && (!use_isa || StringUtil::Strcasecmp(use_isa, "AVX2") == 0))
|
if (cpuinfo_has_x86_avx2() && (!use_isa || StringUtil::Strcasecmp(use_isa, "AVX2") == 0))
|
||||||
{
|
{
|
||||||
SELECT_ALTERNATIVE_RASTERIZER(AVX2);
|
SELECT_IMPLEMENTATION(AVX2);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!use_isa || StringUtil::Strcasecmp(use_isa, "SIMD") == 0)
|
if (!use_isa || StringUtil::Strcasecmp(use_isa, "SIMD") == 0)
|
||||||
{
|
{
|
||||||
SELECT_ALTERNATIVE_RASTERIZER(SIMD);
|
SELECT_IMPLEMENTATION(SIMD);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
INFO_LOG("Using scalar software rasterizer implementation.");
|
INFO_LOG("Using scalar software rasterizer implementation.");
|
||||||
|
SELECT_IMPLEMENTATION(Scalar);
|
||||||
|
|
||||||
#undef SELECT_ALTERNATIVE_RASTERIZER
|
#undef SELECT_IMPLEMENTATION
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,34 +34,38 @@ using DrawLineFunction = void (*)(const GPUBackendDrawLineCommand* cmd, const GP
|
||||||
const GPUBackendDrawLineCommand::Vertex* p1);
|
const GPUBackendDrawLineCommand::Vertex* p1);
|
||||||
typedef const DrawLineFunction DrawLineFunctionTable[2][2];
|
typedef const DrawLineFunction DrawLineFunctionTable[2][2];
|
||||||
|
|
||||||
// Default implementation, compatible with all ISAs.
|
using FillVRAMFunction = void (*)(u32 x, u32 y, u32 width, u32 height, u32 color, bool interlaced, u8 active_line_lsb);
|
||||||
extern const DrawRectangleFunctionTable DrawRectangleFunctions;
|
using WriteVRAMFunction = void (*)(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask,
|
||||||
extern const DrawTriangleFunctionTable DrawTriangleFunctions;
|
bool check_mask);
|
||||||
extern const DrawLineFunctionTable DrawLineFunctions;
|
using CopyVRAMFunction = void (*)(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, bool set_mask,
|
||||||
|
bool check_mask);
|
||||||
|
|
||||||
// Current implementation, selected at runtime.
|
// Current implementation, selected at runtime.
|
||||||
extern const DrawRectangleFunctionTable* SelectedDrawRectangleFunctions;
|
extern const DrawRectangleFunctionTable* DrawRectangleFunctions;
|
||||||
extern const DrawTriangleFunctionTable* SelectedDrawTriangleFunctions;
|
extern const DrawTriangleFunctionTable* DrawTriangleFunctions;
|
||||||
extern const DrawLineFunctionTable* SelectedDrawLineFunctions;
|
extern const DrawLineFunctionTable* DrawLineFunctions;
|
||||||
|
extern FillVRAMFunction FillVRAM;
|
||||||
|
extern WriteVRAMFunction WriteVRAM;
|
||||||
|
extern CopyVRAMFunction CopyVRAM;
|
||||||
|
|
||||||
extern void SelectImplementation();
|
extern void SelectImplementation();
|
||||||
|
|
||||||
ALWAYS_INLINE static DrawLineFunction GetDrawLineFunction(bool shading_enable, bool transparency_enable)
|
ALWAYS_INLINE static DrawLineFunction GetDrawLineFunction(bool shading_enable, bool transparency_enable)
|
||||||
{
|
{
|
||||||
return (*SelectedDrawLineFunctions)[u8(shading_enable)][u8(transparency_enable)];
|
return (*DrawLineFunctions)[u8(shading_enable)][u8(transparency_enable)];
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE static DrawRectangleFunction GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable,
|
ALWAYS_INLINE static DrawRectangleFunction GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable,
|
||||||
bool transparency_enable)
|
bool transparency_enable)
|
||||||
{
|
{
|
||||||
return (*SelectedDrawRectangleFunctions)[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)];
|
return (*DrawRectangleFunctions)[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)];
|
||||||
}
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE static DrawTriangleFunction GetDrawTriangleFunction(bool shading_enable, bool texture_enable,
|
ALWAYS_INLINE static DrawTriangleFunction GetDrawTriangleFunction(bool shading_enable, bool texture_enable,
|
||||||
bool raw_texture_enable, bool transparency_enable)
|
bool raw_texture_enable, bool transparency_enable)
|
||||||
{
|
{
|
||||||
return (*SelectedDrawTriangleFunctions)[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)]
|
return (
|
||||||
[u8(transparency_enable)];
|
*DrawTriangleFunctions)[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)];
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DECLARE_ALTERNATIVE_RASTERIZER(isa) \
|
#define DECLARE_ALTERNATIVE_RASTERIZER(isa) \
|
||||||
|
|
|
@ -1519,6 +1519,237 @@ constinit const DrawTriangleFunctionTable DrawTriangleFunctions = {
|
||||||
{{&DrawTriangle<true, true, false, false>, &DrawTriangle<true, true, false, true>},
|
{{&DrawTriangle<true, true, false, false>, &DrawTriangle<true, true, false, true>},
|
||||||
{&DrawTriangle<true, true, true, false>, &DrawTriangle<true, true, true, true>}}}};
|
{&DrawTriangle<true, true, true, false>, &DrawTriangle<true, true, true, true>}}}};
|
||||||
|
|
||||||
|
static void FillVRAMImpl(u32 x, u32 y, u32 width, u32 height, u32 color, bool interlaced, u8 active_line_lsb)
|
||||||
|
{
|
||||||
|
#ifdef USE_VECTOR
|
||||||
|
const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
|
||||||
|
const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16);
|
||||||
|
constexpr u32 vector_width = 8;
|
||||||
|
const u32 aligned_width = Common::AlignDownPow2(width, vector_width);
|
||||||
|
|
||||||
|
if ((x + width) <= VRAM_WIDTH && !interlaced)
|
||||||
|
{
|
||||||
|
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
||||||
|
{
|
||||||
|
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
||||||
|
|
||||||
|
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
|
||||||
|
u32 xoffs = 0;
|
||||||
|
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
|
||||||
|
GSVector4i::store<false>(row_ptr, fill);
|
||||||
|
for (; xoffs < width; xoffs++)
|
||||||
|
*(row_ptr++) = color16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (interlaced)
|
||||||
|
{
|
||||||
|
// Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
|
||||||
|
const u32 active_field = active_line_lsb;
|
||||||
|
|
||||||
|
if ((x + width) <= VRAM_WIDTH)
|
||||||
|
{
|
||||||
|
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
||||||
|
{
|
||||||
|
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
||||||
|
if ((row & u32(1)) == active_field)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
|
||||||
|
u32 xoffs = 0;
|
||||||
|
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
|
||||||
|
GSVector4i::store<false>(row_ptr, fill);
|
||||||
|
for (; xoffs < width; xoffs++)
|
||||||
|
*(row_ptr++) = color16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
||||||
|
{
|
||||||
|
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
||||||
|
if ((row & u32(1)) == active_field)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
|
||||||
|
for (u32 xoffs = 0; xoffs < width; xoffs++)
|
||||||
|
{
|
||||||
|
const u32 col = (x + xoffs) % VRAM_WIDTH;
|
||||||
|
row_ptr[col] = color16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
||||||
|
{
|
||||||
|
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
||||||
|
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
|
||||||
|
for (u32 xoffs = 0; xoffs < width; xoffs++)
|
||||||
|
{
|
||||||
|
const u32 col = (x + xoffs) % VRAM_WIDTH;
|
||||||
|
row_ptr[col] = color16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
|
||||||
|
if ((x + width) <= VRAM_WIDTH && !interlaced)
|
||||||
|
{
|
||||||
|
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
||||||
|
{
|
||||||
|
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
||||||
|
std::fill_n(&g_vram[row * VRAM_WIDTH + x], width, color16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (interlaced)
|
||||||
|
{
|
||||||
|
// Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
|
||||||
|
const u32 active_field = active_line_lsb;
|
||||||
|
|
||||||
|
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
||||||
|
{
|
||||||
|
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
||||||
|
if ((row & u32(1)) == active_field)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
|
||||||
|
for (u32 xoffs = 0; xoffs < width; xoffs++)
|
||||||
|
{
|
||||||
|
const u32 col = (x + xoffs) % VRAM_WIDTH;
|
||||||
|
row_ptr[col] = color16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
||||||
|
{
|
||||||
|
const u32 row = (y + yoffs) % VRAM_HEIGHT;
|
||||||
|
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
|
||||||
|
for (u32 xoffs = 0; xoffs < width; xoffs++)
|
||||||
|
{
|
||||||
|
const u32 col = (x + xoffs) % VRAM_WIDTH;
|
||||||
|
row_ptr[col] = color16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void WriteVRAMImpl(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask)
|
||||||
|
{
|
||||||
|
// TODO: Vector implementation
|
||||||
|
|
||||||
|
// Fast path when the copy is not oversized.
|
||||||
|
if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !set_mask && !check_mask)
|
||||||
|
{
|
||||||
|
const u16* src_ptr = static_cast<const u16*>(data);
|
||||||
|
u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x];
|
||||||
|
for (u32 yoffs = 0; yoffs < height; yoffs++)
|
||||||
|
{
|
||||||
|
std::copy_n(src_ptr, width, dst_ptr);
|
||||||
|
src_ptr += width;
|
||||||
|
dst_ptr += VRAM_WIDTH;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Slow path when we need to handle wrap-around.
|
||||||
|
// During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
|
||||||
|
const u16* src_ptr = static_cast<const u16*>(data);
|
||||||
|
const u16 mask_and = check_mask ? 0x8000u : 0x0000u;
|
||||||
|
const u16 mask_or = set_mask ? 0x8000u : 0x0000u;
|
||||||
|
|
||||||
|
for (u32 row = 0; row < height;)
|
||||||
|
{
|
||||||
|
u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH];
|
||||||
|
for (u32 col = 0; col < width;)
|
||||||
|
{
|
||||||
|
// TODO: Handle unaligned reads...
|
||||||
|
u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH];
|
||||||
|
if (((*pixel_ptr) & mask_and) == 0)
|
||||||
|
*pixel_ptr = *(src_ptr++) | mask_or;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void CopyVRAMImpl(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, bool set_mask,
|
||||||
|
bool check_mask)
|
||||||
|
{
|
||||||
|
// TODO: Vector implementation.
|
||||||
|
|
||||||
|
// Break up oversized copies. This behavior has not been verified on console.
|
||||||
|
if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH)
|
||||||
|
{
|
||||||
|
u32 remaining_rows = height;
|
||||||
|
u32 current_src_y = src_y;
|
||||||
|
u32 current_dst_y = dst_y;
|
||||||
|
while (remaining_rows > 0)
|
||||||
|
{
|
||||||
|
const u32 rows_to_copy =
|
||||||
|
std::min<u32>(remaining_rows, std::min<u32>(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y));
|
||||||
|
|
||||||
|
u32 remaining_columns = width;
|
||||||
|
u32 current_src_x = src_x;
|
||||||
|
u32 current_dst_x = dst_x;
|
||||||
|
while (remaining_columns > 0)
|
||||||
|
{
|
||||||
|
const u32 columns_to_copy =
|
||||||
|
std::min<u32>(remaining_columns, std::min<u32>(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x));
|
||||||
|
CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, set_mask,
|
||||||
|
check_mask);
|
||||||
|
current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH;
|
||||||
|
current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH;
|
||||||
|
remaining_columns -= columns_to_copy;
|
||||||
|
}
|
||||||
|
|
||||||
|
current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT;
|
||||||
|
current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT;
|
||||||
|
remaining_rows -= rows_to_copy;
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This doesn't have a fast path, but do we really need one? It's not common.
|
||||||
|
const u16 mask_and = check_mask ? 0x8000u : 0x0000u;
|
||||||
|
const u16 mask_or = set_mask ? 0x8000u : 0x0000u;
|
||||||
|
|
||||||
|
// Copy in reverse when src_x < dst_x, this is verified on console.
|
||||||
|
if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH))
|
||||||
|
{
|
||||||
|
for (u32 row = 0; row < height; row++)
|
||||||
|
{
|
||||||
|
const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
||||||
|
u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
||||||
|
|
||||||
|
for (s32 col = static_cast<s32>(width - 1); col >= 0; col--)
|
||||||
|
{
|
||||||
|
const u16 src_pixel = src_row_ptr[(src_x + static_cast<u32>(col)) % VRAM_WIDTH];
|
||||||
|
u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast<u32>(col)) % VRAM_WIDTH];
|
||||||
|
*dst_pixel_ptr = ((*dst_pixel_ptr & mask_and) == 0) ? (src_pixel | mask_or) : *dst_pixel_ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (u32 row = 0; row < height; row++)
|
||||||
|
{
|
||||||
|
const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
||||||
|
u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
|
||||||
|
|
||||||
|
for (u32 col = 0; col < width; col++)
|
||||||
|
{
|
||||||
|
const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH];
|
||||||
|
u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH];
|
||||||
|
*dst_pixel_ptr = ((*dst_pixel_ptr & mask_and) == 0) ? (src_pixel | mask_or) : *dst_pixel_ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef __INTELLISENSE__
|
#ifdef __INTELLISENSE__
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -275,8 +275,6 @@ union GPUBackendCommandParameters
|
||||||
BitField<u8, bool, 2, 1> set_mask_while_drawing;
|
BitField<u8, bool, 2, 1> set_mask_while_drawing;
|
||||||
BitField<u8, bool, 3, 1> check_mask_before_draw;
|
BitField<u8, bool, 3, 1> check_mask_before_draw;
|
||||||
|
|
||||||
ALWAYS_INLINE bool IsMaskingEnabled() const { return (bits & 12u) != 0u; }
|
|
||||||
|
|
||||||
// During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
|
// During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
|
||||||
u16 GetMaskAND() const
|
u16 GetMaskAND() const
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue