From 9e024b7a51cf1651ea6927cfd3862de2701f1a38 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sat, 11 Apr 2020 16:09:03 +1000 Subject: [PATCH] GPU: Handle VRAM wrap-around behavior on scanout --- src/core/gpu.cpp | 31 ++++----- src/core/gpu_hw_d3d11.cpp | 10 +-- src/core/gpu_hw_opengl.cpp | 11 +-- src/core/gpu_hw_shadergen.cpp | 20 +++--- src/core/gpu_sw.cpp | 127 +++++++++++++++++++++++++--------- src/core/gpu_sw.h | 6 +- 6 files changed, 131 insertions(+), 74 deletions(-) diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp index 93e1fc03f..cb9273bb1 100644 --- a/src/core/gpu.cpp +++ b/src/core/gpu.cpp @@ -487,22 +487,19 @@ void GPU::UpdateCRTCDisplayParameters() if (horizontal_display_end <= horizontal_visible_end_tick) { - cs.display_vram_width = std::min( + cs.display_vram_width = std::max((((horizontal_display_end - std::max(horizontal_display_start, horizontal_visible_start_tick)) + (cs.dot_clock_divider - 1)) / cs.dot_clock_divider), - 1u), - VRAM_WIDTH - cs.display_vram_left); + 1u); } else { - cs.display_vram_width = std::min( - std::max( - (((horizontal_visible_end_tick - std::max(horizontal_display_start, horizontal_visible_start_tick)) + - (cs.dot_clock_divider - 1)) / - cs.dot_clock_divider), - 1u), - VRAM_WIDTH - cs.display_vram_left); + cs.display_vram_width = std::max( + (((horizontal_visible_end_tick - std::max(horizontal_display_start, horizontal_visible_start_tick)) + + (cs.dot_clock_divider - 1)) / + cs.dot_clock_divider), + 1u); } if (vertical_display_start >= vertical_visible_start_line) @@ -513,21 +510,19 @@ void GPU::UpdateCRTCDisplayParameters() else { cs.display_origin_top = 0; - cs.display_vram_top = std::min( - m_crtc_state.regs.Y + ((vertical_visible_start_line - vertical_display_start) << height_shift), VRAM_HEIGHT - 1); + cs.display_vram_top = + m_crtc_state.regs.Y + ((vertical_visible_start_line - vertical_display_start) << height_shift); } if (vertical_display_end <= vertical_visible_end_line) { - cs.display_vram_height = std::min( - (vertical_display_end - std::max(vertical_display_start, vertical_visible_start_line)) << height_shift, - VRAM_HEIGHT - cs.display_vram_top); + cs.display_vram_height = (vertical_display_end - std::max(vertical_display_start, vertical_visible_start_line)) + << height_shift; } else { - cs.display_vram_height = std::min( - (vertical_visible_end_line - std::max(vertical_display_start, vertical_visible_start_line)) << height_shift, - VRAM_HEIGHT - cs.display_vram_top); + cs.display_vram_height = (vertical_visible_end_line - std::max(vertical_display_start, vertical_visible_start_line)) + << height_shift; } } diff --git a/src/core/gpu_hw_d3d11.cpp b/src/core/gpu_hw_d3d11.cpp index 82c937a43..d1d243948 100644 --- a/src/core/gpu_hw_d3d11.cpp +++ b/src/core/gpu_hw_d3d11.cpp @@ -553,7 +553,9 @@ void GPU_HW_D3D11::UpdateDisplay() { m_host_display->ClearDisplayTexture(); } - else if (!m_GPUSTAT.display_area_color_depth_24 && !interlaced) + else if (!m_GPUSTAT.display_area_color_depth_24 && !interlaced && + (scaled_vram_offset_x + scaled_display_width) <= m_vram_texture.GetWidth() && + (scaled_vram_offset_y + scaled_vram_offset_y <= m_vram_texture.GetHeight())) { m_host_display->SetDisplayTexture(m_vram_texture.GetD3DSRV(), m_vram_texture.GetWidth(), m_vram_texture.GetHeight(), scaled_vram_offset_x, scaled_vram_offset_y, @@ -567,15 +569,15 @@ void GPU_HW_D3D11::UpdateDisplay() const u32 reinterpret_field_offset = GetInterlacedField(); const u32 reinterpret_start_x = m_crtc_state.regs.X * m_resolution_scale; const u32 reinterpret_width = scaled_display_width + (m_crtc_state.display_vram_left - m_crtc_state.regs.X); - const u32 uniforms[4] = {reinterpret_field_offset, reinterpret_start_x}; + const u32 uniforms[4] = {reinterpret_start_x, scaled_vram_offset_y, reinterpret_field_offset}; ID3D11PixelShader* display_pixel_shader = m_display_pixel_shaders[BoolToUInt8(m_GPUSTAT.display_area_color_depth_24)][BoolToUInt8(interlaced)].Get(); - SetViewportAndScissor(reinterpret_start_x, scaled_vram_offset_y, reinterpret_width, scaled_display_height); + SetViewportAndScissor(0, reinterpret_field_offset, reinterpret_width, scaled_display_height); DrawUtilityShader(display_pixel_shader, uniforms, sizeof(uniforms)); m_host_display->SetDisplayTexture(m_display_texture.GetD3DSRV(), m_display_texture.GetWidth(), - m_display_texture.GetHeight(), scaled_vram_offset_x, scaled_vram_offset_y, + m_display_texture.GetHeight(), scaled_vram_offset_x - reinterpret_start_x, 0, scaled_display_width, scaled_display_height); RestoreGraphicsAPIState(); diff --git a/src/core/gpu_hw_opengl.cpp b/src/core/gpu_hw_opengl.cpp index c20039245..a70bc192a 100644 --- a/src/core/gpu_hw_opengl.cpp +++ b/src/core/gpu_hw_opengl.cpp @@ -506,7 +506,9 @@ void GPU_HW_OpenGL::UpdateDisplay() { m_host_display->ClearDisplayTexture(); } - else if (!m_GPUSTAT.display_area_color_depth_24 && !interlaced) + else if (!m_GPUSTAT.display_area_color_depth_24 && !interlaced && + (scaled_vram_offset_x + scaled_display_width) <= m_vram_texture.GetWidth() && + (scaled_vram_offset_y + scaled_vram_offset_y <= m_vram_texture.GetHeight())) { m_host_display->SetDisplayTexture(reinterpret_cast(static_cast(m_vram_texture.GetGLId())), m_vram_texture.GetWidth(), m_vram_texture.GetHeight(), scaled_vram_offset_x, @@ -525,20 +527,19 @@ void GPU_HW_OpenGL::UpdateDisplay() const u32 flipped_vram_offset_y = VRAM_HEIGHT - vram_offset_y - display_height; const u32 scaled_flipped_vram_offset_y = m_vram_texture.GetHeight() - scaled_vram_offset_y - scaled_display_height; - const u32 reinterpret_field_offset = GetInterlacedField(); const u32 reinterpret_start_x = m_crtc_state.regs.X * m_resolution_scale; const u32 reinterpret_width = scaled_display_width + (m_crtc_state.display_vram_left - m_crtc_state.regs.X); - const u32 uniforms[4] = {reinterpret_field_offset, reinterpret_start_x}; + const u32 uniforms[4] = {reinterpret_start_x, scaled_flipped_vram_offset_y, reinterpret_field_offset}; UploadUniformBlock(uniforms, sizeof(uniforms)); m_batch_ubo_dirty = true; - glViewport(reinterpret_start_x, scaled_flipped_vram_offset_y, reinterpret_width, scaled_display_height); + glViewport(0, reinterpret_field_offset, reinterpret_width, scaled_display_height); glDrawArrays(GL_TRIANGLES, 0, 3); m_host_display->SetDisplayTexture(reinterpret_cast(static_cast(m_display_texture.GetGLId())), m_display_texture.GetWidth(), m_display_texture.GetHeight(), - scaled_vram_offset_x, m_vram_texture.GetHeight() - scaled_vram_offset_y, + scaled_vram_offset_x - reinterpret_start_x, scaled_display_height, scaled_display_width, -static_cast(scaled_display_height)); // restore state diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp index 81523f990..386eab304 100644 --- a/src/core/gpu_hw_shadergen.cpp +++ b/src/core/gpu_hw_shadergen.cpp @@ -770,37 +770,39 @@ std::string GPU_HW_ShaderGen::GenerateDisplayFragmentShader(bool depth_24bit, bo DefineMacro(ss, "INTERLACED", interlaced); WriteCommonFunctions(ss); - DeclareUniformBuffer(ss, {"int u_field_offset", "int u_vram_start_x"}); + DeclareUniformBuffer(ss, {"uint2 u_vram_offset", "uint u_field_offset"}); DeclareTexture(ss, "samp0", 0); DeclareFragmentEntryPoint(ss, 0, 1, {}, true, false); ss << R"( { - int2 icoords = int2(v_pos.xy); + uint2 icoords = uint2(v_pos.xy) + u_vram_offset; #if INTERLACED - if (((fixYCoord(icoords.y) / RESOLUTION_SCALE) & 1) != u_field_offset) + if (((icoords.y / uint(RESOLUTION_SCALE)) & 1u) != u_field_offset) discard; #endif + //icoords.y = uint(fixYCoord(int(icoords.y))); + #if DEPTH_24BIT // relative to start of scanout - int relative_x = (icoords.x - u_vram_start_x) / RESOLUTION_SCALE; - icoords.x = u_vram_start_x + ((relative_x * 3) / 2) * RESOLUTION_SCALE; + uint relative_x = (icoords.x - u_vram_offset.x) / uint(RESOLUTION_SCALE); + icoords.x = u_vram_offset.x + ((relative_x * 3u) / 2u) * uint(RESOLUTION_SCALE); // load adjacent 16-bit texels - uint s0 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, icoords, 0)); - uint s1 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, icoords + int2(RESOLUTION_SCALE, 0), 0)); + uint s0 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, int2(icoords % uint2(VRAM_SIZE)), 0)); + uint s1 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, int2((icoords + uint2(uint(RESOLUTION_SCALE), 0)) % uint2(VRAM_SIZE)), 0)); // select which part of the combined 16-bit texels we are currently shading - uint s1s0 = ((s1 << 16) | s0) >> ((relative_x & 1) * 8); + uint s1s0 = ((s1 << 16) | s0) >> ((relative_x & 1u) * 8u); // extract components and normalize o_col0 = float4(float(s1s0 & 0xFFu) / 255.0, float((s1s0 >> 8u) & 0xFFu) / 255.0, float((s1s0 >> 16u) & 0xFFu) / 255.0, 1.0); #else // load and return - o_col0 = LOAD_TEXTURE(samp0, icoords, 0); + o_col0 = LOAD_TEXTURE(samp0, int2(icoords % uint2(VRAM_SIZE)), 0); #endif } )"; diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp index 89640cced..14cfebb28 100644 --- a/src/core/gpu_sw.cpp +++ b/src/core/gpu_sw.cpp @@ -41,38 +41,99 @@ void GPU_SW::Reset() m_vram.fill(0); } -void GPU_SW::CopyOut15Bit(const u16* src_ptr, u32 src_stride, u32* dst_ptr, u32 dst_stride, u32 width, u32 height) +void GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced) { - for (u32 row = 0; row < height; row++) - { - const u16* src_row_ptr = src_ptr; - u32* dst_row_ptr = dst_ptr; - for (u32 col = 0; col < width; col++) - *(dst_row_ptr++) = RGBA5551ToRGBA8888(*(src_row_ptr++)); + const u8 interlaced_shift = BoolToUInt8(interlaced); - src_ptr += src_stride; - dst_ptr += dst_stride; + // Fast path when not wrapping around. + if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT) + { + dst_stride <<= interlaced_shift; + height >>= interlaced_shift; + + const u16* src_ptr = &m_vram[src_y * VRAM_WIDTH + src_x]; + const u32 src_stride = VRAM_WIDTH << interlaced_shift; + for (u32 row = 0; row < height; row++) + { + const u16* src_row_ptr = src_ptr; + u32* dst_row_ptr = dst_ptr; + for (u32 col = 0; col < width; col++) + *(dst_row_ptr++) = RGBA5551ToRGBA8888(*(src_row_ptr++)); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } + } + else + { + dst_stride <<= interlaced_shift; + height >>= interlaced_shift; + + const u32 end_x = src_x + width; + for (u32 row = 0; row < height; row++) + { + const u16* src_row_ptr = &m_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH]; + u32* dst_row_ptr = dst_ptr; + + for (u32 col = src_x; col < end_x; col++) + *(dst_row_ptr++) = RGBA5551ToRGBA8888(src_row_ptr[col % VRAM_WIDTH]); + + src_y += (1 << interlaced_shift); + dst_ptr += dst_stride; + } } } -void GPU_SW::CopyOut24Bit(const u16* src_ptr, u32 src_stride, u32* dst_ptr, u32 dst_stride, u32 width, u32 height) +void GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced) { - for (u32 row = 0; row < height; row++) + const u8 interlaced_shift = BoolToUInt8(interlaced); + + if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT) { - const u8* src_row_ptr = reinterpret_cast(src_ptr); - u32* dst_row_ptr = dst_ptr; + dst_stride <<= interlaced_shift; + height >>= interlaced_shift; - // Beware unaligned accesses. - for (u32 col = 0; col < width; col++) + const u8* src_ptr = reinterpret_cast(&m_vram[src_y * VRAM_WIDTH + src_x]); + const u32 src_stride = (VRAM_WIDTH << interlaced_shift) * sizeof(u16); + for (u32 row = 0; row < height; row++) { - // This will fill the alpha channel with junk, but that's okay since we don't use it - std::memcpy(dst_row_ptr, src_row_ptr, sizeof(u32)); - src_row_ptr += 3; - dst_row_ptr++; - } + const u8* src_row_ptr = src_ptr; + u8* dst_row_ptr = reinterpret_cast(dst_ptr); + for (u32 col = 0; col < width; col++) + { + *(dst_row_ptr++) = *(src_row_ptr++); + *(dst_row_ptr++) = *(src_row_ptr++); + *(dst_row_ptr++) = *(src_row_ptr++); + *(dst_row_ptr++) = 0xFF; + } - src_ptr += src_stride; - dst_ptr += dst_stride; + src_ptr += src_stride; + dst_ptr += dst_stride; + } + } + else + { + dst_stride <<= interlaced_shift; + height >>= interlaced_shift; + + const u32 end_x = src_x + width; + for (u32 row = 0; row < height; row++) + { + const u16* src_row_ptr = &m_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH]; + u32* dst_row_ptr = dst_ptr; + + for (u32 col = 0; col < width; col++) + { + const u32 offset = (src_x + ((col * 3) / 2)); + const u16 s0 = src_row_ptr[offset % VRAM_WIDTH]; + const u16 s1 = src_row_ptr[(offset + 1) % VRAM_WIDTH]; + const u8 shift = static_cast(col & 1u) * 8; + *(dst_row_ptr++) = (((ZeroExtend32(s1) << 16) | ZeroExtend32(s0)) >> shift) | 0xFF000000u; + } + + src_y += (1 << interlaced_shift); + dst_ptr += dst_stride; + } } } @@ -98,34 +159,32 @@ void GPU_SW::UpdateDisplay() const u32 field = GetInterlacedField(); if (m_GPUSTAT.display_area_color_depth_24) { - CopyOut24Bit(m_vram.data() + (vram_offset_y + field) * VRAM_WIDTH + m_crtc_state.regs.X, VRAM_WIDTH * 2, - m_display_texture_buffer.data() + field * display_width, display_width * 2, display_width, - display_height / 2); + CopyOut24Bit(m_crtc_state.regs.X, vram_offset_y + field, m_display_texture_buffer.data() + field * VRAM_WIDTH, + VRAM_WIDTH, display_width, display_height, true); } else { - CopyOut15Bit(m_vram.data() + (vram_offset_y + field) * VRAM_WIDTH + m_crtc_state.regs.X, VRAM_WIDTH * 2, - m_display_texture_buffer.data() + field * display_width, display_width * 2, display_width, - display_height / 2); + CopyOut15Bit(m_crtc_state.regs.X, vram_offset_y + field, m_display_texture_buffer.data() + field * VRAM_WIDTH, + VRAM_WIDTH, display_width, display_height, true); } } else { if (m_GPUSTAT.display_area_color_depth_24) { - CopyOut24Bit(m_vram.data() + vram_offset_y * VRAM_WIDTH + m_crtc_state.regs.X, VRAM_WIDTH, - m_display_texture_buffer.data(), display_width, display_width, display_height); + CopyOut24Bit(m_crtc_state.regs.X, vram_offset_y, m_display_texture_buffer.data(), VRAM_WIDTH, display_width, + display_height, false); } else { - CopyOut15Bit(m_vram.data() + vram_offset_y * VRAM_WIDTH + m_crtc_state.regs.X, VRAM_WIDTH, - m_display_texture_buffer.data(), display_width, display_width, display_height); + CopyOut15Bit(m_crtc_state.regs.X, vram_offset_y, m_display_texture_buffer.data(), VRAM_WIDTH, display_width, + display_height, false); } } const u32 texture_offset_x = m_crtc_state.display_vram_left - m_crtc_state.regs.X; m_host_display->UpdateTexture(m_display_texture.get(), texture_offset_x, 0, display_width, display_height, - m_display_texture_buffer.data(), display_width * sizeof(u32)); + m_display_texture_buffer.data(), VRAM_WIDTH * sizeof(u32)); m_host_display->SetDisplayTexture(m_display_texture->GetHandle(), VRAM_WIDTH, VRAM_HEIGHT, texture_offset_x, 0, display_width, display_height); m_host_display->SetDisplayParameters(m_crtc_state.display_width, m_crtc_state.display_height, @@ -135,7 +194,7 @@ void GPU_SW::UpdateDisplay() } else { - CopyOut15Bit(m_vram.data(), VRAM_WIDTH, m_display_texture_buffer.data(), VRAM_WIDTH, VRAM_WIDTH, VRAM_HEIGHT); + CopyOut15Bit(0, 0, m_display_texture_buffer.data(), VRAM_WIDTH, VRAM_WIDTH, VRAM_HEIGHT, false); m_host_display->UpdateTexture(m_display_texture.get(), 0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_display_texture_buffer.data(), VRAM_WIDTH * sizeof(u32)); m_host_display->SetDisplayTexture(m_display_texture->GetHandle(), VRAM_WIDTH, VRAM_HEIGHT, 0, 0, VRAM_WIDTH, diff --git a/src/core/gpu_sw.h b/src/core/gpu_sw.h index 5bdf20be6..dae7bb340 100644 --- a/src/core/gpu_sw.h +++ b/src/core/gpu_sw.h @@ -43,10 +43,8 @@ protected: ////////////////////////////////////////////////////////////////////////// // Scanout ////////////////////////////////////////////////////////////////////////// - static void CopyOut15Bit(const u16* src_ptr, u32 src_stride, u32* dst_ptr, u32 dst_stride, u32 width, u32 height); - - static void CopyOut24Bit(const u16* src_ptr, u32 src_stride, u32* dst_ptr, u32 dst_stride, u32 width, u32 height); - + void CopyOut15Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced); + void CopyOut24Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced); void UpdateDisplay() override; //////////////////////////////////////////////////////////////////////////