// Copyright 2009 Dolphin Emulator Project // Licensed under GPLv2+ // Refer to the license.txt file included. #include "VideoBackends/Software/EfbInterface.h" #include #include #include #include #include #include "Common/CommonTypes.h" #include "Common/Logging/Log.h" #include "Common/Swap.h" #include "VideoBackends/Software/CopyRegion.h" #include "VideoCommon/BPMemory.h" #include "VideoCommon/LookUpTables.h" #include "VideoCommon/PerfQueryBase.h" namespace EfbInterface { static std::array efb; static std::array perf_values; static inline u32 GetColorOffset(u16 x, u16 y) { return (x + y * EFB_WIDTH) * 3; } static inline u32 GetDepthOffset(u16 x, u16 y) { constexpr u32 depth_buffer_start = EFB_WIDTH * EFB_HEIGHT * 3; return (x + y * EFB_WIDTH) * 3 + depth_buffer_start; } static void SetPixelAlphaOnly(u32 offset, u8 a) { switch (bpmem.zcontrol.pixel_format) { case PEControl::RGB8_Z24: case PEControl::Z24: case PEControl::RGB565_Z16: // do nothing break; case PEControl::RGBA6_Z24: { u32 a32 = a; u32* dst = (u32*)&efb[offset]; u32 val = *dst & 0xffffffc0; val |= (a32 >> 2) & 0x0000003f; *dst = val; } break; default: ERROR_LOG(VIDEO, "Unsupported pixel format: %i", static_cast(bpmem.zcontrol.pixel_format)); } } static void SetPixelColorOnly(u32 offset, u8* rgb) { switch (bpmem.zcontrol.pixel_format) { case PEControl::RGB8_Z24: case PEControl::Z24: { u32 src = *(u32*)rgb; u32* dst = (u32*)&efb[offset]; u32 val = *dst & 0xff000000; val |= src >> 8; *dst = val; } break; case PEControl::RGBA6_Z24: { u32 src = *(u32*)rgb; u32* dst = (u32*)&efb[offset]; u32 val = *dst & 0xff00003f; val |= (src >> 4) & 0x00000fc0; // blue val |= (src >> 6) & 0x0003f000; // green val |= (src >> 8) & 0x00fc0000; // red *dst = val; } break; case PEControl::RGB565_Z16: { INFO_LOG(VIDEO, "RGB565_Z16 is not supported correctly yet"); u32 src = *(u32*)rgb; u32* dst = (u32*)&efb[offset]; u32 val = *dst & 0xff000000; val |= src >> 8; *dst = val; } break; default: ERROR_LOG(VIDEO, "Unsupported pixel format: %i", static_cast(bpmem.zcontrol.pixel_format)); } } static void SetPixelAlphaColor(u32 offset, u8* color) { switch (bpmem.zcontrol.pixel_format) { case PEControl::RGB8_Z24: case PEControl::Z24: { u32 src = *(u32*)color; u32* dst = (u32*)&efb[offset]; u32 val = *dst & 0xff000000; val |= src >> 8; *dst = val; } break; case PEControl::RGBA6_Z24: { u32 src = *(u32*)color; u32* dst = (u32*)&efb[offset]; u32 val = *dst & 0xff000000; val |= (src >> 2) & 0x0000003f; // alpha val |= (src >> 4) & 0x00000fc0; // blue val |= (src >> 6) & 0x0003f000; // green val |= (src >> 8) & 0x00fc0000; // red *dst = val; } break; case PEControl::RGB565_Z16: { INFO_LOG(VIDEO, "RGB565_Z16 is not supported correctly yet"); u32 src = *(u32*)color; u32* dst = (u32*)&efb[offset]; u32 val = *dst & 0xff000000; val |= src >> 8; *dst = val; } break; default: ERROR_LOG(VIDEO, "Unsupported pixel format: %i", static_cast(bpmem.zcontrol.pixel_format)); } } static u32 GetPixelColor(u32 offset) { u32 src; std::memcpy(&src, &efb[offset], sizeof(u32)); switch (bpmem.zcontrol.pixel_format) { case PEControl::RGB8_Z24: case PEControl::Z24: return 0xff | ((src & 0x00ffffff) << 8); case PEControl::RGBA6_Z24: return Convert6To8(src & 0x3f) | // Alpha Convert6To8((src >> 6) & 0x3f) << 8 | // Blue Convert6To8((src >> 12) & 0x3f) << 16 | // Green Convert6To8((src >> 18) & 0x3f) << 24; // Red case PEControl::RGB565_Z16: INFO_LOG(VIDEO, "RGB565_Z16 is not supported correctly yet"); return 0xff | ((src & 0x00ffffff) << 8); default: ERROR_LOG(VIDEO, "Unsupported pixel format: %i", static_cast(bpmem.zcontrol.pixel_format)); return 0; } } static void SetPixelDepth(u32 offset, u32 depth) { switch (bpmem.zcontrol.pixel_format) { case PEControl::RGB8_Z24: case PEControl::RGBA6_Z24: case PEControl::Z24: { u32* dst = (u32*)&efb[offset]; u32 val = *dst & 0xff000000; val |= depth & 0x00ffffff; *dst = val; } break; case PEControl::RGB565_Z16: { INFO_LOG(VIDEO, "RGB565_Z16 is not supported correctly yet"); u32* dst = (u32*)&efb[offset]; u32 val = *dst & 0xff000000; val |= depth & 0x00ffffff; *dst = val; } break; default: ERROR_LOG(VIDEO, "Unsupported pixel format: %i", static_cast(bpmem.zcontrol.pixel_format)); } } static u32 GetPixelDepth(u32 offset) { u32 depth = 0; switch (bpmem.zcontrol.pixel_format) { case PEControl::RGB8_Z24: case PEControl::RGBA6_Z24: case PEControl::Z24: { depth = (*(u32*)&efb[offset]) & 0x00ffffff; } break; case PEControl::RGB565_Z16: { INFO_LOG(VIDEO, "RGB565_Z16 is not supported correctly yet"); depth = (*(u32*)&efb[offset]) & 0x00ffffff; } break; default: ERROR_LOG(VIDEO, "Unsupported pixel format: %i", static_cast(bpmem.zcontrol.pixel_format)); } return depth; } static u32 GetSourceFactor(u8* srcClr, u8* dstClr, BlendMode::BlendFactor mode) { switch (mode) { case BlendMode::ZERO: return 0; case BlendMode::ONE: return 0xffffffff; case BlendMode::DSTCLR: return *(u32*)dstClr; case BlendMode::INVDSTCLR: return 0xffffffff - *(u32*)dstClr; case BlendMode::SRCALPHA: { u8 alpha = srcClr[ALP_C]; u32 factor = alpha << 24 | alpha << 16 | alpha << 8 | alpha; return factor; } case BlendMode::INVSRCALPHA: { u8 alpha = 0xff - srcClr[ALP_C]; u32 factor = alpha << 24 | alpha << 16 | alpha << 8 | alpha; return factor; } case BlendMode::DSTALPHA: { u8 alpha = dstClr[ALP_C]; u32 factor = alpha << 24 | alpha << 16 | alpha << 8 | alpha; return factor; } case BlendMode::INVDSTALPHA: { u8 alpha = 0xff - dstClr[ALP_C]; u32 factor = alpha << 24 | alpha << 16 | alpha << 8 | alpha; return factor; } } return 0; } static u32 GetDestinationFactor(u8* srcClr, u8* dstClr, BlendMode::BlendFactor mode) { switch (mode) { case BlendMode::ZERO: return 0; case BlendMode::ONE: return 0xffffffff; case BlendMode::SRCCLR: return *(u32*)srcClr; case BlendMode::INVSRCCLR: return 0xffffffff - *(u32*)srcClr; case BlendMode::SRCALPHA: { u8 alpha = srcClr[ALP_C]; u32 factor = alpha << 24 | alpha << 16 | alpha << 8 | alpha; return factor; } case BlendMode::INVSRCALPHA: { u8 alpha = 0xff - srcClr[ALP_C]; u32 factor = alpha << 24 | alpha << 16 | alpha << 8 | alpha; return factor; } case BlendMode::DSTALPHA: { u8 alpha = dstClr[ALP_C] & 0xff; u32 factor = alpha << 24 | alpha << 16 | alpha << 8 | alpha; return factor; } case BlendMode::INVDSTALPHA: { u8 alpha = 0xff - dstClr[ALP_C]; u32 factor = alpha << 24 | alpha << 16 | alpha << 8 | alpha; return factor; } } return 0; } static void BlendColor(u8* srcClr, u8* dstClr) { u32 srcFactor = GetSourceFactor(srcClr, dstClr, bpmem.blendmode.srcfactor); u32 dstFactor = GetDestinationFactor(srcClr, dstClr, bpmem.blendmode.dstfactor); for (int i = 0; i < 4; i++) { // add MSB of factors to make their range 0 -> 256 u32 sf = (srcFactor & 0xff); sf += sf >> 7; u32 df = (dstFactor & 0xff); df += df >> 7; u32 color = (srcClr[i] * sf + dstClr[i] * df) >> 8; dstClr[i] = (color > 255) ? 255 : color; dstFactor >>= 8; srcFactor >>= 8; } } static void LogicBlend(u32 srcClr, u32* dstClr, BlendMode::LogicOp op) { switch (op) { case BlendMode::CLEAR: *dstClr = 0; break; case BlendMode::AND: *dstClr = srcClr & *dstClr; break; case BlendMode::AND_REVERSE: *dstClr = srcClr & (~*dstClr); break; case BlendMode::COPY: *dstClr = srcClr; break; case BlendMode::AND_INVERTED: *dstClr = (~srcClr) & *dstClr; break; case BlendMode::NOOP: // Do nothing break; case BlendMode::XOR: *dstClr = srcClr ^ *dstClr; break; case BlendMode::OR: *dstClr = srcClr | *dstClr; break; case BlendMode::NOR: *dstClr = ~(srcClr | *dstClr); break; case BlendMode::EQUIV: *dstClr = ~(srcClr ^ *dstClr); break; case BlendMode::INVERT: *dstClr = ~*dstClr; break; case BlendMode::OR_REVERSE: *dstClr = srcClr | (~*dstClr); break; case BlendMode::COPY_INVERTED: *dstClr = ~srcClr; break; case BlendMode::OR_INVERTED: *dstClr = (~srcClr) | *dstClr; break; case BlendMode::NAND: *dstClr = ~(srcClr & *dstClr); break; case BlendMode::SET: *dstClr = 0xffffffff; break; } } static void SubtractBlend(u8* srcClr, u8* dstClr) { for (int i = 0; i < 4; i++) { int c = (int)dstClr[i] - (int)srcClr[i]; dstClr[i] = (c < 0) ? 0 : c; } } static void Dither(u16 x, u16 y, u8* color) { // No blending for RGB8 mode if (!bpmem.blendmode.dither || bpmem.zcontrol.pixel_format != PEControl::PixelFormat::RGBA6_Z24) return; // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering static const u8 dither[2][2] = {{0, 2}, {3, 1}}; // Only the color channels are dithered? for (int i = BLU_C; i <= RED_C; i++) color[i] = ((color[i] - (color[i] >> 6)) + dither[y & 1][x & 1]) & 0xfc; } void BlendTev(u16 x, u16 y, u8* color) { const u32 offset = GetColorOffset(x, y); u32 dstClr = GetPixelColor(offset); u8* dstClrPtr = (u8*)&dstClr; if (bpmem.blendmode.blendenable) { if (bpmem.blendmode.subtract) SubtractBlend(color, dstClrPtr); else BlendColor(color, dstClrPtr); } else if (bpmem.blendmode.logicopenable) { LogicBlend(*((u32*)color), &dstClr, bpmem.blendmode.logicmode); } else { dstClrPtr = color; } if (bpmem.dstalpha.enable) dstClrPtr[ALP_C] = bpmem.dstalpha.alpha; if (bpmem.blendmode.colorupdate) { Dither(x, y, dstClrPtr); if (bpmem.blendmode.alphaupdate) SetPixelAlphaColor(offset, dstClrPtr); else SetPixelColorOnly(offset, dstClrPtr); } else if (bpmem.blendmode.alphaupdate) { SetPixelAlphaOnly(offset, dstClrPtr[ALP_C]); } } void SetColor(u16 x, u16 y, u8* color) { u32 offset = GetColorOffset(x, y); if (bpmem.blendmode.colorupdate) { if (bpmem.blendmode.alphaupdate) SetPixelAlphaColor(offset, color); else SetPixelColorOnly(offset, color); } else if (bpmem.blendmode.alphaupdate) { SetPixelAlphaOnly(offset, color[ALP_C]); } } void SetDepth(u16 x, u16 y, u32 depth) { if (bpmem.zmode.updateenable) SetPixelDepth(GetDepthOffset(x, y), depth); } u32 GetColor(u16 x, u16 y) { u32 offset = GetColorOffset(x, y); return GetPixelColor(offset); } static u32 VerticalFilter(const std::array& colors, const std::array& filterCoefficients) { u8 in_colors[3][4]; std::memcpy(&in_colors, colors.data(), sizeof(in_colors)); // Alpha channel is not used u8 out_color[4]; out_color[ALP_C] = 0; // All Coefficients should sum to 64, otherwise the total brightness will change, which many games // do on purpose to implement a brightness filter across the whole copy. for (int i = BLU_C; i <= RED_C; i++) { // TODO: implement support for multisampling. // In non-multisampling mode: // * Coefficients 2, 3 and 4 sample from the current pixel. // * Coefficients 0 and 1 sample from the pixel above this one // * Coefficients 5 and 6 sample from the pixel below this one int sum = in_colors[0][i] * (filterCoefficients[0] + filterCoefficients[1]) + in_colors[1][i] * (filterCoefficients[2] + filterCoefficients[3] + filterCoefficients[4]) + in_colors[2][i] * (filterCoefficients[5] + filterCoefficients[6]); // TODO: this clamping behavior appears to be correct, but isn't confirmed on hardware. out_color[i] = std::min(255, sum >> 6); // clamp larger values to 255 } u32 out_color32; std::memcpy(&out_color32, out_color, sizeof(out_color32)); return out_color32; } static u32 GammaCorrection(u32 color, const float gamma_rcp) { u8 in_colors[4]; std::memcpy(&in_colors, &color, sizeof(in_colors)); u8 out_color[4]; for (int i = BLU_C; i <= RED_C; i++) { out_color[i] = static_cast( MathUtil::Clamp(std::pow(in_colors[i] / 255.0f, gamma_rcp) * 255.0f, 0.0f, 255.0f)); } u32 out_color32; std::memcpy(&out_color32, out_color, sizeof(out_color32)); return out_color32; } // For internal used only, return a non-normalized value, which saves work later. static yuv444 ConvertColorToYUV(u32 color) { const u8 red = static_cast(color >> 24); const u8 green = static_cast(color >> 16); const u8 blue = static_cast(color >> 8); // GameCube/Wii uses the BT.601 standard algorithm for converting to YCbCr; see // http://www.equasys.de/colorconversion.html#YCbCr-RGBColorFormatConversion return {static_cast(0.257f * red + 0.504f * green + 0.098f * blue), static_cast(-0.148f * red + -0.291f * green + 0.439f * blue), static_cast(0.439f * red + -0.368f * green + -0.071f * blue)}; } u32 GetDepth(u16 x, u16 y) { u32 offset = GetDepthOffset(x, y); return GetPixelDepth(offset); } u8* GetPixelPointer(u16 x, u16 y, bool depth) { if (depth) return &efb[GetDepthOffset(x, y)]; return &efb[GetColorOffset(x, y)]; } void EncodeXFB(u8* xfb_in_ram, u32 memory_stride, const MathUtil::Rectangle& source_rect, float y_scale, float gamma) { if (!xfb_in_ram) { WARN_LOG(VIDEO, "Tried to copy to invalid XFB address"); return; } const int left = source_rect.left; const int right = source_rect.right; const bool clamp_top = bpmem.triggerEFBCopy.clamp_top; const bool clamp_bottom = bpmem.triggerEFBCopy.clamp_bottom; const float gamma_rcp = 1.0f / gamma; const auto filter_coefficients = bpmem.copyfilter.GetCoefficients(); // this assumes copies will always start on an even (YU) pixel and the // copy always has an even width, which might not be true. if (left & 1 || right & 1) { WARN_LOG(VIDEO, "Trying to copy XFB to from unaligned EFB source"); // this will show up as wrongly encoded } // Scanline buffer, leave room for borders yuv444 scanline[EFB_WIDTH + 2]; static std::vector source; source.resize(EFB_WIDTH * EFB_HEIGHT); yuv422_packed* src_ptr = &source[0]; for (int y = source_rect.top; y < source_rect.bottom; y++) { // Clamping behavior // NOTE: when the clamp bits aren't set, the hardware will happily read beyond the EFB, // which returns random garbage from the empty bus (confirmed by hardware tests). // // In our implementation, the garbage just so happens to be the top or bottom row. // Statistically, that could happen. const u16 y_prev = static_cast(std::max(clamp_top ? source_rect.top : 0, y - 1)); const u16 y_next = static_cast(std::min(clamp_bottom ? source_rect.bottom : EFB_HEIGHT, y + 1)); // Get a scanline of YUV pixels in 4:4:4 format for (int i = 1, x = left; x < right; i++, x++) { // Get RGB colors std::array colors = {{GetColor(x, y_prev), GetColor(x, y), GetColor(x, y_next)}}; // Vertical Filter (Multisampling resolve, deflicker, brightness) u32 filtered = VerticalFilter(colors, filter_coefficients); // Gamma correction happens here. filtered = GammaCorrection(filtered, gamma_rcp); scanline[i] = ConvertColorToYUV(filtered); } // Flipper clamps the border colors scanline[0] = scanline[1]; scanline[right + 1] = scanline[right]; // And Downsample them to 4:2:2 for (int i = 1, x = left; x < right; i += 2, x += 2) { // YU pixel src_ptr[x].Y = scanline[i].Y + 16; // we mix our color differences in 10 bit space so it will round more accurately // U[i] = 1/4 * U[i-1] + 1/2 * U[i] + 1/4 * U[i+1] src_ptr[x].UV = 128 + ((scanline[i - 1].U + (scanline[i].U << 1) + scanline[i + 1].U) >> 2); // YV pixel src_ptr[x + 1].Y = scanline[i + 1].Y + 16; // V[i] = 1/4 * V[i-1] + 1/2 * V[i] + 1/4 * V[i+1] src_ptr[x + 1].UV = 128 + ((scanline[i - 1].V + (scanline[i].V << 1) + scanline[i + 1].V) >> 2); } src_ptr += memory_stride; } auto dest_rect = MathUtil::Rectangle{source_rect.left, source_rect.top, source_rect.right, static_cast(static_cast(source_rect.bottom) * y_scale)}; const std::size_t destination_size = dest_rect.GetWidth() * dest_rect.GetHeight() * 2; static std::vector destination; destination.resize(dest_rect.GetWidth() * dest_rect.GetHeight()); SW::CopyRegion(source.data(), source_rect, destination.data(), dest_rect); memcpy(xfb_in_ram, destination.data(), destination_size); } bool ZCompare(u16 x, u16 y, u32 z) { u32 offset = GetDepthOffset(x, y); u32 depth = GetPixelDepth(offset); bool pass; switch (bpmem.zmode.func) { case ZMode::NEVER: pass = false; break; case ZMode::LESS: pass = z < depth; break; case ZMode::EQUAL: pass = z == depth; break; case ZMode::LEQUAL: pass = z <= depth; break; case ZMode::GREATER: pass = z > depth; break; case ZMode::NEQUAL: pass = z != depth; break; case ZMode::GEQUAL: pass = z >= depth; break; case ZMode::ALWAYS: pass = true; break; default: pass = false; ERROR_LOG(VIDEO, "Bad Z compare mode %i", (int)bpmem.zmode.func); } if (pass && bpmem.zmode.updateenable) { SetPixelDepth(offset, z); } return pass; } u32 GetPerfQueryResult(PerfQueryType type) { return perf_values[type]; } void ResetPerfQuery() { perf_values = {}; } void IncPerfCounterQuadCount(PerfQueryType type) { // NOTE: hardware doesn't process individual pixels but quads instead. // Current software renderer architecture works on pixels though, so // we have this "quad" hack here to only increment the registers on // every fourth rendered pixel static u32 quad[PQ_NUM_MEMBERS]; if (++quad[type] != 3) return; quad[type] = 0; ++perf_values[type]; } }