diff --git a/Source/Core/Core/CMakeLists.txt b/Source/Core/Core/CMakeLists.txt index ab0eff73f3..4f7176185e 100644 --- a/Source/Core/Core/CMakeLists.txt +++ b/Source/Core/Core/CMakeLists.txt @@ -199,6 +199,7 @@ if(_M_X86) PowerPC/Jit64/Jit_Paired.cpp PowerPC/Jit64/JitRegCache.cpp PowerPC/Jit64/Jit_SystemRegisters.cpp + PowerPC/Jit64Common/Jit64AsmCommon.cpp PowerPC/JitCommon/JitBackpatch.cpp PowerPC/JitCommon/Jit_Util.cpp PowerPC/JitCommon/TrampolineCache.cpp) diff --git a/Source/Core/Core/Core.vcxproj b/Source/Core/Core/Core.vcxproj index bf34fbabbe..23f0167763 100644 --- a/Source/Core/Core/Core.vcxproj +++ b/Source/Core/Core/Core.vcxproj @@ -235,6 +235,7 @@ + @@ -417,6 +418,7 @@ + diff --git a/Source/Core/Core/Core.vcxproj.filters b/Source/Core/Core/Core.vcxproj.filters index d8213794ae..3fc6e92433 100644 --- a/Source/Core/Core/Core.vcxproj.filters +++ b/Source/Core/Core/Core.vcxproj.filters @@ -631,6 +631,9 @@ PowerPC + + PowerPC\Jit64Common + PowerPC\JitCommon @@ -1184,6 +1187,9 @@ PowerPC + + PowerPC\Jit64Common + PowerPC\JitCommon @@ -1229,4 +1235,4 @@ - \ No newline at end of file + diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.h b/Source/Core/Core/PowerPC/Jit64/JitAsm.h index cd6e1bde08..1738ec97b6 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.h +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.h @@ -4,7 +4,7 @@ #pragma once -#include "Core/PowerPC/JitCommon/JitAsmCommon.h" +#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h" // In Dolphin, we don't use inline assembly. Instead, we generate all machine-near // code at runtime. In the case of fixed code like this, after writing it, we write diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp new file mode 100644 index 0000000000..11961dd1a8 --- /dev/null +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp @@ -0,0 +1,601 @@ +// Copyright 2013 Dolphin Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include "Common/MathUtil.h" +#include "Common/x64ABI.h" +#include "Common/x64Emitter.h" + +#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h" +#include "Core/PowerPC/JitCommon/JitBase.h" + +#define QUANTIZED_REGS_TO_SAVE \ + (ABI_ALL_CALLER_SAVED & ~BitSet32 { \ + RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \ + }) + +#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 }) + +using namespace Gen; + +void CommonAsmRoutines::GenFifoWrite(int size) +{ + // Assume value in RSCRATCH + u32 gather_pipe = (u32)(u64)GPFifo::m_gatherPipe; + _assert_msg_(DYNA_REC, gather_pipe <= 0x7FFFFFFF, "Gather pipe not in low 2GB of memory!"); + MOV(32, R(RSCRATCH2), M(&GPFifo::m_gatherPipeCount)); + SwapAndStore(size, MDisp(RSCRATCH2, gather_pipe), RSCRATCH); + ADD(32, R(RSCRATCH2), Imm8(size >> 3)); + MOV(32, M(&GPFifo::m_gatherPipeCount), R(RSCRATCH2)); + RET(); +} + +void CommonAsmRoutines::GenFrsqrte() +{ + // Assume input in XMM0. + // This function clobbers all three RSCRATCH. + MOVQ_xmm(R(RSCRATCH), XMM0); + + // Negative and zero inputs set an exception and take the complex path. + TEST(64, R(RSCRATCH), R(RSCRATCH)); + FixupBranch zero = J_CC(CC_Z, true); + FixupBranch negative = J_CC(CC_S, true); + MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH)); + SHR(64, R(RSCRATCH_EXTRA), Imm8(52)); + + // Zero and max exponents (non-normal floats) take the complex path. + FixupBranch complex1 = J_CC(CC_Z, true); + CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); + FixupBranch complex2 = J_CC(CC_E, true); + + SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD)); + SAR(32, R(RSCRATCH_EXTRA), Imm8(1)); + MOV(32, R(RSCRATCH2), Imm32(0x3FF)); + SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA)); + SHL(64, R(RSCRATCH2), Imm8(52)); // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) / 2)) & (0x7FFLL << 52); + + MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH)); + SHR(64, R(RSCRATCH_EXTRA), Imm8(48)); + AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F)); + XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10)); // int index = i / 2048 + (odd_exponent ? 16 : 0); + + SHR(64, R(RSCRATCH), Imm8(37)); + AND(32, R(RSCRATCH), Imm32(0x7FF)); + IMUL(32, RSCRATCH, MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_dec)); + MOV(32, R(RSCRATCH_EXTRA), MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_base)); + SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH)); + SHL(64, R(RSCRATCH_EXTRA), Imm8(26)); + OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(frsqrte_expected_base[index] - frsqrte_expected_dec[index] * (i % 2048)) << 26; + MOVQ_xmm(XMM0, R(RSCRATCH2)); + RET(); + + // Exception flags for zero input. + SetJumpTarget(zero); + TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX)); + FixupBranch skip_set_fx1 = J_CC(CC_NZ); + OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX)); + FixupBranch complex3 = J(); + + // Exception flags for negative input. + SetJumpTarget(negative); + TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT)); + FixupBranch skip_set_fx2 = J_CC(CC_NZ); + OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT)); + + SetJumpTarget(skip_set_fx1); + SetJumpTarget(skip_set_fx2); + SetJumpTarget(complex1); + SetJumpTarget(complex2); + SetJumpTarget(complex3); + ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); + ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot); + ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); + RET(); +} + +void CommonAsmRoutines::GenFres() +{ + // Assume input in XMM0. + // This function clobbers all three RSCRATCH. + MOVQ_xmm(R(RSCRATCH), XMM0); + + // Zero inputs set an exception and take the complex path. + TEST(64, R(RSCRATCH), R(RSCRATCH)); + FixupBranch zero = J_CC(CC_Z); + + MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH)); + SHR(64, R(RSCRATCH_EXTRA), Imm8(52)); + MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA)); + AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); // exp + AND(32, R(RSCRATCH2), Imm32(0x800)); // sign + CMP(32, R(RSCRATCH_EXTRA), Imm32(895)); + // Take the complex path for very large/small exponents. + FixupBranch complex1 = J_CC(CC_L); + CMP(32, R(RSCRATCH_EXTRA), Imm32(1149)); + FixupBranch complex2 = J_CC(CC_GE); + + SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD)); + NEG(32, R(RSCRATCH_EXTRA)); + OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2)); + SHL(64, R(RSCRATCH_EXTRA), Imm8(52)); // vali = sign | exponent + + MOV(64, R(RSCRATCH2), R(RSCRATCH)); + SHR(64, R(RSCRATCH), Imm8(37)); + SHR(64, R(RSCRATCH2), Imm8(47)); + AND(32, R(RSCRATCH), Imm32(0x3FF)); // i % 1024 + AND(32, R(RSCRATCH2), Imm8(0x1F)); // i / 1024 + + IMUL(32, RSCRATCH, MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_dec)); + ADD(32, R(RSCRATCH), Imm8(1)); + SHR(32, R(RSCRATCH), Imm8(1)); + + MOV(32, R(RSCRATCH2), MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_base)); + SUB(32, R(RSCRATCH2), R(RSCRATCH)); + SHL(64, R(RSCRATCH2), Imm8(29)); + OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29 + MOVQ_xmm(XMM0, R(RSCRATCH2)); + RET(); + + // Exception flags for zero input. + SetJumpTarget(zero); + TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX)); + FixupBranch skip_set_fx1 = J_CC(CC_NZ); + OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX)); + SetJumpTarget(skip_set_fx1); + + SetJumpTarget(complex1); + SetJumpTarget(complex2); + ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); + ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal); + ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); + RET(); +} + +void CommonAsmRoutines::GenMfcr() +{ + // Input: none + // Output: RSCRATCH + // This function clobbers all three RSCRATCH. + X64Reg dst = RSCRATCH; + X64Reg tmp = RSCRATCH2; + X64Reg cr_val = RSCRATCH_EXTRA; + XOR(32, R(dst), R(dst)); + // we only need to zero the high bits of tmp once + XOR(32, R(tmp), R(tmp)); + for (int i = 0; i < 8; i++) + { + static const u32 m_flagTable[8] = { 0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9 }; + if (i != 0) + SHL(32, R(dst), Imm8(4)); + + MOV(64, R(cr_val), PPCSTATE(cr_val[i])); + + // EQ: Bits 31-0 == 0; set flag bit 1 + TEST(32, R(cr_val), R(cr_val)); + // FIXME: is there a better way to do this without the partial register merging? + SETcc(CC_Z, R(tmp)); + LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0)); + + // GT: Value > 0; set flag bit 2 + TEST(64, R(cr_val), R(cr_val)); + SETcc(CC_G, R(tmp)); + LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0)); + + // SO: Bit 61 set; set flag bit 0 + // LT: Bit 62 set; set flag bit 3 + SHR(64, R(cr_val), Imm8(61)); + OR(32, R(dst), MScaled(cr_val, SCALE_4, (u32)(u64)m_flagTable)); + } + RET(); +} + +// Safe + Fast Quantizers, originally from JITIL by magumagu +static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; +static const float GC_ALIGNED16(m_32767) = 32767.0f; +static const float GC_ALIGNED16(m_m32768) = -32768.0f; +static const float GC_ALIGNED16(m_255) = 255.0f; +static const float GC_ALIGNED16(m_127) = 127.0f; +static const float GC_ALIGNED16(m_m128) = -128.0f; + +#define QUANTIZE_OVERFLOW_SAFE + +// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range +// while it's OK for large negatives, it isn't for positives +// I don't know whether the overflow actually happens in any games +// but it potentially can cause problems, so we need some clamping + +// See comment in header for in/outs. +void CommonAsmRoutines::GenQuantizedStores() +{ + const u8* storePairedIllegal = AlignCode4(); + UD2(); + + const u8* storePairedFloat = AlignCode4(); + if (cpu_info.bSSSE3) + { + PSHUFB(XMM0, M((void *)pbswapShuffle2x4)); + MOVQ_xmm(R(RSCRATCH), XMM0); + } + else + { + MOVQ_xmm(R(RSCRATCH), XMM0); + ROL(64, R(RSCRATCH), Imm8(32)); + BSWAP(64, RSCRATCH); + } + SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); + + RET(); + + const u8* storePairedU8 = AlignCode4(); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); + MULPS(XMM0, R(XMM1)); +#ifdef QUANTIZE_OVERFLOW_SAFE + MINPS(XMM0, M(m_65535)); +#endif + CVTTPS2DQ(XMM0, R(XMM0)); + PACKSSDW(XMM0, R(XMM0)); + PACKUSWB(XMM0, R(XMM0)); + MOVD_xmm(R(RSCRATCH), XMM0); + SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); + + RET(); + + const u8* storePairedS8 = AlignCode4(); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); + MULPS(XMM0, R(XMM1)); +#ifdef QUANTIZE_OVERFLOW_SAFE + MINPS(XMM0, M(m_65535)); +#endif + CVTTPS2DQ(XMM0, R(XMM0)); + PACKSSDW(XMM0, R(XMM0)); + PACKSSWB(XMM0, R(XMM0)); + MOVD_xmm(R(RSCRATCH), XMM0); + + SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); + + RET(); + + const u8* storePairedU16 = AlignCode4(); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); + MULPS(XMM0, R(XMM1)); + + if (cpu_info.bSSE4_1) + { +#ifdef QUANTIZE_OVERFLOW_SAFE + MINPS(XMM0, M(m_65535)); +#endif + CVTTPS2DQ(XMM0, R(XMM0)); + PACKUSDW(XMM0, R(XMM0)); + MOVD_xmm(R(RSCRATCH), XMM0); + BSWAP(32, RSCRATCH); + ROL(32, R(RSCRATCH), Imm8(16)); + } + else + { + XORPS(XMM1, R(XMM1)); + MAXPS(XMM0, R(XMM1)); + MINPS(XMM0, M(m_65535)); + + CVTTPS2DQ(XMM0, R(XMM0)); + PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____ + MOVD_xmm(R(RSCRATCH), XMM0); + BSWAP(32, RSCRATCH); + } + + SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); + + RET(); + + const u8* storePairedS16 = AlignCode4(); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); + MULPS(XMM0, R(XMM1)); +#ifdef QUANTIZE_OVERFLOW_SAFE + MINPS(XMM0, M(m_65535)); +#endif + CVTTPS2DQ(XMM0, R(XMM0)); + PACKSSDW(XMM0, R(XMM0)); + MOVD_xmm(R(RSCRATCH), XMM0); + BSWAP(32, RSCRATCH); + ROL(32, R(RSCRATCH), Imm8(16)); + SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); + + RET(); + + pairedStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); + ReserveCodeSpace(8 * sizeof(u8*)); + + pairedStoreQuantized[0] = storePairedFloat; + pairedStoreQuantized[1] = storePairedIllegal; + pairedStoreQuantized[2] = storePairedIllegal; + pairedStoreQuantized[3] = storePairedIllegal; + pairedStoreQuantized[4] = storePairedU8; + pairedStoreQuantized[5] = storePairedU16; + pairedStoreQuantized[6] = storePairedS8; + pairedStoreQuantized[7] = storePairedS16; +} + +// See comment in header for in/outs. +void CommonAsmRoutines::GenQuantizedSingleStores() +{ + const u8* storeSingleIllegal = AlignCode4(); + UD2(); + + // Easy! + const u8* storeSingleFloat = AlignCode4(); + MOVD_xmm(R(RSCRATCH), XMM0); + SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); + RET(); + + const u8* storeSingleU8 = AlignCode4(); // Used by MKWii + SHR(32, R(RSCRATCH2), Imm8(5)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); + XORPS(XMM1, R(XMM1)); + MAXSS(XMM0, R(XMM1)); + MINSS(XMM0, M(&m_255)); + CVTTSS2SI(RSCRATCH, R(XMM0)); + SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); + RET(); + + const u8* storeSingleS8 = AlignCode4(); + SHR(32, R(RSCRATCH2), Imm8(5)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); + MAXSS(XMM0, M(&m_m128)); + MINSS(XMM0, M(&m_127)); + CVTTSS2SI(RSCRATCH, R(XMM0)); + SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); + RET(); + + const u8* storeSingleU16 = AlignCode4(); // Used by MKWii + SHR(32, R(RSCRATCH2), Imm8(5)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); + XORPS(XMM1, R(XMM1)); + MAXSS(XMM0, R(XMM1)); + MINSS(XMM0, M(m_65535)); + CVTTSS2SI(RSCRATCH, R(XMM0)); + SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); + RET(); + + const u8* storeSingleS16 = AlignCode4(); + SHR(32, R(RSCRATCH2), Imm8(5)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); + MAXSS(XMM0, M(&m_m32768)); + MINSS(XMM0, M(&m_32767)); + CVTTSS2SI(RSCRATCH, R(XMM0)); + SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); + RET(); + + singleStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); + ReserveCodeSpace(8 * sizeof(u8*)); + + singleStoreQuantized[0] = storeSingleFloat; + singleStoreQuantized[1] = storeSingleIllegal; + singleStoreQuantized[2] = storeSingleIllegal; + singleStoreQuantized[3] = storeSingleIllegal; + singleStoreQuantized[4] = storeSingleU8; + singleStoreQuantized[5] = storeSingleU16; + singleStoreQuantized[6] = storeSingleS8; + singleStoreQuantized[7] = storeSingleS16; +} + +void CommonAsmRoutines::GenQuantizedLoads() +{ + const u8* loadPairedIllegal = AlignCode4(); + UD2(); + + // FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e. + // don't need hardware access handling. This will definitely crash if paired loads occur + // from non-RAM areas, but as far as I know, this never happens. I don't know if this is + // for a good reason, or merely because no game does this. + // If we find something that actually does do this, maybe this should be changed. How + // much of a performance hit would it be? + const u8* loadPairedFloatTwo = AlignCode4(); + if (jit->js.memcheck) + { + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); + ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); + MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); + } + else if (cpu_info.bSSSE3) + { + MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); + PSHUFB(XMM0, M(pbswapShuffle2x4)); + } + else + { + LoadAndSwap(64, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); + ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); + MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); + } + RET(); + + const u8* loadPairedFloatOne = AlignCode4(); + if (jit->js.memcheck) + { + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + UNPCKLPS(XMM0, M(m_one)); + } + else if (cpu_info.bSSSE3) + { + MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); + PSHUFB(XMM0, M(pbswapShuffle1x4)); + UNPCKLPS(XMM0, M(m_one)); + } + else + { + LoadAndSwap(32, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + UNPCKLPS(XMM0, M(m_one)); + } + RET(); + + const u8* loadPairedU8Two = AlignCode4(); + if (jit->js.memcheck) + { + // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); + ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); + } + else + { + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); + } + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + if (cpu_info.bSSE4_1) + { + PMOVZXBD(XMM0, R(XMM0)); + } + else + { + PXOR(XMM1, R(XMM1)); + PUNPCKLBW(XMM0, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + } + CVTDQ2PS(XMM0, R(XMM0)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); + MULPS(XMM0, R(XMM1)); + RET(); + + const u8* loadPairedU8One = AlignCode4(); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx + CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); + UNPCKLPS(XMM0, M(m_one)); + RET(); + + const u8* loadPairedS8Two = AlignCode4(); + if (jit->js.memcheck) + { + // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); + ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); + } + else + { + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); + } + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + if (cpu_info.bSSE4_1) + { + PMOVSXBD(XMM0, R(XMM0)); + } + else + { + PUNPCKLBW(XMM0, R(XMM0)); + PUNPCKLWD(XMM0, R(XMM0)); + PSRAD(XMM0, 24); + } + CVTDQ2PS(XMM0, R(XMM0)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); + MULPS(XMM0, R(XMM1)); + RET(); + + const u8* loadPairedS8One = AlignCode4(); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true); + CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); + UNPCKLPS(XMM0, M(m_one)); + RET(); + + const u8* loadPairedU16Two = AlignCode4(); + // TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); + ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + if (cpu_info.bSSE4_1) + { + PMOVZXWD(XMM0, R(XMM0)); + } + else + { + PXOR(XMM1, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + } + CVTDQ2PS(XMM0, R(XMM0)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); + MULPS(XMM0, R(XMM1)); + RET(); + + const u8* loadPairedU16One = AlignCode4(); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false); + CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); + UNPCKLPS(XMM0, M(m_one)); + RET(); + + const u8* loadPairedS16Two = AlignCode4(); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); + ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + if (cpu_info.bSSE4_1) + { + PMOVSXWD(XMM0, R(XMM0)); + } + else + { + PUNPCKLWD(XMM0, R(XMM0)); + PSRAD(XMM0, 16); + } + CVTDQ2PS(XMM0, R(XMM0)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); + MULPS(XMM0, R(XMM1)); + RET(); + + const u8* loadPairedS16One = AlignCode4(); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true); + CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); + UNPCKLPS(XMM0, M(m_one)); + RET(); + + pairedLoadQuantized = reinterpret_cast(const_cast(AlignCode16())); + ReserveCodeSpace(16 * sizeof(u8*)); + + pairedLoadQuantized[0] = loadPairedFloatTwo; + pairedLoadQuantized[1] = loadPairedIllegal; + pairedLoadQuantized[2] = loadPairedIllegal; + pairedLoadQuantized[3] = loadPairedIllegal; + pairedLoadQuantized[4] = loadPairedU8Two; + pairedLoadQuantized[5] = loadPairedU16Two; + pairedLoadQuantized[6] = loadPairedS8Two; + pairedLoadQuantized[7] = loadPairedS16Two; + + pairedLoadQuantized[8] = loadPairedFloatOne; + pairedLoadQuantized[9] = loadPairedIllegal; + pairedLoadQuantized[10] = loadPairedIllegal; + pairedLoadQuantized[11] = loadPairedIllegal; + pairedLoadQuantized[12] = loadPairedU8One; + pairedLoadQuantized[13] = loadPairedU16One; + pairedLoadQuantized[14] = loadPairedS8One; + pairedLoadQuantized[15] = loadPairedS16One; +} diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h new file mode 100644 index 0000000000..8d93b4f252 --- /dev/null +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h @@ -0,0 +1,22 @@ +// Copyright 2013 Dolphin Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#pragma once + +#include "Core/PowerPC/JitCommon/Jit_Util.h" +#include "Core/PowerPC/JitCommon/JitAsmCommon.h" + +class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock +{ +protected: + void GenQuantizedLoads(); + void GenQuantizedStores(); + void GenQuantizedSingleStores(); + +public: + void GenFifoWrite(int size); + void GenFrsqrte(); + void GenFres(); + void GenMfcr(); +}; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index 7514167c86..dd693006ea 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -2,194 +2,7 @@ // Licensed under GPLv2 // Refer to the license.txt file included. -#include "Common/CPUDetect.h" -#include "Common/MathUtil.h" -#include "Common/MemoryUtil.h" - #include "Core/PowerPC/JitCommon/JitAsmCommon.h" -#include "Core/PowerPC/JitCommon/JitBase.h" - -#define QUANTIZED_REGS_TO_SAVE \ - (ABI_ALL_CALLER_SAVED & ~BitSet32 { \ - RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \ - }) - -#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 }) - -using namespace Gen; - -void CommonAsmRoutines::GenFifoWrite(int size) -{ - // Assume value in RSCRATCH - u32 gather_pipe = (u32)(u64)GPFifo::m_gatherPipe; - _assert_msg_(DYNA_REC, gather_pipe <= 0x7FFFFFFF, "Gather pipe not in low 2GB of memory!"); - MOV(32, R(RSCRATCH2), M(&GPFifo::m_gatherPipeCount)); - SwapAndStore(size, MDisp(RSCRATCH2, gather_pipe), RSCRATCH); - ADD(32, R(RSCRATCH2), Imm8(size >> 3)); - MOV(32, M(&GPFifo::m_gatherPipeCount), R(RSCRATCH2)); - RET(); -} - -void CommonAsmRoutines::GenFrsqrte() -{ - // Assume input in XMM0. - // This function clobbers all three RSCRATCH. - MOVQ_xmm(R(RSCRATCH), XMM0); - - // Negative and zero inputs set an exception and take the complex path. - TEST(64, R(RSCRATCH), R(RSCRATCH)); - FixupBranch zero = J_CC(CC_Z, true); - FixupBranch negative = J_CC(CC_S, true); - MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH)); - SHR(64, R(RSCRATCH_EXTRA), Imm8(52)); - - // Zero and max exponents (non-normal floats) take the complex path. - FixupBranch complex1 = J_CC(CC_Z, true); - CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); - FixupBranch complex2 = J_CC(CC_E, true); - - SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD)); - SAR(32, R(RSCRATCH_EXTRA), Imm8(1)); - MOV(32, R(RSCRATCH2), Imm32(0x3FF)); - SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA)); - SHL(64, R(RSCRATCH2), Imm8(52)); // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) / 2)) & (0x7FFLL << 52); - - MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH)); - SHR(64, R(RSCRATCH_EXTRA), Imm8(48)); - AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F)); - XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10)); // int index = i / 2048 + (odd_exponent ? 16 : 0); - - SHR(64, R(RSCRATCH), Imm8(37)); - AND(32, R(RSCRATCH), Imm32(0x7FF)); - IMUL(32, RSCRATCH, MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_dec)); - MOV(32, R(RSCRATCH_EXTRA), MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_base)); - SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH)); - SHL(64, R(RSCRATCH_EXTRA), Imm8(26)); - OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(frsqrte_expected_base[index] - frsqrte_expected_dec[index] * (i % 2048)) << 26; - MOVQ_xmm(XMM0, R(RSCRATCH2)); - RET(); - - // Exception flags for zero input. - SetJumpTarget(zero); - TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX)); - FixupBranch skip_set_fx1 = J_CC(CC_NZ); - OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX)); - FixupBranch complex3 = J(); - - // Exception flags for negative input. - SetJumpTarget(negative); - TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT)); - FixupBranch skip_set_fx2 = J_CC(CC_NZ); - OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT)); - - SetJumpTarget(skip_set_fx1); - SetJumpTarget(skip_set_fx2); - SetJumpTarget(complex1); - SetJumpTarget(complex2); - SetJumpTarget(complex3); - ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); - ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot); - ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); - RET(); -} - -void CommonAsmRoutines::GenFres() -{ - // Assume input in XMM0. - // This function clobbers all three RSCRATCH. - MOVQ_xmm(R(RSCRATCH), XMM0); - - // Zero inputs set an exception and take the complex path. - TEST(64, R(RSCRATCH), R(RSCRATCH)); - FixupBranch zero = J_CC(CC_Z); - - MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH)); - SHR(64, R(RSCRATCH_EXTRA), Imm8(52)); - MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA)); - AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); // exp - AND(32, R(RSCRATCH2), Imm32(0x800)); // sign - CMP(32, R(RSCRATCH_EXTRA), Imm32(895)); - // Take the complex path for very large/small exponents. - FixupBranch complex1 = J_CC(CC_L); - CMP(32, R(RSCRATCH_EXTRA), Imm32(1149)); - FixupBranch complex2 = J_CC(CC_GE); - - SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD)); - NEG(32, R(RSCRATCH_EXTRA)); - OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2)); - SHL(64, R(RSCRATCH_EXTRA), Imm8(52)); // vali = sign | exponent - - MOV(64, R(RSCRATCH2), R(RSCRATCH)); - SHR(64, R(RSCRATCH), Imm8(37)); - SHR(64, R(RSCRATCH2), Imm8(47)); - AND(32, R(RSCRATCH), Imm32(0x3FF)); // i % 1024 - AND(32, R(RSCRATCH2), Imm8(0x1F)); // i / 1024 - - IMUL(32, RSCRATCH, MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_dec)); - ADD(32, R(RSCRATCH), Imm8(1)); - SHR(32, R(RSCRATCH), Imm8(1)); - - MOV(32, R(RSCRATCH2), MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_base)); - SUB(32, R(RSCRATCH2), R(RSCRATCH)); - SHL(64, R(RSCRATCH2), Imm8(29)); - OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29 - MOVQ_xmm(XMM0, R(RSCRATCH2)); - RET(); - - // Exception flags for zero input. - SetJumpTarget(zero); - TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX)); - FixupBranch skip_set_fx1 = J_CC(CC_NZ); - OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX)); - SetJumpTarget(skip_set_fx1); - - SetJumpTarget(complex1); - SetJumpTarget(complex2); - ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); - ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal); - ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); - RET(); -} - -void CommonAsmRoutines::GenMfcr() -{ - // Input: none - // Output: RSCRATCH - // This function clobbers all three RSCRATCH. - X64Reg dst = RSCRATCH; - X64Reg tmp = RSCRATCH2; - X64Reg cr_val = RSCRATCH_EXTRA; - XOR(32, R(dst), R(dst)); - // we only need to zero the high bits of tmp once - XOR(32, R(tmp), R(tmp)); - for (int i = 0; i < 8; i++) - { - static const u32 m_flagTable[8] = { 0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9 }; - if (i != 0) - SHL(32, R(dst), Imm8(4)); - - MOV(64, R(cr_val), PPCSTATE(cr_val[i])); - - // EQ: Bits 31-0 == 0; set flag bit 1 - TEST(32, R(cr_val), R(cr_val)); - // FIXME: is there a better way to do this without the partial register merging? - SETcc(CC_Z, R(tmp)); - LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0)); - - // GT: Value > 0; set flag bit 2 - TEST(64, R(cr_val), R(cr_val)); - SETcc(CC_G, R(tmp)); - LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0)); - - // SO: Bit 61 set; set flag bit 0 - // LT: Bit 62 set; set flag bit 3 - SHR(64, R(cr_val), Imm8(61)); - OR(32, R(dst), MScaled(cr_val, SCALE_4, (u32)(u64)m_flagTable)); - } - RET(); -} - -// Safe + Fast Quantizers, originally from JITIL by magumagu const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 }; @@ -250,414 +63,4 @@ const float GC_ALIGNED16(m_dequantizeTableS[]) = (1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1), }; -static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; -static const float GC_ALIGNED16(m_32767) = 32767.0f; -static const float GC_ALIGNED16(m_m32768) = -32768.0f; -static const float GC_ALIGNED16(m_255) = 255.0f; -static const float GC_ALIGNED16(m_127) = 127.0f; -static const float GC_ALIGNED16(m_m128) = -128.0f; - const float GC_ALIGNED16(m_one[]) = { 1.0f, 0.0f, 0.0f, 0.0f }; - -#define QUANTIZE_OVERFLOW_SAFE - -// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range -// while it's OK for large negatives, it isn't for positives -// I don't know whether the overflow actually happens in any games -// but it potentially can cause problems, so we need some clamping - -// See comment in header for in/outs. -void CommonAsmRoutines::GenQuantizedStores() -{ - const u8* storePairedIllegal = AlignCode4(); - UD2(); - - const u8* storePairedFloat = AlignCode4(); - if (cpu_info.bSSSE3) - { - PSHUFB(XMM0, M((void *)pbswapShuffle2x4)); - MOVQ_xmm(R(RSCRATCH), XMM0); - } - else - { - MOVQ_xmm(R(RSCRATCH), XMM0); - ROL(64, R(RSCRATCH), Imm8(32)); - BSWAP(64, RSCRATCH); - } - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - - RET(); - - const u8* storePairedU8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULPS(XMM0, R(XMM1)); -#ifdef QUANTIZE_OVERFLOW_SAFE - MINPS(XMM0, M(m_65535)); -#endif - CVTTPS2DQ(XMM0, R(XMM0)); - PACKSSDW(XMM0, R(XMM0)); - PACKUSWB(XMM0, R(XMM0)); - MOVD_xmm(R(RSCRATCH), XMM0); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - - RET(); - - const u8* storePairedS8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULPS(XMM0, R(XMM1)); -#ifdef QUANTIZE_OVERFLOW_SAFE - MINPS(XMM0, M(m_65535)); -#endif - CVTTPS2DQ(XMM0, R(XMM0)); - PACKSSDW(XMM0, R(XMM0)); - PACKSSWB(XMM0, R(XMM0)); - MOVD_xmm(R(RSCRATCH), XMM0); - - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - - RET(); - - const u8* storePairedU16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULPS(XMM0, R(XMM1)); - - if (cpu_info.bSSE4_1) - { -#ifdef QUANTIZE_OVERFLOW_SAFE - MINPS(XMM0, M(m_65535)); -#endif - CVTTPS2DQ(XMM0, R(XMM0)); - PACKUSDW(XMM0, R(XMM0)); - MOVD_xmm(R(RSCRATCH), XMM0); - BSWAP(32, RSCRATCH); - ROL(32, R(RSCRATCH), Imm8(16)); - } - else - { - XORPS(XMM1, R(XMM1)); - MAXPS(XMM0, R(XMM1)); - MINPS(XMM0, M(m_65535)); - - CVTTPS2DQ(XMM0, R(XMM0)); - PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____ - MOVD_xmm(R(RSCRATCH), XMM0); - BSWAP(32, RSCRATCH); - } - - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - - RET(); - - const u8* storePairedS16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULPS(XMM0, R(XMM1)); -#ifdef QUANTIZE_OVERFLOW_SAFE - MINPS(XMM0, M(m_65535)); -#endif - CVTTPS2DQ(XMM0, R(XMM0)); - PACKSSDW(XMM0, R(XMM0)); - MOVD_xmm(R(RSCRATCH), XMM0); - BSWAP(32, RSCRATCH); - ROL(32, R(RSCRATCH), Imm8(16)); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - - RET(); - - pairedStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); - ReserveCodeSpace(8 * sizeof(u8*)); - - pairedStoreQuantized[0] = storePairedFloat; - pairedStoreQuantized[1] = storePairedIllegal; - pairedStoreQuantized[2] = storePairedIllegal; - pairedStoreQuantized[3] = storePairedIllegal; - pairedStoreQuantized[4] = storePairedU8; - pairedStoreQuantized[5] = storePairedU16; - pairedStoreQuantized[6] = storePairedS8; - pairedStoreQuantized[7] = storePairedS16; -} - -// See comment in header for in/outs. -void CommonAsmRoutines::GenQuantizedSingleStores() -{ - const u8* storeSingleIllegal = AlignCode4(); - UD2(); - - // Easy! - const u8* storeSingleFloat = AlignCode4(); - MOVD_xmm(R(RSCRATCH), XMM0); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - RET(); - - const u8* storeSingleU8 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - XORPS(XMM1, R(XMM1)); - MAXSS(XMM0, R(XMM1)); - MINSS(XMM0, M(&m_255)); - CVTTSS2SI(RSCRATCH, R(XMM0)); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - RET(); - - const u8* storeSingleS8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MAXSS(XMM0, M(&m_m128)); - MINSS(XMM0, M(&m_127)); - CVTTSS2SI(RSCRATCH, R(XMM0)); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - RET(); - - const u8* storeSingleU16 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - XORPS(XMM1, R(XMM1)); - MAXSS(XMM0, R(XMM1)); - MINSS(XMM0, M(m_65535)); - CVTTSS2SI(RSCRATCH, R(XMM0)); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - RET(); - - const u8* storeSingleS16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MAXSS(XMM0, M(&m_m32768)); - MINSS(XMM0, M(&m_32767)); - CVTTSS2SI(RSCRATCH, R(XMM0)); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - RET(); - - singleStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); - ReserveCodeSpace(8 * sizeof(u8*)); - - singleStoreQuantized[0] = storeSingleFloat; - singleStoreQuantized[1] = storeSingleIllegal; - singleStoreQuantized[2] = storeSingleIllegal; - singleStoreQuantized[3] = storeSingleIllegal; - singleStoreQuantized[4] = storeSingleU8; - singleStoreQuantized[5] = storeSingleU16; - singleStoreQuantized[6] = storeSingleS8; - singleStoreQuantized[7] = storeSingleS16; -} - -void CommonAsmRoutines::GenQuantizedLoads() -{ - const u8* loadPairedIllegal = AlignCode4(); - UD2(); - - // FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e. - // don't need hardware access handling. This will definitely crash if paired loads occur - // from non-RAM areas, but as far as I know, this never happens. I don't know if this is - // for a good reason, or merely because no game does this. - // If we find something that actually does do this, maybe this should be changed. How - // much of a performance hit would it be? - const u8* loadPairedFloatTwo = AlignCode4(); - if (jit->js.memcheck) - { - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); - MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); - } - else if (cpu_info.bSSSE3) - { - MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); - PSHUFB(XMM0, M(pbswapShuffle2x4)); - } - else - { - LoadAndSwap(64, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); - ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); - MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); - } - RET(); - - const u8* loadPairedFloatOne = AlignCode4(); - if (jit->js.memcheck) - { - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - UNPCKLPS(XMM0, M(m_one)); - } - else if (cpu_info.bSSSE3) - { - MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); - PSHUFB(XMM0, M(pbswapShuffle1x4)); - UNPCKLPS(XMM0, M(m_one)); - } - else - { - LoadAndSwap(32, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - UNPCKLPS(XMM0, M(m_one)); - } - RET(); - - const u8* loadPairedU8Two = AlignCode4(); - if (jit->js.memcheck) - { - // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); - } - else - { - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); - } - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - if (cpu_info.bSSE4_1) - { - PMOVZXBD(XMM0, R(XMM0)); - } - else - { - PXOR(XMM1, R(XMM1)); - PUNPCKLBW(XMM0, R(XMM1)); - PUNPCKLWD(XMM0, R(XMM1)); - } - CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULPS(XMM0, R(XMM1)); - RET(); - - const u8* loadPairedU8One = AlignCode4(); - if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx - CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - UNPCKLPS(XMM0, M(m_one)); - RET(); - - const u8* loadPairedS8Two = AlignCode4(); - if (jit->js.memcheck) - { - // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); - } - else - { - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); - } - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - if (cpu_info.bSSE4_1) - { - PMOVSXBD(XMM0, R(XMM0)); - } - else - { - PUNPCKLBW(XMM0, R(XMM0)); - PUNPCKLWD(XMM0, R(XMM0)); - PSRAD(XMM0, 24); - } - CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULPS(XMM0, R(XMM1)); - RET(); - - const u8* loadPairedS8One = AlignCode4(); - if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true); - CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - UNPCKLPS(XMM0, M(m_one)); - RET(); - - const u8* loadPairedU16Two = AlignCode4(); - // TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice - if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); - ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - if (cpu_info.bSSE4_1) - { - PMOVZXWD(XMM0, R(XMM0)); - } - else - { - PXOR(XMM1, R(XMM1)); - PUNPCKLWD(XMM0, R(XMM1)); - } - CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULPS(XMM0, R(XMM1)); - RET(); - - const u8* loadPairedU16One = AlignCode4(); - if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false); - CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - UNPCKLPS(XMM0, M(m_one)); - RET(); - - const u8* loadPairedS16Two = AlignCode4(); - if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); - ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - if (cpu_info.bSSE4_1) - { - PMOVSXWD(XMM0, R(XMM0)); - } - else - { - PUNPCKLWD(XMM0, R(XMM0)); - PSRAD(XMM0, 16); - } - CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULPS(XMM0, R(XMM1)); - RET(); - - const u8* loadPairedS16One = AlignCode4(); - if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true); - CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - UNPCKLPS(XMM0, M(m_one)); - RET(); - - pairedLoadQuantized = reinterpret_cast(const_cast(AlignCode16())); - ReserveCodeSpace(16 * sizeof(u8*)); - - pairedLoadQuantized[0] = loadPairedFloatTwo; - pairedLoadQuantized[1] = loadPairedIllegal; - pairedLoadQuantized[2] = loadPairedIllegal; - pairedLoadQuantized[3] = loadPairedIllegal; - pairedLoadQuantized[4] = loadPairedU8Two; - pairedLoadQuantized[5] = loadPairedU16Two; - pairedLoadQuantized[6] = loadPairedS8Two; - pairedLoadQuantized[7] = loadPairedS16Two; - - pairedLoadQuantized[8] = loadPairedFloatOne; - pairedLoadQuantized[9] = loadPairedIllegal; - pairedLoadQuantized[10] = loadPairedIllegal; - pairedLoadQuantized[11] = loadPairedIllegal; - pairedLoadQuantized[12] = loadPairedU8One; - pairedLoadQuantized[13] = loadPairedU16One; - pairedLoadQuantized[14] = loadPairedS8One; - pairedLoadQuantized[15] = loadPairedS16One; -} diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index c872865fa7..4df759c6c5 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -4,7 +4,7 @@ #pragma once -#include "Core/PowerPC/JitCommon/Jit_Util.h" +#include "Common/CommonTypes.h" extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]); extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]); @@ -15,7 +15,6 @@ extern const float GC_ALIGNED16(m_dequantizeTableS[]); class CommonAsmRoutinesBase { public: - const u8 *fifoDirectWrite8; const u8 *fifoDirectWrite16; const u8 *fifoDirectWrite32; @@ -51,19 +50,5 @@ public: // In: ECX: Address to write to. // In: XMM0: Bottom 32-bit slot holds the float to be written. const u8 **singleStoreQuantized; - }; -class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock -{ -protected: - void GenQuantizedLoads(); - void GenQuantizedStores(); - void GenQuantizedSingleStores(); - -public: - void GenFifoWrite(int size); - void GenFrsqrte(); - void GenFres(); - void GenMfcr(); -}; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 6205f7d420..3ff7d74158 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -23,8 +23,8 @@ #include "Core/PowerPC/PowerPC.h" #include "Core/PowerPC/PPCAnalyst.h" #include "Core/PowerPC/PPCTables.h" +#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h" #include "Core/PowerPC/JitCommon/Jit_Util.h" -#include "Core/PowerPC/JitCommon/JitAsmCommon.h" #include "Core/PowerPC/JitCommon/JitCache.h" #include "Core/PowerPC/JitCommon/TrampolineCache.h"