diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h index 28b1e3452e..06ed7be813 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h @@ -145,7 +145,6 @@ enum Opcode { StoreDouble, StoreFReg, FDCmpCR, - CFloatOne, // Store 1.0f into the specified floating register // "Trinary" operators // FIXME: Need to change representation! @@ -287,7 +286,7 @@ public: return FoldUOp(StoreGReg, value, reg); } InstLoc EmitNot(InstLoc op1) { - return EmitXor(op1, EmitIntConst(-1U)); + return EmitXor(op1, EmitIntConst(0xFFFFFFFFU)); } InstLoc EmitAnd(InstLoc op1, InstLoc op2) { return FoldBiOp(And, op1, op2); @@ -517,9 +516,6 @@ public: InstLoc EmitFDCmpCR(InstLoc op1, InstLoc op2) { return FoldBiOp(FDCmpCR, op1, op2); } - InstLoc EmitCFloatOne() { - return FoldZeroOp(CFloatOne, 0); - } InstLoc EmitLoadGQR(unsigned gqr) { return FoldZeroOp(LoadGQR, gqr); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp index aa9dee3cd8..7faccf3d02 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp @@ -725,7 +725,6 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak case LoadDouble: case LoadSingle: case LoadPaired: - case CFloatOne: if (thisUsed) regMarkUse(RI, I, getOp1(I), 1); break; @@ -1170,16 +1169,6 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak regNormalRegClear(RI, I); break; } - case CFloatOne: { - if (!thisUsed) break; - X64Reg reg = fregFindFreeReg(RI); - static const float one = 1.0f; - Jit->MOV(32, R(ECX), Imm32(*(u32*)&one)); - Jit->MOVD_xmm(reg, R(ECX)); - RI.fregs[reg] = I; - regNormalRegClear(RI, I); - break; - } case LoadDouble: { if (!thisUsed) break; X64Reg reg = fregFindFreeReg(RI); @@ -1200,9 +1189,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak regSpill(RI, EAX); regSpill(RI, EDX); X64Reg reg = fregFindFreeReg(RI); - unsigned int quantreg = *I >> 16; + // The lower 3 bits is for GQR index. The next 1 bit is for inst.W + unsigned int quantreg = (*I >> 16) & 0x7; + unsigned int w = *I >> 19; Jit->MOVZX(32, 16, EAX, M(((char *)&GQR(quantreg)) + 2)); Jit->MOVZX(32, 8, EDX, R(AL)); + Jit->OR(32, R(EDX), Imm8(w << 3)); // FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]! (MComplex can do this, no?) #ifdef _M_IX86 Jit->SHL(32, R(EDX), Imm8(2)); diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_LoadStorePaired.cpp index 8fca0bf194..079a42e689 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_LoadStorePaired.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_LoadStorePaired.cpp @@ -38,6 +38,7 @@ void JitIL::psq_st(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(LoadStorePaired) if (js.memcheck) { Default(inst); return; } + if (inst.W) {Default(inst); return;} IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_12), val; if (inst.RA) addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); @@ -45,14 +46,7 @@ void JitIL::psq_st(UGeckoInstruction inst) ibuild.EmitStoreGReg(addr, inst.RA); val = ibuild.EmitLoadFReg(inst.RS); val = ibuild.EmitCompactMRegToPacked(val); - if (inst.W == 0) { - ibuild.EmitStorePaired(val, addr, inst.I); - } else { - IREmitter::InstLoc addr4 = ibuild.EmitAdd(addr, ibuild.EmitIntConst(4)); - IREmitter::InstLoc backup = ibuild.EmitLoad32(addr4); - ibuild.EmitStorePaired(val, addr, inst.I); - ibuild.EmitStore32(backup, addr4); - } + ibuild.EmitStorePaired(val, addr, inst.I); } void JitIL::psq_l(UGeckoInstruction inst) @@ -65,10 +59,7 @@ void JitIL::psq_l(UGeckoInstruction inst) addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); if (inst.OPCD == 57) ibuild.EmitStoreGReg(addr, inst.RA); - val = ibuild.EmitLoadPaired(addr, inst.I); - if (inst.W) { - val = ibuild.EmitFPMerge00(val, ibuild.EmitCFloatOne()); - } + val = ibuild.EmitLoadPaired(addr, inst.I | (inst.W << 3)); // The lower 3 bits is for GQR index. The next 1 bit is for inst.W val = ibuild.EmitExpandPackedToMReg(val); ibuild.EmitStoreFReg(val, inst.RD); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_SystemRegisters.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_SystemRegisters.cpp index 4e9f765773..4ff1d6a29b 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_SystemRegisters.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL_SystemRegisters.cpp @@ -184,23 +184,23 @@ void JitIL::crXX(UGeckoInstruction inst) break; case 129: // crandc - ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(-1U)); + ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(0xFFFFFFFFU)); eax = ibuild.EmitAnd(eax, ecx); break; case 289: // creqv eax = ibuild.EmitXor(eax, ecx); - eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(-1U)); + eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(0xFFFFFFFFU)); break; case 225: // crnand eax = ibuild.EmitAnd(eax, ecx); - eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(-1U)); + eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(0xFFFFFFFFU)); break; case 33: // crnor eax = ibuild.EmitOr(eax, ecx); - eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(-1U)); + eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(0xFFFFFFFFU)); break; case 449: // cror @@ -208,7 +208,7 @@ void JitIL::crXX(UGeckoInstruction inst) break; case 417: // crorc - ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(-1U)); + ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(0xFFFFFFFFU)); eax = ibuild.EmitOr(eax, ecx); break; case 193: diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp index 5bd9baabf8..061ba36662 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp @@ -148,6 +148,7 @@ void CommonAsmRoutines::GenFifoXmm64Write() // Safe + Fast Quantizers, originally from JITIL by magumagu +static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15}; static const float GC_ALIGNED16(m_quantizeTableS[]) = @@ -199,6 +200,8 @@ static const float GC_ALIGNED16(m_255) = 255.0f; static const float GC_ALIGNED16(m_127) = 127.0f; static const float GC_ALIGNED16(m_m128) = -128.0f; +static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f}; + #define QUANTIZE_OVERFLOW_SAFE // according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range @@ -426,7 +429,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { void CommonAsmRoutines::GenQuantizedLoads() { const u8* loadPairedIllegal = AlignCode4(); UD2(); - const u8* loadPairedFloat = AlignCode4(); + + const u8* loadPairedFloatTwo = AlignCode4(); if (cpu_info.bSSSE3) { #ifdef _M_X64 MOVQ_xmm(XMM0, MComplex(RBX, RCX, 1, 0)); @@ -465,7 +469,33 @@ void CommonAsmRoutines::GenQuantizedLoads() { } RET(); - const u8* loadPairedU8 = AlignCode4(); + const u8* loadPairedFloatOne = AlignCode4(); + if (cpu_info.bSSSE3) { +#ifdef _M_X64 + MOVD_xmm(XMM0, MComplex(RBX, RCX, 1, 0)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOVD_xmm(XMM0, MDisp(ECX, (u32)Memory::base)); +#endif + PSHUFB(XMM0, M((void *)pbswapShuffle1x4)); + UNPCKLPS(XMM0, M((void*)m_one)); + } else { +#ifdef _M_X64 + MOV(32, R(RCX), MComplex(RBX, RCX, 1, 0)); + BSWAP(32, RCX); + MOVD_xmm(XMM0, R(RCX)); + UNPCKLPS(XMM0, M((void*)m_one)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base)); + BSWAP(32, EAX); + MOVD_xmm(XMM0, M(&psTemp[0])); + UNPCKLPS(XMM0, M((void*)m_one)); +#endif + } + RET(); + + const u8* loadPairedU8Two = AlignCode4(); UnsafeLoadRegToRegNoSwap(ECX, ECX, 16, 0); MOVD_xmm(XMM0, R(ECX)); PXOR(XMM1, R(XMM1)); @@ -478,7 +508,17 @@ void CommonAsmRoutines::GenQuantizedLoads() { MULPS(XMM0, R(XMM1)); RET(); - const u8* loadPairedS8 = AlignCode4(); + const u8* loadPairedU8One = AlignCode4(); + UnsafeLoadRegToRegNoSwap(ECX, ECX, 8, 0); // ECX = 0x000000xx + MOVD_xmm(XMM0, R(ECX)); + CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better? + SHR(32, R(EAX), Imm8(6)); + MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS)); + MULSS(XMM0, R(XMM1)); + UNPCKLPS(XMM0, M((void*)m_one)); + RET(); + + const u8* loadPairedS8Two = AlignCode4(); UnsafeLoadRegToRegNoSwap(ECX, ECX, 16, 0); MOVD_xmm(XMM0, R(ECX)); PUNPCKLBW(XMM0, R(XMM0)); @@ -491,7 +531,19 @@ void CommonAsmRoutines::GenQuantizedLoads() { MULPS(XMM0, R(XMM1)); RET(); - const u8* loadPairedU16 = AlignCode4(); + const u8* loadPairedS8One = AlignCode4(); + UnsafeLoadRegToRegNoSwap(ECX, ECX, 8, 0); + SHL(32, R(ECX), Imm8(24)); + SAR(32, R(ECX), Imm8(24)); + MOVD_xmm(XMM0, R(ECX)); + CVTDQ2PS(XMM0, R(XMM0)); + SHR(32, R(EAX), Imm8(6)); + MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS)); + MULSS(XMM0, R(XMM1)); + UNPCKLPS(XMM0, M((void*)m_one)); + RET(); + + const u8* loadPairedU16Two = AlignCode4(); UnsafeLoadRegToReg(ECX, ECX, 32, 0, false); ROL(32, R(ECX), Imm8(16)); MOVD_xmm(XMM0, R(ECX)); @@ -504,7 +556,18 @@ void CommonAsmRoutines::GenQuantizedLoads() { MULPS(XMM0, R(XMM1)); RET(); - const u8* loadPairedS16 = AlignCode4(); + const u8* loadPairedU16One = AlignCode4(); + UnsafeLoadRegToReg(ECX, ECX, 32, 0, false); + SHR(32, R(ECX), Imm8(16)); + MOVD_xmm(XMM0, R(ECX)); + CVTDQ2PS(XMM0, R(XMM0)); + SHR(32, R(EAX), Imm8(6)); + MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS)); + MULSS(XMM0, R(XMM1)); + UNPCKLPS(XMM0, M((void*)m_one)); + RET(); + + const u8* loadPairedS16Two = AlignCode4(); UnsafeLoadRegToReg(ECX, ECX, 32, 0, false); ROL(32, R(ECX), Imm8(16)); MOVD_xmm(XMM0, R(ECX)); @@ -518,12 +581,33 @@ void CommonAsmRoutines::GenQuantizedLoads() { MULPS(XMM0, R(XMM1)); RET(); - pairedLoadQuantized[0] = loadPairedFloat; + const u8* loadPairedS16One = AlignCode4(); + UnsafeLoadRegToReg(ECX, ECX, 32, 0, false); + SAR(32, R(ECX), Imm8(16)); + MOVD_xmm(XMM0, R(ECX)); + CVTDQ2PS(XMM0, R(XMM0)); + SHR(32, R(EAX), Imm8(6)); + AND(32, R(EAX), Imm32(0xFC)); + MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS)); + MULSS(XMM0, R(XMM1)); + UNPCKLPS(XMM0, M((void*)m_one)); + RET(); + + pairedLoadQuantized[0] = loadPairedFloatTwo; pairedLoadQuantized[1] = loadPairedIllegal; pairedLoadQuantized[2] = loadPairedIllegal; pairedLoadQuantized[3] = loadPairedIllegal; - pairedLoadQuantized[4] = loadPairedU8; - pairedLoadQuantized[5] = loadPairedU16; - pairedLoadQuantized[6] = loadPairedS8; - pairedLoadQuantized[7] = loadPairedS16; + pairedLoadQuantized[4] = loadPairedU8Two; + pairedLoadQuantized[5] = loadPairedU16Two; + pairedLoadQuantized[6] = loadPairedS8Two; + pairedLoadQuantized[7] = loadPairedS16Two; + + pairedLoadQuantized[8] = loadPairedFloatOne; + pairedLoadQuantized[9] = loadPairedIllegal; + pairedLoadQuantized[10] = loadPairedIllegal; + pairedLoadQuantized[11] = loadPairedIllegal; + pairedLoadQuantized[12] = loadPairedU8One; + pairedLoadQuantized[13] = loadPairedU16One; + pairedLoadQuantized[14] = loadPairedS8One; + pairedLoadQuantized[15] = loadPairedS16One; } diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.h index 65788fea3b..ce298f7661 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.h @@ -56,7 +56,7 @@ public: // Out: XMM0: Bottom two 32-bit slots hold the read value, // converted to a pair of floats. // Trashes: EAX ECX EDX - const u8 GC_ALIGNED16(*pairedLoadQuantized[8]); + const u8 GC_ALIGNED16(*pairedLoadQuantized[16]); // In: array index: GQR to use. // In: ECX: Address to write to.