JitIL: Modified psq_l implementation. Reverted psq_st. Removed compile warnings.
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6114 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
20704fca3d
commit
62c4d439ce
|
@ -145,7 +145,6 @@ enum Opcode {
|
|||
StoreDouble,
|
||||
StoreFReg,
|
||||
FDCmpCR,
|
||||
CFloatOne, // Store 1.0f into the specified floating register
|
||||
|
||||
// "Trinary" operators
|
||||
// FIXME: Need to change representation!
|
||||
|
@ -287,7 +286,7 @@ public:
|
|||
return FoldUOp(StoreGReg, value, reg);
|
||||
}
|
||||
InstLoc EmitNot(InstLoc op1) {
|
||||
return EmitXor(op1, EmitIntConst(-1U));
|
||||
return EmitXor(op1, EmitIntConst(0xFFFFFFFFU));
|
||||
}
|
||||
InstLoc EmitAnd(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(And, op1, op2);
|
||||
|
@ -517,9 +516,6 @@ public:
|
|||
InstLoc EmitFDCmpCR(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FDCmpCR, op1, op2);
|
||||
}
|
||||
InstLoc EmitCFloatOne() {
|
||||
return FoldZeroOp(CFloatOne, 0);
|
||||
}
|
||||
InstLoc EmitLoadGQR(unsigned gqr) {
|
||||
return FoldZeroOp(LoadGQR, gqr);
|
||||
}
|
||||
|
|
|
@ -725,7 +725,6 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
|
|||
case LoadDouble:
|
||||
case LoadSingle:
|
||||
case LoadPaired:
|
||||
case CFloatOne:
|
||||
if (thisUsed)
|
||||
regMarkUse(RI, I, getOp1(I), 1);
|
||||
break;
|
||||
|
@ -1170,16 +1169,6 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
|
|||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case CFloatOne: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
static const float one = 1.0f;
|
||||
Jit->MOV(32, R(ECX), Imm32(*(u32*)&one));
|
||||
Jit->MOVD_xmm(reg, R(ECX));
|
||||
RI.fregs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case LoadDouble: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
|
@ -1200,9 +1189,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
|
|||
regSpill(RI, EAX);
|
||||
regSpill(RI, EDX);
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
unsigned int quantreg = *I >> 16;
|
||||
// The lower 3 bits is for GQR index. The next 1 bit is for inst.W
|
||||
unsigned int quantreg = (*I >> 16) & 0x7;
|
||||
unsigned int w = *I >> 19;
|
||||
Jit->MOVZX(32, 16, EAX, M(((char *)&GQR(quantreg)) + 2));
|
||||
Jit->MOVZX(32, 8, EDX, R(AL));
|
||||
Jit->OR(32, R(EDX), Imm8(w << 3));
|
||||
// FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]! (MComplex can do this, no?)
|
||||
#ifdef _M_IX86
|
||||
Jit->SHL(32, R(EDX), Imm8(2));
|
||||
|
|
|
@ -38,6 +38,7 @@ void JitIL::psq_st(UGeckoInstruction inst)
|
|||
INSTRUCTION_START
|
||||
JITDISABLE(LoadStorePaired)
|
||||
if (js.memcheck) { Default(inst); return; }
|
||||
if (inst.W) {Default(inst); return;}
|
||||
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_12), val;
|
||||
if (inst.RA)
|
||||
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||
|
@ -45,14 +46,7 @@ void JitIL::psq_st(UGeckoInstruction inst)
|
|||
ibuild.EmitStoreGReg(addr, inst.RA);
|
||||
val = ibuild.EmitLoadFReg(inst.RS);
|
||||
val = ibuild.EmitCompactMRegToPacked(val);
|
||||
if (inst.W == 0) {
|
||||
ibuild.EmitStorePaired(val, addr, inst.I);
|
||||
} else {
|
||||
IREmitter::InstLoc addr4 = ibuild.EmitAdd(addr, ibuild.EmitIntConst(4));
|
||||
IREmitter::InstLoc backup = ibuild.EmitLoad32(addr4);
|
||||
ibuild.EmitStorePaired(val, addr, inst.I);
|
||||
ibuild.EmitStore32(backup, addr4);
|
||||
}
|
||||
}
|
||||
|
||||
void JitIL::psq_l(UGeckoInstruction inst)
|
||||
|
@ -65,10 +59,7 @@ void JitIL::psq_l(UGeckoInstruction inst)
|
|||
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||
if (inst.OPCD == 57)
|
||||
ibuild.EmitStoreGReg(addr, inst.RA);
|
||||
val = ibuild.EmitLoadPaired(addr, inst.I);
|
||||
if (inst.W) {
|
||||
val = ibuild.EmitFPMerge00(val, ibuild.EmitCFloatOne());
|
||||
}
|
||||
val = ibuild.EmitLoadPaired(addr, inst.I | (inst.W << 3)); // The lower 3 bits is for GQR index. The next 1 bit is for inst.W
|
||||
val = ibuild.EmitExpandPackedToMReg(val);
|
||||
ibuild.EmitStoreFReg(val, inst.RD);
|
||||
}
|
||||
|
|
|
@ -184,23 +184,23 @@ void JitIL::crXX(UGeckoInstruction inst)
|
|||
break;
|
||||
case 129:
|
||||
// crandc
|
||||
ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(-1U));
|
||||
ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(0xFFFFFFFFU));
|
||||
eax = ibuild.EmitAnd(eax, ecx);
|
||||
break;
|
||||
case 289:
|
||||
// creqv
|
||||
eax = ibuild.EmitXor(eax, ecx);
|
||||
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(-1U));
|
||||
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(0xFFFFFFFFU));
|
||||
break;
|
||||
case 225:
|
||||
// crnand
|
||||
eax = ibuild.EmitAnd(eax, ecx);
|
||||
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(-1U));
|
||||
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(0xFFFFFFFFU));
|
||||
break;
|
||||
case 33:
|
||||
// crnor
|
||||
eax = ibuild.EmitOr(eax, ecx);
|
||||
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(-1U));
|
||||
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(0xFFFFFFFFU));
|
||||
break;
|
||||
case 449:
|
||||
// cror
|
||||
|
@ -208,7 +208,7 @@ void JitIL::crXX(UGeckoInstruction inst)
|
|||
break;
|
||||
case 417:
|
||||
// crorc
|
||||
ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(-1U));
|
||||
ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(0xFFFFFFFFU));
|
||||
eax = ibuild.EmitOr(eax, ecx);
|
||||
break;
|
||||
case 193:
|
||||
|
|
|
@ -148,6 +148,7 @@ void CommonAsmRoutines::GenFifoXmm64Write()
|
|||
|
||||
// Safe + Fast Quantizers, originally from JITIL by magumagu
|
||||
|
||||
static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
|
||||
static const float GC_ALIGNED16(m_quantizeTableS[]) =
|
||||
|
@ -199,6 +200,8 @@ static const float GC_ALIGNED16(m_255) = 255.0f;
|
|||
static const float GC_ALIGNED16(m_127) = 127.0f;
|
||||
static const float GC_ALIGNED16(m_m128) = -128.0f;
|
||||
|
||||
static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
#define QUANTIZE_OVERFLOW_SAFE
|
||||
|
||||
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range
|
||||
|
@ -426,7 +429,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
|
|||
void CommonAsmRoutines::GenQuantizedLoads() {
|
||||
const u8* loadPairedIllegal = AlignCode4();
|
||||
UD2();
|
||||
const u8* loadPairedFloat = AlignCode4();
|
||||
|
||||
const u8* loadPairedFloatTwo = AlignCode4();
|
||||
if (cpu_info.bSSSE3) {
|
||||
#ifdef _M_X64
|
||||
MOVQ_xmm(XMM0, MComplex(RBX, RCX, 1, 0));
|
||||
|
@ -465,7 +469,33 @@ void CommonAsmRoutines::GenQuantizedLoads() {
|
|||
}
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU8 = AlignCode4();
|
||||
const u8* loadPairedFloatOne = AlignCode4();
|
||||
if (cpu_info.bSSSE3) {
|
||||
#ifdef _M_X64
|
||||
MOVD_xmm(XMM0, MComplex(RBX, RCX, 1, 0));
|
||||
#else
|
||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOVD_xmm(XMM0, MDisp(ECX, (u32)Memory::base));
|
||||
#endif
|
||||
PSHUFB(XMM0, M((void *)pbswapShuffle1x4));
|
||||
UNPCKLPS(XMM0, M((void*)m_one));
|
||||
} else {
|
||||
#ifdef _M_X64
|
||||
MOV(32, R(RCX), MComplex(RBX, RCX, 1, 0));
|
||||
BSWAP(32, RCX);
|
||||
MOVD_xmm(XMM0, R(RCX));
|
||||
UNPCKLPS(XMM0, M((void*)m_one));
|
||||
#else
|
||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
|
||||
BSWAP(32, EAX);
|
||||
MOVD_xmm(XMM0, M(&psTemp[0]));
|
||||
UNPCKLPS(XMM0, M((void*)m_one));
|
||||
#endif
|
||||
}
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU8Two = AlignCode4();
|
||||
UnsafeLoadRegToRegNoSwap(ECX, ECX, 16, 0);
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
PXOR(XMM1, R(XMM1));
|
||||
|
@ -478,7 +508,17 @@ void CommonAsmRoutines::GenQuantizedLoads() {
|
|||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS8 = AlignCode4();
|
||||
const u8* loadPairedU8One = AlignCode4();
|
||||
UnsafeLoadRegToRegNoSwap(ECX, ECX, 8, 0); // ECX = 0x000000xx
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better?
|
||||
SHR(32, R(EAX), Imm8(6));
|
||||
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
UNPCKLPS(XMM0, M((void*)m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS8Two = AlignCode4();
|
||||
UnsafeLoadRegToRegNoSwap(ECX, ECX, 16, 0);
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
PUNPCKLBW(XMM0, R(XMM0));
|
||||
|
@ -491,7 +531,19 @@ void CommonAsmRoutines::GenQuantizedLoads() {
|
|||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU16 = AlignCode4();
|
||||
const u8* loadPairedS8One = AlignCode4();
|
||||
UnsafeLoadRegToRegNoSwap(ECX, ECX, 8, 0);
|
||||
SHL(32, R(ECX), Imm8(24));
|
||||
SAR(32, R(ECX), Imm8(24));
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(EAX), Imm8(6));
|
||||
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
UNPCKLPS(XMM0, M((void*)m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU16Two = AlignCode4();
|
||||
UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
|
||||
ROL(32, R(ECX), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
|
@ -504,7 +556,18 @@ void CommonAsmRoutines::GenQuantizedLoads() {
|
|||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS16 = AlignCode4();
|
||||
const u8* loadPairedU16One = AlignCode4();
|
||||
UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
|
||||
SHR(32, R(ECX), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(EAX), Imm8(6));
|
||||
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
UNPCKLPS(XMM0, M((void*)m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS16Two = AlignCode4();
|
||||
UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
|
||||
ROL(32, R(ECX), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
|
@ -518,12 +581,33 @@ void CommonAsmRoutines::GenQuantizedLoads() {
|
|||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
pairedLoadQuantized[0] = loadPairedFloat;
|
||||
const u8* loadPairedS16One = AlignCode4();
|
||||
UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
|
||||
SAR(32, R(ECX), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(EAX), Imm8(6));
|
||||
AND(32, R(EAX), Imm32(0xFC));
|
||||
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
UNPCKLPS(XMM0, M((void*)m_one));
|
||||
RET();
|
||||
|
||||
pairedLoadQuantized[0] = loadPairedFloatTwo;
|
||||
pairedLoadQuantized[1] = loadPairedIllegal;
|
||||
pairedLoadQuantized[2] = loadPairedIllegal;
|
||||
pairedLoadQuantized[3] = loadPairedIllegal;
|
||||
pairedLoadQuantized[4] = loadPairedU8;
|
||||
pairedLoadQuantized[5] = loadPairedU16;
|
||||
pairedLoadQuantized[6] = loadPairedS8;
|
||||
pairedLoadQuantized[7] = loadPairedS16;
|
||||
pairedLoadQuantized[4] = loadPairedU8Two;
|
||||
pairedLoadQuantized[5] = loadPairedU16Two;
|
||||
pairedLoadQuantized[6] = loadPairedS8Two;
|
||||
pairedLoadQuantized[7] = loadPairedS16Two;
|
||||
|
||||
pairedLoadQuantized[8] = loadPairedFloatOne;
|
||||
pairedLoadQuantized[9] = loadPairedIllegal;
|
||||
pairedLoadQuantized[10] = loadPairedIllegal;
|
||||
pairedLoadQuantized[11] = loadPairedIllegal;
|
||||
pairedLoadQuantized[12] = loadPairedU8One;
|
||||
pairedLoadQuantized[13] = loadPairedU16One;
|
||||
pairedLoadQuantized[14] = loadPairedS8One;
|
||||
pairedLoadQuantized[15] = loadPairedS16One;
|
||||
}
|
||||
|
|
|
@ -56,7 +56,7 @@ public:
|
|||
// Out: XMM0: Bottom two 32-bit slots hold the read value,
|
||||
// converted to a pair of floats.
|
||||
// Trashes: EAX ECX EDX
|
||||
const u8 GC_ALIGNED16(*pairedLoadQuantized[8]);
|
||||
const u8 GC_ALIGNED16(*pairedLoadQuantized[16]);
|
||||
|
||||
// In: array index: GQR to use.
|
||||
// In: ECX: Address to write to.
|
||||
|
|
Loading…
Reference in New Issue