JitIL: Modified psq_l implementation. Reverted psq_st. Removed compile warnings.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6114 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
nodchip 2010-08-21 06:29:47 +00:00
parent 20704fca3d
commit 62c4d439ce
6 changed files with 108 additions and 45 deletions

View File

@ -145,7 +145,6 @@ enum Opcode {
StoreDouble,
StoreFReg,
FDCmpCR,
CFloatOne, // Store 1.0f into the specified floating register
// "Trinary" operators
// FIXME: Need to change representation!
@ -287,7 +286,7 @@ public:
return FoldUOp(StoreGReg, value, reg);
}
InstLoc EmitNot(InstLoc op1) {
return EmitXor(op1, EmitIntConst(-1U));
return EmitXor(op1, EmitIntConst(0xFFFFFFFFU));
}
InstLoc EmitAnd(InstLoc op1, InstLoc op2) {
return FoldBiOp(And, op1, op2);
@ -517,9 +516,6 @@ public:
InstLoc EmitFDCmpCR(InstLoc op1, InstLoc op2) {
return FoldBiOp(FDCmpCR, op1, op2);
}
InstLoc EmitCFloatOne() {
return FoldZeroOp(CFloatOne, 0);
}
InstLoc EmitLoadGQR(unsigned gqr) {
return FoldZeroOp(LoadGQR, gqr);
}

View File

@ -725,7 +725,6 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
case LoadDouble:
case LoadSingle:
case LoadPaired:
case CFloatOne:
if (thisUsed)
regMarkUse(RI, I, getOp1(I), 1);
break;
@ -1170,16 +1169,6 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
regNormalRegClear(RI, I);
break;
}
case CFloatOne: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
static const float one = 1.0f;
Jit->MOV(32, R(ECX), Imm32(*(u32*)&one));
Jit->MOVD_xmm(reg, R(ECX));
RI.fregs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case LoadDouble: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
@ -1200,9 +1189,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, bool UseProfile, bool Mak
regSpill(RI, EAX);
regSpill(RI, EDX);
X64Reg reg = fregFindFreeReg(RI);
unsigned int quantreg = *I >> 16;
// The lower 3 bits is for GQR index. The next 1 bit is for inst.W
unsigned int quantreg = (*I >> 16) & 0x7;
unsigned int w = *I >> 19;
Jit->MOVZX(32, 16, EAX, M(((char *)&GQR(quantreg)) + 2));
Jit->MOVZX(32, 8, EDX, R(AL));
Jit->OR(32, R(EDX), Imm8(w << 3));
// FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]! (MComplex can do this, no?)
#ifdef _M_IX86
Jit->SHL(32, R(EDX), Imm8(2));

View File

@ -38,6 +38,7 @@ void JitIL::psq_st(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(LoadStorePaired)
if (js.memcheck) { Default(inst); return; }
if (inst.W) {Default(inst); return;}
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_12), val;
if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
@ -45,14 +46,7 @@ void JitIL::psq_st(UGeckoInstruction inst)
ibuild.EmitStoreGReg(addr, inst.RA);
val = ibuild.EmitLoadFReg(inst.RS);
val = ibuild.EmitCompactMRegToPacked(val);
if (inst.W == 0) {
ibuild.EmitStorePaired(val, addr, inst.I);
} else {
IREmitter::InstLoc addr4 = ibuild.EmitAdd(addr, ibuild.EmitIntConst(4));
IREmitter::InstLoc backup = ibuild.EmitLoad32(addr4);
ibuild.EmitStorePaired(val, addr, inst.I);
ibuild.EmitStore32(backup, addr4);
}
ibuild.EmitStorePaired(val, addr, inst.I);
}
void JitIL::psq_l(UGeckoInstruction inst)
@ -65,10 +59,7 @@ void JitIL::psq_l(UGeckoInstruction inst)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
if (inst.OPCD == 57)
ibuild.EmitStoreGReg(addr, inst.RA);
val = ibuild.EmitLoadPaired(addr, inst.I);
if (inst.W) {
val = ibuild.EmitFPMerge00(val, ibuild.EmitCFloatOne());
}
val = ibuild.EmitLoadPaired(addr, inst.I | (inst.W << 3)); // The lower 3 bits is for GQR index. The next 1 bit is for inst.W
val = ibuild.EmitExpandPackedToMReg(val);
ibuild.EmitStoreFReg(val, inst.RD);
}

View File

@ -184,23 +184,23 @@ void JitIL::crXX(UGeckoInstruction inst)
break;
case 129:
// crandc
ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(-1U));
ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(0xFFFFFFFFU));
eax = ibuild.EmitAnd(eax, ecx);
break;
case 289:
// creqv
eax = ibuild.EmitXor(eax, ecx);
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(-1U));
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(0xFFFFFFFFU));
break;
case 225:
// crnand
eax = ibuild.EmitAnd(eax, ecx);
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(-1U));
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(0xFFFFFFFFU));
break;
case 33:
// crnor
eax = ibuild.EmitOr(eax, ecx);
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(-1U));
eax = ibuild.EmitXor(eax, ibuild.EmitIntConst(0xFFFFFFFFU));
break;
case 449:
// cror
@ -208,7 +208,7 @@ void JitIL::crXX(UGeckoInstruction inst)
break;
case 417:
// crorc
ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(-1U));
ecx = ibuild.EmitXor(ecx, ibuild.EmitIntConst(0xFFFFFFFFU));
eax = ibuild.EmitOr(eax, ecx);
break;
case 193:

View File

@ -148,6 +148,7 @@ void CommonAsmRoutines::GenFifoXmm64Write()
// Safe + Fast Quantizers, originally from JITIL by magumagu
static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
static const float GC_ALIGNED16(m_quantizeTableS[]) =
@ -199,6 +200,8 @@ static const float GC_ALIGNED16(m_255) = 255.0f;
static const float GC_ALIGNED16(m_127) = 127.0f;
static const float GC_ALIGNED16(m_m128) = -128.0f;
static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f};
#define QUANTIZE_OVERFLOW_SAFE
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range
@ -426,7 +429,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
void CommonAsmRoutines::GenQuantizedLoads() {
const u8* loadPairedIllegal = AlignCode4();
UD2();
const u8* loadPairedFloat = AlignCode4();
const u8* loadPairedFloatTwo = AlignCode4();
if (cpu_info.bSSSE3) {
#ifdef _M_X64
MOVQ_xmm(XMM0, MComplex(RBX, RCX, 1, 0));
@ -465,7 +469,33 @@ void CommonAsmRoutines::GenQuantizedLoads() {
}
RET();
const u8* loadPairedU8 = AlignCode4();
const u8* loadPairedFloatOne = AlignCode4();
if (cpu_info.bSSSE3) {
#ifdef _M_X64
MOVD_xmm(XMM0, MComplex(RBX, RCX, 1, 0));
#else
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOVD_xmm(XMM0, MDisp(ECX, (u32)Memory::base));
#endif
PSHUFB(XMM0, M((void *)pbswapShuffle1x4));
UNPCKLPS(XMM0, M((void*)m_one));
} else {
#ifdef _M_X64
MOV(32, R(RCX), MComplex(RBX, RCX, 1, 0));
BSWAP(32, RCX);
MOVD_xmm(XMM0, R(RCX));
UNPCKLPS(XMM0, M((void*)m_one));
#else
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
BSWAP(32, EAX);
MOVD_xmm(XMM0, M(&psTemp[0]));
UNPCKLPS(XMM0, M((void*)m_one));
#endif
}
RET();
const u8* loadPairedU8Two = AlignCode4();
UnsafeLoadRegToRegNoSwap(ECX, ECX, 16, 0);
MOVD_xmm(XMM0, R(ECX));
PXOR(XMM1, R(XMM1));
@ -478,7 +508,17 @@ void CommonAsmRoutines::GenQuantizedLoads() {
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS8 = AlignCode4();
const u8* loadPairedU8One = AlignCode4();
UnsafeLoadRegToRegNoSwap(ECX, ECX, 8, 0); // ECX = 0x000000xx
MOVD_xmm(XMM0, R(ECX));
CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better?
SHR(32, R(EAX), Imm8(6));
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1));
UNPCKLPS(XMM0, M((void*)m_one));
RET();
const u8* loadPairedS8Two = AlignCode4();
UnsafeLoadRegToRegNoSwap(ECX, ECX, 16, 0);
MOVD_xmm(XMM0, R(ECX));
PUNPCKLBW(XMM0, R(XMM0));
@ -491,7 +531,19 @@ void CommonAsmRoutines::GenQuantizedLoads() {
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU16 = AlignCode4();
const u8* loadPairedS8One = AlignCode4();
UnsafeLoadRegToRegNoSwap(ECX, ECX, 8, 0);
SHL(32, R(ECX), Imm8(24));
SAR(32, R(ECX), Imm8(24));
MOVD_xmm(XMM0, R(ECX));
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(EAX), Imm8(6));
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1));
UNPCKLPS(XMM0, M((void*)m_one));
RET();
const u8* loadPairedU16Two = AlignCode4();
UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
ROL(32, R(ECX), Imm8(16));
MOVD_xmm(XMM0, R(ECX));
@ -504,7 +556,18 @@ void CommonAsmRoutines::GenQuantizedLoads() {
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS16 = AlignCode4();
const u8* loadPairedU16One = AlignCode4();
UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
SHR(32, R(ECX), Imm8(16));
MOVD_xmm(XMM0, R(ECX));
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(EAX), Imm8(6));
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1));
UNPCKLPS(XMM0, M((void*)m_one));
RET();
const u8* loadPairedS16Two = AlignCode4();
UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
ROL(32, R(ECX), Imm8(16));
MOVD_xmm(XMM0, R(ECX));
@ -518,12 +581,33 @@ void CommonAsmRoutines::GenQuantizedLoads() {
MULPS(XMM0, R(XMM1));
RET();
pairedLoadQuantized[0] = loadPairedFloat;
const u8* loadPairedS16One = AlignCode4();
UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
SAR(32, R(ECX), Imm8(16));
MOVD_xmm(XMM0, R(ECX));
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(EAX), Imm8(6));
AND(32, R(EAX), Imm32(0xFC));
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1));
UNPCKLPS(XMM0, M((void*)m_one));
RET();
pairedLoadQuantized[0] = loadPairedFloatTwo;
pairedLoadQuantized[1] = loadPairedIllegal;
pairedLoadQuantized[2] = loadPairedIllegal;
pairedLoadQuantized[3] = loadPairedIllegal;
pairedLoadQuantized[4] = loadPairedU8;
pairedLoadQuantized[5] = loadPairedU16;
pairedLoadQuantized[6] = loadPairedS8;
pairedLoadQuantized[7] = loadPairedS16;
pairedLoadQuantized[4] = loadPairedU8Two;
pairedLoadQuantized[5] = loadPairedU16Two;
pairedLoadQuantized[6] = loadPairedS8Two;
pairedLoadQuantized[7] = loadPairedS16Two;
pairedLoadQuantized[8] = loadPairedFloatOne;
pairedLoadQuantized[9] = loadPairedIllegal;
pairedLoadQuantized[10] = loadPairedIllegal;
pairedLoadQuantized[11] = loadPairedIllegal;
pairedLoadQuantized[12] = loadPairedU8One;
pairedLoadQuantized[13] = loadPairedU16One;
pairedLoadQuantized[14] = loadPairedS8One;
pairedLoadQuantized[15] = loadPairedS16One;
}

View File

@ -56,7 +56,7 @@ public:
// Out: XMM0: Bottom two 32-bit slots hold the read value,
// converted to a pair of floats.
// Trashes: EAX ECX EDX
const u8 GC_ALIGNED16(*pairedLoadQuantized[8]);
const u8 GC_ALIGNED16(*pairedLoadQuantized[16]);
// In: array index: GQR to use.
// In: ECX: Address to write to.