diff --git a/src/xenia/cpu/frontend/ppc_emit_altivec.cc b/src/xenia/cpu/frontend/ppc_emit_altivec.cc index ec7ed2b2b..4d54b9437 100644 --- a/src/xenia/cpu/frontend/ppc_emit_altivec.cc +++ b/src/xenia/cpu/frontend/ppc_emit_altivec.cc @@ -245,14 +245,26 @@ XEEMITTER(lvlxl128, VX128_1(4, 1539), VX128_1)(PPCHIRBuilder& f, InstrData& i) { int InstrEmit_lvrx_(PPCHIRBuilder& f, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { + // NOTE: if eb == 0 (so 16b aligned) then no data is loaded. This is important + // as often times memcpy's will use this to handle the remaining <=16b of a + // buffer, which sometimes may be nothing and hang off the end of the valid + // page area. We still need to zero the resulting register, though. Value* ea = CalculateEA_0(f, ra, rb); Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); + // Skip if %16=0 (just load zero). + auto load_label = f.NewLabel(); + auto end_label = f.NewLabel(); + f.BranchTrue(eb, load_label); + f.StoreVR(vd, f.LoadZeroVec128()); + f.Branch(end_label); + f.MarkLabel(load_label); // ea &= ~0xF ea = f.And(ea, f.LoadConstantUint64(~0xFull)); // v = (new >> (16 - eb)) Value* v = f.Permute(f.LoadVectorShl(eb), f.LoadZeroVec128(), f.ByteSwap(f.Load(ea, VEC128_TYPE)), INT8_TYPE); f.StoreVR(vd, v); + f.MarkLabel(end_label); return 0; } XEEMITTER(lvrx, 0x7C00044E, X)(PPCHIRBuilder& f, InstrData& i) { @@ -304,10 +316,15 @@ XEEMITTER(stvlxl128, VX128_1(4, 1795), VX128_1)(PPCHIRBuilder& f, int InstrEmit_stvrx_(PPCHIRBuilder& f, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { - // NOTE: if eb == 0 (so 16b aligned) this equals new_value - // we could optimize this to prevent the other load/mask, in that case. + // NOTE: if eb == 0 (so 16b aligned) then no data is loaded. This is important + // as often times memcpy's will use this to handle the remaining <=16b of a + // buffer, which sometimes may be nothing and hang off the end of the valid + // page area. Value* ea = CalculateEA_0(f, ra, rb); Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); + // Skip if %16=0 (no data to store). + auto skip_label = f.NewLabel(); + f.BranchFalse(eb, skip_label); // ea &= ~0xF ea = f.And(ea, f.LoadConstantUint64(~0xFull)); // v = (old & ~mask) | ((new << eb) & mask) @@ -320,6 +337,7 @@ int InstrEmit_stvrx_(PPCHIRBuilder& f, InstrData& i, uint32_t vd, uint32_t ra, Value* v = f.Or(f.And(old_value, f.Not(mask)), f.And(new_value, mask)); // ea &= ~0xF (handled above) f.Store(ea, f.ByteSwap(v)); + f.MarkLabel(skip_label); return 0; } XEEMITTER(stvrx, 0x7C00054E, X)(PPCHIRBuilder& f, InstrData& i) { diff --git a/src/xenia/cpu/frontend/testing/instr_lvr.s b/src/xenia/cpu/frontend/testing/instr_lvr.s index 7cb9ff489..706a08a19 100644 --- a/src/xenia/cpu/frontend/testing/instr_lvr.s +++ b/src/xenia/cpu/frontend/testing/instr_lvr.s @@ -18,3 +18,23 @@ test_lvr_1_constant: #_ REGISTER_OUT r4 0x100010B7 #_ REGISTER_OUT r5 0x10 #_ REGISTER_OUT v3 [00000000, 00000000, 000D0E10, 11121314] + +test_lvr_2: + #_ REGISTER_IN r4 0x20000000 + #_ REGISTER_IN r5 0x10 + #_ REGISTER_IN v3 [FFFFFFFF, FFFFFFFF, FFFFFFFF, FFFFFFFF] + lvrx v3, r4, r5 + blr + #_ REGISTER_OUT r4 0x20000000 + #_ REGISTER_OUT r5 0x10 + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000] + +test_lvr_2_constant: + #_ REGISTER_IN v3 [FFFFFFFF, FFFFFFFF, FFFFFFFF, FFFFFFFF] + lis r4, 0x2000 + li r5, 0x10 + lvrx v3, r4, r5 + blr + #_ REGISTER_OUT r4 0x20000000 + #_ REGISTER_OUT r5 0x10 + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000] diff --git a/src/xenia/cpu/frontend/testing/instr_stvr.s b/src/xenia/cpu/frontend/testing/instr_stvr.s index 151349749..cc5bc2212 100644 --- a/src/xenia/cpu/frontend/testing/instr_stvr.s +++ b/src/xenia/cpu/frontend/testing/instr_stvr.s @@ -47,3 +47,21 @@ test_stvr_2_constant: #_ REGISTER_OUT r4 0x10001044 #_ REGISTER_OUT v3 [F0F1F2F3, F4F5F6F7, F8F9FAFB, FCFDFEFF] #_ MEMORY_OUT 10001040 FCFDFEFF 04050607 08090A0B 0C0D0E0F + +test_stvr_3: + #_ REGISTER_IN r4 0x10010000 + #_ REGISTER_IN r5 0x0 + #_ REGISTER_IN v3 [BE74FCBD, BD912ABA, BF317BBB, BF2D135F] + stvrx v3, r4, r5 + blr + #_ REGISTER_OUT r4 0x10010000 + #_ REGISTER_OUT r5 0x0 + #_ REGISTER_OUT v3 [BE74FCBD, BD912ABA, BF317BBB, BF2D135F] + +test_stvr_3_constant: + #_ REGISTER_IN v3 [BE74FCBD, BD912ABA, BF317BBB, BF2D135F] + lis r4, 0x1001 + stvrx v3, r4, r0 + blr + #_ REGISTER_OUT r4 0x10010000 + #_ REGISTER_OUT v3 [BE74FCBD, BD912ABA, BF317BBB, BF2D135F]