A bunch of fixes for division logic:
"turns out theres a lot of quirks with the div instructions we havent been covering if the denom is 0, we jump to the end and mov eax/rax to dst, which is correct because ppc raises no exceptions for divide by 0 unlike x86 except we don't initialize eax before that jump, so whatever garbage from the previous sequence that has been left in eax/rax is what the result of the instruction will be and then in our constant folding, we don't do the same zero check in Value::Div, so if we constant folded the denom to 0 we will host crash the ppc manual says the result for a division by 0 is undefined, but in reality it seems it is always 0 there are a few posts i saw from googling about it, and tests on my rgh gave me 0, but then another issue came up and that is that we dont check for signed overflow in our division, so we raise an exception if guest code ever does (1<<signbit_pos) / -1 signed overflow in division also produces 0 on ppc the last thing is that if src2 is constant we skip the 0 check for division without checking if its nonzero all weird, likely very rare edge cases, except for maybe the signed overflow division chrispy — Today at 9:51 AM oh yeah, and because the int members of constantvalue are all signed ints, we were actually doing signed division always with constant folding" fixed an earlier mistake by me with the precision of fresx made some optimization disableable implemented vkpkx fixed possible bugs with vsr/vsl constant folding disabled the nice imul code for now, there was a bug with int64 version and i dont have time to check started on multiplication/addition/subtraction/division identities Removed optimized VSL implementation, it's going to have to be rewritten anyway Added ppc_ctx_t to xboxkrnl shim for direct context access started working on KeSaveFloatingPointState, re'ed most of it Exposed some more state/functionality to the kernel for implementing lower level routines like the save/restore ones Add cvar to re-enable incorrect mxcsr behavior if a user doesnt care and wants better cpu performance Stubbed out more impossible sequences, replace mul_hi_i32 with a 64 bit multiply
This commit is contained in:
parent
f45e9e5e9a
commit
324a8eb818
|
@ -67,6 +67,7 @@ class Backend {
|
|||
// up until the start of ctx may be used by the backend to store whatever data
|
||||
// they want
|
||||
virtual void InitializeBackendContext(void* ctx) {}
|
||||
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode){};
|
||||
|
||||
protected:
|
||||
Processor* processor_ = nullptr;
|
||||
|
|
|
@ -689,8 +689,7 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
|
|||
#endif
|
||||
}
|
||||
void X64Backend::InitializeBackendContext(void* ctx) {
|
||||
X64BackendContext* bctx = reinterpret_cast<X64BackendContext*>(
|
||||
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
|
||||
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
||||
bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
|
||||
bctx->mxcsr_fpu =
|
||||
DEFAULT_FPU_MXCSR; // idk if this is right, check on rgh what the
|
||||
|
@ -700,6 +699,18 @@ void X64Backend::InitializeBackendContext(void* ctx) {
|
|||
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
|
||||
bctx->Ox1000 = 0x1000;
|
||||
}
|
||||
const uint32_t mxcsr_table[8] = {
|
||||
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
|
||||
};
|
||||
|
||||
void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
|
||||
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
||||
|
||||
uint32_t control = mode & 7;
|
||||
_mm_setcsr(mxcsr_table[control]);
|
||||
bctx->mxcsr_fpu = mxcsr_table[control];
|
||||
((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
|
||||
}
|
||||
} // namespace x64
|
||||
} // namespace backend
|
||||
} // namespace cpu
|
||||
|
|
|
@ -37,9 +37,10 @@ typedef void (*ResolveFunctionThunk)();
|
|||
// negatively index the membase reg)
|
||||
struct X64BackendContext {
|
||||
void* ResolveFunction_Ptr; // cached pointer to resolvefunction
|
||||
unsigned int mxcsr_fpu; //currently, the way we implement rounding mode affects both vmx and the fpu
|
||||
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
|
||||
// affects both vmx and the fpu
|
||||
unsigned int mxcsr_vmx;
|
||||
unsigned int flags; //bit 0 = 0 if mxcsr is fpu, else it is vmx
|
||||
unsigned int flags; // bit 0 = 0 if mxcsr is fpu, else it is vmx
|
||||
unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted
|
||||
// add of it by... 2 bytes lol
|
||||
};
|
||||
|
@ -48,7 +49,7 @@ constexpr unsigned int DEFAULT_VMX_MXCSR =
|
|||
0x0040 | (_MM_MASK_MASK); // default rounding mode for vmx
|
||||
|
||||
constexpr unsigned int DEFAULT_FPU_MXCSR = 0x1F80;
|
||||
|
||||
extern const uint32_t mxcsr_table[8];
|
||||
class X64Backend : public Backend {
|
||||
public:
|
||||
static const uint32_t kForceReturnAddress = 0x9FFF0000u;
|
||||
|
@ -85,6 +86,12 @@ class X64Backend : public Backend {
|
|||
void UninstallBreakpoint(Breakpoint* breakpoint) override;
|
||||
virtual void InitializeBackendContext(void* ctx) override;
|
||||
|
||||
X64BackendContext* BackendContextForGuestContext(void* ctx) {
|
||||
return reinterpret_cast<X64BackendContext*>(
|
||||
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
|
||||
}
|
||||
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
|
||||
|
||||
private:
|
||||
static bool ExceptionCallbackThunk(Exception* ex, void* data);
|
||||
bool ExceptionCallback(Exception* ex);
|
||||
|
|
|
@ -50,6 +50,13 @@ DEFINE_bool(resolve_rel32_guest_calls, true,
|
|||
"Experimental optimization, directly call already resolved "
|
||||
"functions via x86 rel32 call/jmp",
|
||||
"CPU");
|
||||
|
||||
DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
|
||||
"Disables the FPU/VMX MXCSR sharing workaround, potentially "
|
||||
"causing incorrect rounding behavior and denormal handling in VMX "
|
||||
"code. The workaround may cause reduced CPU performance but is a "
|
||||
"more accurate emulation",
|
||||
"x64");
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
|
@ -1374,13 +1381,13 @@ Xbyak::Label& X64Emitter::NewCachedLabel() {
|
|||
return *tmp;
|
||||
}
|
||||
|
||||
template<bool switching_to_fpu>
|
||||
template <bool switching_to_fpu>
|
||||
static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) {
|
||||
auto flags = e.GetBackendFlagsPtr();
|
||||
if (switching_to_fpu) {
|
||||
e.btr(flags, 0); // bit 0 set to 0 = is fpu mode
|
||||
} else {
|
||||
e.bts(flags, 0); // bit 0 set to 1 = is vmx mode
|
||||
e.bts(flags, 0); // bit 0 set to 1 = is vmx mode
|
||||
}
|
||||
Xbyak::Label& come_back = e.NewCachedLabel();
|
||||
|
||||
|
@ -1391,20 +1398,24 @@ static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) {
|
|||
e.LoadFpuMxcsrDirect();
|
||||
} else {
|
||||
e.LoadVmxMxcsrDirect();
|
||||
}
|
||||
}
|
||||
e.jmp(come_back, X64Emitter::T_NEAR);
|
||||
});
|
||||
if (switching_to_fpu) {
|
||||
e.jc(reload_bailout,
|
||||
X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode.
|
||||
} else {
|
||||
e.jnc(reload_bailout,
|
||||
X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode.
|
||||
e.jnc(
|
||||
reload_bailout,
|
||||
X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode.
|
||||
}
|
||||
e.L(come_back);
|
||||
}
|
||||
|
||||
bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
|
||||
if (cvars::enable_incorrect_roundingmode_behavior) {
|
||||
return false; // no MXCSR mode handling!
|
||||
}
|
||||
if (new_mode == mxcsr_mode_) {
|
||||
return false;
|
||||
}
|
||||
|
@ -1420,21 +1431,21 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
|
|||
ChangeMxcsrModeDynamicHelper<false>(*this);
|
||||
} else {
|
||||
assert_unhandled_case(new_mode);
|
||||
}
|
||||
} else { //even if already set, we still need to update flags to reflect our mode
|
||||
}
|
||||
} else { // even if already set, we still need to update flags to reflect
|
||||
// our mode
|
||||
if (new_mode == MXCSRMode::Fpu) {
|
||||
btr(GetBackendFlagsPtr(), 0);
|
||||
} else if (new_mode == MXCSRMode::Vmx) {
|
||||
bts(GetBackendFlagsPtr(), 0);
|
||||
} else {
|
||||
assert_unhandled_case(new_mode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
mxcsr_mode_ = new_mode;
|
||||
if (!already_set) {
|
||||
if (new_mode == MXCSRMode::Fpu) {
|
||||
|
||||
LoadFpuMxcsrDirect();
|
||||
btr(GetBackendFlagsPtr(), 0);
|
||||
return true;
|
||||
|
|
|
@ -23,6 +23,10 @@ DEFINE_bool(
|
|||
elide_e0_check, false,
|
||||
"Eliminate e0 check on some memory accesses, like to r13(tls) or r1(sp)",
|
||||
"CPU");
|
||||
DEFINE_bool(enable_rmw_context_merging, false,
|
||||
"Permit merging read-modify-write HIR instr sequences together "
|
||||
"into x86 instructions that use a memory operand.",
|
||||
"x64");
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
@ -88,6 +92,9 @@ struct LoadModStoreContext : public LoadModStore {
|
|||
};
|
||||
static bool GetLoadModStoreContext(const hir::Instr* loadinsn,
|
||||
LoadModStoreContext* out) {
|
||||
if (!cvars::enable_rmw_context_merging) {
|
||||
return false;
|
||||
}
|
||||
if (!GetLoadModStore(loadinsn, out)) {
|
||||
return false;
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -10,11 +10,13 @@
|
|||
#ifndef XENIA_CPU_BACKEND_X64_X64_SEQUENCES_H_
|
||||
#define XENIA_CPU_BACKEND_X64_X64_SEQUENCES_H_
|
||||
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/cpu/hir/instr.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#define assert_impossible_sequence(name) \
|
||||
assert_always("impossible sequence hit" #name);
|
||||
#define assert_impossible_sequence(name) \
|
||||
assert_always("impossible sequence hit" #name); \
|
||||
XELOGE("impossible sequence hit: {}", #name)
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
|
|
@ -20,7 +20,9 @@
|
|||
DEFINE_bool(inline_mmio_access, true, "Inline constant MMIO loads and stores.",
|
||||
"CPU");
|
||||
|
||||
DEFINE_bool(permit_float_constant_evaluation, false, "Allow float constant evaluation, may produce incorrect results and break games math",
|
||||
DEFINE_bool(permit_float_constant_evaluation, false,
|
||||
"Allow float constant evaluation, may produce incorrect results "
|
||||
"and break games math",
|
||||
"CPU");
|
||||
|
||||
namespace xe {
|
||||
|
@ -85,8 +87,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->dest) {
|
||||
might_be_floatop |= i->dest->MaybeFloaty();
|
||||
}
|
||||
|
||||
bool should_skip_because_of_float =
|
||||
|
||||
bool should_skip_because_of_float =
|
||||
might_be_floatop && !cvars::permit_float_constant_evaluation;
|
||||
|
||||
auto v = i->dest;
|
||||
|
@ -557,6 +559,12 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
|
||||
i->Remove();
|
||||
result = true;
|
||||
} else if (!i->src2.value->MaybeFloaty() &&
|
||||
i->src2.value->IsConstantZero()) {
|
||||
// division by 0 == 0 every time,
|
||||
v->set_zero(i->src2.value->type);
|
||||
i->Remove();
|
||||
result = true;
|
||||
} else if (i->src2.value->IsConstant()) {
|
||||
// Division by one = no-op.
|
||||
Value* src1 = i->src1.value;
|
||||
|
@ -672,29 +680,33 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
}
|
||||
break;
|
||||
case OPCODE_SHL:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Shl(i->src2.value);
|
||||
i->Remove();
|
||||
result = true;
|
||||
} else if (i->src2.value->IsConstantZero()) {
|
||||
auto src1 = i->src1.value;
|
||||
i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||
i->set_src1(src1);
|
||||
result = true;
|
||||
if (i->dest->type != VEC128_TYPE) {
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Shl(i->src2.value);
|
||||
i->Remove();
|
||||
result = true;
|
||||
} else if (i->src2.value->IsConstantZero()) {
|
||||
auto src1 = i->src1.value;
|
||||
i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||
i->set_src1(src1);
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case OPCODE_SHR:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Shr(i->src2.value);
|
||||
i->Remove();
|
||||
result = true;
|
||||
} else if (i->src2.value->IsConstantZero()) {
|
||||
auto src1 = i->src1.value;
|
||||
i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||
i->set_src1(src1);
|
||||
result = true;
|
||||
if (i->dest->type != VEC128_TYPE) {
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Shr(i->src2.value);
|
||||
i->Remove();
|
||||
result = true;
|
||||
} else if (i->src2.value->IsConstantZero()) {
|
||||
auto src1 = i->src1.value;
|
||||
i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||
i->set_src1(src1);
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case OPCODE_SHA:
|
||||
|
@ -729,7 +741,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
result = true;
|
||||
}
|
||||
break;
|
||||
|
||||
#if 1
|
||||
case OPCODE_PERMUTE: {
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
|
||||
i->src3.value->IsConstant() &&
|
||||
|
@ -756,6 +768,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case OPCODE_INSERT:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
|
||||
i->src3.value->IsConstant()) {
|
||||
|
|
|
@ -83,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
iter_result |= SimplifyBitArith(builder);
|
||||
iter_result |= EliminateConversions(builder);
|
||||
iter_result |= SimplifyAssignments(builder);
|
||||
iter_result |= SimplifyBasicArith(builder);
|
||||
|
||||
result |= iter_result;
|
||||
} while (iter_result);
|
||||
|
@ -1228,6 +1229,91 @@ Value* SimplificationPass::CheckValue(Value* value, bool& result) {
|
|||
return value;
|
||||
}
|
||||
|
||||
bool SimplificationPass::SimplifyAddArith(hir::Instr* i,
|
||||
hir::HIRBuilder* builder) {
|
||||
/*
|
||||
example: (x <<1 ) + x == (x*3)
|
||||
|
||||
*/
|
||||
auto [shlinsn, addend] =
|
||||
i->BinaryValueArrangeByDefiningOpcode(&OPCODE_SHL_info);
|
||||
if (!shlinsn) {
|
||||
return false;
|
||||
}
|
||||
Instr* shift_insn = shlinsn->def;
|
||||
|
||||
Value* shift = shift_insn->src2.value;
|
||||
|
||||
// if not a constant shift, we cant combine to a multiply
|
||||
if (!shift->IsConstant()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Value* shouldbeaddend = shift_insn->src1.value;
|
||||
|
||||
if (!shouldbeaddend->IsEqual(addend)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint64_t multiplier = 1ULL << shift->constant.u8;
|
||||
|
||||
multiplier++;
|
||||
|
||||
hir::Value* oldvalue = shouldbeaddend;
|
||||
|
||||
i->Replace(&OPCODE_MUL_info, ARITHMETIC_UNSIGNED);
|
||||
i->set_src1(oldvalue);
|
||||
|
||||
// this sequence needs to be broken out into some kind of LoadConstant(type,
|
||||
// raw_value) method of hirbuilder
|
||||
auto constmul = builder->AllocValue(oldvalue->type);
|
||||
// could cause problems on big endian targets...
|
||||
constmul->flags |= VALUE_IS_CONSTANT;
|
||||
constmul->constant.u64 = multiplier;
|
||||
|
||||
i->set_src2(constmul);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SimplificationPass::SimplifySubArith(hir::Instr* i,
|
||||
hir::HIRBuilder* builder) {
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::SimplifyBasicArith(hir::Instr* i,
|
||||
hir::HIRBuilder* builder) {
|
||||
if (!i->dest) {
|
||||
return false;
|
||||
}
|
||||
if (i->dest->MaybeFloaty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
hir::Opcode op = i->GetOpcodeNum();
|
||||
|
||||
switch (op) {
|
||||
case OPCODE_ADD: {
|
||||
return SimplifyAddArith(i, builder);
|
||||
}
|
||||
case OPCODE_SUB: {
|
||||
return SimplifySubArith(i, builder);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) {
|
||||
bool result = false;
|
||||
auto block = builder->first_block();
|
||||
while (block) {
|
||||
auto i = block->instr_head;
|
||||
while (i) {
|
||||
result |= SimplifyBasicArith(i, builder);
|
||||
i = i->next;
|
||||
}
|
||||
block = block->next;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
} // namespace passes
|
||||
} // namespace compiler
|
||||
} // namespace cpu
|
||||
|
|
|
@ -32,6 +32,13 @@ class SimplificationPass : public ConditionalGroupSubpass {
|
|||
bool SimplifyAssignments(hir::HIRBuilder* builder);
|
||||
hir::Value* CheckValue(hir::Value* value, bool& result);
|
||||
bool SimplifyBitArith(hir::HIRBuilder* builder);
|
||||
|
||||
// handles simple multiplication/addition rules
|
||||
bool SimplifyBasicArith(hir::HIRBuilder* builder);
|
||||
bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
|
||||
bool SimplifyAddArith(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool SimplifySubArith(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
// handle either or or xor with 0
|
||||
bool CheckOrXorZero(hir::Instr* i);
|
||||
bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
|
|
|
@ -79,6 +79,10 @@ class Instr {
|
|||
void MoveBefore(Instr* other);
|
||||
void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
|
||||
void Remove();
|
||||
const OpcodeInfo* GetOpcodeInfo() const { return opcode; }
|
||||
// if opcode is null, we have bigger problems
|
||||
Opcode GetOpcodeNum() const { return GetOpcodeInfo()->num; }
|
||||
|
||||
template <typename TPredicate>
|
||||
std::pair<Value*, Value*> BinaryValueArrangeByPredicateExclusive(
|
||||
TPredicate&& pred) {
|
||||
|
@ -86,12 +90,13 @@ class Instr {
|
|||
auto src2_value = src2.value;
|
||||
if (!src1_value || !src2_value) return {nullptr, nullptr};
|
||||
|
||||
if (!opcode) return {nullptr, nullptr}; // impossible!
|
||||
if (!GetOpcodeInfo()) return {nullptr, nullptr}; // impossible!
|
||||
|
||||
// check if binary opcode taking two values. we dont care if the dest is a
|
||||
// value
|
||||
|
||||
if (!IsOpcodeBinaryValue(opcode->signature)) return {nullptr, nullptr};
|
||||
if (!IsOpcodeBinaryValue(GetOpcodeInfo()->signature))
|
||||
return {nullptr, nullptr};
|
||||
|
||||
if (pred(src1_value)) {
|
||||
if (pred(src2_value)) {
|
||||
|
@ -119,7 +124,7 @@ if both are constant, return nullptr, nullptr
|
|||
std::pair<Value*, Value*> BinaryValueArrangeByDefiningOpcode(
|
||||
const OpcodeInfo* op_ptr) {
|
||||
return BinaryValueArrangeByPredicateExclusive([op_ptr](Value* value) {
|
||||
return value->def && value->def->opcode == op_ptr;
|
||||
return value->def && value->def->GetOpcodeInfo() == op_ptr;
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -143,7 +148,7 @@ if both are constant, return nullptr, nullptr
|
|||
*/
|
||||
template <typename TCallable>
|
||||
void VisitValueOperands(TCallable&& call_for_values) {
|
||||
uint32_t signature = opcode->signature;
|
||||
uint32_t signature = GetOpcodeInfo()->signature;
|
||||
|
||||
OpcodeSignatureType t_dest, t_src1, t_src2, t_src3;
|
||||
|
||||
|
|
|
@ -199,7 +199,7 @@ void Value::Truncate(TypeName target_type) {
|
|||
return;
|
||||
}
|
||||
}
|
||||
//WARNING: this does not handle rounding flags at all!
|
||||
// WARNING: this does not handle rounding flags at all!
|
||||
void Value::Convert(TypeName target_type, RoundMode round_mode) {
|
||||
switch (type) {
|
||||
case FLOAT32_TYPE:
|
||||
|
@ -428,35 +428,57 @@ void Value::MulHi(Value* other, bool is_unsigned) {
|
|||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static T PPCUDiv(T numer, T denom) {
|
||||
if (!denom) {
|
||||
return 0;
|
||||
} else {
|
||||
return numer / denom;
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
static T PPCIDiv(T numer, T denom) {
|
||||
if (!denom) {
|
||||
return 0;
|
||||
} else if (numer == static_cast<T>(1LL << ((sizeof(T) * CHAR_BIT) - 1)) &&
|
||||
!~denom) { // if numer is signbit and denom is all ones, signed
|
||||
// oflow
|
||||
return 0;
|
||||
} else {
|
||||
return numer / denom;
|
||||
}
|
||||
}
|
||||
|
||||
// warning : we tolerate division by 0 in x64_sequences, but here we do not
|
||||
void Value::Div(Value* other, bool is_unsigned) {
|
||||
assert_true(type == other->type);
|
||||
switch (type) {
|
||||
case INT8_TYPE:
|
||||
if (is_unsigned) {
|
||||
constant.i8 /= uint8_t(other->constant.i8);
|
||||
constant.i8 = PPCUDiv<uint8_t>(constant.i8, other->constant.i8);
|
||||
} else {
|
||||
constant.i8 /= other->constant.i8;
|
||||
constant.i8 = PPCIDiv<int8_t>(constant.i8, other->constant.i8);
|
||||
}
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
if (is_unsigned) {
|
||||
constant.i16 /= uint16_t(other->constant.i16);
|
||||
constant.i16 = PPCUDiv<uint16_t>(constant.i16, other->constant.i16);
|
||||
} else {
|
||||
constant.i16 /= other->constant.i16;
|
||||
constant.i16 = PPCIDiv<int16_t>(constant.i16, other->constant.i16);
|
||||
}
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
if (is_unsigned) {
|
||||
constant.i32 /= uint32_t(other->constant.i32);
|
||||
constant.i32 = PPCUDiv<uint32_t>(constant.i32, other->constant.i32);
|
||||
} else {
|
||||
constant.i32 /= other->constant.i32;
|
||||
constant.i32 = PPCIDiv<int32_t>(constant.i32, other->constant.i32);
|
||||
}
|
||||
break;
|
||||
case INT64_TYPE:
|
||||
if (is_unsigned) {
|
||||
constant.i64 /= uint64_t(other->constant.i64);
|
||||
constant.i64 = PPCUDiv<uint64_t>(constant.i64, other->constant.i64);
|
||||
} else {
|
||||
constant.i64 /= other->constant.i64;
|
||||
constant.i64 = PPCIDiv<int64_t>(constant.i64, other->constant.i64);
|
||||
}
|
||||
break;
|
||||
case FLOAT32_TYPE:
|
||||
|
|
|
@ -364,12 +364,11 @@ int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// is this the right format?
|
||||
//todo: what mtvscr does with the unused bits is implementation defined, figure out what it does
|
||||
|
||||
// todo: what mtvscr does with the unused bits is implementation defined,
|
||||
// figure out what it does
|
||||
|
||||
Value* v = f.LoadVR(i.VX128_1.RB);
|
||||
|
||||
|
||||
Value* has_njm_value = f.Extract(v, (uint8_t)3, INT32_TYPE);
|
||||
|
||||
f.SetNJM(f.IsTrue(f.And(has_njm_value, f.LoadConstantInt32(65536))));
|
||||
|
@ -1824,9 +1823,38 @@ int InstrEmit_vsum4ubs(PPCHIRBuilder& f, const InstrData& i) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
static Value* vkpkx_in_low(PPCHIRBuilder& f, Value* input) {
|
||||
// truncate from argb8888 to 1 bit alpha, 5 bit red, 5 bit green, 5 bit blue
|
||||
auto ShrU32Vec = [&f](Value* input, unsigned shift) {
|
||||
return f.VectorShr(input, f.LoadConstantVec128(vec128i(shift)), INT32_TYPE);
|
||||
};
|
||||
auto AndU32Vec = [&f](Value* input, unsigned msk) {
|
||||
return f.And(input, f.LoadConstantVec128(vec128i(msk)));
|
||||
};
|
||||
auto tmp1 = AndU32Vec(ShrU32Vec(input, 9), 0xFC00);
|
||||
auto tmp2 = AndU32Vec(ShrU32Vec(input, 6), 0x3E0);
|
||||
auto tmp3 = AndU32Vec(ShrU32Vec(input, 3), 0x1F);
|
||||
return f.Or(tmp3, f.Or(tmp1, tmp2));
|
||||
}
|
||||
|
||||
int InstrEmit_vpkpx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
XEINSTRNOTIMPLEMENTED();
|
||||
return 1;
|
||||
// I compared the results of this against over a million randomly generated
|
||||
// sets of inputs and all compared equal
|
||||
|
||||
Value* src1 = f.LoadVR(i.VX.VA);
|
||||
|
||||
Value* src2 = f.LoadVR(i.VX.VB);
|
||||
|
||||
Value* pck1 = vkpkx_in_low(f, src1);
|
||||
Value* pck2 = vkpkx_in_low(f, src2);
|
||||
|
||||
Value* result = f.Pack(
|
||||
pck1, pck2,
|
||||
PACK_TYPE_16_IN_32 | PACK_TYPE_IN_UNSIGNED | PACK_TYPE_OUT_UNSIGNED);
|
||||
|
||||
f.StoreVR(i.VX.VD, result);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_vpkshss_(PPCHIRBuilder& f, uint32_t vd, uint32_t va,
|
||||
|
|
|
@ -336,10 +336,14 @@ int InstrEmit_mulhwx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
XEINSTRNOTIMPLEMENTED();
|
||||
return 1;
|
||||
}
|
||||
Value* ratrunc =
|
||||
f.SignExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE);
|
||||
|
||||
Value* rbtrunc =
|
||||
f.SignExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE);
|
||||
|
||||
Value* v = f.Sha(f.Mul(ratrunc, rbtrunc), 32);
|
||||
|
||||
Value* v = f.SignExtend(f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
|
||||
f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE)),
|
||||
INT64_TYPE);
|
||||
f.StoreGPR(i.XO.RT, v);
|
||||
if (i.XO.Rc) {
|
||||
f.UpdateCR(0, v);
|
||||
|
@ -355,10 +359,13 @@ int InstrEmit_mulhwux(PPCHIRBuilder& f, const InstrData& i) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
Value* v = f.ZeroExtend(
|
||||
f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
|
||||
f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), ARITHMETIC_UNSIGNED),
|
||||
INT64_TYPE);
|
||||
Value* ratrunc =
|
||||
f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE);
|
||||
|
||||
Value* rbtrunc =
|
||||
f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE);
|
||||
|
||||
Value* v = f.Shr(f.Mul(ratrunc, rbtrunc, ARITHMETIC_UNSIGNED), 32);
|
||||
f.StoreGPR(i.XO.RT, v);
|
||||
if (i.XO.Rc) {
|
||||
f.UpdateCR(0, v);
|
||||
|
|
|
@ -89,8 +89,10 @@ int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
int InstrEmit_fresx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- 1.0 / (frB)
|
||||
|
||||
Value* v = f.Recip(f.LoadFPR(i.A.FRB));
|
||||
v = f.ToSingle(v);
|
||||
// this actually does seem to require single precision, oddly
|
||||
// more research is needed
|
||||
Value* v = f.Recip(f.Convert(f.LoadFPR(i.A.FRB), FLOAT32_TYPE));
|
||||
v = f.Convert(v, FLOAT64_TYPE); // f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
|
|
@ -11,9 +11,17 @@
|
|||
|
||||
#include <stddef.h>
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/cpu/ppc/ppc_context.h"
|
||||
#include "xenia/cpu/ppc/ppc_hir_builder.h"
|
||||
|
||||
DEFINE_bool(
|
||||
disable_prefetch_and_cachecontrol, false,
|
||||
"Disables translating ppc prefetch/cache flush instructions to host "
|
||||
"prefetch/cacheflush instructions. This may improve performance as these "
|
||||
"instructions were written with the Xbox 360's cache in mind, and modern "
|
||||
"processors do their own automatic prefetching.",
|
||||
"CPU");
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace ppc {
|
||||
|
@ -1080,28 +1088,36 @@ int InstrEmit_stfsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// https://randomascii.wordpress.com/2018/01/07/finding-a-cpu-design-bug-in-the-xbox-360/
|
||||
|
||||
int InstrEmit_dcbf(PPCHIRBuilder& f, const InstrData& i) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128,
|
||||
CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE_AND_FLUSH);
|
||||
if (!cvars::disable_prefetch_and_cachecontrol) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128,
|
||||
CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE_AND_FLUSH);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_dcbst(PPCHIRBuilder& f, const InstrData& i) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE);
|
||||
if (!cvars::disable_prefetch_and_cachecontrol) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_dcbt(PPCHIRBuilder& f, const InstrData& i) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH);
|
||||
if (!cvars::disable_prefetch_and_cachecontrol) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_dcbtst(PPCHIRBuilder& f, const InstrData& i) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128,
|
||||
CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE);
|
||||
if (!cvars::disable_prefetch_and_cachecontrol) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128,
|
||||
CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -55,7 +55,9 @@ class PPCFrontend {
|
|||
PPCBuiltins builtins_ = {0};
|
||||
TypePool<PPCTranslator, PPCFrontend*> translator_pool_;
|
||||
};
|
||||
|
||||
// Checks the state of the global lock and sets scratch to the current MSR
|
||||
// value.
|
||||
void CheckGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1);
|
||||
} // namespace ppc
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -192,6 +192,21 @@ class ParamBase : public Param {
|
|||
T value_;
|
||||
};
|
||||
|
||||
class ContextParam : public Param {
|
||||
public:
|
||||
ContextParam() : Param(), ctx_(nullptr) {}
|
||||
ContextParam(PPCContext* value) : Param(), ctx_(value) {}
|
||||
ContextParam(Init& init) : Param(init), ctx_(init.ppc_context) {}
|
||||
|
||||
operator PPCContext*() const { return ctx_; }
|
||||
PPCContext* value() const { return ctx_; }
|
||||
|
||||
PPCContext* operator->() const { return ctx_; }
|
||||
|
||||
protected:
|
||||
PPCContext* ctx_;
|
||||
};
|
||||
|
||||
class PointerParam : public ParamBase<uint32_t> {
|
||||
public:
|
||||
PointerParam(Init& init) : ParamBase(init) {
|
||||
|
@ -370,6 +385,7 @@ using int_result_t = shim::ResultBase<int32_t>;
|
|||
using dword_result_t = shim::ResultBase<uint32_t>;
|
||||
using pointer_result_t = shim::ResultBase<uint32_t>;
|
||||
using X_HRESULT_result_t = shim::ResultBase<X_HRESULT>;
|
||||
using ppc_context_t = shim::ContextParam;
|
||||
|
||||
// Exported from kernel_state.cc.
|
||||
KernelState* kernel_state();
|
||||
|
@ -422,6 +438,9 @@ inline void AppendParam(StringBuffer* string_buffer, lpdouble_t param) {
|
|||
string_buffer->AppendFormat("({:G})", param.value());
|
||||
}
|
||||
}
|
||||
inline void AppendParam(StringBuffer* string_buffer, ppc_context_t param) {
|
||||
string_buffer->Append("ContextArg");
|
||||
}
|
||||
inline void AppendParam(StringBuffer* string_buffer, lpstring_t param) {
|
||||
string_buffer->AppendFormat("{:08X}", param.guest_address());
|
||||
if (param) {
|
||||
|
|
|
@ -8,12 +8,13 @@
|
|||
*/
|
||||
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/cpu/ppc/ppc_frontend.h"
|
||||
#include "xenia/cpu/processor.h"
|
||||
#include "xenia/kernel/kernel_state.h"
|
||||
#include "xenia/kernel/util/shim_utils.h"
|
||||
#include "xenia/kernel/xboxkrnl/xboxkrnl_private.h"
|
||||
#include "xenia/kernel/xthread.h"
|
||||
#include "xenia/xbox.h"
|
||||
|
||||
namespace xe {
|
||||
namespace kernel {
|
||||
namespace xboxkrnl {
|
||||
|
@ -22,6 +23,94 @@ void KeEnableFpuExceptions_entry(dword_t enabled) {
|
|||
// TODO(benvanik): can we do anything about exceptions?
|
||||
}
|
||||
DECLARE_XBOXKRNL_EXPORT1(KeEnableFpuExceptions, kNone, kStub);
|
||||
#if 0
|
||||
struct __declspec(align(8)) fpucontext_ptr_t {
|
||||
char unknown_data[158];
|
||||
__int16 field_9E;
|
||||
char field_A0[2272];
|
||||
unsigned __int64 saved_FPSCR;
|
||||
double saved_fpu_regs[32];
|
||||
};
|
||||
#pragma pack(push, 1)
|
||||
struct __declspec(align(1)) r13_struct_t {
|
||||
char field_0[6];
|
||||
__int16 field_6;
|
||||
char field_8[2];
|
||||
char field_A;
|
||||
char field_B[5];
|
||||
int field_10;
|
||||
char field_14[315];
|
||||
char field_14F;
|
||||
unsigned int field_150;
|
||||
char field_154[427];
|
||||
char field_2FF;
|
||||
char field_300;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
|
||||
static uint64_t Do_mfmsr(ppc_context_t& ctx) {
|
||||
auto frontend = ctx->thread_state->processor()->frontend();
|
||||
cpu::ppc::CheckGlobalLock(
|
||||
ctx, reinterpret_cast<void*>(&xe::global_critical_region::mutex()),
|
||||
reinterpret_cast<void*>(&frontend->builtins()->global_lock_count));
|
||||
return ctx->scratch;
|
||||
}
|
||||
|
||||
void KeSaveFloatingPointState_entry(ppc_context_t& ctx) {
|
||||
xe::Memory* memory = ctx->thread_state->memory();
|
||||
unsigned int r13 = static_cast<unsigned int>(ctx->r[13]);
|
||||
|
||||
|
||||
|
||||
|
||||
r13_struct_t* st = memory->TranslateVirtual<r13_struct_t*>(r13);
|
||||
/*
|
||||
lwz r10, 0x150(r13)
|
||||
lbz r11, 0xA(r13)
|
||||
tweqi r10, 0
|
||||
twnei r11, 0
|
||||
*/
|
||||
|
||||
unsigned int r10 = st->field_150;
|
||||
unsigned char r11 = st->field_A;
|
||||
|
||||
if (r10 == 0 || r11 != 0) {
|
||||
//trap!
|
||||
}
|
||||
|
||||
//should do mfmsr here
|
||||
|
||||
unsigned int r3 = xe::load_and_swap<unsigned int>(&st->field_10);
|
||||
|
||||
//too much work to do the mfmsr/mtmsr stuff right now
|
||||
int to_store = -2049;
|
||||
xe::store_and_swap(&st->field_10, (unsigned int)to_store);
|
||||
xe::store_and_swap(&st->field_6, (short)to_store);
|
||||
|
||||
|
||||
|
||||
if (r3 != ~0u) {
|
||||
fpucontext_ptr_t* fpucontext =
|
||||
memory->TranslateVirtual<fpucontext_ptr_t*>(r3);
|
||||
xe::store_and_swap<uint64_t>(&fpucontext->saved_FPSCR, ctx->fpscr.value);
|
||||
|
||||
for (unsigned int i = 0; i < 32; ++i) {
|
||||
xe::store_and_swap(&fpucontext->saved_fpu_regs[i], ctx->f[i]);
|
||||
}
|
||||
xe::store_and_swap<unsigned short>(&fpucontext->field_9E, 0xD7FF);
|
||||
}
|
||||
ctx->processor->backend()->SetGuestRoundingMode(ctx.value(), 0);
|
||||
ctx->fpscr.value = 0;
|
||||
st->field_A = 1;
|
||||
|
||||
xe::store_and_swap(&st->field_10, r13 + 0x300);
|
||||
ctx->r[3] = r3;
|
||||
|
||||
}
|
||||
|
||||
DECLARE_XBOXKRNL_EXPORT1(KeSaveFloatingPointState, kNone, kImplemented);
|
||||
#endif
|
||||
|
||||
} // namespace xboxkrnl
|
||||
} // namespace kernel
|
||||
|
|
Loading…
Reference in New Issue