Merge pull request #58 from chrisps/canary_experimental
[CPU] VKPKX Implementation, miscellaneous fixes
This commit is contained in:
commit
3ac99e0d7d
|
@ -67,6 +67,7 @@ class Backend {
|
|||
// up until the start of ctx may be used by the backend to store whatever data
|
||||
// they want
|
||||
virtual void InitializeBackendContext(void* ctx) {}
|
||||
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode){};
|
||||
|
||||
protected:
|
||||
Processor* processor_ = nullptr;
|
||||
|
|
|
@ -689,8 +689,7 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
|
|||
#endif
|
||||
}
|
||||
void X64Backend::InitializeBackendContext(void* ctx) {
|
||||
X64BackendContext* bctx = reinterpret_cast<X64BackendContext*>(
|
||||
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
|
||||
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
||||
bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
|
||||
bctx->mxcsr_fpu =
|
||||
DEFAULT_FPU_MXCSR; // idk if this is right, check on rgh what the
|
||||
|
@ -700,6 +699,18 @@ void X64Backend::InitializeBackendContext(void* ctx) {
|
|||
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
|
||||
bctx->Ox1000 = 0x1000;
|
||||
}
|
||||
const uint32_t mxcsr_table[8] = {
|
||||
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
|
||||
};
|
||||
|
||||
void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
|
||||
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
||||
|
||||
uint32_t control = mode & 7;
|
||||
_mm_setcsr(mxcsr_table[control]);
|
||||
bctx->mxcsr_fpu = mxcsr_table[control];
|
||||
((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
|
||||
}
|
||||
} // namespace x64
|
||||
} // namespace backend
|
||||
} // namespace cpu
|
||||
|
|
|
@ -37,9 +37,10 @@ typedef void (*ResolveFunctionThunk)();
|
|||
// negatively index the membase reg)
|
||||
struct X64BackendContext {
|
||||
void* ResolveFunction_Ptr; // cached pointer to resolvefunction
|
||||
unsigned int mxcsr_fpu; //currently, the way we implement rounding mode affects both vmx and the fpu
|
||||
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
|
||||
// affects both vmx and the fpu
|
||||
unsigned int mxcsr_vmx;
|
||||
unsigned int flags; //bit 0 = 0 if mxcsr is fpu, else it is vmx
|
||||
unsigned int flags; // bit 0 = 0 if mxcsr is fpu, else it is vmx
|
||||
unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted
|
||||
// add of it by... 2 bytes lol
|
||||
};
|
||||
|
@ -48,7 +49,7 @@ constexpr unsigned int DEFAULT_VMX_MXCSR =
|
|||
0x0040 | (_MM_MASK_MASK); // default rounding mode for vmx
|
||||
|
||||
constexpr unsigned int DEFAULT_FPU_MXCSR = 0x1F80;
|
||||
|
||||
extern const uint32_t mxcsr_table[8];
|
||||
class X64Backend : public Backend {
|
||||
public:
|
||||
static const uint32_t kForceReturnAddress = 0x9FFF0000u;
|
||||
|
@ -85,6 +86,12 @@ class X64Backend : public Backend {
|
|||
void UninstallBreakpoint(Breakpoint* breakpoint) override;
|
||||
virtual void InitializeBackendContext(void* ctx) override;
|
||||
|
||||
X64BackendContext* BackendContextForGuestContext(void* ctx) {
|
||||
return reinterpret_cast<X64BackendContext*>(
|
||||
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
|
||||
}
|
||||
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
|
||||
|
||||
private:
|
||||
static bool ExceptionCallbackThunk(Exception* ex, void* data);
|
||||
bool ExceptionCallback(Exception* ex);
|
||||
|
|
|
@ -50,6 +50,13 @@ DEFINE_bool(resolve_rel32_guest_calls, true,
|
|||
"Experimental optimization, directly call already resolved "
|
||||
"functions via x86 rel32 call/jmp",
|
||||
"CPU");
|
||||
|
||||
DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
|
||||
"Disables the FPU/VMX MXCSR sharing workaround, potentially "
|
||||
"causing incorrect rounding behavior and denormal handling in VMX "
|
||||
"code. The workaround may cause reduced CPU performance but is a "
|
||||
"more accurate emulation",
|
||||
"x64");
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
|
@ -1374,7 +1381,7 @@ Xbyak::Label& X64Emitter::NewCachedLabel() {
|
|||
return *tmp;
|
||||
}
|
||||
|
||||
template<bool switching_to_fpu>
|
||||
template <bool switching_to_fpu>
|
||||
static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) {
|
||||
auto flags = e.GetBackendFlagsPtr();
|
||||
if (switching_to_fpu) {
|
||||
|
@ -1398,13 +1405,17 @@ static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) {
|
|||
e.jc(reload_bailout,
|
||||
X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode.
|
||||
} else {
|
||||
e.jnc(reload_bailout,
|
||||
e.jnc(
|
||||
reload_bailout,
|
||||
X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode.
|
||||
}
|
||||
e.L(come_back);
|
||||
}
|
||||
|
||||
bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
|
||||
if (cvars::enable_incorrect_roundingmode_behavior) {
|
||||
return false; // no MXCSR mode handling!
|
||||
}
|
||||
if (new_mode == mxcsr_mode_) {
|
||||
return false;
|
||||
}
|
||||
|
@ -1421,7 +1432,8 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
|
|||
} else {
|
||||
assert_unhandled_case(new_mode);
|
||||
}
|
||||
} else { //even if already set, we still need to update flags to reflect our mode
|
||||
} else { // even if already set, we still need to update flags to reflect
|
||||
// our mode
|
||||
if (new_mode == MXCSRMode::Fpu) {
|
||||
btr(GetBackendFlagsPtr(), 0);
|
||||
} else if (new_mode == MXCSRMode::Vmx) {
|
||||
|
@ -1434,7 +1446,6 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
|
|||
mxcsr_mode_ = new_mode;
|
||||
if (!already_set) {
|
||||
if (new_mode == MXCSRMode::Fpu) {
|
||||
|
||||
LoadFpuMxcsrDirect();
|
||||
btr(GetBackendFlagsPtr(), 0);
|
||||
return true;
|
||||
|
|
|
@ -23,6 +23,10 @@ DEFINE_bool(
|
|||
elide_e0_check, false,
|
||||
"Eliminate e0 check on some memory accesses, like to r13(tls) or r1(sp)",
|
||||
"CPU");
|
||||
DEFINE_bool(enable_rmw_context_merging, false,
|
||||
"Permit merging read-modify-write HIR instr sequences together "
|
||||
"into x86 instructions that use a memory operand.",
|
||||
"x64");
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
@ -88,6 +92,9 @@ struct LoadModStoreContext : public LoadModStore {
|
|||
};
|
||||
static bool GetLoadModStoreContext(const hir::Instr* loadinsn,
|
||||
LoadModStoreContext* out) {
|
||||
if (!cvars::enable_rmw_context_merging) {
|
||||
return false;
|
||||
}
|
||||
if (!GetLoadModStore(loadinsn, out)) {
|
||||
return false;
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -10,11 +10,13 @@
|
|||
#ifndef XENIA_CPU_BACKEND_X64_X64_SEQUENCES_H_
|
||||
#define XENIA_CPU_BACKEND_X64_X64_SEQUENCES_H_
|
||||
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/cpu/hir/instr.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#define assert_impossible_sequence(name) \
|
||||
assert_always("impossible sequence hit" #name);
|
||||
assert_always("impossible sequence hit" #name); \
|
||||
XELOGE("impossible sequence hit: {}", #name)
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
|
|
@ -20,7 +20,9 @@
|
|||
DEFINE_bool(inline_mmio_access, true, "Inline constant MMIO loads and stores.",
|
||||
"CPU");
|
||||
|
||||
DEFINE_bool(permit_float_constant_evaluation, false, "Allow float constant evaluation, may produce incorrect results and break games math",
|
||||
DEFINE_bool(permit_float_constant_evaluation, false,
|
||||
"Allow float constant evaluation, may produce incorrect results "
|
||||
"and break games math",
|
||||
"CPU");
|
||||
|
||||
namespace xe {
|
||||
|
@ -557,6 +559,12 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
|
||||
i->Remove();
|
||||
result = true;
|
||||
} else if (!i->src2.value->MaybeFloaty() &&
|
||||
i->src2.value->IsConstantZero()) {
|
||||
// division by 0 == 0 every time,
|
||||
v->set_zero(i->src2.value->type);
|
||||
i->Remove();
|
||||
result = true;
|
||||
} else if (i->src2.value->IsConstant()) {
|
||||
// Division by one = no-op.
|
||||
Value* src1 = i->src1.value;
|
||||
|
@ -672,6 +680,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
}
|
||||
break;
|
||||
case OPCODE_SHL:
|
||||
if (i->dest->type != VEC128_TYPE) {
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Shl(i->src2.value);
|
||||
|
@ -683,8 +692,10 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
i->set_src1(src1);
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case OPCODE_SHR:
|
||||
if (i->dest->type != VEC128_TYPE) {
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Shr(i->src2.value);
|
||||
|
@ -696,6 +707,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
i->set_src1(src1);
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case OPCODE_SHA:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
|
@ -729,7 +741,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
result = true;
|
||||
}
|
||||
break;
|
||||
|
||||
#if 1
|
||||
case OPCODE_PERMUTE: {
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
|
||||
i->src3.value->IsConstant() &&
|
||||
|
@ -756,6 +768,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case OPCODE_INSERT:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
|
||||
i->src3.value->IsConstant()) {
|
||||
|
|
|
@ -83,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
iter_result |= SimplifyBitArith(builder);
|
||||
iter_result |= EliminateConversions(builder);
|
||||
iter_result |= SimplifyAssignments(builder);
|
||||
iter_result |= SimplifyBasicArith(builder);
|
||||
|
||||
result |= iter_result;
|
||||
} while (iter_result);
|
||||
|
@ -1228,6 +1229,91 @@ Value* SimplificationPass::CheckValue(Value* value, bool& result) {
|
|||
return value;
|
||||
}
|
||||
|
||||
bool SimplificationPass::SimplifyAddArith(hir::Instr* i,
|
||||
hir::HIRBuilder* builder) {
|
||||
/*
|
||||
example: (x <<1 ) + x == (x*3)
|
||||
|
||||
*/
|
||||
auto [shlinsn, addend] =
|
||||
i->BinaryValueArrangeByDefiningOpcode(&OPCODE_SHL_info);
|
||||
if (!shlinsn) {
|
||||
return false;
|
||||
}
|
||||
Instr* shift_insn = shlinsn->def;
|
||||
|
||||
Value* shift = shift_insn->src2.value;
|
||||
|
||||
// if not a constant shift, we cant combine to a multiply
|
||||
if (!shift->IsConstant()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Value* shouldbeaddend = shift_insn->src1.value;
|
||||
|
||||
if (!shouldbeaddend->IsEqual(addend)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint64_t multiplier = 1ULL << shift->constant.u8;
|
||||
|
||||
multiplier++;
|
||||
|
||||
hir::Value* oldvalue = shouldbeaddend;
|
||||
|
||||
i->Replace(&OPCODE_MUL_info, ARITHMETIC_UNSIGNED);
|
||||
i->set_src1(oldvalue);
|
||||
|
||||
// this sequence needs to be broken out into some kind of LoadConstant(type,
|
||||
// raw_value) method of hirbuilder
|
||||
auto constmul = builder->AllocValue(oldvalue->type);
|
||||
// could cause problems on big endian targets...
|
||||
constmul->flags |= VALUE_IS_CONSTANT;
|
||||
constmul->constant.u64 = multiplier;
|
||||
|
||||
i->set_src2(constmul);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SimplificationPass::SimplifySubArith(hir::Instr* i,
|
||||
hir::HIRBuilder* builder) {
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::SimplifyBasicArith(hir::Instr* i,
|
||||
hir::HIRBuilder* builder) {
|
||||
if (!i->dest) {
|
||||
return false;
|
||||
}
|
||||
if (i->dest->MaybeFloaty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
hir::Opcode op = i->GetOpcodeNum();
|
||||
|
||||
switch (op) {
|
||||
case OPCODE_ADD: {
|
||||
return SimplifyAddArith(i, builder);
|
||||
}
|
||||
case OPCODE_SUB: {
|
||||
return SimplifySubArith(i, builder);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) {
|
||||
bool result = false;
|
||||
auto block = builder->first_block();
|
||||
while (block) {
|
||||
auto i = block->instr_head;
|
||||
while (i) {
|
||||
result |= SimplifyBasicArith(i, builder);
|
||||
i = i->next;
|
||||
}
|
||||
block = block->next;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
} // namespace passes
|
||||
} // namespace compiler
|
||||
} // namespace cpu
|
||||
|
|
|
@ -32,6 +32,13 @@ class SimplificationPass : public ConditionalGroupSubpass {
|
|||
bool SimplifyAssignments(hir::HIRBuilder* builder);
|
||||
hir::Value* CheckValue(hir::Value* value, bool& result);
|
||||
bool SimplifyBitArith(hir::HIRBuilder* builder);
|
||||
|
||||
// handles simple multiplication/addition rules
|
||||
bool SimplifyBasicArith(hir::HIRBuilder* builder);
|
||||
bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
|
||||
bool SimplifyAddArith(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool SimplifySubArith(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
// handle either or or xor with 0
|
||||
bool CheckOrXorZero(hir::Instr* i);
|
||||
bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
|
|
|
@ -79,6 +79,10 @@ class Instr {
|
|||
void MoveBefore(Instr* other);
|
||||
void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
|
||||
void Remove();
|
||||
const OpcodeInfo* GetOpcodeInfo() const { return opcode; }
|
||||
// if opcode is null, we have bigger problems
|
||||
Opcode GetOpcodeNum() const { return GetOpcodeInfo()->num; }
|
||||
|
||||
template <typename TPredicate>
|
||||
std::pair<Value*, Value*> BinaryValueArrangeByPredicateExclusive(
|
||||
TPredicate&& pred) {
|
||||
|
@ -86,12 +90,13 @@ class Instr {
|
|||
auto src2_value = src2.value;
|
||||
if (!src1_value || !src2_value) return {nullptr, nullptr};
|
||||
|
||||
if (!opcode) return {nullptr, nullptr}; // impossible!
|
||||
if (!GetOpcodeInfo()) return {nullptr, nullptr}; // impossible!
|
||||
|
||||
// check if binary opcode taking two values. we dont care if the dest is a
|
||||
// value
|
||||
|
||||
if (!IsOpcodeBinaryValue(opcode->signature)) return {nullptr, nullptr};
|
||||
if (!IsOpcodeBinaryValue(GetOpcodeInfo()->signature))
|
||||
return {nullptr, nullptr};
|
||||
|
||||
if (pred(src1_value)) {
|
||||
if (pred(src2_value)) {
|
||||
|
@ -119,7 +124,7 @@ if both are constant, return nullptr, nullptr
|
|||
std::pair<Value*, Value*> BinaryValueArrangeByDefiningOpcode(
|
||||
const OpcodeInfo* op_ptr) {
|
||||
return BinaryValueArrangeByPredicateExclusive([op_ptr](Value* value) {
|
||||
return value->def && value->def->opcode == op_ptr;
|
||||
return value->def && value->def->GetOpcodeInfo() == op_ptr;
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -143,7 +148,7 @@ if both are constant, return nullptr, nullptr
|
|||
*/
|
||||
template <typename TCallable>
|
||||
void VisitValueOperands(TCallable&& call_for_values) {
|
||||
uint32_t signature = opcode->signature;
|
||||
uint32_t signature = GetOpcodeInfo()->signature;
|
||||
|
||||
OpcodeSignatureType t_dest, t_src1, t_src2, t_src3;
|
||||
|
||||
|
|
|
@ -199,7 +199,7 @@ void Value::Truncate(TypeName target_type) {
|
|||
return;
|
||||
}
|
||||
}
|
||||
//WARNING: this does not handle rounding flags at all!
|
||||
// WARNING: this does not handle rounding flags at all!
|
||||
void Value::Convert(TypeName target_type, RoundMode round_mode) {
|
||||
switch (type) {
|
||||
case FLOAT32_TYPE:
|
||||
|
@ -428,35 +428,57 @@ void Value::MulHi(Value* other, bool is_unsigned) {
|
|||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static T PPCUDiv(T numer, T denom) {
|
||||
if (!denom) {
|
||||
return 0;
|
||||
} else {
|
||||
return numer / denom;
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
static T PPCIDiv(T numer, T denom) {
|
||||
if (!denom) {
|
||||
return 0;
|
||||
} else if (numer == static_cast<T>(1LL << ((sizeof(T) * CHAR_BIT) - 1)) &&
|
||||
!~denom) { // if numer is signbit and denom is all ones, signed
|
||||
// oflow
|
||||
return 0;
|
||||
} else {
|
||||
return numer / denom;
|
||||
}
|
||||
}
|
||||
|
||||
// warning : we tolerate division by 0 in x64_sequences, but here we do not
|
||||
void Value::Div(Value* other, bool is_unsigned) {
|
||||
assert_true(type == other->type);
|
||||
switch (type) {
|
||||
case INT8_TYPE:
|
||||
if (is_unsigned) {
|
||||
constant.i8 /= uint8_t(other->constant.i8);
|
||||
constant.i8 = PPCUDiv<uint8_t>(constant.i8, other->constant.i8);
|
||||
} else {
|
||||
constant.i8 /= other->constant.i8;
|
||||
constant.i8 = PPCIDiv<int8_t>(constant.i8, other->constant.i8);
|
||||
}
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
if (is_unsigned) {
|
||||
constant.i16 /= uint16_t(other->constant.i16);
|
||||
constant.i16 = PPCUDiv<uint16_t>(constant.i16, other->constant.i16);
|
||||
} else {
|
||||
constant.i16 /= other->constant.i16;
|
||||
constant.i16 = PPCIDiv<int16_t>(constant.i16, other->constant.i16);
|
||||
}
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
if (is_unsigned) {
|
||||
constant.i32 /= uint32_t(other->constant.i32);
|
||||
constant.i32 = PPCUDiv<uint32_t>(constant.i32, other->constant.i32);
|
||||
} else {
|
||||
constant.i32 /= other->constant.i32;
|
||||
constant.i32 = PPCIDiv<int32_t>(constant.i32, other->constant.i32);
|
||||
}
|
||||
break;
|
||||
case INT64_TYPE:
|
||||
if (is_unsigned) {
|
||||
constant.i64 /= uint64_t(other->constant.i64);
|
||||
constant.i64 = PPCUDiv<uint64_t>(constant.i64, other->constant.i64);
|
||||
} else {
|
||||
constant.i64 /= other->constant.i64;
|
||||
constant.i64 = PPCIDiv<int64_t>(constant.i64, other->constant.i64);
|
||||
}
|
||||
break;
|
||||
case FLOAT32_TYPE:
|
||||
|
|
|
@ -364,12 +364,11 @@ int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// is this the right format?
|
||||
//todo: what mtvscr does with the unused bits is implementation defined, figure out what it does
|
||||
|
||||
// todo: what mtvscr does with the unused bits is implementation defined,
|
||||
// figure out what it does
|
||||
|
||||
Value* v = f.LoadVR(i.VX128_1.RB);
|
||||
|
||||
|
||||
Value* has_njm_value = f.Extract(v, (uint8_t)3, INT32_TYPE);
|
||||
|
||||
f.SetNJM(f.IsTrue(f.And(has_njm_value, f.LoadConstantInt32(65536))));
|
||||
|
@ -1824,9 +1823,38 @@ int InstrEmit_vsum4ubs(PPCHIRBuilder& f, const InstrData& i) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
static Value* vkpkx_in_low(PPCHIRBuilder& f, Value* input) {
|
||||
// truncate from argb8888 to 1 bit alpha, 5 bit red, 5 bit green, 5 bit blue
|
||||
auto ShrU32Vec = [&f](Value* input, unsigned shift) {
|
||||
return f.VectorShr(input, f.LoadConstantVec128(vec128i(shift)), INT32_TYPE);
|
||||
};
|
||||
auto AndU32Vec = [&f](Value* input, unsigned msk) {
|
||||
return f.And(input, f.LoadConstantVec128(vec128i(msk)));
|
||||
};
|
||||
auto tmp1 = AndU32Vec(ShrU32Vec(input, 9), 0xFC00);
|
||||
auto tmp2 = AndU32Vec(ShrU32Vec(input, 6), 0x3E0);
|
||||
auto tmp3 = AndU32Vec(ShrU32Vec(input, 3), 0x1F);
|
||||
return f.Or(tmp3, f.Or(tmp1, tmp2));
|
||||
}
|
||||
|
||||
int InstrEmit_vpkpx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
XEINSTRNOTIMPLEMENTED();
|
||||
return 1;
|
||||
// I compared the results of this against over a million randomly generated
|
||||
// sets of inputs and all compared equal
|
||||
|
||||
Value* src1 = f.LoadVR(i.VX.VA);
|
||||
|
||||
Value* src2 = f.LoadVR(i.VX.VB);
|
||||
|
||||
Value* pck1 = vkpkx_in_low(f, src1);
|
||||
Value* pck2 = vkpkx_in_low(f, src2);
|
||||
|
||||
Value* result = f.Pack(
|
||||
pck1, pck2,
|
||||
PACK_TYPE_16_IN_32 | PACK_TYPE_IN_UNSIGNED | PACK_TYPE_OUT_UNSIGNED);
|
||||
|
||||
f.StoreVR(i.VX.VD, result);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_vpkshss_(PPCHIRBuilder& f, uint32_t vd, uint32_t va,
|
||||
|
|
|
@ -336,10 +336,14 @@ int InstrEmit_mulhwx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
XEINSTRNOTIMPLEMENTED();
|
||||
return 1;
|
||||
}
|
||||
Value* ratrunc =
|
||||
f.SignExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE);
|
||||
|
||||
Value* rbtrunc =
|
||||
f.SignExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE);
|
||||
|
||||
Value* v = f.Sha(f.Mul(ratrunc, rbtrunc), 32);
|
||||
|
||||
Value* v = f.SignExtend(f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
|
||||
f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE)),
|
||||
INT64_TYPE);
|
||||
f.StoreGPR(i.XO.RT, v);
|
||||
if (i.XO.Rc) {
|
||||
f.UpdateCR(0, v);
|
||||
|
@ -355,10 +359,13 @@ int InstrEmit_mulhwux(PPCHIRBuilder& f, const InstrData& i) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
Value* v = f.ZeroExtend(
|
||||
f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
|
||||
f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), ARITHMETIC_UNSIGNED),
|
||||
INT64_TYPE);
|
||||
Value* ratrunc =
|
||||
f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE);
|
||||
|
||||
Value* rbtrunc =
|
||||
f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE);
|
||||
|
||||
Value* v = f.Shr(f.Mul(ratrunc, rbtrunc, ARITHMETIC_UNSIGNED), 32);
|
||||
f.StoreGPR(i.XO.RT, v);
|
||||
if (i.XO.Rc) {
|
||||
f.UpdateCR(0, v);
|
||||
|
|
|
@ -89,8 +89,10 @@ int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
int InstrEmit_fresx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- 1.0 / (frB)
|
||||
|
||||
Value* v = f.Recip(f.LoadFPR(i.A.FRB));
|
||||
v = f.ToSingle(v);
|
||||
// this actually does seem to require single precision, oddly
|
||||
// more research is needed
|
||||
Value* v = f.Recip(f.Convert(f.LoadFPR(i.A.FRB), FLOAT32_TYPE));
|
||||
v = f.Convert(v, FLOAT64_TYPE); // f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
|
|
@ -11,9 +11,17 @@
|
|||
|
||||
#include <stddef.h>
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/cpu/ppc/ppc_context.h"
|
||||
#include "xenia/cpu/ppc/ppc_hir_builder.h"
|
||||
|
||||
DEFINE_bool(
|
||||
disable_prefetch_and_cachecontrol, false,
|
||||
"Disables translating ppc prefetch/cache flush instructions to host "
|
||||
"prefetch/cacheflush instructions. This may improve performance as these "
|
||||
"instructions were written with the Xbox 360's cache in mind, and modern "
|
||||
"processors do their own automatic prefetching.",
|
||||
"CPU");
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace ppc {
|
||||
|
@ -1080,28 +1088,36 @@ int InstrEmit_stfsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// https://randomascii.wordpress.com/2018/01/07/finding-a-cpu-design-bug-in-the-xbox-360/
|
||||
|
||||
int InstrEmit_dcbf(PPCHIRBuilder& f, const InstrData& i) {
|
||||
if (!cvars::disable_prefetch_and_cachecontrol) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128,
|
||||
CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE_AND_FLUSH);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_dcbst(PPCHIRBuilder& f, const InstrData& i) {
|
||||
if (!cvars::disable_prefetch_and_cachecontrol) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_dcbt(PPCHIRBuilder& f, const InstrData& i) {
|
||||
if (!cvars::disable_prefetch_and_cachecontrol) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_dcbtst(PPCHIRBuilder& f, const InstrData& i) {
|
||||
if (!cvars::disable_prefetch_and_cachecontrol) {
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
f.CacheControl(ea, 128,
|
||||
CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -55,7 +55,9 @@ class PPCFrontend {
|
|||
PPCBuiltins builtins_ = {0};
|
||||
TypePool<PPCTranslator, PPCFrontend*> translator_pool_;
|
||||
};
|
||||
|
||||
// Checks the state of the global lock and sets scratch to the current MSR
|
||||
// value.
|
||||
void CheckGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1);
|
||||
} // namespace ppc
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -192,6 +192,21 @@ class ParamBase : public Param {
|
|||
T value_;
|
||||
};
|
||||
|
||||
class ContextParam : public Param {
|
||||
public:
|
||||
ContextParam() : Param(), ctx_(nullptr) {}
|
||||
ContextParam(PPCContext* value) : Param(), ctx_(value) {}
|
||||
ContextParam(Init& init) : Param(init), ctx_(init.ppc_context) {}
|
||||
|
||||
operator PPCContext*() const { return ctx_; }
|
||||
PPCContext* value() const { return ctx_; }
|
||||
|
||||
PPCContext* operator->() const { return ctx_; }
|
||||
|
||||
protected:
|
||||
PPCContext* ctx_;
|
||||
};
|
||||
|
||||
class PointerParam : public ParamBase<uint32_t> {
|
||||
public:
|
||||
PointerParam(Init& init) : ParamBase(init) {
|
||||
|
@ -370,6 +385,7 @@ using int_result_t = shim::ResultBase<int32_t>;
|
|||
using dword_result_t = shim::ResultBase<uint32_t>;
|
||||
using pointer_result_t = shim::ResultBase<uint32_t>;
|
||||
using X_HRESULT_result_t = shim::ResultBase<X_HRESULT>;
|
||||
using ppc_context_t = shim::ContextParam;
|
||||
|
||||
// Exported from kernel_state.cc.
|
||||
KernelState* kernel_state();
|
||||
|
@ -422,6 +438,9 @@ inline void AppendParam(StringBuffer* string_buffer, lpdouble_t param) {
|
|||
string_buffer->AppendFormat("({:G})", param.value());
|
||||
}
|
||||
}
|
||||
inline void AppendParam(StringBuffer* string_buffer, ppc_context_t param) {
|
||||
string_buffer->Append("ContextArg");
|
||||
}
|
||||
inline void AppendParam(StringBuffer* string_buffer, lpstring_t param) {
|
||||
string_buffer->AppendFormat("{:08X}", param.guest_address());
|
||||
if (param) {
|
||||
|
|
|
@ -8,12 +8,13 @@
|
|||
*/
|
||||
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/cpu/ppc/ppc_frontend.h"
|
||||
#include "xenia/cpu/processor.h"
|
||||
#include "xenia/kernel/kernel_state.h"
|
||||
#include "xenia/kernel/util/shim_utils.h"
|
||||
#include "xenia/kernel/xboxkrnl/xboxkrnl_private.h"
|
||||
#include "xenia/kernel/xthread.h"
|
||||
#include "xenia/xbox.h"
|
||||
|
||||
namespace xe {
|
||||
namespace kernel {
|
||||
namespace xboxkrnl {
|
||||
|
@ -22,6 +23,94 @@ void KeEnableFpuExceptions_entry(dword_t enabled) {
|
|||
// TODO(benvanik): can we do anything about exceptions?
|
||||
}
|
||||
DECLARE_XBOXKRNL_EXPORT1(KeEnableFpuExceptions, kNone, kStub);
|
||||
#if 0
|
||||
struct __declspec(align(8)) fpucontext_ptr_t {
|
||||
char unknown_data[158];
|
||||
__int16 field_9E;
|
||||
char field_A0[2272];
|
||||
unsigned __int64 saved_FPSCR;
|
||||
double saved_fpu_regs[32];
|
||||
};
|
||||
#pragma pack(push, 1)
|
||||
struct __declspec(align(1)) r13_struct_t {
|
||||
char field_0[6];
|
||||
__int16 field_6;
|
||||
char field_8[2];
|
||||
char field_A;
|
||||
char field_B[5];
|
||||
int field_10;
|
||||
char field_14[315];
|
||||
char field_14F;
|
||||
unsigned int field_150;
|
||||
char field_154[427];
|
||||
char field_2FF;
|
||||
char field_300;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
|
||||
static uint64_t Do_mfmsr(ppc_context_t& ctx) {
|
||||
auto frontend = ctx->thread_state->processor()->frontend();
|
||||
cpu::ppc::CheckGlobalLock(
|
||||
ctx, reinterpret_cast<void*>(&xe::global_critical_region::mutex()),
|
||||
reinterpret_cast<void*>(&frontend->builtins()->global_lock_count));
|
||||
return ctx->scratch;
|
||||
}
|
||||
|
||||
void KeSaveFloatingPointState_entry(ppc_context_t& ctx) {
|
||||
xe::Memory* memory = ctx->thread_state->memory();
|
||||
unsigned int r13 = static_cast<unsigned int>(ctx->r[13]);
|
||||
|
||||
|
||||
|
||||
|
||||
r13_struct_t* st = memory->TranslateVirtual<r13_struct_t*>(r13);
|
||||
/*
|
||||
lwz r10, 0x150(r13)
|
||||
lbz r11, 0xA(r13)
|
||||
tweqi r10, 0
|
||||
twnei r11, 0
|
||||
*/
|
||||
|
||||
unsigned int r10 = st->field_150;
|
||||
unsigned char r11 = st->field_A;
|
||||
|
||||
if (r10 == 0 || r11 != 0) {
|
||||
//trap!
|
||||
}
|
||||
|
||||
//should do mfmsr here
|
||||
|
||||
unsigned int r3 = xe::load_and_swap<unsigned int>(&st->field_10);
|
||||
|
||||
//too much work to do the mfmsr/mtmsr stuff right now
|
||||
int to_store = -2049;
|
||||
xe::store_and_swap(&st->field_10, (unsigned int)to_store);
|
||||
xe::store_and_swap(&st->field_6, (short)to_store);
|
||||
|
||||
|
||||
|
||||
if (r3 != ~0u) {
|
||||
fpucontext_ptr_t* fpucontext =
|
||||
memory->TranslateVirtual<fpucontext_ptr_t*>(r3);
|
||||
xe::store_and_swap<uint64_t>(&fpucontext->saved_FPSCR, ctx->fpscr.value);
|
||||
|
||||
for (unsigned int i = 0; i < 32; ++i) {
|
||||
xe::store_and_swap(&fpucontext->saved_fpu_regs[i], ctx->f[i]);
|
||||
}
|
||||
xe::store_and_swap<unsigned short>(&fpucontext->field_9E, 0xD7FF);
|
||||
}
|
||||
ctx->processor->backend()->SetGuestRoundingMode(ctx.value(), 0);
|
||||
ctx->fpscr.value = 0;
|
||||
st->field_A = 1;
|
||||
|
||||
xe::store_and_swap(&st->field_10, r13 + 0x300);
|
||||
ctx->r[3] = r3;
|
||||
|
||||
}
|
||||
|
||||
DECLARE_XBOXKRNL_EXPORT1(KeSaveFloatingPointState, kNone, kImplemented);
|
||||
#endif
|
||||
|
||||
} // namespace xboxkrnl
|
||||
} // namespace kernel
|
||||
|
|
Loading…
Reference in New Issue