Merge pull request #58 from chrisps/canary_experimental

[CPU] VKPKX Implementation, miscellaneous fixes
This commit is contained in:
Radosław Gliński 2022-08-08 07:54:26 +02:00 committed by GitHub
commit 3ac99e0d7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 512 additions and 477 deletions

View File

@ -67,6 +67,7 @@ class Backend {
// up until the start of ctx may be used by the backend to store whatever data
// they want
virtual void InitializeBackendContext(void* ctx) {}
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode){};
protected:
Processor* processor_ = nullptr;

View File

@ -689,8 +689,7 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
#endif
}
void X64Backend::InitializeBackendContext(void* ctx) {
X64BackendContext* bctx = reinterpret_cast<X64BackendContext*>(
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
bctx->mxcsr_fpu =
DEFAULT_FPU_MXCSR; // idk if this is right, check on rgh what the
@ -700,6 +699,18 @@ void X64Backend::InitializeBackendContext(void* ctx) {
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
bctx->Ox1000 = 0x1000;
}
const uint32_t mxcsr_table[8] = {
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
};
void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
uint32_t control = mode & 7;
_mm_setcsr(mxcsr_table[control]);
bctx->mxcsr_fpu = mxcsr_table[control];
((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
}
} // namespace x64
} // namespace backend
} // namespace cpu

View File

@ -37,9 +37,10 @@ typedef void (*ResolveFunctionThunk)();
// negatively index the membase reg)
struct X64BackendContext {
void* ResolveFunction_Ptr; // cached pointer to resolvefunction
unsigned int mxcsr_fpu; //currently, the way we implement rounding mode affects both vmx and the fpu
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
// affects both vmx and the fpu
unsigned int mxcsr_vmx;
unsigned int flags; //bit 0 = 0 if mxcsr is fpu, else it is vmx
unsigned int flags; // bit 0 = 0 if mxcsr is fpu, else it is vmx
unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted
// add of it by... 2 bytes lol
};
@ -48,7 +49,7 @@ constexpr unsigned int DEFAULT_VMX_MXCSR =
0x0040 | (_MM_MASK_MASK); // default rounding mode for vmx
constexpr unsigned int DEFAULT_FPU_MXCSR = 0x1F80;
extern const uint32_t mxcsr_table[8];
class X64Backend : public Backend {
public:
static const uint32_t kForceReturnAddress = 0x9FFF0000u;
@ -85,6 +86,12 @@ class X64Backend : public Backend {
void UninstallBreakpoint(Breakpoint* breakpoint) override;
virtual void InitializeBackendContext(void* ctx) override;
X64BackendContext* BackendContextForGuestContext(void* ctx) {
return reinterpret_cast<X64BackendContext*>(
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
}
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
private:
static bool ExceptionCallbackThunk(Exception* ex, void* data);
bool ExceptionCallback(Exception* ex);

View File

@ -50,6 +50,13 @@ DEFINE_bool(resolve_rel32_guest_calls, true,
"Experimental optimization, directly call already resolved "
"functions via x86 rel32 call/jmp",
"CPU");
DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
"Disables the FPU/VMX MXCSR sharing workaround, potentially "
"causing incorrect rounding behavior and denormal handling in VMX "
"code. The workaround may cause reduced CPU performance but is a "
"more accurate emulation",
"x64");
namespace xe {
namespace cpu {
namespace backend {
@ -1374,13 +1381,13 @@ Xbyak::Label& X64Emitter::NewCachedLabel() {
return *tmp;
}
template<bool switching_to_fpu>
template <bool switching_to_fpu>
static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) {
auto flags = e.GetBackendFlagsPtr();
if (switching_to_fpu) {
e.btr(flags, 0); // bit 0 set to 0 = is fpu mode
} else {
e.bts(flags, 0); // bit 0 set to 1 = is vmx mode
e.bts(flags, 0); // bit 0 set to 1 = is vmx mode
}
Xbyak::Label& come_back = e.NewCachedLabel();
@ -1391,20 +1398,24 @@ static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) {
e.LoadFpuMxcsrDirect();
} else {
e.LoadVmxMxcsrDirect();
}
}
e.jmp(come_back, X64Emitter::T_NEAR);
});
if (switching_to_fpu) {
e.jc(reload_bailout,
X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode.
} else {
e.jnc(reload_bailout,
X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode.
e.jnc(
reload_bailout,
X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode.
}
e.L(come_back);
}
bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
if (cvars::enable_incorrect_roundingmode_behavior) {
return false; // no MXCSR mode handling!
}
if (new_mode == mxcsr_mode_) {
return false;
}
@ -1420,21 +1431,21 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
ChangeMxcsrModeDynamicHelper<false>(*this);
} else {
assert_unhandled_case(new_mode);
}
} else { //even if already set, we still need to update flags to reflect our mode
}
} else { // even if already set, we still need to update flags to reflect
// our mode
if (new_mode == MXCSRMode::Fpu) {
btr(GetBackendFlagsPtr(), 0);
} else if (new_mode == MXCSRMode::Vmx) {
bts(GetBackendFlagsPtr(), 0);
} else {
assert_unhandled_case(new_mode);
}
}
}
}
} else {
mxcsr_mode_ = new_mode;
if (!already_set) {
if (new_mode == MXCSRMode::Fpu) {
LoadFpuMxcsrDirect();
btr(GetBackendFlagsPtr(), 0);
return true;

View File

@ -23,6 +23,10 @@ DEFINE_bool(
elide_e0_check, false,
"Eliminate e0 check on some memory accesses, like to r13(tls) or r1(sp)",
"CPU");
DEFINE_bool(enable_rmw_context_merging, false,
"Permit merging read-modify-write HIR instr sequences together "
"into x86 instructions that use a memory operand.",
"x64");
namespace xe {
namespace cpu {
@ -88,6 +92,9 @@ struct LoadModStoreContext : public LoadModStore {
};
static bool GetLoadModStoreContext(const hir::Instr* loadinsn,
LoadModStoreContext* out) {
if (!cvars::enable_rmw_context_merging) {
return false;
}
if (!GetLoadModStore(loadinsn, out)) {
return false;
}

File diff suppressed because one or more lines are too long

View File

@ -10,11 +10,13 @@
#ifndef XENIA_CPU_BACKEND_X64_X64_SEQUENCES_H_
#define XENIA_CPU_BACKEND_X64_X64_SEQUENCES_H_
#include "xenia/base/logging.h"
#include "xenia/cpu/hir/instr.h"
#include <unordered_map>
#define assert_impossible_sequence(name) \
assert_always("impossible sequence hit" #name);
#define assert_impossible_sequence(name) \
assert_always("impossible sequence hit" #name); \
XELOGE("impossible sequence hit: {}", #name)
namespace xe {
namespace cpu {

View File

@ -20,7 +20,9 @@
DEFINE_bool(inline_mmio_access, true, "Inline constant MMIO loads and stores.",
"CPU");
DEFINE_bool(permit_float_constant_evaluation, false, "Allow float constant evaluation, may produce incorrect results and break games math",
DEFINE_bool(permit_float_constant_evaluation, false,
"Allow float constant evaluation, may produce incorrect results "
"and break games math",
"CPU");
namespace xe {
@ -85,8 +87,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
if (i->dest) {
might_be_floatop |= i->dest->MaybeFloaty();
}
bool should_skip_because_of_float =
bool should_skip_because_of_float =
might_be_floatop && !cvars::permit_float_constant_evaluation;
auto v = i->dest;
@ -557,6 +559,12 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
i->Remove();
result = true;
} else if (!i->src2.value->MaybeFloaty() &&
i->src2.value->IsConstantZero()) {
// division by 0 == 0 every time,
v->set_zero(i->src2.value->type);
i->Remove();
result = true;
} else if (i->src2.value->IsConstant()) {
// Division by one = no-op.
Value* src1 = i->src1.value;
@ -672,29 +680,33 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
}
break;
case OPCODE_SHL:
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
v->set_from(i->src1.value);
v->Shl(i->src2.value);
i->Remove();
result = true;
} else if (i->src2.value->IsConstantZero()) {
auto src1 = i->src1.value;
i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(src1);
result = true;
if (i->dest->type != VEC128_TYPE) {
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
v->set_from(i->src1.value);
v->Shl(i->src2.value);
i->Remove();
result = true;
} else if (i->src2.value->IsConstantZero()) {
auto src1 = i->src1.value;
i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(src1);
result = true;
}
}
break;
case OPCODE_SHR:
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
v->set_from(i->src1.value);
v->Shr(i->src2.value);
i->Remove();
result = true;
} else if (i->src2.value->IsConstantZero()) {
auto src1 = i->src1.value;
i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(src1);
result = true;
if (i->dest->type != VEC128_TYPE) {
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
v->set_from(i->src1.value);
v->Shr(i->src2.value);
i->Remove();
result = true;
} else if (i->src2.value->IsConstantZero()) {
auto src1 = i->src1.value;
i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(src1);
result = true;
}
}
break;
case OPCODE_SHA:
@ -729,7 +741,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
result = true;
}
break;
#if 1
case OPCODE_PERMUTE: {
if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
i->src3.value->IsConstant() &&
@ -756,6 +768,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
break;
}
#endif
case OPCODE_INSERT:
if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
i->src3.value->IsConstant()) {

View File

@ -83,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
iter_result |= SimplifyBitArith(builder);
iter_result |= EliminateConversions(builder);
iter_result |= SimplifyAssignments(builder);
iter_result |= SimplifyBasicArith(builder);
result |= iter_result;
} while (iter_result);
@ -1228,6 +1229,91 @@ Value* SimplificationPass::CheckValue(Value* value, bool& result) {
return value;
}
bool SimplificationPass::SimplifyAddArith(hir::Instr* i,
hir::HIRBuilder* builder) {
/*
example: (x <<1 ) + x == (x*3)
*/
auto [shlinsn, addend] =
i->BinaryValueArrangeByDefiningOpcode(&OPCODE_SHL_info);
if (!shlinsn) {
return false;
}
Instr* shift_insn = shlinsn->def;
Value* shift = shift_insn->src2.value;
// if not a constant shift, we cant combine to a multiply
if (!shift->IsConstant()) {
return false;
}
Value* shouldbeaddend = shift_insn->src1.value;
if (!shouldbeaddend->IsEqual(addend)) {
return false;
}
uint64_t multiplier = 1ULL << shift->constant.u8;
multiplier++;
hir::Value* oldvalue = shouldbeaddend;
i->Replace(&OPCODE_MUL_info, ARITHMETIC_UNSIGNED);
i->set_src1(oldvalue);
// this sequence needs to be broken out into some kind of LoadConstant(type,
// raw_value) method of hirbuilder
auto constmul = builder->AllocValue(oldvalue->type);
// could cause problems on big endian targets...
constmul->flags |= VALUE_IS_CONSTANT;
constmul->constant.u64 = multiplier;
i->set_src2(constmul);
return true;
}
bool SimplificationPass::SimplifySubArith(hir::Instr* i,
hir::HIRBuilder* builder) {
return false;
}
bool SimplificationPass::SimplifyBasicArith(hir::Instr* i,
hir::HIRBuilder* builder) {
if (!i->dest) {
return false;
}
if (i->dest->MaybeFloaty()) {
return false;
}
hir::Opcode op = i->GetOpcodeNum();
switch (op) {
case OPCODE_ADD: {
return SimplifyAddArith(i, builder);
}
case OPCODE_SUB: {
return SimplifySubArith(i, builder);
}
}
return false;
}
bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) {
bool result = false;
auto block = builder->first_block();
while (block) {
auto i = block->instr_head;
while (i) {
result |= SimplifyBasicArith(i, builder);
i = i->next;
}
block = block->next;
}
return result;
}
} // namespace passes
} // namespace compiler
} // namespace cpu

View File

@ -32,6 +32,13 @@ class SimplificationPass : public ConditionalGroupSubpass {
bool SimplifyAssignments(hir::HIRBuilder* builder);
hir::Value* CheckValue(hir::Value* value, bool& result);
bool SimplifyBitArith(hir::HIRBuilder* builder);
// handles simple multiplication/addition rules
bool SimplifyBasicArith(hir::HIRBuilder* builder);
bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifyAddArith(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifySubArith(hir::Instr* i, hir::HIRBuilder* builder);
// handle either or or xor with 0
bool CheckOrXorZero(hir::Instr* i);
bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);

View File

@ -79,6 +79,10 @@ class Instr {
void MoveBefore(Instr* other);
void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
void Remove();
const OpcodeInfo* GetOpcodeInfo() const { return opcode; }
// if opcode is null, we have bigger problems
Opcode GetOpcodeNum() const { return GetOpcodeInfo()->num; }
template <typename TPredicate>
std::pair<Value*, Value*> BinaryValueArrangeByPredicateExclusive(
TPredicate&& pred) {
@ -86,12 +90,13 @@ class Instr {
auto src2_value = src2.value;
if (!src1_value || !src2_value) return {nullptr, nullptr};
if (!opcode) return {nullptr, nullptr}; // impossible!
if (!GetOpcodeInfo()) return {nullptr, nullptr}; // impossible!
// check if binary opcode taking two values. we dont care if the dest is a
// value
if (!IsOpcodeBinaryValue(opcode->signature)) return {nullptr, nullptr};
if (!IsOpcodeBinaryValue(GetOpcodeInfo()->signature))
return {nullptr, nullptr};
if (pred(src1_value)) {
if (pred(src2_value)) {
@ -119,7 +124,7 @@ if both are constant, return nullptr, nullptr
std::pair<Value*, Value*> BinaryValueArrangeByDefiningOpcode(
const OpcodeInfo* op_ptr) {
return BinaryValueArrangeByPredicateExclusive([op_ptr](Value* value) {
return value->def && value->def->opcode == op_ptr;
return value->def && value->def->GetOpcodeInfo() == op_ptr;
});
}
@ -143,7 +148,7 @@ if both are constant, return nullptr, nullptr
*/
template <typename TCallable>
void VisitValueOperands(TCallable&& call_for_values) {
uint32_t signature = opcode->signature;
uint32_t signature = GetOpcodeInfo()->signature;
OpcodeSignatureType t_dest, t_src1, t_src2, t_src3;

View File

@ -199,7 +199,7 @@ void Value::Truncate(TypeName target_type) {
return;
}
}
//WARNING: this does not handle rounding flags at all!
// WARNING: this does not handle rounding flags at all!
void Value::Convert(TypeName target_type, RoundMode round_mode) {
switch (type) {
case FLOAT32_TYPE:
@ -428,35 +428,57 @@ void Value::MulHi(Value* other, bool is_unsigned) {
}
}
template <typename T>
static T PPCUDiv(T numer, T denom) {
if (!denom) {
return 0;
} else {
return numer / denom;
}
}
template <typename T>
static T PPCIDiv(T numer, T denom) {
if (!denom) {
return 0;
} else if (numer == static_cast<T>(1LL << ((sizeof(T) * CHAR_BIT) - 1)) &&
!~denom) { // if numer is signbit and denom is all ones, signed
// oflow
return 0;
} else {
return numer / denom;
}
}
// warning : we tolerate division by 0 in x64_sequences, but here we do not
void Value::Div(Value* other, bool is_unsigned) {
assert_true(type == other->type);
switch (type) {
case INT8_TYPE:
if (is_unsigned) {
constant.i8 /= uint8_t(other->constant.i8);
constant.i8 = PPCUDiv<uint8_t>(constant.i8, other->constant.i8);
} else {
constant.i8 /= other->constant.i8;
constant.i8 = PPCIDiv<int8_t>(constant.i8, other->constant.i8);
}
break;
case INT16_TYPE:
if (is_unsigned) {
constant.i16 /= uint16_t(other->constant.i16);
constant.i16 = PPCUDiv<uint16_t>(constant.i16, other->constant.i16);
} else {
constant.i16 /= other->constant.i16;
constant.i16 = PPCIDiv<int16_t>(constant.i16, other->constant.i16);
}
break;
case INT32_TYPE:
if (is_unsigned) {
constant.i32 /= uint32_t(other->constant.i32);
constant.i32 = PPCUDiv<uint32_t>(constant.i32, other->constant.i32);
} else {
constant.i32 /= other->constant.i32;
constant.i32 = PPCIDiv<int32_t>(constant.i32, other->constant.i32);
}
break;
case INT64_TYPE:
if (is_unsigned) {
constant.i64 /= uint64_t(other->constant.i64);
constant.i64 = PPCUDiv<uint64_t>(constant.i64, other->constant.i64);
} else {
constant.i64 /= other->constant.i64;
constant.i64 = PPCIDiv<int64_t>(constant.i64, other->constant.i64);
}
break;
case FLOAT32_TYPE:

View File

@ -364,12 +364,11 @@ int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
// is this the right format?
//todo: what mtvscr does with the unused bits is implementation defined, figure out what it does
// todo: what mtvscr does with the unused bits is implementation defined,
// figure out what it does
Value* v = f.LoadVR(i.VX128_1.RB);
Value* has_njm_value = f.Extract(v, (uint8_t)3, INT32_TYPE);
f.SetNJM(f.IsTrue(f.And(has_njm_value, f.LoadConstantInt32(65536))));
@ -1824,9 +1823,38 @@ int InstrEmit_vsum4ubs(PPCHIRBuilder& f, const InstrData& i) {
return 1;
}
static Value* vkpkx_in_low(PPCHIRBuilder& f, Value* input) {
// truncate from argb8888 to 1 bit alpha, 5 bit red, 5 bit green, 5 bit blue
auto ShrU32Vec = [&f](Value* input, unsigned shift) {
return f.VectorShr(input, f.LoadConstantVec128(vec128i(shift)), INT32_TYPE);
};
auto AndU32Vec = [&f](Value* input, unsigned msk) {
return f.And(input, f.LoadConstantVec128(vec128i(msk)));
};
auto tmp1 = AndU32Vec(ShrU32Vec(input, 9), 0xFC00);
auto tmp2 = AndU32Vec(ShrU32Vec(input, 6), 0x3E0);
auto tmp3 = AndU32Vec(ShrU32Vec(input, 3), 0x1F);
return f.Or(tmp3, f.Or(tmp1, tmp2));
}
int InstrEmit_vpkpx(PPCHIRBuilder& f, const InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
// I compared the results of this against over a million randomly generated
// sets of inputs and all compared equal
Value* src1 = f.LoadVR(i.VX.VA);
Value* src2 = f.LoadVR(i.VX.VB);
Value* pck1 = vkpkx_in_low(f, src1);
Value* pck2 = vkpkx_in_low(f, src2);
Value* result = f.Pack(
pck1, pck2,
PACK_TYPE_16_IN_32 | PACK_TYPE_IN_UNSIGNED | PACK_TYPE_OUT_UNSIGNED);
f.StoreVR(i.VX.VD, result);
return 0;
}
int InstrEmit_vpkshss_(PPCHIRBuilder& f, uint32_t vd, uint32_t va,

View File

@ -336,10 +336,14 @@ int InstrEmit_mulhwx(PPCHIRBuilder& f, const InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
}
Value* ratrunc =
f.SignExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE);
Value* rbtrunc =
f.SignExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE);
Value* v = f.Sha(f.Mul(ratrunc, rbtrunc), 32);
Value* v = f.SignExtend(f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE)),
INT64_TYPE);
f.StoreGPR(i.XO.RT, v);
if (i.XO.Rc) {
f.UpdateCR(0, v);
@ -355,10 +359,13 @@ int InstrEmit_mulhwux(PPCHIRBuilder& f, const InstrData& i) {
return 1;
}
Value* v = f.ZeroExtend(
f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), ARITHMETIC_UNSIGNED),
INT64_TYPE);
Value* ratrunc =
f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE);
Value* rbtrunc =
f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE);
Value* v = f.Shr(f.Mul(ratrunc, rbtrunc, ARITHMETIC_UNSIGNED), 32);
f.StoreGPR(i.XO.RT, v);
if (i.XO.Rc) {
f.UpdateCR(0, v);

View File

@ -89,8 +89,10 @@ int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fresx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- 1.0 / (frB)
Value* v = f.Recip(f.LoadFPR(i.A.FRB));
v = f.ToSingle(v);
// this actually does seem to require single precision, oddly
// more research is needed
Value* v = f.Recip(f.Convert(f.LoadFPR(i.A.FRB), FLOAT32_TYPE));
v = f.Convert(v, FLOAT64_TYPE); // f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;

View File

@ -11,9 +11,17 @@
#include <stddef.h>
#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/cpu/ppc/ppc_context.h"
#include "xenia/cpu/ppc/ppc_hir_builder.h"
DEFINE_bool(
disable_prefetch_and_cachecontrol, false,
"Disables translating ppc prefetch/cache flush instructions to host "
"prefetch/cacheflush instructions. This may improve performance as these "
"instructions were written with the Xbox 360's cache in mind, and modern "
"processors do their own automatic prefetching.",
"CPU");
namespace xe {
namespace cpu {
namespace ppc {
@ -1080,28 +1088,36 @@ int InstrEmit_stfsx(PPCHIRBuilder& f, const InstrData& i) {
// https://randomascii.wordpress.com/2018/01/07/finding-a-cpu-design-bug-in-the-xbox-360/
int InstrEmit_dcbf(PPCHIRBuilder& f, const InstrData& i) {
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
f.CacheControl(ea, 128,
CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE_AND_FLUSH);
if (!cvars::disable_prefetch_and_cachecontrol) {
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
f.CacheControl(ea, 128,
CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE_AND_FLUSH);
}
return 0;
}
int InstrEmit_dcbst(PPCHIRBuilder& f, const InstrData& i) {
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE);
if (!cvars::disable_prefetch_and_cachecontrol) {
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE);
}
return 0;
}
int InstrEmit_dcbt(PPCHIRBuilder& f, const InstrData& i) {
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH);
if (!cvars::disable_prefetch_and_cachecontrol) {
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH);
}
return 0;
}
int InstrEmit_dcbtst(PPCHIRBuilder& f, const InstrData& i) {
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
f.CacheControl(ea, 128,
CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE);
if (!cvars::disable_prefetch_and_cachecontrol) {
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
f.CacheControl(ea, 128,
CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE);
}
return 0;
}

View File

@ -55,7 +55,9 @@ class PPCFrontend {
PPCBuiltins builtins_ = {0};
TypePool<PPCTranslator, PPCFrontend*> translator_pool_;
};
// Checks the state of the global lock and sets scratch to the current MSR
// value.
void CheckGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1);
} // namespace ppc
} // namespace cpu
} // namespace xe

View File

@ -192,6 +192,21 @@ class ParamBase : public Param {
T value_;
};
class ContextParam : public Param {
public:
ContextParam() : Param(), ctx_(nullptr) {}
ContextParam(PPCContext* value) : Param(), ctx_(value) {}
ContextParam(Init& init) : Param(init), ctx_(init.ppc_context) {}
operator PPCContext*() const { return ctx_; }
PPCContext* value() const { return ctx_; }
PPCContext* operator->() const { return ctx_; }
protected:
PPCContext* ctx_;
};
class PointerParam : public ParamBase<uint32_t> {
public:
PointerParam(Init& init) : ParamBase(init) {
@ -370,6 +385,7 @@ using int_result_t = shim::ResultBase<int32_t>;
using dword_result_t = shim::ResultBase<uint32_t>;
using pointer_result_t = shim::ResultBase<uint32_t>;
using X_HRESULT_result_t = shim::ResultBase<X_HRESULT>;
using ppc_context_t = shim::ContextParam;
// Exported from kernel_state.cc.
KernelState* kernel_state();
@ -422,6 +438,9 @@ inline void AppendParam(StringBuffer* string_buffer, lpdouble_t param) {
string_buffer->AppendFormat("({:G})", param.value());
}
}
inline void AppendParam(StringBuffer* string_buffer, ppc_context_t param) {
string_buffer->Append("ContextArg");
}
inline void AppendParam(StringBuffer* string_buffer, lpstring_t param) {
string_buffer->AppendFormat("{:08X}", param.guest_address());
if (param) {

View File

@ -8,12 +8,13 @@
*/
#include "xenia/base/logging.h"
#include "xenia/cpu/ppc/ppc_frontend.h"
#include "xenia/cpu/processor.h"
#include "xenia/kernel/kernel_state.h"
#include "xenia/kernel/util/shim_utils.h"
#include "xenia/kernel/xboxkrnl/xboxkrnl_private.h"
#include "xenia/kernel/xthread.h"
#include "xenia/xbox.h"
namespace xe {
namespace kernel {
namespace xboxkrnl {
@ -22,6 +23,94 @@ void KeEnableFpuExceptions_entry(dword_t enabled) {
// TODO(benvanik): can we do anything about exceptions?
}
DECLARE_XBOXKRNL_EXPORT1(KeEnableFpuExceptions, kNone, kStub);
#if 0
struct __declspec(align(8)) fpucontext_ptr_t {
char unknown_data[158];
__int16 field_9E;
char field_A0[2272];
unsigned __int64 saved_FPSCR;
double saved_fpu_regs[32];
};
#pragma pack(push, 1)
struct __declspec(align(1)) r13_struct_t {
char field_0[6];
__int16 field_6;
char field_8[2];
char field_A;
char field_B[5];
int field_10;
char field_14[315];
char field_14F;
unsigned int field_150;
char field_154[427];
char field_2FF;
char field_300;
};
#pragma pack(pop)
static uint64_t Do_mfmsr(ppc_context_t& ctx) {
auto frontend = ctx->thread_state->processor()->frontend();
cpu::ppc::CheckGlobalLock(
ctx, reinterpret_cast<void*>(&xe::global_critical_region::mutex()),
reinterpret_cast<void*>(&frontend->builtins()->global_lock_count));
return ctx->scratch;
}
void KeSaveFloatingPointState_entry(ppc_context_t& ctx) {
xe::Memory* memory = ctx->thread_state->memory();
unsigned int r13 = static_cast<unsigned int>(ctx->r[13]);
r13_struct_t* st = memory->TranslateVirtual<r13_struct_t*>(r13);
/*
lwz r10, 0x150(r13)
lbz r11, 0xA(r13)
tweqi r10, 0
twnei r11, 0
*/
unsigned int r10 = st->field_150;
unsigned char r11 = st->field_A;
if (r10 == 0 || r11 != 0) {
//trap!
}
//should do mfmsr here
unsigned int r3 = xe::load_and_swap<unsigned int>(&st->field_10);
//too much work to do the mfmsr/mtmsr stuff right now
int to_store = -2049;
xe::store_and_swap(&st->field_10, (unsigned int)to_store);
xe::store_and_swap(&st->field_6, (short)to_store);
if (r3 != ~0u) {
fpucontext_ptr_t* fpucontext =
memory->TranslateVirtual<fpucontext_ptr_t*>(r3);
xe::store_and_swap<uint64_t>(&fpucontext->saved_FPSCR, ctx->fpscr.value);
for (unsigned int i = 0; i < 32; ++i) {
xe::store_and_swap(&fpucontext->saved_fpu_regs[i], ctx->f[i]);
}
xe::store_and_swap<unsigned short>(&fpucontext->field_9E, 0xD7FF);
}
ctx->processor->backend()->SetGuestRoundingMode(ctx.value(), 0);
ctx->fpscr.value = 0;
st->field_A = 1;
xe::store_and_swap(&st->field_10, r13 + 0x300);
ctx->r[3] = r3;
}
DECLARE_XBOXKRNL_EXPORT1(KeSaveFloatingPointState, kNone, kImplemented);
#endif
} // namespace xboxkrnl
} // namespace kernel