Changing the way the global lock works. Some things are better, I think.

Regressions are likely.
This commit is contained in:
Ben Vanik 2015-09-01 09:45:14 -07:00
parent 5355183590
commit f5e374f9b5
14 changed files with 199 additions and 128 deletions

View File

@ -13,6 +13,7 @@
#include <cstdint>
#include <string>
#include "xenia/base/mutex.h"
#include "xenia/base/vec128.h"
namespace xe {
@ -404,8 +405,9 @@ typedef struct alignas(64) PPCContext_s {
// Thread ID assigned to this context.
uint32_t thread_id;
// Reserve address for load acquire/store release. Shared.
uint64_t* reserve_address;
// Global interrupt lock, held while interrupts are disabled or interrupts are
// executing. This is shared among all threads and comes from the processor.
xe::recursive_mutex* global_mutex;
// Used to shuttle data into externs. Contents volatile.
uint64_t scratch;

View File

@ -11,6 +11,7 @@
#include "xenia/base/assert.h"
#include "xenia/cpu/frontend/ppc_context.h"
#include "xenia/cpu/frontend/ppc_frontend.h"
#include "xenia/cpu/frontend/ppc_hir_builder.h"
namespace xe {
@ -699,14 +700,28 @@ XEEMITTER(mtspr, 0x7C0003A6, XFX)(PPCHIRBuilder& f, InstrData& i) {
// without the lock here threads can livelock.
XEEMITTER(mfmsr, 0x7C0000A6, X)(PPCHIRBuilder& f, InstrData& i) {
f.StoreGPR(i.X.RT, f.LoadMSR());
// bit 48 = EE; interrupt enabled
// bit 62 = RI; recoverable interrupt
// return 8000h if unlocked (interrupts enabled), else 0
f.CallExtern(f.builtins()->check_global_lock);
f.StoreGPR(i.X.RT, f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE));
return 0;
}
XEEMITTER(mtmsr, 0x7C000124, X)(PPCHIRBuilder& f, InstrData& i) {
if (i.X.RA & 0x01) {
// L = 1
f.StoreMSR(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
// iff storing from r13
f.StoreContext(
offsetof(PPCContext, scratch),
f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE));
if (i.X.RT == 13) {
// iff storing from r13 we are taking a lock (disable interrupts).
f.CallExtern(f.builtins()->enter_global_lock);
} else {
// Otherwise we are restoring interrupts (probably).
f.CallExtern(f.builtins()->leave_global_lock);
}
return 0;
} else {
// L = 0
@ -718,7 +733,15 @@ XEEMITTER(mtmsr, 0x7C000124, X)(PPCHIRBuilder& f, InstrData& i) {
XEEMITTER(mtmsrd, 0x7C000164, X)(PPCHIRBuilder& f, InstrData& i) {
if (i.X.RA & 0x01) {
// L = 1
f.StoreMSR(f.LoadGPR(i.X.RT));
f.StoreContext(offsetof(PPCContext, scratch),
f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
if (i.X.RT == 13) {
// iff storing from r13 we are taking a lock (disable interrupts).
f.CallExtern(f.builtins()->enter_global_lock);
} else {
// Otherwise we are restoring interrupts (probably).
f.CallExtern(f.builtins()->leave_global_lock);
}
return 0;
} else {
// L = 0

View File

@ -683,8 +683,16 @@ XEEMITTER(ldarx, 0x7C0000A8, X)(PPCHIRBuilder& f, InstrData& i) {
// RESERVE_LENGTH <- 8
// RESERVE_ADDR <- real_addr(EA)
// RT <- MEM(EA, 8)
// NOTE: we assume we are within a global lock.
// We could assert here that the block (or its parent) has taken a global lock
// already, but I haven't see anything but interrupt callbacks (which are
// always under a global lock) do that yet.
// We issue a memory barrier here to make sure that we get good values.
f.MemoryBarrier();
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt = f.ByteSwap(f.LoadAcquire(ea, INT64_TYPE));
Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE));
f.StoreGPR(i.X.RT, rt);
return 0;
}
@ -699,9 +707,16 @@ XEEMITTER(lwarx, 0x7C000028, X)(PPCHIRBuilder& f, InstrData& i) {
// RESERVE_LENGTH <- 4
// RESERVE_ADDR <- real_addr(EA)
// RT <- i32.0 || MEM(EA, 4)
// NOTE: we assume we are within a global lock.
// We could assert here that the block (or its parent) has taken a global lock
// already, but I haven't see anything but interrupt callbacks (which are
// always under a global lock) do that yet.
// We issue a memory barrier here to make sure that we get good values.
f.MemoryBarrier();
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt =
f.ZeroExtend(f.ByteSwap(f.LoadAcquire(ea, INT32_TYPE)), INT64_TYPE);
Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE);
f.StoreGPR(i.X.RT, rt);
return 0;
}
@ -716,9 +731,22 @@ XEEMITTER(stdcx, 0x7C0001AD, X)(PPCHIRBuilder& f, InstrData& i) {
// MEM(EA, 8) <- (RS)
// n <- 1 if store performed
// CR0[LT GT EQ SO] = 0b00 || n || XER[SO]
// NOTE: we assume we are within a global lock.
// As we have been exclusively executing this entire time, we assume that no
// one else could have possibly touched the memory and must always succeed.
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT));
f.StoreRelease(ea, rt); // also updates cr0
f.Store(ea, rt);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
// Issue memory barrier for when we go out of lock and want others to see our
// updates.
f.MemoryBarrier();
return 0;
}
@ -732,9 +760,22 @@ XEEMITTER(stwcx, 0x7C00012D, X)(PPCHIRBuilder& f, InstrData& i) {
// MEM(EA, 4) <- (RS)[32:63]
// n <- 1 if store performed
// CR0[LT GT EQ SO] = 0b00 || n || XER[SO]
// NOTE: we assume we are within a global lock.
// As we have been exclusively executing this entire time, we assume that no
// one else could have possibly touched the memory and must always succeed.
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE));
f.StoreRelease(ea, rt); // also updates cr0
f.Store(ea, rt);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
// Issue memory barrier for when we go out of lock and want others to see our
// updates.
f.MemoryBarrier();
return 0;
}

View File

@ -57,32 +57,41 @@ PPCFrontend::~PPCFrontend() {
Memory* PPCFrontend::memory() const { return processor_->memory(); }
// Checks the state of the global lock and sets scratch to the current MSR
// value.
void CheckGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1) {
ppc_context->scratch = 0x8000;
auto global_mutex = reinterpret_cast<xe::recursive_mutex*>(arg0);
auto global_lock_count = reinterpret_cast<int32_t*>(arg1);
std::lock_guard<xe::recursive_mutex> lock(*global_mutex);
ppc_context->scratch = *global_lock_count ? 0 : 0x8000;
}
void HandleGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1) {
auto global_lock = reinterpret_cast<xe::mutex*>(arg0);
volatile bool* global_lock_taken = reinterpret_cast<bool*>(arg1);
uint64_t value = ppc_context->scratch;
if (value == 0x8000) {
global_lock->lock();
*global_lock_taken = false;
global_lock->unlock();
} else if (value == ppc_context->r[13]) {
global_lock->lock();
*global_lock_taken = true;
global_lock->unlock();
}
// Enters the global lock. Safe to recursion.
void EnterGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1) {
auto global_mutex = reinterpret_cast<xe::recursive_mutex*>(arg0);
auto global_lock_count = reinterpret_cast<int32_t*>(arg1);
global_mutex->lock();
*global_lock_count = *global_lock_count + 1;
}
// Leaves the global lock. Safe to recursion.
void LeaveGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1) {
auto global_mutex = reinterpret_cast<xe::recursive_mutex*>(arg0);
auto global_lock_count = reinterpret_cast<int32_t*>(arg1);
*global_lock_count = *global_lock_count - 1;
assert_true(*global_lock_count >= 0);
global_mutex->unlock();
}
bool PPCFrontend::Initialize() {
void* arg0 = reinterpret_cast<void*>(&builtins_.global_lock);
void* arg1 = reinterpret_cast<void*>(&builtins_.global_lock_taken);
void* arg0 = reinterpret_cast<void*>(processor_->global_mutex());
void* arg1 = reinterpret_cast<void*>(&builtins_.global_lock_count);
builtins_.check_global_lock =
processor_->DefineBuiltin("CheckGlobalLock", CheckGlobalLock, arg0, arg1);
builtins_.handle_global_lock = processor_->DefineBuiltin(
"HandleGlobalLock", HandleGlobalLock, arg0, arg1);
builtins_.enter_global_lock =
processor_->DefineBuiltin("EnterGlobalLock", EnterGlobalLock, arg0, arg1);
builtins_.leave_global_lock =
processor_->DefineBuiltin("LeaveGlobalLock", LeaveGlobalLock, arg0, arg1);
return true;
}

View File

@ -32,10 +32,10 @@ namespace frontend {
class PPCTranslator;
struct PPCBuiltins {
xe::mutex global_lock;
bool global_lock_taken;
int32_t global_lock_count;
Function* check_global_lock;
Function* handle_global_lock;
Function* enter_global_lock;
Function* leave_global_lock;
};
class PPCFrontend {

View File

@ -39,6 +39,8 @@ PPCHIRBuilder::PPCHIRBuilder(PPCFrontend* frontend)
PPCHIRBuilder::~PPCHIRBuilder() = default;
PPCBuiltins* PPCHIRBuilder::builtins() const { return frontend_->builtins(); }
void PPCHIRBuilder::Reset() {
function_ = nullptr;
start_address_ = 0;
@ -345,20 +347,6 @@ void PPCHIRBuilder::UpdateCR6(Value* src_value) {
// TOOD(benvanik): trace CR.
}
Value* PPCHIRBuilder::LoadMSR() {
// bit 48 = EE; interrupt enabled
// bit 62 = RI; recoverable interrupt
// return 8000h if unlocked, else 0
CallExtern(frontend_->builtins()->check_global_lock);
return LoadContext(offsetof(PPCContext, scratch), INT64_TYPE);
}
void PPCHIRBuilder::StoreMSR(Value* value) {
// if & 0x8000 == 0, lock, else unlock
StoreContext(offsetof(PPCContext, scratch), ZeroExtend(value, INT64_TYPE));
CallExtern(frontend_->builtins()->handle_global_lock);
}
Value* PPCHIRBuilder::LoadFPSCR() {
return LoadContext(offsetof(PPCContext, fpscr), INT64_TYPE);
}
@ -444,30 +432,6 @@ void PPCHIRBuilder::StoreVR(uint32_t reg, Value* value) {
trace_reg.value = value;
}
Value* PPCHIRBuilder::LoadAcquire(Value* address, TypeName type,
uint32_t load_flags) {
AtomicExchange(LoadContext(offsetof(PPCContext, reserve_address), INT64_TYPE),
Truncate(address, INT32_TYPE));
return Load(address, type, load_flags);
}
Value* PPCHIRBuilder::StoreRelease(Value* address, Value* value,
uint32_t store_flags) {
Value* old_address = AtomicExchange(
LoadContext(offsetof(PPCContext, reserve_address), INT64_TYPE),
LoadZeroInt32());
// Ensure the reservation addresses match.
Value* eq = CompareEQ(Truncate(address, INT32_TYPE), old_address);
StoreContext(offsetof(PPCContext, cr0.cr0_eq), eq);
StoreContext(offsetof(PPCContext, cr0.cr0_lt), LoadZeroInt8());
StoreContext(offsetof(PPCContext, cr0.cr0_gt), LoadZeroInt8());
auto skip_label = NewLabel();
BranchFalse(eq, skip_label, BRANCH_UNLIKELY);
Store(address, value, store_flags);
MarkLabel(skip_label);
return eq;
}
} // namespace frontend
} // namespace cpu
} // namespace xe

View File

@ -18,6 +18,7 @@ namespace xe {
namespace cpu {
namespace frontend {
struct PPCBuiltins;
class PPCFrontend;
class PPCHIRBuilder : public hir::HIRBuilder {
@ -29,6 +30,8 @@ class PPCHIRBuilder : public hir::HIRBuilder {
explicit PPCHIRBuilder(PPCFrontend* frontend);
~PPCHIRBuilder() override;
PPCBuiltins* builtins() const;
void Reset() override;
enum EmitFlags {
@ -54,8 +57,6 @@ class PPCHIRBuilder : public hir::HIRBuilder {
void UpdateCR(uint32_t n, Value* lhs, bool is_signed = true);
void UpdateCR(uint32_t n, Value* lhs, Value* rhs, bool is_signed = true);
void UpdateCR6(Value* src_value);
Value* LoadMSR();
void StoreMSR(Value* value);
Value* LoadFPSCR();
void StoreFPSCR(Value* value);
Value* LoadXER();
@ -75,10 +76,6 @@ class PPCHIRBuilder : public hir::HIRBuilder {
Value* LoadVR(uint32_t reg);
void StoreVR(uint32_t reg, Value* value);
Value* LoadAcquire(Value* address, hir::TypeName type,
uint32_t load_flags = 0);
Value* StoreRelease(Value* address, Value* value, uint32_t store_flags = 0);
private:
void AnnotateLabel(uint32_t address, Label* label);

View File

@ -327,6 +327,39 @@ uint64_t Processor::Execute(ThreadState* thread_state, uint32_t address,
return context->r[3];
}
uint64_t Processor::ExecuteInterrupt(ThreadState* thread_state,
uint32_t address, uint64_t args[],
size_t arg_count) {
SCOPE_profile_cpu_f("cpu");
// Hold the global lock during interrupt dispatch.
// This will block if any code is in a critical region (has interrupts
// disabled) or if any other interrupt is executing.
std::lock_guard<xe::recursive_mutex> lock(global_mutex_);
PPCContext* context = thread_state->context();
assert_true(arg_count <= 5);
for (size_t i = 0; i < arg_count; ++i) {
context->r[3 + i] = args[i];
}
// TLS ptr must be zero during interrupts. Some games check this and
// early-exit routines when under interrupts.
auto pcr_address =
memory_->TranslateVirtual(static_cast<uint32_t>(context->r[13]));
uint32_t old_tls_ptr = xe::load_and_swap<uint32_t>(pcr_address);
xe::store_and_swap<uint32_t>(pcr_address, 0);
if (!Execute(thread_state, address)) {
return 0xDEADBABE;
}
// Restores TLS ptr.
xe::store_and_swap<uint32_t>(pcr_address, old_tls_ptr);
return context->r[3];
}
Irql Processor::RaiseIrql(Irql new_value) {
return static_cast<Irql>(
xe::atomic_exchange(static_cast<uint32_t>(new_value),

View File

@ -11,7 +11,6 @@
#define XENIA_CPU_PROCESSOR_H_
#include <memory>
#include <mutex>
#include <string>
#include <vector>
@ -84,7 +83,10 @@ class Processor {
bool Execute(ThreadState* thread_state, uint32_t address);
uint64_t Execute(ThreadState* thread_state, uint32_t address, uint64_t args[],
size_t arg_count);
uint64_t ExecuteInterrupt(ThreadState* thread_state, uint32_t address,
uint64_t args[], size_t arg_count);
xe::recursive_mutex* global_mutex() { return &global_mutex_; }
Irql RaiseIrql(Irql new_value);
void LowerIrql(Irql old_value);
@ -108,6 +110,7 @@ class Processor {
uint32_t next_builtin_address_ = 0xFFFF0000u;
Irql irql_;
xe::recursive_mutex global_mutex_;
};
} // namespace cpu

View File

@ -90,7 +90,7 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
std::memset(context_, 0, sizeof(PPCContext));
// Stash pointers to common structures that callbacks may need.
context_->reserve_address = memory_->reserve_address();
context_->global_mutex = processor_->global_mutex();
context_->virtual_membase = memory_->virtual_membase();
context_->physical_membase = memory_->physical_membase();
context_->processor = processor_;

View File

@ -80,8 +80,8 @@ void GraphicsSystem::DispatchInterruptCallback(uint32_t source, uint32_t cpu) {
interrupt_callback_, source, cpu);
uint64_t args[] = {source, interrupt_callback_data_};
processor_->Execute(thread->thread_state(), interrupt_callback_, args,
xe::countof(args));
processor_->ExecuteInterrupt(thread->thread_state(), interrupt_callback_,
args, xe::countof(args));
}
} // namespace gpu

View File

@ -403,7 +403,6 @@ SHIM_CALL XamShowDirtyDiscErrorUI_shim(PPCContext* ppc_context,
// This is death, and should never return.
// TODO(benvanik): cleaner exit.
assert_always();
exit(1);
}

View File

@ -1279,23 +1279,23 @@ pointer_result_t InterlockedPushEntrySList(
assert_not_null(plist_ptr);
assert_not_null(entry);
alignas(8) X_SLIST_HEADER old_hdr = {0};
// Hold a global lock during this method. Once in the lock we assume we have
// exclusive access to the structure.
std::lock_guard<xe::recursive_mutex> lock(
*kernel_state()->processor()->global_mutex());
alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr;
alignas(8) X_SLIST_HEADER new_hdr = {0};
uint32_t old_head = 0;
new_hdr.depth = old_hdr.depth + 1;
new_hdr.sequence = old_hdr.sequence + 1;
do {
// Kill the guest reservation
xe::atomic_exchange(0, kernel_memory()->reserve_address());
uint32_t old_head = old_hdr.next.next;
entry->next = old_hdr.next.next;
new_hdr.next.next = entry.guest_address();
old_hdr = *plist_ptr;
new_hdr.depth = old_hdr.depth + 1;
new_hdr.sequence = old_hdr.sequence + 1;
old_head = old_hdr.next.next;
entry->next = old_hdr.next.next;
new_hdr.next.next = entry.guest_address();
} while (!xe::atomic_cas(*(uint64_t*)&old_hdr, *(uint64_t*)&new_hdr,
(uint64_t*)plist_ptr.host_address()));
xe::atomic_cas(*reinterpret_cast<uint64_t*>(&old_hdr),
*reinterpret_cast<uint64_t*>(&new_hdr),
reinterpret_cast<uint64_t*>(plist_ptr.host_address()));
return old_head;
}
@ -1305,27 +1305,29 @@ DECLARE_XBOXKRNL_EXPORT(InterlockedPushEntrySList,
pointer_result_t InterlockedPopEntrySList(pointer_t<X_SLIST_HEADER> plist_ptr) {
assert_not_null(plist_ptr);
// Hold a global lock during this method. Once in the lock we assume we have
// exclusive access to the structure.
std::lock_guard<xe::recursive_mutex> lock(
*kernel_state()->processor()->global_mutex());
uint32_t popped = 0;
alignas(8) X_SLIST_HEADER old_hdr = {0};
alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr;
alignas(8) X_SLIST_HEADER new_hdr = {0};
do {
// Kill the guest reservation (guest InterlockedPushEntrySList uses this)
xe::atomic_exchange(0, kernel_memory()->reserve_address());
auto next = kernel_memory()->TranslateVirtual<X_SINGLE_LIST_ENTRY*>(
old_hdr.next.next);
if (!old_hdr.next.next) {
return 0;
}
popped = old_hdr.next.next;
old_hdr = *plist_ptr;
auto next = kernel_memory()->TranslateVirtual<X_SINGLE_LIST_ENTRY*>(
old_hdr.next.next);
if (!old_hdr.next.next) {
return 0;
}
popped = old_hdr.next.next;
new_hdr.depth = old_hdr.depth - 1;
new_hdr.next.next = next->next;
new_hdr.sequence = old_hdr.sequence;
new_hdr.depth = old_hdr.depth - 1;
new_hdr.next.next = next->next;
new_hdr.sequence = old_hdr.sequence;
} while (!xe::atomic_cas(*(uint64_t*)&old_hdr, *(uint64_t*)&new_hdr,
(uint64_t*)plist_ptr.host_address()));
xe::atomic_cas(*reinterpret_cast<uint64_t*>(&old_hdr),
*reinterpret_cast<uint64_t*>(&new_hdr),
reinterpret_cast<uint64_t*>(plist_ptr.host_address()));
return popped;
}
@ -1333,22 +1335,23 @@ DECLARE_XBOXKRNL_EXPORT(InterlockedPopEntrySList,
ExportTag::kImplemented | ExportTag::kHighFrequency);
pointer_result_t InterlockedFlushSList(pointer_t<X_SLIST_HEADER> plist_ptr) {
alignas(8) X_SLIST_HEADER old_hdr = {0};
assert_not_null(plist_ptr);
// Hold a global lock during this method. Once in the lock we assume we have
// exclusive access to the structure.
std::lock_guard<xe::recursive_mutex> lock(
*kernel_state()->processor()->global_mutex());
alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr;
alignas(8) X_SLIST_HEADER new_hdr = {0};
uint32_t first = 0;
uint32_t first = old_hdr.next.next;
new_hdr.next.next = 0;
new_hdr.depth = 0;
new_hdr.sequence = 0;
do {
// Kill the guest reservation
xe::atomic_exchange(0, kernel_memory()->reserve_address());
old_hdr = *plist_ptr;
first = old_hdr.next.next;
new_hdr.next.next = 0;
new_hdr.depth = 0;
new_hdr.sequence = 0;
} while (!xe::atomic_cas(*(uint64_t*)&old_hdr, *(uint64_t*)&new_hdr,
(uint64_t*)plist_ptr.host_address()));
xe::atomic_cas(*reinterpret_cast<uint64_t*>(&old_hdr),
*reinterpret_cast<uint64_t*>(&new_hdr),
reinterpret_cast<uint64_t*>(plist_ptr.host_address()));
return first;
}

View File

@ -181,8 +181,6 @@ class Memory {
(guest_address & 0x1FFFFFFF));
}
inline uint64_t* reserve_address() { return &reserve_address_; }
// TODO(benvanik): make poly memory utils for these.
void Zero(uint32_t address, uint32_t size);
void Fill(uint32_t address, uint32_t size, uint8_t value);
@ -219,7 +217,6 @@ class Memory {
uint32_t system_page_size_ = 0;
uint8_t* virtual_membase_ = nullptr;
uint8_t* physical_membase_ = nullptr;
uint64_t reserve_address_ = 0;
xe::memory::FileMappingHandle mapping_ = nullptr;
uint8_t* mapping_base_ = nullptr;