From a0b387e0a959f9fc3ed968d0b84c7e7883847f1e Mon Sep 17 00:00:00 2001 From: Eladash Date: Fri, 2 Oct 2020 15:15:23 +0300 Subject: [PATCH] cellSpurs: Fix HLE workload signalling, taskset fixes --- rpcs3/Emu/Cell/Modules/cellSpurs.cpp | 108 +++++++++++++----- rpcs3/Emu/Cell/Modules/cellSpurs.h | 22 +++- rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp | 140 ++++++++++++++---------- 3 files changed, 178 insertions(+), 92 deletions(-) diff --git a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp index 6202348115..d38c6cb07a 100644 --- a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp +++ b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp @@ -3700,15 +3700,37 @@ s32 _spurs::create_task(vm::ptr taskset, vm::ptr task_id, // TODO: Verify the ELF header is proper and all its load segments are at address >= 0x3000 u32 tmp_task_id; - for (tmp_task_id = 0; tmp_task_id < CELL_SPURS_MAX_TASK; tmp_task_id++) { - if (!taskset->enabled.value()._bit[tmp_task_id]) + auto addr = taskset.ptr(&CellSpursTaskset::enabled).addr(); + auto [res, rtime] = vm::reservation_lock(addr, 16, vm::dma_lockb); + + // NOTE: Realfw processes this using 4 32-bits atomic loops + // But here its processed within a single 128-bit atomic op + vm::_ref>(addr).fetch_op([&](be_t& value) { - auto enabled = taskset->enabled.value(); - enabled._bit[tmp_task_id] = true; - taskset->enabled = enabled; - break; - } + auto value0 = value.value(); + + if (auto pos = std::countl_one(+value0._u64[0]); pos != 64) + { + tmp_task_id = pos; + value0._u64[0] |= (1ull << 63) >> pos; + value = value0; + return true; + } + + if (auto pos = std::countl_one(+value0._u64[1]); pos != 64) + { + tmp_task_id = pos + 64; + value0._u64[1] |= (1ull << 63) >> pos; + value = value0; + return true; + } + + tmp_task_id = CELL_SPURS_MAX_TASK; + return false; + }); + + res.release(rtime + 128); } if (tmp_task_id >= CELL_SPURS_MAX_TASK) @@ -3730,13 +3752,14 @@ s32 _spurs::create_task(vm::ptr taskset, vm::ptr task_id, s32 _spurs::task_start(ppu_thread& ppu, vm::ptr taskset, u32 taskId) { - auto pendingReady = taskset->pending_ready.value(); - pendingReady._bit[taskId] = true; - taskset->pending_ready = pendingReady; + auto [res, rtime] = vm::reservation_lock(taskset.ptr(&CellSpursTaskset::pending_ready).addr(), 16, vm::dma_lockb); + taskset->pending_ready.values[taskId / 32] |= (1u << 31) >> (taskId % 32); + res.release(rtime + 128); - cellSpursSendWorkloadSignal(ppu, taskset->spurs, taskset->wid); + auto spurs = +taskset->spurs; + ppu_execute<&cellSpursSendWorkloadSignal>(ppu, spurs, +taskset->wid); - if (s32 rc = cellSpursWakeUp(ppu, taskset->spurs)) + if (s32 rc = ppu_execute<&cellSpursWakeUp>(ppu, spurs)) { if (rc + 0u == CELL_SPURS_POLICY_MODULE_ERROR_STAT) { @@ -3782,6 +3805,8 @@ s32 cellSpursCreateTask(ppu_thread& ppu, vm::ptr taskset, vm:: s32 _cellSpursSendSignal(ppu_thread& ppu, vm::ptr taskset, u32 taskId) { + cellSpurs.trace("_cellSpursSendSignal(taskset=*0x%x, taskId=0x%x)", taskset, taskId); + if (!taskset) { return CELL_SPURS_TASK_ERROR_NULL_POINTER; @@ -3797,30 +3822,59 @@ s32 _cellSpursSendSignal(ppu_thread& ppu, vm::ptr taskset, u32 return CELL_SPURS_TASK_ERROR_INVAL; } - be_t _0(v128::from32(0)); - bool disabled = taskset->enabled.value()._bit[taskId]; - auto invalid = (taskset->ready & taskset->pending_ready) != _0 || (taskset->running & taskset->waiting) != _0 || disabled || - ((taskset->running | taskset->ready | taskset->pending_ready | taskset->waiting | taskset->signalled) & ~taskset->enabled) != _0; - - if (invalid) + int signal; + for (;;) { - return CELL_SPURS_TASK_ERROR_SRCH; + const u32 addr = taskset.ptr(&CellSpursTaskset::signalled).ptr(&decltype(CellSpursTaskset::signalled)::values, taskId / 32).addr(); + u32 signalled = ppu_lwarx(ppu, addr); + + const u32 running = taskset->running.values[taskId / 32]; + const u32 ready = taskset->ready.values[taskId / 32]; + const u32 waiting = taskset->waiting.values[taskId / 32]; + const u32 enabled = taskset->enabled.values[taskId / 32]; + const u32 pready = taskset->pending_ready.values[taskId / 32]; + + const u32 mask = (1u << 31) >> (taskId % 32); + + if ((running & waiting) || (ready & pready) || + ((signalled | waiting | pready | running | ready) & ~enabled) || !(enabled & mask)) + { + // Error conditions: + // 1) Cannot have a waiting bit and running bit set at the same time + // 2) Cannot have a read bit and pending_ready bit at the same time + // 3) Any disabled bit in enabled mask must be not set + // 4) Specified task must be enabled + signal = -1; + } + else + { + signal = !!(~signalled & waiting & mask); + signalled |= (signal ? mask : 0); + } + + if (ppu_stwcx(ppu, addr, signalled)) + { + break; + } } - auto shouldSignal = ((taskset->waiting & ~taskset->signalled) & be_t(v128::fromBit(taskId))) != _0 ? true : false; - auto signalled = taskset->signalled.value(); - signalled._bit[taskId] = true; - taskset->signalled = signalled; - if (shouldSignal) + switch (signal) { - cellSpursSendWorkloadSignal(ppu, taskset->spurs, taskset->wid); - auto rc = cellSpursWakeUp(ppu, taskset->spurs); + case 0: break; + case 1: + { + auto spurs = +taskset->spurs; + + ppu_execute<&cellSpursSendWorkloadSignal>(ppu, spurs, +taskset->wid); + auto rc = ppu_execute<&cellSpursWakeUp>(ppu, spurs); if (rc + 0u == CELL_SPURS_POLICY_MODULE_ERROR_STAT) { return CELL_SPURS_TASK_ERROR_STAT; } - ASSERT(rc == CELL_OK); + return rc; + } + default: return CELL_SPURS_TASK_ERROR_SRCH; } return CELL_OK; diff --git a/rpcs3/Emu/Cell/Modules/cellSpurs.h b/rpcs3/Emu/Cell/Modules/cellSpurs.h index 28eb851b8b..cbb130b3ea 100644 --- a/rpcs3/Emu/Cell/Modules/cellSpurs.h +++ b/rpcs3/Emu/Cell/Modules/cellSpurs.h @@ -811,12 +811,22 @@ struct alignas(128) CellSpursTaskset CHECK_SIZE(TaskInfo, 48); - be_t running; // 0x00 - be_t ready; // 0x10 - be_t pending_ready; // 0x20 - be_t enabled; // 0x30 - be_t signalled; // 0x40 - be_t waiting; // 0x50 + struct atomic_tasks_bitset + { + atomic_be_t values[4]; + + u32 get_bit(u32 bit) const + { + return values[bit / 32] & ((1u << 31) >> (bit % 32)); + } + }; + + atomic_tasks_bitset running; // 0x00 + atomic_tasks_bitset ready; // 0x10 + atomic_tasks_bitset pending_ready; // 0x20 + atomic_tasks_bitset enabled; // 0x30 + atomic_tasks_bitset signalled; // 0x40 + atomic_tasks_bitset waiting; // 0x50 vm::bptr spurs; // 0x60 be_t args; // 0x68 u8 enable_clear_ls; // 0x70 diff --git a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp index 5264309cdc..ebe3f7b0c8 100644 --- a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp +++ b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp @@ -2,6 +2,7 @@ #include "Loader/ELF.h" #include "Emu/Cell/PPUModule.h" +#include "Emu/Memory/vm_reservation.h" #include "Emu/Cell/SPUThread.h" #include "Emu/Cell/SPURecompiler.h" #include "Emu/Cell/lv2/sys_lwmutex.h" @@ -120,24 +121,29 @@ void cellSpursModuleExit(spu_thread& spu) } // Execute a DMA operation -bool spursDma(spu_thread& spu, u32 cmd, u64 ea, u32 lsa, u32 size, u32 tag) +bool spursDma(spu_thread& spu, const spu_mfc_cmd& args) { - spu.set_ch_value(MFC_LSA, lsa); - spu.set_ch_value(MFC_EAH, static_cast(ea >> 32)); - spu.set_ch_value(MFC_EAL, static_cast(ea)); - spu.set_ch_value(MFC_Size, size); - spu.set_ch_value(MFC_TagID, tag); - spu.set_ch_value(MFC_Cmd, cmd); + spu.ch_mfc_cmd = args; - if (cmd == MFC_GETLLAR_CMD || cmd == MFC_PUTLLC_CMD || cmd == MFC_PUTLLUC_CMD) + if (!spu.process_mfc_cmd()) { - const u32 rv = static_cast(spu.get_ch_value(MFC_RdAtomicStat)); - return cmd == MFC_PUTLLC_CMD ? !rv : true; + spu_runtime::g_escape(&spu); + } + + if (args.cmd == MFC_GETLLAR_CMD || args.cmd == MFC_PUTLLC_CMD || args.cmd == MFC_PUTLLUC_CMD) + { + return static_cast(spu.get_ch_value(MFC_RdAtomicStat)) != MFC_PUTLLC_FAILURE; } return true; } +// Execute a DMA operation +bool spursDma(spu_thread& spu, u32 cmd, u64 ea, u32 lsa, u32 size, u32 tag) +{ + return spursDma(spu, {MFC(cmd), static_cast(tag & 0x1f), static_cast(size & 0x7fff), lsa, static_cast(ea), static_cast(ea >> 32)}); +} + // Get the status of DMA operations u32 spursDmaGetCompletionStatus(spu_thread& spu, u32 tagMask) { @@ -1402,76 +1408,93 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i auto ctxt = spu._ptr(0x2700); s32 rc = CELL_OK; - s32 numNewlyReadyTasks; + s32 numNewlyReadyTasks = 0; + //vm::reservation_op(vm::cast(ctxt->taskset.addr(), HERE), 128, [&]() { - auto taskset = ctxt->taskset.get_ptr(); + auto taskset = ctxt->taskset; + v128 waiting = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::waiting)); + v128 running = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::running)); + v128 ready = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::ready)); + v128 pready = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::pending_ready)); + v128 enabled = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::enabled)); + v128 signalled = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::signalled)); // Verify taskset state is valid - be_t _0(v128::from32(0)); - if ((taskset->waiting & taskset->running) != _0 || (taskset->ready & taskset->pending_ready) != _0 || - ((taskset->running | taskset->ready | taskset->pending_ready | taskset->signalled | taskset->waiting) & ~taskset->enabled) != _0) + if ((waiting & running) != v128{} || (ready & pready) != v128{} || + (v128::andnot(enabled, running | ready | pready | signalled | waiting) != v128{})) { spu_log.error("Invalid taskset state"); spursHalt(spu); } // Find the number of tasks that have become ready since the last iteration - auto newlyReadyTasks = (taskset->signalled | taskset->pending_ready) & ~taskset->ready.value(); - numNewlyReadyTasks = 0; - for (auto i = 0; i < 128; i++) { - if (newlyReadyTasks._bit[i]) + auto newlyReadyTasks = v128::andnot(ready, signalled | pready); + + // TODO: Optimize this shit with std::popcount when it's known to be fixed + for (auto i = 0; i < 128; i++) { - numNewlyReadyTasks++; + if (newlyReadyTasks._bit[i]) + { + numNewlyReadyTasks++; + } } } v128 readyButNotRunning; u8 selectedTaskId; - v128 running = taskset->running.value(); - v128 waiting = taskset->waiting.value(); - v128 enabled = taskset->enabled.value(); - v128 signalled = (taskset->signalled & (taskset->ready | taskset->pending_ready)); - v128 ready = (taskset->signalled | taskset->ready | taskset->pending_ready); + v128 signalled0 = (signalled & (ready | pready)); + v128 ready0 = (signalled | ready | pready); switch (request) { case SPURS_TASKSET_REQUEST_POLL_SIGNAL: - rc = signalled._bit[ctxt->taskId] ? 1 : 0; - signalled._bit[ctxt->taskId] = false; + { + rc = signalled0._bit[ctxt->taskId] ? 1 : 0; + signalled0._bit[ctxt->taskId] = false; break; + } case SPURS_TASKSET_REQUEST_DESTROY_TASK: + { numNewlyReadyTasks--; running._bit[ctxt->taskId] = false; enabled._bit[ctxt->taskId] = false; - signalled._bit[ctxt->taskId] = false; - ready._bit[ctxt->taskId] = false; + signalled0._bit[ctxt->taskId] = false; + ready0._bit[ctxt->taskId] = false; break; + } case SPURS_TASKSET_REQUEST_YIELD_TASK: + { running._bit[ctxt->taskId] = false; waiting._bit[ctxt->taskId] = true; break; + } case SPURS_TASKSET_REQUEST_WAIT_SIGNAL: - if (signalled._bit[ctxt->taskId] == false) + { + if (signalled0._bit[ctxt->taskId] == false) { numNewlyReadyTasks--; running._bit[ctxt->taskId] = false; waiting._bit[ctxt->taskId] = true; - signalled._bit[ctxt->taskId] = false; - ready._bit[ctxt->taskId] = false; + signalled0._bit[ctxt->taskId] = false; + ready0._bit[ctxt->taskId] = false; } break; + } case SPURS_TASKSET_REQUEST_POLL: - readyButNotRunning = ready & ~running; + { + readyButNotRunning = v128::andnot(running, ready0); if (taskset->wkl_flag_wait_task < CELL_SPURS_MAX_TASK) { - readyButNotRunning = readyButNotRunning & ~(v128::fromBit(taskset->wkl_flag_wait_task)); + readyButNotRunning._bit[taskset->wkl_flag_wait_task] = false; } - rc = readyButNotRunning != _0 ? 1 : 0; + rc = readyButNotRunning != v128{} ? 1 : 0; break; + } case SPURS_TASKSET_REQUEST_WAIT_WKL_FLAG: + { if (taskset->wkl_flag_wait_task == 0x81) { // A workload flag is already pending so consume it @@ -1493,11 +1516,13 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i rc = CELL_SPURS_TASK_ERROR_BUSY; } break; + } case SPURS_TASKSET_REQUEST_SELECT_TASK: - readyButNotRunning = ready & ~running; + { + readyButNotRunning = v128::andnot(running, ready0); if (taskset->wkl_flag_wait_task < CELL_SPURS_MAX_TASK) { - readyButNotRunning = readyButNotRunning & ~(v128::fromBit(taskset->wkl_flag_wait_task)); + readyButNotRunning._bit[taskset->wkl_flag_wait_task] = false; } // Select a task from the readyButNotRunning set to run. Start from the task after the last scheduled task to ensure fairness. @@ -1534,7 +1559,9 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i waiting._bit[selectedTaskId] = false; } break; + } case SPURS_TASKSET_REQUEST_RECV_WKL_FLAG: + { if (taskset->wkl_flag_wait_task < CELL_SPURS_MAX_TASK) { // There is a task waiting for the workload flag @@ -1549,41 +1576,36 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i rc = 0; } break; + } default: spu_log.error("Unknown taskset request"); spursHalt(spu); } - taskset->pending_ready = _0; - taskset->running = running; - taskset->waiting = waiting; - taskset->enabled = enabled; - taskset->signalled = signalled; - taskset->ready = ready; + vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::waiting)) = waiting; + vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::running)) = running; + vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::ready)) = ready; + vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::pending_ready)) = v128{}; + vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::enabled)) = enabled; + vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::signalled)) = signalled; - std::memcpy(spu._ptr(0x2700), taskset, 128); + std::memcpy(spu._ptr(0x2700), spu._ptr(0x100), 128); // Copy data }//); // Increment the ready count of the workload by the number of tasks that have become ready - //vm::reservation_op(vm::cast(kernelCtxt->spurs.addr(), HERE), 128, [&]() + if (numNewlyReadyTasks) { - auto spurs = kernelCtxt->spurs.get_ptr(); + auto spurs = kernelCtxt->spurs; - s32 readyCount = kernelCtxt->wklCurrentId < CELL_SPURS_MAX_WORKLOAD ? spurs->wklReadyCount1[kernelCtxt->wklCurrentId].load() : spurs->wklIdleSpuCountOrReadyCount2[kernelCtxt->wklCurrentId & 0x0F].load(); - readyCount += numNewlyReadyTasks; - readyCount = readyCount < 0 ? 0 : readyCount > 0xFF ? 0xFF : readyCount; - - if (kernelCtxt->wklCurrentId < CELL_SPURS_MAX_WORKLOAD) + auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb); + spurs->readyCount(kernelCtxt->wklCurrentId).fetch_op([&](u8& val) { - spurs->wklReadyCount1[kernelCtxt->wklCurrentId] = readyCount; - } - else - { - spurs->wklIdleSpuCountOrReadyCount2[kernelCtxt->wklCurrentId & 0x0F] = readyCount; - } + const s32 _new = val + numNewlyReadyTasks; + val = static_cast(std::clamp(_new, 0, 0xFF)); + }); - std::memcpy(spu._ptr(0x100), spurs, 128); - }//); + res.release(rtime + 128); + } return rc; }