mfc: Fix support for list transfer stall bit with partial support for out of order execution - Also give Sync commands a size so they are properly detected by queue checks

This commit is contained in:
Jake 2017-11-30 20:50:01 -06:00 committed by kd-11
parent 8b476b5bfa
commit 34e01ba3d8
3 changed files with 135 additions and 106 deletions

View File

@ -142,107 +142,142 @@ void mfc_thread::cpu_task()
if (queue_size)
{
auto& cmd = spu.mfc_queue[0];
if ((cmd.cmd & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK)) == MFC_PUTQLLUC_CMD)
u32 fence_mask = 0; // Using this instead of stall_mask to avoid a possible race condition
u32 barrier_mask = 0;
bool first = true;
for (u32 i = 0; i < spu.mfc_queue.size(); i++, first = false)
{
auto& data = vm::ps3::_ref<decltype(spu.rdata)>(cmd.eal);
const auto to_write = spu._ref<decltype(spu.rdata)>(cmd.lsa & 0x3ffff);
auto& cmd = spu.mfc_queue[i];
cmd.size = 0;
no_updates = 0;
// this check all revolves around a potential 'stalled list' in the queue as its the one thing that can cause out of order mfc list execution currently
// a list with barrier hard blocks that tag until it's been dealt with
// and a new command that has a fence cant be executed until the stalled list has been dealt with
if ((cmd.size != 0) && ((barrier_mask & (1u << cmd.tag)) || ((cmd.cmd & MFC_FENCE_MASK) && ((1 << cmd.tag) & fence_mask))))
continue;
vm::reservation_acquire(cmd.eal, 128);
// Store unconditionally
if (s_use_rtm && utils::transaction_enter())
if ((cmd.cmd & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK)) == MFC_PUTQLLUC_CMD)
{
if (!vm::reader_lock{vm::try_to_lock})
{
_xabort(0);
}
auto& data = vm::ps3::_ref<decltype(spu.rdata)>(cmd.eal);
const auto to_write = spu._ref<decltype(spu.rdata)>(cmd.lsa & 0x3ffff);
data = to_write;
vm::reservation_update(cmd.eal, 128);
vm::notify(cmd.eal, 128);
_xend();
}
else
{
vm::writer_lock lock(0);
data = to_write;
vm::reservation_update(cmd.eal, 128);
vm::notify(cmd.eal, 128);
}
}
else if (cmd.cmd & MFC_LIST_MASK)
{
struct list_element
{
be_t<u16> sb; // Stall-and-Notify bit (0x8000)
be_t<u16> ts; // List Transfer Size
be_t<u32> ea; // External Address Low
};
if (cmd.size && (spu.ch_stall_mask & (1u << cmd.tag)) == 0)
{
cmd.lsa &= 0x3fff0;
const list_element item = spu._ref<list_element>(cmd.eal & 0x3fff8);
const u32 size = item.ts;
const u32 addr = item.ea;
if (size)
{
spu_mfc_cmd transfer;
transfer.eal = addr;
transfer.eah = 0;
transfer.lsa = cmd.lsa | (addr & 0xf);
transfer.tag = cmd.tag;
transfer.cmd = MFC(cmd.cmd & ~MFC_LIST_MASK);
transfer.size = size;
spu.do_dma_transfer(transfer);
cmd.lsa += std::max<u32>(size, 16);
}
cmd.eal += 8;
cmd.size -= 8;
cmd.size = 0;
no_updates = 0;
if (item.sb & 0x8000)
vm::reservation_acquire(cmd.eal, 128);
// Store unconditionally
if (s_use_rtm && utils::transaction_enter())
{
spu.ch_stall_stat.push_or(spu, 1 << cmd.tag);
const u32 evt = spu.ch_event_stat.fetch_or(SPU_EVENT_SN);
if (evt & SPU_EVENT_WAITING)
if (!vm::reader_lock{ vm::try_to_lock })
{
spu.notify();
}
else if (evt & SPU_EVENT_INTR_ENABLED)
{
spu.state += cpu_flag::suspend;
_xabort(0);
}
data = to_write;
vm::reservation_update(cmd.eal, 128);
vm::notify(cmd.eal, 128);
_xend();
}
else
{
vm::writer_lock lock(0);
data = to_write;
vm::reservation_update(cmd.eal, 128);
vm::notify(cmd.eal, 128);
}
}
}
else if (LIKELY(cmd.size))
{
spu.do_dma_transfer(cmd);
cmd.size = 0;
}
else if (UNLIKELY((cmd.cmd & ~0xc) == MFC_BARRIER_CMD))
{
// TODO (MFC_BARRIER_CMD, MFC_EIEIO_CMD, MFC_SYNC_CMD)
_mm_mfence();
}
else if (cmd.cmd & MFC_LIST_MASK)
{
struct list_element
{
be_t<u16> sb; // Stall-and-Notify bit (0x8000)
be_t<u16> ts; // List Transfer Size
be_t<u32> ea; // External Address Low
};
if (!cmd.size)
{
spu.mfc_queue.end_pop();
no_updates = 0;
if (cmd.size && (spu.ch_stall_mask & (1u << cmd.tag)) == 0)
{
cmd.lsa &= 0x3fff0;
// try to get the whole list done in one go
while (cmd.size != 0)
{
const list_element item = spu._ref<list_element>(cmd.eal & 0x3fff8);
const u32 size = item.ts;
const u32 addr = item.ea;
if (size)
{
spu_mfc_cmd transfer;
transfer.eal = addr;
transfer.eah = 0;
transfer.lsa = cmd.lsa | (addr & 0xf);
transfer.tag = cmd.tag;
transfer.cmd = MFC(cmd.cmd & ~MFC_LIST_MASK);
transfer.size = size;
spu.do_dma_transfer(transfer);
cmd.lsa += std::max<u32>(size, 16);
}
cmd.eal += 8;
cmd.size -= 8;
no_updates = 0;
// dont stall for last 'item' in list
if ((item.sb & 0x8000) && (cmd.size != 0))
{
spu.ch_stall_mask |= (1 << cmd.tag);
spu.ch_stall_stat.push_or(spu, 1 << cmd.tag);
const u32 evt = spu.ch_event_stat.fetch_or(SPU_EVENT_SN);
if (evt & SPU_EVENT_WAITING)
{
spu.notify();
}
break;
}
}
}
if (cmd.size != 0 && (cmd.cmd & MFC_BARRIER_MASK))
barrier_mask |= (1 << cmd.tag);
else if (cmd.size != 0)
fence_mask |= (1 << cmd.tag);
}
else if (UNLIKELY((cmd.cmd & ~0xc) == MFC_BARRIER_CMD))
{
// Raw barrier commands / sync commands are tag agnostic and hard sync the mfc list
// Need to gaurentee everything ahead of it has processed before this
if (first)
cmd.size = 0;
else
break;
}
else if (LIKELY(cmd.size))
{
spu.do_dma_transfer(cmd);
cmd.size = 0;
}
if (!cmd.size && first)
{
spu.mfc_queue.end_pop();
no_updates = 0;
break;
}
else if (!cmd.size && i == 1)
{
// nasty hack, shoving stalled list down one
// this *works* from the idea that the only thing that could have been passed over in position 0 is a stalled list
// todo: this can still create a situation where we say the mfc queue is full when its actually not, which will cause a rough deadlock between spu and mfc
// which will causes a situation where the spu is waiting for the queue to open up but hasnt signaled the stall yet
spu.mfc_queue[1] = spu.mfc_queue[0];
spu.mfc_queue.end_pop();
no_updates = 0;
break;
}
}
}
@ -252,26 +287,21 @@ void mfc_thread::cpu_task()
{
// Mask incomplete transfers
u32 completed = spu.ch_tag_mask;
for (u32 i = 0; i < spu.mfc_queue.size(); i++)
{
const auto& _cmd = spu.mfc_queue[i];
if (_cmd.size)
for (u32 i = 0; i < spu.mfc_queue.size(); i++)
{
if (spu.ch_tag_upd == 1)
{
const auto& _cmd = spu.mfc_queue[i];
if (_cmd.size)
completed &= ~(1u << _cmd.tag);
}
else
{
completed = 0;
break;
}
}
}
if (completed && spu.ch_tag_upd.exchange(0))
if (completed && spu.ch_tag_upd.compare_and_swap_test(1, 0))
{
spu.ch_tag_stat.push(spu, completed);
no_updates = 0;
}
else if (completed && spu.ch_tag_mask == completed && spu.ch_tag_upd.compare_and_swap_test(2, 0))
{
spu.ch_tag_stat.push(spu, completed);
no_updates = 0;
@ -280,7 +310,6 @@ void mfc_thread::cpu_task()
test_state();
}
if (no_updates++)
{
if (no_updates >= 3)

View File

@ -815,7 +815,7 @@ void SPUThread::process_mfc_cmd()
case MFC_GETLB_CMD:
case MFC_GETLF_CMD:
{
if (ch_mfc_cmd.size <= max_imm_dma_size && mfc_queue.size() == 0 && (ch_stall_mask & (1u << ch_mfc_cmd.tag)) == 0)
if (ch_mfc_cmd.size <= max_imm_dma_size && mfc_queue.size() == 0)
{
vm::reader_lock lock(vm::try_to_lock);
@ -890,7 +890,7 @@ void SPUThread::process_mfc_cmd()
case MFC_EIEIO_CMD:
case MFC_SYNC_CMD:
{
ch_mfc_cmd.size = 0;
ch_mfc_cmd.size = 1;
if (mfc_queue.size() == 0)
{

View File

@ -283,7 +283,7 @@ struct cfg_root : cfg::node
cfg::_bool bind_spu_cores{this, "Bind SPU threads to secondary cores"};
cfg::_bool lower_spu_priority{this, "Lower SPU thread priority"};
cfg::_bool spu_debug{this, "SPU Debug"};
cfg::_int<32, 16384> max_spu_immediate_write_size{this, "Maximum immediate DMA write size", 16384}; // Maximum size that an SPU thread can write directly without posting to MFC
cfg::_int<0, 16384> max_spu_immediate_write_size{this, "Maximum immediate DMA write size", 16384}; // Maximum size that an SPU thread can write directly without posting to MFC
cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks
cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free
cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield