From ec8c5d4bb6e23cb464d002047f4db2b1266113ca Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 13 Oct 2019 16:48:11 +1000 Subject: [PATCH] DMA: Batch multi-word transfers together --- src/core/bus.cpp | 54 +++++++++++- src/core/bus.h | 10 ++- src/core/cdrom.cpp | 27 ++---- src/core/cdrom.h | 2 +- src/core/dma.cpp | 203 ++++++++++++++++++++++++--------------------- src/core/dma.h | 6 +- src/core/gpu.cpp | 54 +++++++++--- src/core/gpu.h | 5 +- src/core/mdec.cpp | 11 ++- src/core/mdec.h | 4 +- src/core/spu.cpp | 42 ++++++++-- src/core/spu.h | 4 +- 12 files changed, 273 insertions(+), 149 deletions(-) diff --git a/src/core/bus.cpp b/src/core/bus.cpp index a1cb790d6..56e9d5f6f 100644 --- a/src/core/bus.cpp +++ b/src/core/bus.cpp @@ -17,7 +17,7 @@ Log_SetChannel(Bus); #define FIXUP_WORD_READ_OFFSET(offset) ((offset) & ~u32(3)) -#define FIXUP_WORD_READ_VALUE(offset, value) ((value) >> (((offset) & u32(3)) * 8)) +#define FIXUP_WORD_READ_VALUE(offset, value) ((value) >> (((offset)&u32(3)) * 8)) // Offset and value remapping for (w32) registers from nocash docs. void FixupUnalignedWordAccessW32(u32& offset, u32& value) @@ -118,6 +118,58 @@ bool Bus::WriteWord(PhysicalMemoryAddress address, u32 value) return DispatchAccess(address, value); } +TickCount Bus::ReadWords(PhysicalMemoryAddress address, u32* words, u32 word_count) +{ + if (address + (word_count * sizeof(u32)) > (RAM_BASE + RAM_SIZE)) + { + // Not RAM, or RAM mirrors. + TickCount total_ticks = 0; + for (u32 i = 0; i < word_count; i++) + { + const TickCount ticks = DispatchAccess(address, words[i]); + if (ticks < 0) + return -1; + + total_ticks += ticks; + address += sizeof(u32); + } + + return total_ticks; + } + + + // DMA is using DRAM Hyper Page mode, allowing it to access DRAM rows at 1 clock cycle per word (effectively around 17 + // clks per 16 words, due to required row address loading, probably plus some further minimal overload due to refresh + // cycles). This is making DMA much faster than CPU memory accesses (CPU DRAM access takes 1 opcode cycle plus 6 + // waitstates, ie. 7 cycles in total). + std::memcpy(words, &m_ram[address], sizeof(u32) * word_count); + return static_cast(word_count + ((word_count + 15) / 16)); +} + +TickCount Bus::WriteWords(PhysicalMemoryAddress address, const u32* words, u32 word_count) +{ + if (address + (word_count * sizeof(u32)) > (RAM_BASE + RAM_SIZE)) + { + // Not RAM, or RAM mirrors. + TickCount total_ticks = 0; + for (u32 i = 0; i < word_count; i++) + { + u32 value = words[i]; + const TickCount ticks = DispatchAccess(address, value); + if (ticks < 0) + return -1; + + total_ticks += ticks; + address += sizeof(u32); + } + + return total_ticks; + } + + std::memcpy(&m_ram[address], words, sizeof(u32) * word_count); + return static_cast(word_count + ((word_count + 15) / 16)); +} + void Bus::PatchBIOS(u32 address, u32 value, u32 mask /*= UINT32_C(0xFFFFFFFF)*/) { const u32 phys_address = address & UINT32_C(0x1FFFFFFF); diff --git a/src/core/bus.h b/src/core/bus.h index 6456a8df8..8c918876c 100644 --- a/src/core/bus.h +++ b/src/core/bus.h @@ -38,16 +38,24 @@ public: bool WriteByte(PhysicalMemoryAddress address, u8 value); bool WriteHalfWord(PhysicalMemoryAddress address, u16 value); bool WriteWord(PhysicalMemoryAddress address, u32 value); - + template TickCount DispatchAccess(PhysicalMemoryAddress address, u32& value); + // Optimized variant for burst/multi-word read/writing. + TickCount ReadWords(PhysicalMemoryAddress address, u32* words, u32 word_count); + TickCount WriteWords(PhysicalMemoryAddress address, const u32* words, u32 word_count); + void PatchBIOS(u32 address, u32 value, u32 mask = UINT32_C(0xFFFFFFFF)); void SetExpansionROM(std::vector data); private: enum : u32 { + RAM_BASE = 0x00000000, + RAM_SIZE = 0x200000, + RAM_MASK = RAM_SIZE - 1, + RAM_MIRROR_END = 0x800000, EXP1_BASE = 0x1F000000, EXP1_SIZE = 0x800000, EXP1_MASK = EXP1_SIZE - 1, diff --git a/src/core/cdrom.cpp b/src/core/cdrom.cpp index 2c8a9a06f..052ae4cf9 100644 --- a/src/core/cdrom.cpp +++ b/src/core/cdrom.cpp @@ -348,30 +348,17 @@ void CDROM::WriteRegister(u32 offset, u8 value) ZeroExtend32(m_status.index.GetValue()), ZeroExtend32(value)); } -u32 CDROM::DMARead() +void CDROM::DMARead(u32* words, u32 word_count) { - if (m_data_fifo.IsEmpty()) + const u32 words_in_fifo = m_data_fifo.GetSize() / 4; + if (words_in_fifo < word_count) { - Log_ErrorPrintf("DMA read on empty data FIFO"); - return UINT32_C(0xFFFFFFFF); + Log_ErrorPrintf("DMA read on empty/near-empty data FIFO"); + std::memset(words + words_in_fifo, 0, sizeof(u32) * (word_count - words_in_fifo)); } - u32 data; - if (m_data_fifo.GetSize() >= sizeof(data)) - { - std::memcpy(&data, m_data_fifo.GetFrontPointer(), sizeof(data)); - m_data_fifo.Remove(sizeof(data)); - } - else - { - Log_WarningPrintf("Unaligned DMA read on FIFO(%u)", m_data_fifo.GetSize()); - data = 0; - std::memcpy(&data, m_data_fifo.GetFrontPointer(), m_data_fifo.GetSize()); - m_data_fifo.Clear(); - } - - // Log_DebugPrintf("DMA Read -> 0x%08X (%u remaining)", data, m_data_fifo.GetSize()); - return data; + const u32 bytes_to_read = std::min(word_count * sizeof(u32), m_data_fifo.GetSize()); + m_data_fifo.PopRange(reinterpret_cast(words), bytes_to_read); } void CDROM::SetInterrupt(Interrupt interrupt) diff --git a/src/core/cdrom.h b/src/core/cdrom.h index 021323ab8..c700fd405 100644 --- a/src/core/cdrom.h +++ b/src/core/cdrom.h @@ -29,7 +29,7 @@ public: // I/O u8 ReadRegister(u32 offset); void WriteRegister(u32 offset, u8 value); - u32 DMARead(); + void DMARead(u32* words, u32 word_count); void Execute(TickCount ticks); diff --git a/src/core/dma.cpp b/src/core/dma.cpp index daf2549a0..0c2564936 100644 --- a/src/core/dma.cpp +++ b/src/core/dma.cpp @@ -24,6 +24,7 @@ bool DMA::Initialize(System* system, Bus* bus, InterruptController* interrupt_co m_cdrom = cdrom; m_spu = spu; m_mdec = mdec; + m_transfer_buffer.resize(32); return true; } @@ -223,7 +224,7 @@ void DMA::TransferChannel(Channel channel) // start/trigger bit is cleared on beginning of transfer cs.channel_control.start_trigger = false; - PhysicalMemoryAddress current_address = cs.base_address & ~UINT32_C(3); + PhysicalMemoryAddress current_address = (cs.base_address & ~UINT32_C(3)) & ADDRESS_MASK; const PhysicalMemoryAddress increment = cs.channel_control.address_step_reverse ? static_cast(-4) : UINT32_C(4); switch (cs.channel_control.sync_mode) { @@ -233,32 +234,9 @@ void DMA::TransferChannel(Channel channel) Log_DebugPrintf("DMA%u: Copying %u words %s 0x%08X", static_cast(channel), word_count, copy_to_device ? "from" : "to", current_address); if (copy_to_device) - { - u32 words_remaining = word_count; - do - { - words_remaining--; - - u32 value = 0; - m_bus->DispatchAccess(current_address, value); - DMAWrite(channel, value, current_address, words_remaining); - - current_address = (current_address + increment) & ADDRESS_MASK; - } while (words_remaining > 0); - } + TransferMemoryToDevice(channel, current_address, increment, word_count); else - { - u32 words_remaining = word_count; - do - { - words_remaining--; - - u32 value = DMARead(channel, current_address, words_remaining); - m_bus->DispatchAccess(current_address, value); - - current_address = (current_address + increment) & ADDRESS_MASK; - } while (words_remaining > 0); - } + TransferDeviceToMemory(channel, current_address, increment, word_count); } break; @@ -285,18 +263,7 @@ void DMA::TransferChannel(Channel channel) current_address += sizeof(header); if (word_count > 0) - { - u32 words_remaining = word_count; - do - { - words_remaining--; - - u32 memory_value = 0; - m_bus->DispatchAccess(current_address, memory_value); - DMAWrite(channel, memory_value, current_address, words_remaining); - current_address = (current_address + UINT32_C(4)) & ADDRESS_MASK; - } while (words_remaining > 0); - } + TransferMemoryToDevice(channel, current_address, 4, word_count); if (next_address & UINT32_C(0x800000)) break; @@ -313,6 +280,7 @@ void DMA::TransferChannel(Channel channel) cs.block_control.request.GetBlockCount(), cs.block_control.request.GetBlockSize(), copy_to_device ? "from" : "to", current_address); + const u32 block_size = cs.block_control.request.GetBlockSize(); u32 blocks_remaining = cs.block_control.request.block_count; if (copy_to_device) @@ -320,18 +288,8 @@ void DMA::TransferChannel(Channel channel) do { blocks_remaining--; - - u32 words_remaining = cs.block_control.request.block_size; - do - { - words_remaining--; - - u32 value = 0; - m_bus->DispatchAccess(current_address, value); - DMAWrite(channel, value, current_address, words_remaining); - - current_address = (current_address + increment) & ADDRESS_MASK; - } while (words_remaining > 0); + TransferMemoryToDevice(channel, current_address, increment, block_size); + current_address = (current_address + (increment * block_size)) & ADDRESS_MASK; } while (cs.request && blocks_remaining > 0); } else @@ -339,17 +297,8 @@ void DMA::TransferChannel(Channel channel) do { blocks_remaining--; - - u32 words_remaining = cs.block_control.request.block_size; - do - { - words_remaining--; - - u32 value = DMARead(channel, current_address, words_remaining); - m_bus->DispatchAccess(current_address, value); - - current_address = (current_address + increment) & ADDRESS_MASK; - } while (words_remaining > 0); + TransferDeviceToMemory(channel, current_address, increment, block_size); + current_address = (current_address + (increment * block_size)) & ADDRESS_MASK; } while (cs.request && blocks_remaining > 0); } @@ -382,56 +331,122 @@ void DMA::TransferChannel(Channel channel) } } -u32 DMA::DMARead(Channel channel, PhysicalMemoryAddress dst_address, u32 remaining_words) +void DMA::TransferMemoryToDevice(Channel channel, u32 address, u32 increment, u32 word_count) { + // Read from memory. Wrap-around? + if (m_transfer_buffer.size() < word_count) + m_transfer_buffer.resize(word_count); + + if (increment > 0 && ((address + (increment * word_count)) & ADDRESS_MASK) > address) + { + m_bus->ReadWords(address, m_transfer_buffer.data(), word_count); + } + else + { + for (u32 i = 0; i < word_count; i++) + { + m_bus->DispatchAccess(address, m_transfer_buffer[i]); + address = (address + increment) & ADDRESS_MASK; + } + } + + switch (channel) + { + case Channel::GPU: + m_gpu->DMAWrite(m_transfer_buffer.data(), word_count); + break; + + case Channel::SPU: + m_spu->DMAWrite(m_transfer_buffer.data(), word_count); + break; + + case Channel::MDECin: + m_mdec->DMAWrite(m_transfer_buffer.data(), word_count); + break; + + case Channel::CDROM: + case Channel::MDECout: + case Channel::PIO: + default: + Panic("Unhandled DMA channel for device write"); + break; + } +} + +void DMA::TransferDeviceToMemory(Channel channel, u32 address, u32 increment, u32 word_count) +{ + if (m_transfer_buffer.size() < word_count) + m_transfer_buffer.resize(word_count); + + // Read from device. switch (channel) { case Channel::OTC: + { // clear ordering table - return (remaining_words == 0) ? UINT32_C(0xFFFFFF) : ((dst_address - UINT32_C(4)) & ADDRESS_MASK); + // this always goes in reverse, so we can generate values in reverse order and write it forwards + if (((address - (4 * word_count)) & ADDRESS_MASK) < address) + { + const u32 end_address = (address - (4 * (word_count - 1))) & ADDRESS_MASK; - case Channel::GPU: - return m_gpu->DMARead(); + u32 value = end_address; + m_transfer_buffer[0] = UINT32_C(0xFFFFFF); + for (u32 i = 1; i < word_count; i++) + { + m_transfer_buffer[i] = value; + value = (value + 4) & ADDRESS_MASK; + } - case Channel::CDROM: - return m_cdrom->DMARead(); + m_bus->WriteWords(end_address, m_transfer_buffer.data(), word_count); + } + else + { + for (u32 i = 0; i < word_count; i++) + { + u32 value = (i == word_count - 1) ? UINT32_C(0xFFFFFFF) : ((address - 4) & ADDRESS_MASK); + m_bus->DispatchAccess(address, value); + address = (address - 4) & ADDRESS_MASK; + } + } - case Channel::SPU: - return m_spu->DMARead(); - - case Channel::MDECout: - return m_mdec->DMARead(); - - case Channel::MDECin: - case Channel::PIO: - default: - Panic("Unhandled DMA channel read"); - return UINT32_C(0xFFFFFFFF); - } -} - -void DMA::DMAWrite(Channel channel, u32 value, PhysicalMemoryAddress src_address, u32 remaining_words) -{ - switch (channel) - { - case Channel::GPU: - m_gpu->DMAWrite(value); return; + } + break; - case Channel::SPU: - m_spu->DMAWrite(value); + case Channel::GPU: + m_gpu->DMARead(m_transfer_buffer.data(), word_count); break; - case Channel::MDECin: - m_mdec->DMAWrite(value); + case Channel::CDROM: + m_cdrom->DMARead(m_transfer_buffer.data(), word_count); + break; + + case Channel::SPU: + m_spu->DMARead(m_transfer_buffer.data(), word_count); break; case Channel::MDECout: - case Channel::CDROM: + m_mdec->DMARead(m_transfer_buffer.data(), word_count); + break; + + case Channel::MDECin: case Channel::PIO: - case Channel::OTC: default: - Panic("Unhandled DMA channel write"); + Panic("Unhandled DMA channel for device read"); + std::fill_n(m_transfer_buffer.begin(), word_count, UINT32_C(0xFFFFFFFF)); break; } + + if (increment > 0 && ((address + (increment * word_count)) & ADDRESS_MASK) > address) + { + m_bus->WriteWords(address, m_transfer_buffer.data(), word_count); + } + else + { + for (u32 i = 0; i < word_count; i++) + { + m_bus->DispatchAccess(address, m_transfer_buffer[i]); + address = (address + increment) & ADDRESS_MASK; + } + } } diff --git a/src/core/dma.h b/src/core/dma.h index f356add6a..42e4d3601 100644 --- a/src/core/dma.h +++ b/src/core/dma.h @@ -65,10 +65,10 @@ private: void TransferChannel(Channel channel); // from device -> memory - u32 DMARead(Channel channel, PhysicalMemoryAddress dst_address, u32 remaining_words); + void TransferDeviceToMemory(Channel channel, u32 address, u32 increment, u32 word_count); // from memory -> device - void DMAWrite(Channel channel, u32 value, PhysicalMemoryAddress src_address, u32 remaining_words); + void TransferMemoryToDevice(Channel channel, u32 address, u32 increment, u32 word_count); System* m_system = nullptr; Bus* m_bus = nullptr; @@ -81,6 +81,8 @@ private: TickCount m_transfer_ticks = 0; bool m_transfer_in_progress = false; + std::vector m_transfer_buffer; + struct ChannelState { u32 base_address; diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp index 0169b3411..e1674003f 100644 --- a/src/core/gpu.cpp +++ b/src/core/gpu.cpp @@ -217,29 +217,56 @@ void GPU::WriteRegister(u32 offset, u32 value) } } -u32 GPU::DMARead() +void GPU::DMARead(u32* words, u32 word_count) { if (m_GPUSTAT.dma_direction != DMADirection::GPUREADtoCPU) { Log_ErrorPrintf("Invalid DMA direction from GPU DMA read"); - return UINT32_C(0xFFFFFFFF); + std::fill_n(words, word_count, UINT32_C(0xFFFFFFFF)); + return; } - return ReadGPUREAD(); + const u32 words_to_copy = std::min(word_count, static_cast(m_GPUREAD_buffer.size())); + if (!m_GPUREAD_buffer.empty()) + { + auto it = m_GPUREAD_buffer.begin(); + for (u32 i = 0; i < word_count; i++) + words[i] = *(it++); + + m_GPUREAD_buffer.erase(m_GPUREAD_buffer.begin(), it); + } + if (words_to_copy < word_count) + { + Log_WarningPrintf("Partially-empty GPUREAD buffer on GPU DMA read"); + std::fill_n(words + words_to_copy, word_count - words_to_copy, u32(0)); + } + + UpdateGPUSTAT(); } -void GPU::DMAWrite(u32 value) +void GPU::DMAWrite(const u32* words, u32 word_count) { switch (m_GPUSTAT.dma_direction) { case DMADirection::CPUtoGP0: - WriteGP0(value); - break; + { + m_GP0_command.reserve(m_GP0_command.size() + word_count); + for (u32 i = 0; i < word_count; i++) + { + m_GP0_command.push_back(*(words++)); + HandleGP0Command(); + } + + UpdateGPUSTAT(); + } + break; default: - Log_ErrorPrintf("Unhandled GPU DMA write mode %u for value %08X", - static_cast(m_GPUSTAT.dma_direction.GetValue()), value); - break; + { + Log_ErrorPrintf("Unhandled GPU DMA write mode %u for %u words", + static_cast(m_GPUSTAT.dma_direction.GetValue()), word_count); + } + break; } } @@ -369,10 +396,14 @@ void GPU::WriteGP0(u32 value) { m_GP0_command.push_back(value); Assert(m_GP0_command.size() <= 1048576); + HandleGP0Command(); + UpdateGPUSTAT(); +} +void GPU::HandleGP0Command() +{ const u8 command = Truncate8(m_GP0_command[0] >> 24); const u32 param = m_GP0_command[0] & UINT32_C(0x00FFFFFF); - UpdateGPUSTAT(); if (command >= 0x20 && command <= 0x7F) { @@ -432,7 +463,7 @@ void GPU::WriteGP0(u32 value) case 0xE2: // set texture window { - m_render_state.SetTextureWindow(value); + m_render_state.SetTextureWindow(param); Log_DebugPrintf("Set texture window %02X %02X %02X %02X", m_render_state.texture_window_mask_x, m_render_state.texture_window_mask_y, m_render_state.texture_window_offset_x, m_render_state.texture_window_offset_y); @@ -504,7 +535,6 @@ void GPU::WriteGP0(u32 value) } m_GP0_command.clear(); - UpdateGPUSTAT(); } void GPU::WriteGP1(u32 value) diff --git a/src/core/gpu.h b/src/core/gpu.h index 48017adfa..0c8f99bf4 100644 --- a/src/core/gpu.h +++ b/src/core/gpu.h @@ -52,8 +52,8 @@ public: void WriteRegister(u32 offset, u32 value); // DMA access - u32 DMARead(); - void DMAWrite(u32 value); + void DMARead(u32* words, u32 word_count); + void DMAWrite(const u32* words, u32 word_count); // gpu_hw_opengl.cpp static std::unique_ptr CreateHardwareOpenGLRenderer(); @@ -185,6 +185,7 @@ protected: void HandleGetGPUInfoCommand(u32 value); // Rendering commands, returns false if not enough data is provided + void HandleGP0Command(); bool HandleRenderCommand(); bool HandleFillRectangleCommand(); bool HandleCopyRectangleCPUToVRAMCommand(); diff --git a/src/core/mdec.cpp b/src/core/mdec.cpp index a362c4e04..885c1c9a7 100644 --- a/src/core/mdec.cpp +++ b/src/core/mdec.cpp @@ -97,14 +97,17 @@ void MDEC::WriteRegister(u32 offset, u32 value) } } -u32 MDEC::DMARead() +void MDEC::DMARead(u32* words, u32 word_count) { - return ReadDataRegister(); + // TODO: Make faster + for (u32 i= 0; i < word_count; i++) + words[i] = ReadDataRegister(); } -void MDEC::DMAWrite(u32 value) +void MDEC::DMAWrite(const u32* words, u32 word_count) { - WriteCommandRegister(value); + for (u32 i = 0; i < word_count; i++) + WriteCommandRegister(words[i]); } void MDEC::SoftReset() diff --git a/src/core/mdec.h b/src/core/mdec.h index e41748664..f9bdba191 100644 --- a/src/core/mdec.h +++ b/src/core/mdec.h @@ -23,8 +23,8 @@ public: u32 ReadRegister(u32 offset); void WriteRegister(u32 offset, u32 value); - u32 DMARead(); - void DMAWrite(u32 value); + void DMARead(u32* words, u32 word_count); + void DMAWrite(const u32* words, u32 word_count); void DrawDebugMenu(); void DrawDebugWindow(); diff --git a/src/core/spu.cpp b/src/core/spu.cpp index 0d599ecc7..bbabe76b3 100644 --- a/src/core/spu.cpp +++ b/src/core/spu.cpp @@ -412,18 +412,44 @@ void SPU::WriteVoiceRegister(u32 offset, u16 value) } } -u32 SPU::DMARead() +void SPU::DMARead(u32* words, u32 word_count) { - const u16 lsb = RAMTransferRead(); - const u16 msb = RAMTransferRead(); - return ZeroExtend32(lsb) | (ZeroExtend32(msb) << 16); + // test for wrap-around + if ((m_transfer_address & ~RAM_MASK) != ((m_transfer_address + (word_count * sizeof(u32))) & ~RAM_MASK)) + { + // this could still be optimized to copy in two parts - end/start, but is unlikely. + for (u32 i = 0; i < word_count; i++) + { + const u16 lsb = RAMTransferRead(); + const u16 msb = RAMTransferRead(); + words[i] = ZeroExtend32(lsb) | (ZeroExtend32(msb) << 16); + } + } + else + { + std::memcpy(words, &m_ram[m_transfer_address], sizeof(u32) * word_count); + m_transfer_address = (m_transfer_address + (sizeof(u32) * word_count)) & RAM_MASK; + } } -void SPU::DMAWrite(u32 value) +void SPU::DMAWrite(const u32* words, u32 word_count) { - // two 16-bit writes to prevent out-of-bounds - RAMTransferWrite(Truncate16(value)); - RAMTransferWrite(Truncate16(value >> 16)); + // test for wrap-around + if ((m_transfer_address & ~RAM_MASK) != ((m_transfer_address + (word_count * sizeof(u32))) & ~RAM_MASK)) + { + // this could still be optimized to copy in two parts - end/start, but is unlikely. + for (u32 i = 0; i < word_count; i++) + { + const u32 value = words[i]; + RAMTransferWrite(Truncate16(value)); + RAMTransferWrite(Truncate16(value >> 16)); + } + } + else + { + std::memcpy(&m_ram[m_transfer_address], words, sizeof(u32) * word_count); + m_transfer_address = (m_transfer_address + (sizeof(u32) * word_count)) & RAM_MASK; + } } void SPU::UpdateDMARequest() diff --git a/src/core/spu.h b/src/core/spu.h index 32e62134e..65c6c735e 100644 --- a/src/core/spu.h +++ b/src/core/spu.h @@ -25,8 +25,8 @@ public: u16 ReadRegister(u32 offset); void WriteRegister(u32 offset, u16 value); - u32 DMARead(); - void DMAWrite(u32 value); + void DMARead(u32* words, u32 word_count); + void DMAWrite(const u32* words, u32 word_count); void Execute(TickCount ticks);