EE Rec/IOP Rec: Rewrite large portions

- Add fastmem
 - Add delay slot swapping
 - Add COP2 sync elision
 - Add block analysis and use analysis
 - Add GPR register caching and renaming
This commit is contained in:
Connor McLaughlin 2022-10-29 13:39:19 +10:00 committed by refractionpcsx2
parent 56501e0811
commit 1ccddb92d4
52 changed files with 9111 additions and 5575 deletions

View File

@ -145,6 +145,41 @@ namespace HostSys
extern void UnmapSharedMemory(void* baseaddr, size_t size);
}
class SharedMemoryMappingArea
{
public:
static std::unique_ptr<SharedMemoryMappingArea> Create(size_t size);
~SharedMemoryMappingArea();
__fi size_t GetSize() const { return m_size; }
__fi size_t GetNumPages() const { return m_num_pages; }
__fi u8* BasePointer() const { return m_base_ptr; }
__fi u8* OffsetPointer(size_t offset) const { return m_base_ptr + offset; }
__fi u8* PagePointer(size_t page) const { return m_base_ptr + __pagesize * page; }
u8* Map(void* file_handle, size_t file_offset, void* map_base, size_t map_size, const PageProtectionMode& mode);
bool Unmap(void* map_base, size_t map_size);
private:
SharedMemoryMappingArea(u8* base_ptr, size_t size, size_t num_pages);
u8* m_base_ptr;
size_t m_size;
size_t m_num_pages;
size_t m_num_mappings = 0;
#ifdef _WIN32
using PlaceholderMap = std::map<size_t, size_t>;
PlaceholderMap::iterator FindPlaceholder(size_t page);
PlaceholderMap m_placeholder_ranges;
#endif
};
// Safe version of Munmap -- NULLs the pointer variable immediately after free'ing it.
#define SafeSysMunmap(ptr, size) \
((void)(HostSys::Munmap(ptr, size), (ptr) = 0))

View File

@ -23,6 +23,7 @@
#include "fmt/core.h"
#include "common/Align.h"
#include "common/PageFaultSource.h"
#include "common/Assertions.h"
#include "common/Console.h"
@ -34,12 +35,26 @@
#define MAP_ANONYMOUS MAP_ANON
#endif
#include <cerrno>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#ifndef __APPLE__
#include <ucontext.h>
#endif
extern void SignalExit(int sig);
static const uptr m_pagemask = getpagesize() - 1;
static struct sigaction s_old_sigsegv_action;
#if defined(__APPLE__)
static struct sigaction s_old_sigbus_action;
#endif
// Linux implementation of SIGSEGV handler. Bind it using sigaction().
static void SysPageFaultSignalFilter(int signal, siginfo_t* siginfo, void*)
static void SysPageFaultSignalFilter(int signal, siginfo_t* siginfo, void* ctx)
{
// [TODO] : Add a thread ID filter to the Linux Signal handler here.
// Rationale: On windows, the __try/__except model allows per-thread specific behavior
@ -57,13 +72,20 @@ static void SysPageFaultSignalFilter(int signal, siginfo_t* siginfo, void*)
// Note: Use of stdio functions isn't safe here. Avoid console logs,
// assertions, file logs, or just about anything else useful.
#if defined(__APPLE__) && defined(__x86_64__)
void* const exception_pc = reinterpret_cast<void*>(static_cast<ucontext_t*>(ctx)->uc_mcontext->__ss.__rip);
#elif defined(__x86_64__)
void* const exception_pc = reinterpret_cast<void*>(static_cast<ucontext_t*>(ctx)->uc_mcontext.gregs[REG_RIP]);
#else
void* const exception_pc = nullptr;
#endif
// Note: This signal can be accessed by the EE or MTVU thread
// Source_PageFault is a global variable with its own state information
// so for now we lock this exception code unless someone can fix this better...
std::unique_lock lock(PageFault_Mutex);
Source_PageFault->Dispatch(PageFaultInfo((uptr)siginfo->si_addr & ~m_pagemask));
Source_PageFault->Dispatch(PageFaultInfo((uptr)exception_pc, (uptr)siginfo->si_addr & ~m_pagemask));
// resumes execution right where we left off (re-executes instruction that
// caused the SIGSEGV).
@ -89,11 +111,11 @@ void _platform_InstallSignalHandler()
sigemptyset(&sa.sa_mask);
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = SysPageFaultSignalFilter;
#ifdef __APPLE__
#if defined(__APPLE__)
// MacOS uses SIGBUS for memory permission violations
sigaction(SIGBUS, &sa, NULL);
sigaction(SIGBUS, &sa, &s_old_sigbus_action);
#else
sigaction(SIGSEGV, &sa, NULL);
sigaction(SIGSEGV, &sa, &s_old_sigsegv_action);
#endif
}
@ -210,4 +232,56 @@ void HostSys::UnmapSharedMemory(void* baseaddr, size_t size)
pxFailRel("Failed to unmap shared memory");
}
SharedMemoryMappingArea::SharedMemoryMappingArea(u8* base_ptr, size_t size, size_t num_pages)
: m_base_ptr(base_ptr)
, m_size(size)
, m_num_pages(num_pages)
{
}
SharedMemoryMappingArea::~SharedMemoryMappingArea()
{
pxAssertRel(m_num_mappings == 0, "No mappings left");
if (munmap(m_base_ptr, m_size) != 0)
pxFailRel("Failed to release shared memory area");
}
std::unique_ptr<SharedMemoryMappingArea> SharedMemoryMappingArea::Create(size_t size)
{
pxAssertRel(Common::IsAlignedPow2(size, __pagesize), "Size is page aligned");
void* alloc = mmap(nullptr, size, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (alloc == MAP_FAILED)
return nullptr;
return std::unique_ptr<SharedMemoryMappingArea>(new SharedMemoryMappingArea(static_cast<u8*>(alloc), size, size / __pagesize));
}
u8* SharedMemoryMappingArea::Map(void* file_handle, size_t file_offset, void* map_base, size_t map_size, const PageProtectionMode& mode)
{
pxAssert(static_cast<u8*>(map_base) >= m_base_ptr && static_cast<u8*>(map_base) < (m_base_ptr + m_size));
const uint lnxmode = LinuxProt(mode);
void* const ptr = mmap(map_base, map_size, lnxmode, MAP_SHARED | MAP_FIXED,
static_cast<int>(reinterpret_cast<intptr_t>(file_handle)), static_cast<off_t>(file_offset));
if (ptr == MAP_FAILED)
return nullptr;
m_num_mappings++;
return static_cast<u8*>(ptr);
}
bool SharedMemoryMappingArea::Unmap(void* map_base, size_t map_size)
{
pxAssert(static_cast<u8*>(map_base) >= m_base_ptr && static_cast<u8*>(map_base) < (m_base_ptr + m_size));
if (mmap(map_base, map_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == MAP_FAILED)
return false;
m_num_mappings--;
return true;
}
#endif

View File

@ -34,10 +34,12 @@
struct PageFaultInfo
{
uptr pc;
uptr addr;
PageFaultInfo(uptr address)
PageFaultInfo(uptr pc_, uptr address)
{
pc = pc_;
addr = address;
}
};

View File

@ -24,14 +24,8 @@
#define NOMINMAX
#endif
// Qt build requires Windows 10+, WX Windows 8.1+.
#ifndef _WIN32_WINNT
#ifdef PCSX2_CORE
// We require Windows 10+.
#define _WIN32_WINNT 0x0A00 // Windows 10
#else
#define _WIN32_WINNT 0x0603 // Windows 8.1
#endif
#endif
#include <windows.h>
#include <VersionHelpers.h>

View File

@ -24,16 +24,24 @@
#include "common/AlignedMalloc.h"
#include "fmt/core.h"
#include "fmt/format.h"
static long DoSysPageFaultExceptionFilter(EXCEPTION_POINTERS* eps)
{
if (eps->ExceptionRecord->ExceptionCode != EXCEPTION_ACCESS_VIOLATION)
return EXCEPTION_CONTINUE_SEARCH;
#if defined(_M_AMD64)
void* const exception_pc = reinterpret_cast<void*>(eps->ContextRecord->Rip);
#else
void* const exception_pc = nullptr;
#endif
// Note: This exception can be accessed by the EE or MTVU thread
// Source_PageFault is a global variable with its own state information
// so for now we lock this exception code unless someone can fix this better...
std::unique_lock lock(PageFault_Mutex);
Source_PageFault->Dispatch(PageFaultInfo((uptr)eps->ExceptionRecord->ExceptionInformation[1]));
Source_PageFault->Dispatch(PageFaultInfo((uptr)exception_pc, (uptr)eps->ExceptionRecord->ExceptionInformation[1]));
return Source_PageFault->WasHandled() ? EXCEPTION_CONTINUE_EXECUTION : EXCEPTION_CONTINUE_SEARCH;
}
@ -148,4 +156,185 @@ void HostSys::UnmapSharedMemory(void* baseaddr, size_t size)
pxFail("Failed to unmap shared memory");
}
SharedMemoryMappingArea::SharedMemoryMappingArea(u8* base_ptr, size_t size, size_t num_pages)
: m_base_ptr(base_ptr)
, m_size(size)
, m_num_pages(num_pages)
{
m_placeholder_ranges.emplace(0, size);
}
SharedMemoryMappingArea::~SharedMemoryMappingArea()
{
pxAssertRel(m_num_mappings == 0, "No mappings left");
// hopefully this will be okay, and we don't need to coalesce all the placeholders...
if (!VirtualFreeEx(GetCurrentProcess(), m_base_ptr, 0, MEM_RELEASE))
pxFailRel("Failed to release shared memory area");
}
SharedMemoryMappingArea::PlaceholderMap::iterator SharedMemoryMappingArea::FindPlaceholder(size_t offset)
{
if (m_placeholder_ranges.empty())
return m_placeholder_ranges.end();
// this will give us an iterator equal or after page
auto it = m_placeholder_ranges.lower_bound(offset);
if (it == m_placeholder_ranges.end())
{
// check the last page
it = (++m_placeholder_ranges.rbegin()).base();
}
// it's the one we found?
if (offset >= it->first && offset < it->second)
return it;
// otherwise try the one before
if (it == m_placeholder_ranges.begin())
return m_placeholder_ranges.end();
--it;
if (offset >= it->first && offset < it->second)
return it;
else
return m_placeholder_ranges.end();
}
std::unique_ptr<SharedMemoryMappingArea> SharedMemoryMappingArea::Create(size_t size)
{
pxAssertRel(Common::IsAlignedPow2(size, __pagesize), "Size is page aligned");
void* alloc = VirtualAlloc2(GetCurrentProcess(), nullptr, size, MEM_RESERVE | MEM_RESERVE_PLACEHOLDER, PAGE_NOACCESS, nullptr, 0);
if (!alloc)
return nullptr;
return std::unique_ptr<SharedMemoryMappingArea>(new SharedMemoryMappingArea(static_cast<u8*>(alloc), size, size / __pagesize));
}
u8* SharedMemoryMappingArea::Map(void* file_handle, size_t file_offset, void* map_base, size_t map_size, const PageProtectionMode& mode)
{
pxAssert(static_cast<u8*>(map_base) >= m_base_ptr && static_cast<u8*>(map_base) < (m_base_ptr + m_size));
const size_t map_offset = static_cast<u8*>(map_base) - m_base_ptr;
pxAssert(Common::IsAlignedPow2(map_offset, __pagesize));
pxAssert(Common::IsAlignedPow2(map_size, __pagesize));
// should be a placeholder. unless there's some other mapping we didn't free.
PlaceholderMap::iterator phit = FindPlaceholder(map_offset);
pxAssertMsg(phit != m_placeholder_ranges.end(), "Page we're mapping is a placeholder");
pxAssertMsg(map_offset >= phit->first && map_offset < phit->second, "Page is in returned placeholder range");
pxAssertMsg((map_offset + map_size) <= phit->second, "Page range is in returned placeholder range");
// do we need to split to the left? (i.e. is there a placeholder before this range)
const size_t old_ph_end = phit->second;
if (map_offset != phit->first)
{
phit->second = map_offset;
// split it (i.e. left..start and start..end are now separated)
if (!VirtualFreeEx(GetCurrentProcess(), OffsetPointer(phit->first),
(map_offset - phit->first), MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER))
{
pxFailRel("Failed to left split placeholder for map");
}
}
else
{
// start of the placeholder is getting used, we'll split it right below if there's anything left over
m_placeholder_ranges.erase(phit);
}
// do we need to split to the right? (i.e. is there a placeholder after this range)
if ((map_offset + map_size) != old_ph_end)
{
// split out end..ph_end
m_placeholder_ranges.emplace(map_offset + map_size, old_ph_end);
if (!VirtualFreeEx(GetCurrentProcess(), OffsetPointer(map_offset), map_size,
MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER))
{
pxFailRel("Failed to right split placeholder for map");
}
}
// actually do the mapping, replacing the placeholder on the range
if (!MapViewOfFile3(static_cast<HANDLE>(file_handle), GetCurrentProcess(),
map_base, file_offset, map_size, MEM_REPLACE_PLACEHOLDER, PAGE_READWRITE, nullptr, 0))
{
Console.Error("(SharedMemoryMappingArea) MapViewOfFile3() failed: %u", GetLastError());
return nullptr;
}
const DWORD prot = ConvertToWinApi(mode);
if (prot != PAGE_READWRITE)
{
DWORD old_prot;
if (!VirtualProtect(map_base, map_size, prot, &old_prot))
pxFail("Failed to protect memory mapping");
}
m_num_mappings++;
return static_cast<u8*>(map_base);
}
bool SharedMemoryMappingArea::Unmap(void* map_base, size_t map_size)
{
pxAssert(static_cast<u8*>(map_base) >= m_base_ptr && static_cast<u8*>(map_base) < (m_base_ptr + m_size));
const size_t map_offset = static_cast<u8*>(map_base) - m_base_ptr;
pxAssert(Common::IsAlignedPow2(map_offset, __pagesize));
pxAssert(Common::IsAlignedPow2(map_size, __pagesize));
const size_t page = map_offset / __pagesize;
// unmap the specified range
if (!UnmapViewOfFile2(GetCurrentProcess(), map_base, MEM_PRESERVE_PLACEHOLDER))
{
Console.Error("(SharedMemoryMappingArea) UnmapViewOfFile2() failed: %u", GetLastError());
return false;
}
// can we coalesce to the left?
PlaceholderMap::iterator left_it = (map_offset > 0) ? FindPlaceholder(map_offset - 1) : m_placeholder_ranges.end();
if (left_it != m_placeholder_ranges.end())
{
// the left placeholder should end at our start
pxAssert(map_offset == left_it->second);
left_it->second = map_offset + map_size;
// combine placeholders before and the range we're unmapping, i.e. to the left
if (!VirtualFreeEx(GetCurrentProcess(), OffsetPointer(left_it->first),
left_it->second - left_it->first, MEM_RELEASE | MEM_COALESCE_PLACEHOLDERS))
{
pxFail("Failed to coalesce placeholders left for unmap");
}
}
else
{
// this is a new placeholder
left_it = m_placeholder_ranges.emplace(map_offset, map_offset + map_size).first;
}
// can we coalesce to the right?
PlaceholderMap::iterator right_it = ((map_offset + map_size) < m_size) ? FindPlaceholder(map_offset + map_size) : m_placeholder_ranges.end();
if (right_it != m_placeholder_ranges.end())
{
// should start at our end
pxAssert(right_it->first == (map_offset + map_size));
left_it->second = right_it->second;
m_placeholder_ranges.erase(right_it);
// combine our placeholder and the next, i.e. to the right
if (!VirtualFreeEx(GetCurrentProcess(), OffsetPointer(left_it->first),
left_it->second - left_it->first, MEM_RELEASE | MEM_COALESCE_PLACEHOLDERS))
{
pxFail("Failed to coalescae placeholders right for unmap");
}
}
m_num_mappings--;
return true;
}
#endif

View File

@ -35,6 +35,7 @@ AdvancedSystemSettingsWidget::AdvancedSystemSettingsWidget(SettingsDialog* dialo
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.eeCache, "EmuCore/CPU/Recompiler", "EnableEECache", false);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.eeINTCSpinDetection, "EmuCore/Speedhacks", "IntcStat", true);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.eeWaitLoopDetection, "EmuCore/Speedhacks", "WaitLoop", true);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.eeFastmem, "EmuCore/CPU/Recompiler", "EnableFastmem", true);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.vu0Recompiler, "EmuCore/CPU/Recompiler", "EnableVU0", true);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.vu1Recompiler, "EmuCore/CPU/Recompiler", "EnableVU1", true);
@ -60,6 +61,9 @@ AdvancedSystemSettingsWidget::AdvancedSystemSettingsWidget(SettingsDialog* dialo
dialog->registerWidgetHelp(m_ui.eeINTCSpinDetection, tr("INTC Spin Detection"), tr("Checked"),
tr("Huge speedup for some games, with almost no compatibility side effects."));
dialog->registerWidgetHelp(m_ui.eeFastmem, tr("Enable Fast Memory Access"), tr("Checked"),
tr("Uses backpatching to avoid register flushing on every memory access."));
dialog->registerWidgetHelp(m_ui.vu0Recompiler, tr("Enable VU0 Recompiler"), tr("Checked"),
tr("Enables VU0 Recompiler."));

View File

@ -32,13 +32,6 @@
<string>EmotionEngine (MIPS-IV)</string>
</property>
<layout class="QGridLayout" name="gridLayout_4">
<item row="0" column="0">
<widget class="QCheckBox" name="eeRecompiler">
<property name="text">
<string>Enable Recompiler</string>
</property>
</widget>
</item>
<item row="2" column="0">
<widget class="QCheckBox" name="eeWaitLoopDetection">
<property name="text">
@ -46,6 +39,20 @@
</property>
</widget>
</item>
<item row="2" column="1">
<widget class="QCheckBox" name="eeINTCSpinDetection">
<property name="text">
<string>INTC Spin Detection</string>
</property>
</widget>
</item>
<item row="0" column="0">
<widget class="QCheckBox" name="eeRecompiler">
<property name="text">
<string>Enable Recompiler</string>
</property>
</widget>
</item>
<item row="0" column="1">
<widget class="QCheckBox" name="eeCache">
<property name="text">
@ -53,10 +60,10 @@
</property>
</widget>
</item>
<item row="2" column="1">
<widget class="QCheckBox" name="eeINTCSpinDetection">
<item row="3" column="0">
<widget class="QCheckBox" name="eeFastmem">
<property name="text">
<string>INTC Spin Detection</string>
<string>Enable Fast Memory Access</string>
</property>
</widget>
</item>

View File

@ -45,6 +45,7 @@ GameFixSettingsWidget::GameFixSettingsWidget(SettingsDialog* dialog, QWidget* pa
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.VIF1StallHack, "EmuCore/Gamefixes", "VIF1StallHack", false);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.VuAddSubHack, "EmuCore/Gamefixes", "VuAddSubHack", false);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.IbitHack, "EmuCore/Gamefixes", "IbitHack", false);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.FullVU0SyncHack, "EmuCore/Gamefixes", "FullVU0SyncHack", false);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.VUSyncHack, "EmuCore/Gamefixes", "VUSyncHack", false);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.VUOverflowHack, "EmuCore/Gamefixes", "VUOverflowHack", false);
SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.XgKickHack, "EmuCore/Gamefixes", "XgKickHack", false);

View File

@ -113,6 +113,13 @@
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="FullVU0SyncHack">
<property name="text">
<string>Full VU0 Synchronization (Correct But Slower)</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="IbitHack">
<property name="text">

View File

@ -1698,12 +1698,9 @@ if(WIN32)
strmiids.lib
opengl32.lib
comsuppw.lib
OneCore.lib
)
if(PCSX2_CORE)
target_link_libraries(PCSX2_FLAGS INTERFACE
OneCore.lib
)
else()
if(NOT PCSX2_CORE)
target_link_libraries(PCSX2_FLAGS INTERFACE
pthreads4w
)

View File

@ -46,6 +46,7 @@ enum GamefixId
Fix_VUOverflow,
Fix_XGKick,
Fix_BlitInternalFPS,
Fix_FullVU0Sync,
GamefixId_COUNT
};
@ -382,6 +383,8 @@ struct Pcsx2Config
bool
EnableEECache : 1;
bool
EnableFastmem : 1;
BITFIELD_END
RecompilerOptions();
@ -845,7 +848,8 @@ struct Pcsx2Config
VUSyncHack : 1, // Makes microVU run behind the EE to avoid VU register reading/writing sync issues. Useful for M-Bit games
VUOverflowHack : 1, // Tries to simulate overflow flag checks (not really possible on x86 without soft floats)
XgKickHack : 1, // Erementar Gerad, adds more delay to VU XGkick instructions. Corrects the color of some graphics, but breaks Tri-ace games and others.
BlitInternalFPSHack : 1; // Disables privileged register write-based FPS detection.
BlitInternalFPSHack : 1, // Disables privileged register write-based FPS detection.
FullVU0SyncHack : 1; // Forces tight VU0 sync on every COP2 instruction.
BITFIELD_END
GamefixOptions();
@ -1146,6 +1150,7 @@ namespace EmuFolders
#define CHECK_EEREC (EmuConfig.Cpu.Recompiler.EnableEE)
#define CHECK_CACHE (EmuConfig.Cpu.Recompiler.EnableEECache)
#define CHECK_IOPREC (EmuConfig.Cpu.Recompiler.EnableIOP)
#define CHECK_FASTMEM (EmuConfig.Cpu.Recompiler.EnableEE && EmuConfig.Cpu.Recompiler.EnableFastmem)
//------------ SPECIAL GAME FIXES!!! ---------------
#define CHECK_VUADDSUBHACK (EmuConfig.Gamefixes.VuAddSubHack) // Special Fix for Tri-ace games, they use an encryption algorithm that requires VU addi opcode to be bit-accurate.
@ -1161,6 +1166,7 @@ namespace EmuFolders
#define CHECK_VIF1STALLHACK (EmuConfig.Gamefixes.VIF1StallHack) // Like above, processes FIFO data before the stall is allowed (to make sure data goes over).
#define CHECK_GIFFIFOHACK (EmuConfig.Gamefixes.GIFFIFOHack) // Enabled the GIF FIFO (more correct but slower)
#define CHECK_VUOVERFLOWHACK (EmuConfig.Gamefixes.VUOverflowHack) // Special Fix for Superman Returns, they check for overflows on PS2 floats which we can't do without soft floats.
#define CHECK_FULLVU0SYNCHACK (EmuConfig.Gamefixes.FullVU0SyncHack)
//------------ Advanced Options!!! ---------------
#define CHECK_VU_OVERFLOW (EmuConfig.Cpu.Recompiler.vuOverflow)

View File

@ -298,8 +298,8 @@ void iDumpBlock( int startpc, u8 * ptr )
// write the instruction info
std::fprintf(eff, "\n\nlive0 - %x, live2 - %x, lastuse - %x\nxmm - %x, used - %x\n",
EEINST_LIVE0, EEINST_LIVE2, EEINST_LASTUSE, EEINST_XMM, EEINST_USED
std::fprintf(eff, "\n\nlive0 - %x, lastuse - %x\nxmm - %x, used - %x\n",
EEINST_LIVE, EEINST_LASTUSE, EEINST_XMM, EEINST_USED
);
memzero(used);

View File

@ -3801,6 +3801,8 @@ void FullscreenUI::DrawAdvancedSettingsPage()
"EmuCore/Speedhacks", "IntcStat", true);
DrawToggleSetting(bsi, "Enable Wait Loop Detection", "Moderate speedup for some games, with no known side effects.",
"EmuCore/Speedhacks", "WaitLoop", true);
DrawToggleSetting(bsi, "Enable Fast Memory Access", "Uses backpatching to avoid register flushing on every memory access.",
"EmuCore/CPU/Recompiler", "EnableFastmem", true);
DrawToggleSetting(bsi, "Enable VU0 Recompiler (Micro Mode)",
"New Vector Unit recompiler with much improved compatibility. Recommended.", "EmuCore/CPU/Recompiler", "EnableVU0", true);
DrawToggleSetting(bsi, "Enable VU1 Recompiler", "New Vector Unit recompiler with much improved compatibility. Recommended.",
@ -3857,6 +3859,8 @@ void FullscreenUI::DrawGameFixesSettingsPage()
"EmuCore/Gamefixes", "VuAddSubHack", false);
DrawToggleSetting(bsi, "VU I bit Hack avoid constant recompilation in some games",
"Scarface The World Is Yours, Crash Tag Team Racing.", "EmuCore/Gamefixes", "IbitHack", false);
DrawToggleSetting(
bsi, "Full VU0 Synchronization", "Forces tight VU0 sync on every COP2 instruction.", "EmuCore/Gamefixes", "FullVU0SyncHack", false);
DrawToggleSetting(bsi, "VU Sync (Run behind)", "To avoid sync problems when reading or writing VU registers.", "EmuCore/Gamefixes",
"VUSyncHack", false);
DrawToggleSetting(

View File

@ -404,6 +404,10 @@ void CommonHost::UpdateLogging(SettingsInterface& si)
DevConWriterEnabled = any_logging_sinks && (IsDevBuild || si.GetBoolValue("Logging", "EnableVerbose", false));
SysConsole.eeConsole.Enabled = any_logging_sinks && si.GetBoolValue("Logging", "EnableEEConsole", false);
SysConsole.iopConsole.Enabled = any_logging_sinks && si.GetBoolValue("Logging", "EnableIOPConsole", false);
SysTrace.IOP.R3000A.Enabled = true;
SysTrace.IOP.COP2.Enabled = true;
SysTrace.IOP.Memory.Enabled = true;
SysTrace.SIF.Enabled = true;
// Input Recording Logs
SysConsole.recordingConsole.Enabled = any_logging_sinks && si.GetBoolValue("Logging", "EnableInputRecordingLogs", true);

View File

@ -963,6 +963,7 @@ void mmap_MarkCountedRamPage( u32 paddr )
m_PageProtectInfo[rampage].Mode = ProtMode_Write;
HostSys::MemProtect( &eeMem->Main[rampage<<__pageshift], __pagesize, PageAccess_ReadOnly() );
vtlb_UpdateFastmemProtection(rampage << __pageshift, __pagesize, PageAccess_ReadOnly());
}
// offset - offset of address relative to psM.
@ -980,6 +981,7 @@ static __fi void mmap_ClearCpuBlock( uint offset )
"Attempted to clear a block that is already under manual protection." );
HostSys::MemProtect( &eeMem->Main[rampage<<__pageshift], __pagesize, PageAccess_ReadWrite() );
vtlb_UpdateFastmemProtection(rampage << __pageshift, __pagesize, PageAccess_ReadWrite());
m_PageProtectInfo[rampage].Mode = ProtMode_Manual;
Cpu->Clear( m_PageProtectInfo[rampage].ReverseRamMap, __pagesize );
}
@ -988,12 +990,37 @@ void mmap_PageFaultHandler::OnPageFaultEvent( const PageFaultInfo& info, bool& h
{
pxAssert( eeMem );
// get bad virtual address
uptr offset = info.addr - (uptr)eeMem->Main;
if( offset >= Ps2MemSize::MainRam ) return;
u32 vaddr;
if (CHECK_FASTMEM && vtlb_GetGuestAddress(info.addr, &vaddr))
{
// this was inside the fastmem area. check if it's a code page
// fprintf(stderr, "Fault on fastmem %p vaddr %08X\n", info.addr, vaddr);
mmap_ClearCpuBlock( offset );
handled = true;
uptr ptr = (uptr)PSM(vaddr);
uptr offset = (ptr - (uptr)eeMem->Main);
if (ptr && m_PageProtectInfo[offset >> __pageshift].Mode == ProtMode_Write)
{
// fprintf(stderr, "Not backpatching code write at %08X\n", vaddr);
mmap_ClearCpuBlock(offset);
handled = true;
}
else
{
// fprintf(stderr, "Trying backpatching vaddr %08X\n", vaddr);
if (vtlb_BackpatchLoadStore(info.pc, info.addr))
handled = true;
}
}
else
{
// get bad virtual address
uptr offset = info.addr - (uptr)eeMem->Main;
if (offset >= Ps2MemSize::MainRam)
return;
mmap_ClearCpuBlock(offset);
handled = true;
}
}
// Clears all block tracking statuses, manual protection flags, and write protection.
@ -1005,4 +1032,5 @@ void mmap_ResetBlockTracking()
//DbgCon.WriteLn( "vtlb/mmap: Block Tracking reset..." );
memzero( m_PageProtectInfo );
if (eeMem) HostSys::MemProtect( eeMem->Main, Ps2MemSize::MainRam, PageAccess_ReadWrite() );
vtlb_UpdateFastmemProtection(0, Ps2MemSize::MainRam, PageAccess_ReadWrite());
}

View File

@ -155,6 +155,7 @@ Pcsx2Config::RecompilerOptions::RecompilerOptions()
EnableIOP = true;
EnableVU0 = true;
EnableVU1 = true;
EnableFastmem = true;
// vu and fpu clamping default to standard overflow.
vuOverflow = true;
@ -211,6 +212,7 @@ void Pcsx2Config::RecompilerOptions::LoadSave(SettingsWrapper& wrap)
SettingsWrapBitBool(EnableEECache);
SettingsWrapBitBool(EnableVU0);
SettingsWrapBitBool(EnableVU1);
SettingsWrapBitBool(EnableFastmem);
SettingsWrapBitBool(vuOverflow);
SettingsWrapBitBool(vuExtraOverflow);
@ -864,7 +866,8 @@ static const char* const tbl_GamefixNames[] =
"VUSync",
"VUOverflow",
"XGKick",
"BlitInternalFPS"
"BlitInternalFPS",
"FullVU0Sync",
};
const char* EnumToString(GamefixId id)
@ -907,6 +910,7 @@ void Pcsx2Config::GamefixOptions::Set(GamefixId id, bool enabled)
case Fix_VUSync: VUSyncHack = enabled; break;
case Fix_VUOverflow: VUOverflowHack = enabled; break;
case Fix_BlitInternalFPS: BlitInternalFPSHack = enabled; break;
case Fix_FullVU0Sync: FullVU0SyncHack = enabled; break;
jNO_DEFAULT;
}
}
@ -934,6 +938,7 @@ bool Pcsx2Config::GamefixOptions::Get(GamefixId id) const
case Fix_VUSync: return VUSyncHack;
case Fix_VUOverflow: return VUOverflowHack;
case Fix_BlitInternalFPS: return BlitInternalFPSHack;
case Fix_FullVU0Sync: return FullVU0SyncHack;
jNO_DEFAULT;
}
return false; // unreachable, but we still need to suppress warnings >_<
@ -961,6 +966,7 @@ void Pcsx2Config::GamefixOptions::LoadSave(SettingsWrapper& wrap)
SettingsWrapBitBool(VUSyncHack);
SettingsWrapBitBool(VUOverflowHack);
SettingsWrapBitBool(BlitInternalFPSHack);
SettingsWrapBitBool(FullVU0SyncHack);
}

View File

@ -46,12 +46,6 @@ namespace Exception
public:
explicit CancelInstruction() { }
};
class FailedToAllocateRegister
{
public:
explicit FailedToAllocateRegister() { }
};
}
// --------------------------------------------------------------------------------------

View File

@ -109,12 +109,18 @@ void RecompiledCodeReserve::Reset()
void RecompiledCodeReserve::AllowModification()
{
// Apple Silicon enforces write protection in hardware.
#if !defined(__APPLE__) || !defined(_M_ARM64)
HostSys::MemProtect(m_baseptr, m_size, PageAccess_Any());
#endif
}
void RecompiledCodeReserve::ForbidModification()
{
// Apple Silicon enforces write protection in hardware.
#if !defined(__APPLE__) || !defined(_M_ARM64)
HostSys::MemProtect(m_baseptr, m_size, PageProtectionMode().Read().Execute());
#endif
}
// Sets the abbreviated name used by the profiler. Name should be under 10 characters long.

View File

@ -113,6 +113,10 @@ public:
VirtualMemoryBumpAllocator& BumpAllocator() { return m_bumpAllocator; }
const eeMemoryReserve& EEMemory() const { return m_ee; }
const iopMemoryReserve& IOPMemory() const { return m_iop; }
const vuMemoryReserve& VUMemory() const { return m_vu; }
bool Allocate();
void Reset();
void Release();

View File

@ -1475,6 +1475,7 @@ void VMManager::Execute()
// We need to switch the cpus out, and reset the new ones if so.
s_cpu_provider_pack->ApplyConfig();
SysClearExecutionCache();
vtlb_ResetFastmem();
}
// Execute until we're asked to stop.
@ -1553,6 +1554,9 @@ void VMManager::CheckForCPUConfigChanges(const Pcsx2Config& old_config)
SysClearExecutionCache();
memBindConditionalHandlers();
if (EmuConfig.Cpu.Recompiler.EnableFastmem != old_config.Cpu.Recompiler.EnableFastmem)
vtlb_ResetFastmem();
// did we toggle recompilers?
if (EmuConfig.Cpu.CpusChanged(old_config.Cpu))
{

View File

@ -71,7 +71,7 @@
<Link>
<LargeAddressAware>Yes</LargeAddressAware>
<AdditionalDependencies>comctl32.lib;ws2_32.lib;shlwapi.lib;winmm.lib;rpcrt4.lib;iphlpapi.lib;dsound.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>dxguid.lib;dinput8.lib;hid.lib;PowrProf.lib;d3dcompiler.lib;d3d11.lib;dxgi.lib;strmiids.lib;opengl32.lib;comsuppw.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>dxguid.lib;dinput8.lib;hid.lib;PowrProf.lib;d3dcompiler.lib;d3d11.lib;dxgi.lib;strmiids.lib;opengl32.lib;comsuppw.lib;OneCore.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemGroup>

View File

@ -42,6 +42,13 @@
#include "fmt/core.h"
#include <map>
#include <unordered_set>
#include <unordered_map>
#define FASTMEM_LOG(...)
//#define FASTMEM_LOG(...) Console.WriteLn(__VA_ARGS__)
using namespace R5900;
using namespace vtlb_private;
@ -60,6 +67,36 @@ static vtlbHandler UnmappedVirtHandler1;
static vtlbHandler UnmappedPhyHandler0;
static vtlbHandler UnmappedPhyHandler1;
struct FastmemVirtualMapping
{
u32 offset;
u32 size;
};
struct LoadstoreBackpatchInfo
{
u32 guest_pc;
u32 gpr_bitmask;
u32 fpr_bitmask;
u8 code_size;
u8 address_register;
u8 data_register;
u8 size_in_bits;
bool is_signed;
bool is_load;
bool is_fpr;
};
static constexpr size_t FASTMEM_AREA_SIZE = 0x100000000ULL;
static constexpr u32 FASTMEM_PAGE_COUNT = FASTMEM_AREA_SIZE / VTLB_PAGE_SIZE;
static constexpr u32 NO_FASTMEM_MAPPING = 0xFFFFFFFFu;
static std::unique_ptr<SharedMemoryMappingArea> s_fastmem_area;
static std::vector<u32> s_fastmem_virtual_mapping; // maps vaddr -> mainmem offset
static std::unordered_multimap<u32, u32> s_fastmem_physical_mapping; // maps mainmem offset -> vaddr
static std::unordered_map<uptr, LoadstoreBackpatchInfo> s_fastmem_backpatch_info;
static std::unordered_set<u32> s_fastmem_faulting_pcs;
vtlb_private::VTLBPhysical vtlb_private::VTLBPhysical::fromPointer(sptr ptr) {
pxAssertMsg(ptr >= 0, "Address too high");
return VTLBPhysical(ptr);
@ -659,6 +696,341 @@ __fi u32 vtlb_V2P(u32 vaddr)
return paddr;
}
static constexpr bool vtlb_MismatchedHostPageSize()
{
return (__pagesize != VTLB_PAGE_SIZE);
}
static bool vtlb_IsHostAligned(u32 paddr)
{
if constexpr (!vtlb_MismatchedHostPageSize())
return true;
return ((paddr & __pagemask) == 0);
}
static u32 vtlb_HostPage(u32 page)
{
if constexpr (!vtlb_MismatchedHostPageSize())
return page;
return page >> (__pageshift - VTLB_PAGE_BITS);
}
static u32 vtlb_HostAlignOffset(u32 offset)
{
if constexpr (!vtlb_MismatchedHostPageSize())
return offset;
return offset & ~__pagemask;
}
static bool vtlb_IsHostCoalesced(u32 page)
{
if constexpr (__pagesize == VTLB_PAGE_SIZE)
{
return true;
}
else
{
static constexpr u32 shift = __pageshift - VTLB_PAGE_BITS;
static constexpr u32 count = (1u << shift);
static constexpr u32 mask = count - 1;
const u32 base = page & ~mask;
const u32 base_offset = s_fastmem_virtual_mapping[base];
if ((base_offset & __pagemask) != 0)
return false;
for (u32 i = 0, expected_offset = base_offset; i < count; i++, expected_offset += VTLB_PAGE_SIZE)
{
if (s_fastmem_virtual_mapping[base + i] != expected_offset)
return false;
}
return true;
}
}
static bool vtlb_GetMainMemoryOffsetFromPtr(uptr ptr, u32* mainmem_offset, u32* mainmem_size, PageProtectionMode* prot)
{
const uptr page_end = ptr + VTLB_PAGE_SIZE;
SysMainMemory& vmmem = GetVmMemory();
// EE memory and ROMs.
if (ptr >= (uptr)eeMem->Main && page_end <= (uptr)eeMem->ZeroRead)
{
const u32 eemem_offset = static_cast<u32>(ptr - (uptr)eeMem->Main);
const bool writeable = ((eemem_offset < Ps2MemSize::MainRam) ? (mmap_GetRamPageInfo(eemem_offset) != ProtMode_Write) : true);
*mainmem_offset = (eemem_offset + HostMemoryMap::EEmemOffset);
*mainmem_size = (offsetof(EEVM_MemoryAllocMess, ZeroRead) - eemem_offset);
*prot = PageProtectionMode().Read().Write(writeable);
return true;
}
// IOP memory.
if (ptr >= (uptr)iopMem->Main && page_end <= (uptr)iopMem->P)
{
const u32 iopmem_offset = static_cast<u32>(ptr - (uptr)iopMem->Main);
*mainmem_offset = iopmem_offset + HostMemoryMap::IOPmemOffset;
*mainmem_size = (offsetof(IopVM_MemoryAllocMess, P) - iopmem_offset);
*prot = PageProtectionMode().Read().Write();
return true;
}
// VU memory - this includes both data and code for VU0/VU1.
// Practically speaking, this is only data, because the code goes through a handler.
if (ptr >= (uptr)vmmem.VUMemory().GetPtr() && page_end <= (uptr)vmmem.VUMemory().GetPtrEnd())
{
const u32 vumem_offset = static_cast<u32>(ptr - (uptr)vmmem.VUMemory().GetPtr());
*mainmem_offset = vumem_offset + HostMemoryMap::VUmemOffset;
*mainmem_size = vmmem.VUMemory().GetSize() - vumem_offset;
*prot = PageProtectionMode().Read().Write();
return true;
}
// We end up with some unknown mappings here; currently the IOP memory, instead of being physically mapped
// as 2MB, ends up being mapped as 8MB. But this shouldn't be virtual mapped anyway, so fallback to slowmem
// in such cases.
return false;
}
static bool vtlb_GetMainMemoryOffset(u32 paddr, u32* mainmem_offset, u32* mainmem_size, PageProtectionMode* prot)
{
if (paddr >= VTLB_PMAP_SZ)
return false;
// Handlers aren't in our shared memory, obviously.
const VTLBPhysical& vm = vtlbdata.pmap[paddr >> VTLB_PAGE_BITS];
if (vm.isHandler())
return false;
return vtlb_GetMainMemoryOffsetFromPtr(vm.raw(), mainmem_offset, mainmem_size, prot);
}
static void vtlb_CreateFastmemMapping(u32 vaddr, u32 mainmem_offset, const PageProtectionMode& mode)
{
FASTMEM_LOG("Create fastmem mapping @ vaddr %08X mainmem %08X", vaddr, mainmem_offset);
const u32 page = vaddr / VTLB_PAGE_SIZE;
if (s_fastmem_virtual_mapping[page] == mainmem_offset)
{
// current mapping is fine
return;
}
if (s_fastmem_virtual_mapping[page] != NO_FASTMEM_MAPPING)
{
// current mapping needs to be removed
const bool was_coalesced = vtlb_IsHostCoalesced(page);
s_fastmem_virtual_mapping[page] = NO_FASTMEM_MAPPING;
if (was_coalesced && !s_fastmem_area->Unmap(s_fastmem_area->PagePointer(vtlb_HostPage(page)), __pagesize))
Console.Error("Failed to unmap vaddr %08X", vaddr);
// remove reverse mapping
auto range = s_fastmem_physical_mapping.equal_range(mainmem_offset);
for (auto it = range.first; it != range.second; )
{
auto this_it = it++;
if (this_it->second == vaddr)
s_fastmem_physical_mapping.erase(this_it);
}
}
s_fastmem_virtual_mapping[page] = mainmem_offset;
if (vtlb_IsHostCoalesced(page))
{
const u32 host_page = vtlb_HostPage(page);
const u32 host_offset = vtlb_HostAlignOffset(mainmem_offset);
if (!s_fastmem_area->Map(GetVmMemory().MainMemory()->GetFileHandle(), host_offset,
s_fastmem_area->PagePointer(host_page), __pagesize, mode))
{
Console.Error("Failed to map vaddr %08X to mainmem offset %08X", vtlb_HostAlignOffset(vaddr), host_offset);
s_fastmem_virtual_mapping[page] = NO_FASTMEM_MAPPING;
return;
}
}
s_fastmem_physical_mapping.emplace(mainmem_offset, vaddr);
}
static void vtlb_RemoveFastmemMapping(u32 vaddr)
{
const u32 page = vaddr / VTLB_PAGE_SIZE;
if (s_fastmem_virtual_mapping[page] == NO_FASTMEM_MAPPING)
return;
const u32 mainmem_offset = s_fastmem_virtual_mapping[page];
const bool was_coalesced = vtlb_IsHostCoalesced(page);
FASTMEM_LOG("Remove fastmem mapping @ vaddr %08X mainmem %08X", vaddr, mainmem_offset);
s_fastmem_virtual_mapping[page] = NO_FASTMEM_MAPPING;
if (was_coalesced && !s_fastmem_area->Unmap(s_fastmem_area->PagePointer(vtlb_HostPage(page)), __pagesize))
Console.Error("Failed to unmap vaddr %08X", vtlb_HostAlignOffset(vaddr));
// remove from reverse map
auto range = s_fastmem_physical_mapping.equal_range(mainmem_offset);
for (auto it = range.first; it != range.second;)
{
auto this_it = it++;
if (this_it->second == vaddr)
s_fastmem_physical_mapping.erase(this_it);
}
}
static void vtlb_RemoveFastmemMappings(u32 vaddr, u32 size)
{
pxAssert((vaddr & VTLB_PAGE_MASK) == 0);
pxAssert(size > 0 && (size & VTLB_PAGE_MASK) == 0);
const u32 num_pages = size / VTLB_PAGE_SIZE;
for (u32 i = 0; i < num_pages; i++, vaddr += VTLB_PAGE_SIZE)
vtlb_RemoveFastmemMapping(vaddr);
}
static void vtlb_RemoveFastmemMappings()
{
if (s_fastmem_virtual_mapping.empty())
{
// not initialized yet
return;
}
for (u32 page = 0; page < FASTMEM_PAGE_COUNT; page++)
{
if (s_fastmem_virtual_mapping[page] == NO_FASTMEM_MAPPING)
continue;
s_fastmem_virtual_mapping[page] = NO_FASTMEM_MAPPING;
if (!vtlb_IsHostAligned(page << VTLB_PAGE_BITS))
continue;
if (!s_fastmem_area->Unmap(s_fastmem_area->PagePointer(vtlb_HostPage(page)), __pagesize))
Console.Error("Failed to unmap vaddr %08X", page * __pagesize);
}
s_fastmem_physical_mapping.clear();
}
bool vtlb_ResolveFastmemMapping(uptr* addr)
{
uptr uaddr = *addr;
uptr fastmem_start = (uptr)vtlbdata.fastmem_base;
uptr fastmem_end = fastmem_start + 0xFFFFFFFFu;
if (uaddr < fastmem_start || uaddr > fastmem_end)
return false;
const u32 vaddr = static_cast<u32>(uaddr - fastmem_start);
FASTMEM_LOG("Trying to resolve %p (vaddr %08X)", (void*)uaddr, vaddr);
const u32 vpage = vaddr / VTLB_PAGE_SIZE;
if (s_fastmem_virtual_mapping[vpage] == NO_FASTMEM_MAPPING)
{
FASTMEM_LOG("%08X is not virtual mapped", vaddr);
return false;
}
const u32 mainmem_offset = s_fastmem_virtual_mapping[vpage] + (vaddr & VTLB_PAGE_MASK);
FASTMEM_LOG("Resolved %p (vaddr %08X) to mainmem offset %08X", uaddr, vaddr, mainmem_offset);
*addr = ((uptr)GetVmMemory().MainMemory()->GetBase()) + mainmem_offset;
return true;
}
bool vtlb_GetGuestAddress(uptr host_addr, u32* guest_addr)
{
uptr fastmem_start = (uptr)vtlbdata.fastmem_base;
uptr fastmem_end = fastmem_start + 0xFFFFFFFFu;
if (host_addr < fastmem_start || host_addr > fastmem_end)
return false;
*guest_addr = static_cast<u32>(host_addr - fastmem_start);
return true;
}
void vtlb_UpdateFastmemProtection(u32 paddr, u32 size, const PageProtectionMode& prot)
{
if (!CHECK_FASTMEM)
return;
pxAssert((paddr & VTLB_PAGE_MASK) == 0);
pxAssert(size > 0 && (size & VTLB_PAGE_MASK) == 0);
u32 mainmem_start, mainmem_size;
PageProtectionMode old_prot;
if (!vtlb_GetMainMemoryOffset(paddr, &mainmem_start, &mainmem_size, &old_prot))
return;
FASTMEM_LOG("UpdateFastmemProtection %08X mmoffset %08X %08X", paddr, mainmem_start, size);
u32 current_mainmem = mainmem_start;
const u32 num_pages = std::min(size, mainmem_size) / VTLB_PAGE_SIZE;
for (u32 i = 0; i < num_pages; i++, current_mainmem += VTLB_PAGE_SIZE)
{
// update virtual mapping mapping
auto range = s_fastmem_physical_mapping.equal_range(current_mainmem);
for (auto it = range.first; it != range.second; ++it)
{
FASTMEM_LOG(" valias %08X (size %u)", it->second, VTLB_PAGE_SIZE);
if (vtlb_IsHostAligned(it->second))
HostSys::MemProtect(s_fastmem_area->OffsetPointer(it->second), __pagesize, prot);
}
}
}
void vtlb_ClearLoadStoreInfo()
{
s_fastmem_backpatch_info.clear();
s_fastmem_faulting_pcs.clear();
}
void vtlb_AddLoadStoreInfo(uptr code_address, u32 code_size, u32 guest_pc, u32 gpr_bitmask, u32 fpr_bitmask, u8 address_register, u8 data_register, u8 size_in_bits, bool is_signed, bool is_load, bool is_fpr)
{
pxAssert(code_size < std::numeric_limits<u8>::max());
auto iter = s_fastmem_backpatch_info.find(code_address);
if (iter != s_fastmem_backpatch_info.end())
s_fastmem_backpatch_info.erase(iter);
LoadstoreBackpatchInfo info{guest_pc, gpr_bitmask, fpr_bitmask, static_cast<u8>(code_size), address_register, data_register, size_in_bits, is_signed, is_load, is_fpr};
s_fastmem_backpatch_info.emplace(code_address, info);
}
bool vtlb_BackpatchLoadStore(uptr code_address, uptr fault_address)
{
uptr fastmem_start = (uptr)vtlbdata.fastmem_base;
uptr fastmem_end = fastmem_start + 0xFFFFFFFFu;
if (fault_address < fastmem_start || fault_address > fastmem_end)
return false;
auto iter = s_fastmem_backpatch_info.find(code_address);
if (iter == s_fastmem_backpatch_info.end())
return false;
const LoadstoreBackpatchInfo& info = iter->second;
const u32 guest_addr = static_cast<u32>(fault_address - fastmem_start);
vtlb_DynBackpatchLoadStore(code_address, info.code_size, info.guest_pc, guest_addr,
info.gpr_bitmask, info.fpr_bitmask, info.address_register, info.data_register,
info.size_in_bits, info.is_signed, info.is_load, info.is_fpr);
// queue block for recompilation later
Cpu->Clear(info.guest_pc, 1);
// and store the pc in the faulting list, so that we don't emit another fastmem loadstore
s_fastmem_faulting_pcs.insert(info.guest_pc);
s_fastmem_backpatch_info.erase(iter);
return true;
}
bool vtlb_IsFaultingPC(u32 guest_pc)
{
return (s_fastmem_faulting_pcs.find(guest_pc) != s_fastmem_faulting_pcs.end());
}
//virtual mappings
//TODO: Add invalid paddr checks
void vtlb_VMap(u32 vaddr,u32 paddr,u32 size)
@ -667,6 +1039,23 @@ void vtlb_VMap(u32 vaddr,u32 paddr,u32 size)
verify(0==(paddr&VTLB_PAGE_MASK));
verify(0==(size&VTLB_PAGE_MASK) && size>0);
if (CHECK_FASTMEM)
{
const u32 num_pages = size / VTLB_PAGE_SIZE;
u32 current_vaddr = vaddr;
u32 current_paddr = paddr;
for (u32 i = 0; i < num_pages; i++, current_vaddr += VTLB_PAGE_SIZE, current_paddr += VTLB_PAGE_SIZE)
{
u32 hoffset, hsize;
PageProtectionMode mode;
if (vtlb_GetMainMemoryOffset(current_paddr, &hoffset, &hsize, &mode))
vtlb_CreateFastmemMapping(current_vaddr, hoffset, mode);
else
vtlb_RemoveFastmemMapping(current_vaddr);
}
}
while (size > 0)
{
VTLBVirtual vmv;
@ -696,6 +1085,22 @@ void vtlb_VMapBuffer(u32 vaddr,void* buffer,u32 size)
verify(0==(vaddr&VTLB_PAGE_MASK));
verify(0==(size&VTLB_PAGE_MASK) && size>0);
if (CHECK_FASTMEM)
{
if (buffer == eeMem->Scratch && size == Ps2MemSize::Scratch)
{
u32 fm_vaddr = vaddr;
u32 fm_hostoffset = HostMemoryMap::EEmemOffset + offsetof(EEVM_MemoryAllocMess, Scratch);
PageProtectionMode mode = PageProtectionMode().Read().Write();
for (u32 i = 0; i < (Ps2MemSize::Scratch / VTLB_PAGE_SIZE); i++, fm_vaddr += VTLB_PAGE_SIZE, fm_hostoffset += VTLB_PAGE_SIZE)
vtlb_CreateFastmemMapping(fm_vaddr, fm_hostoffset, mode);
}
else
{
vtlb_RemoveFastmemMappings(vaddr, size);
}
}
uptr bu8 = (uptr)buffer;
while (size > 0)
{
@ -711,6 +1116,8 @@ void vtlb_VMapUnmap(u32 vaddr,u32 size)
verify(0==(vaddr&VTLB_PAGE_MASK));
verify(0==(size&VTLB_PAGE_MASK) && size>0);
vtlb_RemoveFastmemMappings(vaddr, size);
while (size > 0)
{
@ -775,11 +1182,45 @@ void vtlb_Init()
// This function should probably be part of the COP0 rather than here in VTLB.
void vtlb_Reset()
{
vtlb_RemoveFastmemMappings();
for(int i=0; i<48; i++) UnmapTLB(i);
}
void vtlb_Shutdown()
{
vtlb_RemoveFastmemMappings();
s_fastmem_backpatch_info.clear();
s_fastmem_faulting_pcs.clear();
}
void vtlb_ResetFastmem()
{
DevCon.WriteLn("Resetting fastmem mappings...");
vtlb_RemoveFastmemMappings();
s_fastmem_backpatch_info.clear();
s_fastmem_faulting_pcs.clear();
if (!CHECK_FASTMEM || !CHECK_EEREC || !vtlbdata.vmap)
return;
// we need to go through and look at the vtlb pointers, to remap the host area
for (size_t i = 0; i < VTLB_VMAP_ITEMS; i++)
{
const VTLBVirtual& vm = vtlbdata.vmap[i];
const u32 vaddr = static_cast<u32>(i) << VTLB_PAGE_BITS;
if (vm.isHandler(vaddr))
{
// Handlers should be unmapped.
continue;
}
// Check if it's a physical mapping to our main memory area.
u32 mainmem_offset, mainmem_size;
PageProtectionMode prot;
if (vtlb_GetMainMemoryOffsetFromPtr(vm.assumePtr(vaddr), &mainmem_offset, &mainmem_size, &prot))
vtlb_CreateFastmemMapping(vaddr, mainmem_offset, prot);
}
}
static constexpr size_t VMAP_SIZE = sizeof(VTLBVirtual) * VTLB_VMAP_ITEMS;
@ -804,6 +1245,19 @@ void vtlb_Core_Alloc()
HostSys::MemProtect(vmap, VMAP_SIZE, PageProtectionMode().Read().Write());
vtlbdata.vmap = vmap;
}
if (!vtlbdata.fastmem_base)
{
pxAssert(!s_fastmem_area);
s_fastmem_area = SharedMemoryMappingArea::Create(FASTMEM_AREA_SIZE);
if (!s_fastmem_area)
pxFailRel("Failed to allocate fastmem area");
s_fastmem_virtual_mapping.resize(FASTMEM_PAGE_COUNT, NO_FASTMEM_MAPPING);
vtlbdata.fastmem_base = (uptr)s_fastmem_area->BasePointer();
Console.WriteLn(Color_StrongGreen, "Fastmem area: %p - %p",
vtlbdata.fastmem_base, vtlbdata.fastmem_base + (FASTMEM_AREA_SIZE - 1));
}
}
static constexpr size_t PPMAP_SIZE = sizeof(*vtlbdata.ppmap) * VTLB_VMAP_ITEMS;
@ -840,6 +1294,14 @@ void vtlb_Core_Free()
HostSys::MemProtect(vtlbdata.ppmap, PPMAP_SIZE, PageProtectionMode());
vtlbdata.ppmap = nullptr;
}
vtlb_RemoveFastmemMappings();
vtlb_ClearLoadStoreInfo();
vtlbdata.fastmem_base = 0;
decltype(s_fastmem_physical_mapping)().swap(s_fastmem_physical_mapping);
decltype(s_fastmem_virtual_mapping)().swap(s_fastmem_virtual_mapping);
s_fastmem_area.reset();
}
static std::string GetHostVmErrorMsg()

View File

@ -57,6 +57,7 @@ extern void vtlb_Alloc_Ppmap();
extern void vtlb_Init();
extern void vtlb_Shutdown();
extern void vtlb_Reset();
extern void vtlb_ResetFastmem();
extern vtlbHandler vtlb_NewHandler();
@ -82,6 +83,15 @@ extern void vtlb_DynV2P();
extern void vtlb_VMap(u32 vaddr,u32 paddr,u32 sz);
extern void vtlb_VMapBuffer(u32 vaddr,void* buffer,u32 sz);
extern void vtlb_VMapUnmap(u32 vaddr,u32 sz);
extern bool vtlb_ResolveFastmemMapping(uptr* addr);
extern bool vtlb_GetGuestAddress(uptr host_addr, u32* guest_addr);
extern void vtlb_UpdateFastmemProtection(u32 paddr, u32 size, const PageProtectionMode& prot);
extern bool vtlb_BackpatchLoadStore(uptr code_address, uptr fault_address);
extern void vtlb_ClearLoadStoreInfo();
extern void vtlb_AddLoadStoreInfo(uptr code_address, u32 code_size, u32 guest_pc, u32 gpr_bitmask, u32 fpr_bitmask, u8 address_register, u8 data_register, u8 size_in_bits, bool is_signed, bool is_load, bool is_fpr);
extern void vtlb_DynBackpatchLoadStore(uptr code_address, u32 code_size, u32 guest_pc, u32 guest_addr, u32 gpr_bitmask, u32 fpr_bitmask, u8 address_register, u8 data_register, u8 size_in_bits, bool is_signed, bool is_load, bool is_fpr);
extern bool vtlb_IsFaultingPC(u32 guest_pc);
//Memory functions
@ -101,13 +111,14 @@ extern DataType vtlb_ramRead(u32 mem);
template <typename DataType>
extern bool vtlb_ramWrite(u32 mem, const DataType& value);
extern void vtlb_DynGenWrite(u32 sz);
extern void vtlb_DynGenReadNonQuad(u32 bits, bool sign);
extern int vtlb_DynGenReadQuad(u32 sz, int gpr);
using vtlb_ReadRegAllocCallback = int(*)();
extern int vtlb_DynGenReadNonQuad(u32 bits, bool sign, bool xmm, int addr_reg, vtlb_ReadRegAllocCallback dest_reg_alloc = nullptr);
extern int vtlb_DynGenReadNonQuad_Const(u32 bits, bool sign, bool xmm, u32 addr_const, vtlb_ReadRegAllocCallback dest_reg_alloc = nullptr);
extern int vtlb_DynGenReadQuad(u32 bits, int addr_reg, vtlb_ReadRegAllocCallback dest_reg_alloc = nullptr);
extern int vtlb_DynGenReadQuad_Const(u32 bits, u32 addr_const, vtlb_ReadRegAllocCallback dest_reg_alloc = nullptr);
extern void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const );
extern int vtlb_DynGenReadQuad_Const( u32 bits, u32 addr_const, int gpr );
extern void vtlb_DynGenReadNonQuad_Const( u32 bits, bool sign, u32 addr_const );
extern void vtlb_DynGenWrite(u32 sz, bool xmm, int addr_reg, int value_reg);
extern void vtlb_DynGenWrite_Const(u32 bits, bool xmm, u32 addr_const, int value_reg);
// --------------------------------------------------------------------------------------
// VtlbMemoryReserve
@ -125,7 +136,7 @@ public:
// --------------------------------------------------------------------------------------
// eeMemoryReserve
// --------------------------------------------------------------------------------------
class eeMemoryReserve : private VtlbMemoryReserve
class eeMemoryReserve : public VtlbMemoryReserve
{
typedef VtlbMemoryReserve _parent;
@ -142,7 +153,7 @@ public:
// --------------------------------------------------------------------------------------
// iopMemoryReserve
// --------------------------------------------------------------------------------------
class iopMemoryReserve : private VtlbMemoryReserve
class iopMemoryReserve : public VtlbMemoryReserve
{
typedef VtlbMemoryReserve _parent;
@ -159,7 +170,7 @@ public:
// --------------------------------------------------------------------------------------
// vuMemoryReserve
// --------------------------------------------------------------------------------------
class vuMemoryReserve : private VtlbMemoryReserve
class vuMemoryReserve : public VtlbMemoryReserve
{
typedef VtlbMemoryReserve _parent;
@ -253,10 +264,13 @@ namespace vtlb_private
u32* ppmap; //4MB (allocated by vtlb_init) // PS2 virtual to PS2 physical
uptr fastmem_base;
MapData()
{
vmap = NULL;
ppmap = NULL;
fastmem_base = 0;
}
};

View File

@ -44,7 +44,7 @@ namespace COP0 {
// this should be a conditional Jump -- JZ or JNZ normally.
static void _setupBranchTest()
{
_eeFlushAllUnused();
_eeFlushAllDirty();
// COP0 branch conditionals are based on the following equation:
// (((psHu16(DMAC_STAT) | ~psHu16(DMAC_PCR)) & 0x3ff) == 0x3ff)
@ -64,26 +64,32 @@ static void _setupBranchTest()
void recBC0F()
{
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
const bool swap = TrySwapDelaySlot(0, 0, 0);
_setupBranchTest();
recDoBranchImm(JE32(0));
recDoBranchImm(branchTo, JE32(0), false, swap);
}
void recBC0T()
{
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
const bool swap = TrySwapDelaySlot(0, 0, 0);
_setupBranchTest();
recDoBranchImm(JNE32(0));
recDoBranchImm(branchTo, JNE32(0), false, swap);
}
void recBC0FL()
{
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
_setupBranchTest();
recDoBranchImm_Likely(JE32(0));
recDoBranchImm(branchTo, JE32(0), true, false);
}
void recBC0TL()
{
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
_setupBranchTest();
recDoBranchImm_Likely(JNE32(0));
recDoBranchImm(branchTo, JNE32(0), true, false);
}
void recTLBR() { recCall(Interp::TLBR); }
@ -118,7 +124,7 @@ void recDI()
// Jak X, Namco 50th anniversary, Spongebob the Movie, Spongebob Battle for Bikini Bottom,
// The Incredibles, The Incredibles rize of the underminer, Soukou kihei armodyne, Garfield Saving Arlene, Tales of Fandom Vol. 2.
if (!g_recompilingDelaySlot)
recompileNextInstruction(0); // DI execution is delayed by one instruction
recompileNextInstruction(false, false); // DI execution is delayed by one instruction
xMOV(eax, ptr[&cpuRegs.CP0.n.Status]);
xTEST(eax, 0x20006); // EXL | ERL | EDI
@ -152,13 +158,12 @@ void recMFC0()
x86SetJ8(skipInc);
xADD(ptr[&cpuRegs.CP0.n.Count], eax);
xMOV(ptr[&cpuRegs.lastCOP0Cycle], ecx);
xMOV(eax, ptr[&cpuRegs.CP0.r[_Rd_]]);
if (!_Rt_)
return;
_deleteEEreg(_Rt_, 0);
eeSignExtendTo(_Rt_);
const int regt = _Rt_ ? _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE) : -1;
xMOVSX(xRegister64(regt), ptr32[&cpuRegs.CP0.r[_Rd_]]);
return;
}
@ -169,22 +174,25 @@ void recMFC0()
{
if (0 == (_Imm_ & 1)) // MFPS, register value ignored
{
xMOV(eax, ptr[&cpuRegs.PERF.n.pccr]);
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
xMOVSX(xRegister64(regt), ptr32[&cpuRegs.PERF.n.pccr]);
}
else if (0 == (_Imm_ & 2)) // MFPC 0, only LSB of register matters
{
iFlushCall(FLUSH_INTERPRETER);
xFastCall((void*)COP0_UpdatePCCR);
xMOV(eax, ptr[&cpuRegs.PERF.n.pcr0]);
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
xMOVSX(xRegister64(regt), ptr32[&cpuRegs.PERF.n.pcr0]);
}
else // MFPC 1
{
iFlushCall(FLUSH_INTERPRETER);
xFastCall((void*)COP0_UpdatePCCR);
xMOV(eax, ptr[&cpuRegs.PERF.n.pcr1]);
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
xMOVSX(xRegister64(regt), ptr32[&cpuRegs.PERF.n.pcr1]);
}
_deleteEEreg(_Rt_, 0);
eeSignExtendTo(_Rt_);
return;
}
@ -193,10 +201,9 @@ void recMFC0()
COP0_LOG("MFC0 Breakpoint debug Registers code = %x\n", cpuRegs.code & 0x3FF);
return;
}
_eeOnWriteReg(_Rt_, 1);
_deleteEEreg(_Rt_, 0);
xMOV(eax, ptr[&cpuRegs.CP0.r[_Rd_]]);
eeSignExtendTo(_Rt_);
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
xMOVSX(xRegister64(regt), ptr32[&cpuRegs.CP0.r[_Rd_]]);
}
void recMTC0()
@ -260,15 +267,15 @@ void recMTC0()
switch (_Rd_)
{
case 12:
_eeMoveGPRtoR(arg1reg, _Rt_);
iFlushCall(FLUSH_INTERPRETER);
_eeMoveGPRtoR(ecx, _Rt_);
xFastCall((void*)WriteCP0Status, ecx);
xFastCall((void*)WriteCP0Status);
break;
case 16:
_eeMoveGPRtoR(arg1reg, _Rt_);
iFlushCall(FLUSH_INTERPRETER);
_eeMoveGPRtoR(ecx, _Rt_);
xFastCall((void*)WriteCP0Config, ecx);
xFastCall((void*)WriteCP0Config);
break;
case 9:

File diff suppressed because it is too large Load Diff

View File

@ -22,86 +22,72 @@
// Namespace Note : iCore32 contains all of the Register Allocation logic, in addition to a handful
// of utility functions for emitting frequent code.
//#define RALOG(...) fprintf(stderr, __VA_ARGS__)
#define RALOG(...)
////////////////////////////////////////////////////////////////////////////////
// Shared Register allocation flags (apply to X86, XMM, MMX, etc).
#define MODE_READ 1
#define MODE_WRITE 2
#define MODE_READHALF 4 // read only low 64 bits
#define MODE_VUXY 8 // vector only has xy valid (real zw are in mem), not the same as MODE_READHALF
#define MODE_VUZ 0x10 // z only doesn't work for now
#define MODE_VUXYZ (MODE_VUZ | MODE_VUXY) // vector only has xyz valid (real w is in memory)
#define MODE_NOFLUSH 0x20 // can't flush reg to mem
#define MODE_NOFRAME 0x40 // when allocating x86regs, don't use ebp reg
#define MODE_8BITREG 0x80 // when allocating x86regs, use only eax, ecx, edx, and ebx
#define MODE_CALLEESAVED 0x20 // can't flush reg to mem
#define PROCESS_EE_XMM 0x02
// currently only used in FPU
#define PROCESS_EE_S 0x04 // S is valid, otherwise take from mem
#define PROCESS_EE_T 0x08 // T is valid, otherwise take from mem
#define PROCESS_EE_D 0x10 // D is valid, otherwise take from mem
// not used in VU recs
#define PROCESS_EE_MODEWRITES 0x10 // if s is a reg, set if not in cpuRegs
#define PROCESS_EE_MODEWRITET 0x20 // if t is a reg, set if not in cpuRegs
#define PROCESS_EE_LO 0x40 // lo reg is valid
#define PROCESS_EE_HI 0x80 // hi reg is valid
#define PROCESS_EE_ACC 0x40 // acc reg is valid
// used in VU recs
#define PROCESS_VU_UPDATEFLAGS 0x10
#define PROCESS_VU_COP2 0x80 // simple cop2
#define EEREC_S (((info) >> 8) & 0xf)
#define EEREC_T (((info) >> 12) & 0xf)
#define EEREC_D (((info) >> 16) & 0xf)
#define EEREC_LO (((info) >> 20) & 0xf)
#define EEREC_HI (((info) >> 24) & 0xf)
#define EEREC_ACC (((info) >> 20) & 0xf)
#define EEREC_TEMP (((info) >> 24) & 0xf)
#define VUREC_FMAC ((info)&0x80000000)
#define PROCESS_EE_SET_S(reg) ((reg) << 8)
#define PROCESS_EE_SET_T(reg) ((reg) << 12)
#define PROCESS_EE_SET_D(reg) ((reg) << 16)
#define PROCESS_EE_SET_LO(reg) ((reg) << 20)
#define PROCESS_EE_SET_HI(reg) ((reg) << 24)
#define PROCESS_EE_SET_ACC(reg) ((reg) << 20)
#define PROCESS_VU_SET_ACC(reg) PROCESS_EE_SET_ACC(reg)
#define PROCESS_VU_SET_TEMP(reg) ((reg) << 24)
#define PROCESS_VU_SET_FMAC() 0x80000000
#define PROCESS_EE_SET_S(reg) (((reg) << 8) | PROCESS_EE_S)
#define PROCESS_EE_SET_T(reg) (((reg) << 12) | PROCESS_EE_T)
#define PROCESS_EE_SET_D(reg) (((reg) << 16) | PROCESS_EE_D)
#define PROCESS_EE_SET_LO(reg) (((reg) << 20) | PROCESS_EE_LO)
#define PROCESS_EE_SET_HI(reg) (((reg) << 24) | PROCESS_EE_HI)
#define PROCESS_EE_SET_ACC(reg) (((reg) << 20) | PROCESS_EE_ACC)
// special info not related to above flags
#define PROCESS_CONSTS 1
#define PROCESS_CONSTT 2
// XMM caching helpers
#define XMMINFO_READLO 0x001
#define XMMINFO_READHI 0x002
#define XMMINFO_WRITELO 0x004
#define XMMINFO_WRITEHI 0x008
#define XMMINFO_WRITED 0x010
#define XMMINFO_READD 0x020
#define XMMINFO_READS 0x040
#define XMMINFO_READT 0x080
#define XMMINFO_READACC 0x200
#define XMMINFO_WRITEACC 0x400
#define XMMINFO_WRITET 0x800
#define XMMINFO_64BITOP 0x1000
#define XMMINFO_FORCEREGS 0x2000
#define XMMINFO_FORCEREGT 0x4000
#define XMMINFO_NORENAME 0x8000 // disables renaming of Rs to Rt in Rt = Rs op imm
////////////////////////////////////////////////////////////////////////////////
// X86 (32-bit) Register Allocation Tools
#define X86TYPE_TEMP 0
#define X86TYPE_GPR 1
#define X86TYPE_VI 2
#define X86TYPE_MEMOFFSET 3
#define X86TYPE_VIMEMOFFSET 4
#define X86TYPE_VUQREAD 5
#define X86TYPE_VUPREAD 6
#define X86TYPE_VUQWRITE 7
#define X86TYPE_VUPWRITE 8
#define X86TYPE_PSX 9
#define X86TYPE_PCWRITEBACK 10
#define X86TYPE_PSX_PCWRITEBACK 12
#define X86TYPE_VITEMP 13
#define X86TYPE_FNARG 14 // function parameter, max is 4
#define X86TYPE_VU1 0x80
//#define X86_ISVI(type) ((type&~X86TYPE_VU1) == X86TYPE_VI)
static __fi int X86_ISVI(int type)
{
return ((type & ~X86TYPE_VU1) == X86TYPE_VI);
}
#define X86TYPE_FPRC 2
#define X86TYPE_VIREG 3
#define X86TYPE_PCWRITEBACK 4
#define X86TYPE_PSX 5
#define X86TYPE_PSX_PCWRITEBACK 6
struct _x86regs
{
@ -116,79 +102,83 @@ struct _x86regs
extern _x86regs x86regs[iREGCNT_GPR], s_saveX86regs[iREGCNT_GPR];
uptr _x86GetAddr(int type, int reg);
bool _isAllocatableX86reg(int x86reg);
void _initX86regs();
int _getFreeX86reg(int mode);
int _allocX86reg(x86Emitter::xRegister32 x86reg, int type, int reg, int mode);
void _deleteX86reg(int type, int reg, int flush);
int _allocX86reg(int type, int reg, int mode);
int _checkX86reg(int type, int reg, int mode);
bool _hasX86reg(int type, int reg, int required_mode = 0);
void _addNeededX86reg(int type, int reg);
void _clearNeededX86regs();
void _freeX86reg(const x86Emitter::xRegister32& x86reg);
void _freeX86reg(int x86reg);
void _freeX86regWithoutWriteback(int x86reg);
void _freeX86regs();
void _flushCachedRegs();
void _flushX86regs();
void _flushConstRegs();
void _flushConstReg(int reg);
void _validateRegs();
void _writebackX86Reg(int x86reg);
////////////////////////////////////////////////////////////////////////////////
// XMM (128-bit) Register Allocation Tools
#define XMM_CONV_VU(VU) (VU == &VU1)
#define XMMTYPE_TEMP 0 // has to be 0
#define XMMTYPE_VFREG 1
#define XMMTYPE_ACC 2
#define XMMTYPE_FPREG 3
#define XMMTYPE_FPACC 4
#define XMMTYPE_GPRREG 5
#define XMMTYPE_GPRREG X86TYPE_GPR
#define XMMTYPE_FPREG 6
#define XMMTYPE_FPACC 7
#define XMMTYPE_VFREG 8
// lo and hi regs
#define XMMGPR_LO 33
#define XMMGPR_HI 32
#define XMMFPU_ACC 32
enum : int
{
DELETE_REG_FREE = 0,
DELETE_REG_FLUSH = 1,
DELETE_REG_FLUSH_AND_FREE = 2,
DELETE_REG_FREE_NO_WRITEBACK = 3
};
struct _xmmregs
{
u8 inuse;
u8 reg;
s8 reg;
u8 type;
u8 mode;
u8 needed;
u8 VU; // 0 = VU0, 1 = VU1
u16 counter;
};
void _cop2BackupRegs();
void _cop2RestoreRegs();
void _initXMMregs();
int _getFreeXMMreg();
int _allocTempXMMreg(XMMSSEType type, int xmmreg);
int _allocFPtoXMMreg(int xmmreg, int fpreg, int mode);
int _allocGPRtoXMMreg(int xmmreg, int gprreg, int mode);
int _allocFPACCtoXMMreg(int xmmreg, int mode);
int _getFreeXMMreg(u32 maxreg = iREGCNT_XMM);
int _allocTempXMMreg(XMMSSEType type);
int _allocFPtoXMMreg(int fpreg, int mode);
int _allocGPRtoXMMreg(int gprreg, int mode);
int _allocFPACCtoXMMreg(int mode);
void _reallocateXMMreg(int xmmreg, int newtype, int newreg, int newmode, bool writeback = true);
int _checkXMMreg(int type, int reg, int mode);
bool _hasXMMreg(int type, int reg, int required_mode = 0);
void _addNeededFPtoXMMreg(int fpreg);
void _addNeededFPACCtoXMMreg();
void _addNeededGPRtoX86reg(int gprreg);
void _addNeededPSXtoX86reg(int gprreg);
void _addNeededGPRtoXMMreg(int gprreg);
void _clearNeededXMMregs();
//void _deleteACCtoXMMreg(int vu, int flush);
void _deleteGPRtoX86reg(int reg, int flush);
void _deletePSXtoX86reg(int reg, int flush);
void _deleteGPRtoXMMreg(int reg, int flush);
void _deleteFPtoXMMreg(int reg, int flush);
void _freeXMMreg(u32 xmmreg);
void _clearNeededCOP2Regs();
u16 _freeXMMregsCOP2();
//void _moveXMMreg(int xmmreg); // instead of freeing, moves it to a diff location
void _freeXMMreg(int xmmreg);
void _freeXMMregWithoutWriteback(int xmmreg);
void _writebackXMMreg(int xmmreg);
int _allocVFtoXMMreg(int vfreg, int mode);
void mVUFreeCOP2XMMreg(int hostreg);
void _flushCOP2regs();
void _flushXMMreg(int xmmreg);
void _flushXMMregs();
u8 _hasFreeXMMreg();
void _freeXMMregs();
int _getNumXMMwrite();
void _signExtendSFtoM(uptr mem);
// returns new index of reg, lower 32 bits already in mmx
// shift is used when the data is in the top bits of the mmx reg to begin with
// a negative shift is for sign extension
int _signExtendXMMtoM(uptr to, x86SSERegType from, int candestroy); // returns true if reg destroyed
//////////////////////
// Instruction Info //
@ -205,54 +195,99 @@ int _signExtendXMMtoM(uptr to, x86SSERegType from, int candestroy); // returns t
// 3/ EEINST_LIVE* is cleared when register is written. And set again when register is read.
// My guess: the purpose is to detect the usage hole in the flow
#define EEINST_LIVE0 1 // if var is ever used (read or write)
#define EEINST_LIVE2 4 // if cur var's next 64 bits are needed
#define EEINST_LIVE 1 // if var is ever used (read or write)
#define EEINST_LASTUSE 8 // if var isn't written/read anymore
//#define EEINST_MMX 0x10 // removed
#define EEINST_XMM 0x20 // var will be used in xmm ops
#define EEINST_USED 0x40
#define EEINSTINFO_COP1 1
#define EEINSTINFO_COP2 2
#define EEINST_COP2_DENORMALIZE_STATUS_FLAG 0x100
#define EEINST_COP2_NORMALIZE_STATUS_FLAG 0x200
#define EEINST_COP2_STATUS_FLAG 0x400
#define EEINST_COP2_MAC_FLAG 0x800
#define EEINST_COP2_CLIP_FLAG 0x1000
#define EEINST_COP2_FINISH_VU0_MICRO 0x2000
#define EEINST_COP2_SYNC_VU0 0x2000
#define EEINST_COP2_FINISH_VU0 0x4000
#define EEINST_COP2_FLUSH_VU0_REGISTERS 0x8000
struct EEINST
{
u16 info; // extra info, if 1 inst is COP1, 2 inst is COP2. Also uses EEINST_XMM
u8 regs[34]; // includes HI/LO (HI=32, LO=33)
u8 fpuregs[33]; // ACC=32
u8 vfregs[33]; // ACC=32
u8 viregs[16];
// uses XMMTYPE_ flags; if type == XMMTYPE_TEMP, not used
u8 writeType[3], writeReg[3]; // reg written in this inst, 0 if no reg
u8 readType[4], readReg[4];
// valid if info & EEINSTINFO_COP2
int cycle; // cycle of inst (at offset from block)
_VURegsNum vuregs;
};
extern EEINST* g_pCurInstInfo; // info for the cur instruction
extern void _recClearInst(EEINST* pinst);
// returns the number of insts + 1 until written (0 if not written)
extern u32 _recIsRegWritten(EEINST* pinst, int size, u8 xmmtype, u8 reg);
// returns the number of insts + 1 until used (0 if not used)
//extern u32 _recIsRegUsed(EEINST* pinst, int size, u8 xmmtype, u8 reg);
extern u32 _recIsRegReadOrWritten(EEINST* pinst, int size, u8 xmmtype, u8 reg);
extern void _recFillRegister(EEINST& pinst, int type, int reg, int write);
static __fi bool EEINST_ISLIVE64(u32 reg) { return !!(g_pCurInstInfo->regs[reg] & (EEINST_LIVE0)); }
static __fi bool EEINST_ISLIVEXMM(u32 reg) { return !!(g_pCurInstInfo->regs[reg] & (EEINST_LIVE0 | EEINST_LIVE2)); }
static __fi bool EEINST_ISLIVE2(u32 reg) { return !!(g_pCurInstInfo->regs[reg] & EEINST_LIVE2); }
// If unset, values which are not live will not be written back to memory.
// Tends to break stuff at the moment.
#define EE_WRITE_DEAD_VALUES 1
static __fi bool FPUINST_ISLIVE(u32 reg) { return !!(g_pCurInstInfo->fpuregs[reg] & EEINST_LIVE0); }
/// Returns true if the register is used later in the block, and this isn't the last instruction to use it.
/// In other words, the register is worth keeping in a host register/caching it.
static __fi bool EEINST_USEDTEST(u32 reg)
{
return (g_pCurInstInfo->regs[reg] & (EEINST_USED | EEINST_LASTUSE)) == EEINST_USED;
}
/// Returns true if the register is used later in the block as an XMM/128-bit value.
static __fi bool EEINST_XMMUSEDTEST(u32 reg)
{
return (g_pCurInstInfo->regs[reg] & (EEINST_USED | EEINST_XMM | EEINST_LASTUSE)) == (EEINST_USED | EEINST_XMM);
}
/// Returns true if the specified VF register is used later in the block.
static __fi bool COP2INST_USEDTEST(u32 reg)
{
return (g_pCurInstInfo->vfregs[reg] & (EEINST_USED | EEINST_LASTUSE)) == EEINST_USED;
}
/// Returns true if the value should be computed/written back.
/// Basically, this means it's either used before it's overwritten, or not overwritten by the end of the block.
static __fi bool EEINST_LIVETEST(u32 reg)
{
return EE_WRITE_DEAD_VALUES || ((g_pCurInstInfo->regs[reg] & EEINST_LIVE) != 0);
}
/// Returns true if the register can be renamed into another.
static __fi bool EEINST_RENAMETEST(u32 reg)
{
return (reg == 0 || !EEINST_USEDTEST(reg) || !EEINST_LIVETEST(reg));
}
static __fi bool FPUINST_ISLIVE(u32 reg) { return !!(g_pCurInstInfo->fpuregs[reg] & EEINST_LIVE); }
static __fi bool FPUINST_LASTUSE(u32 reg) { return !!(g_pCurInstInfo->fpuregs[reg] & EEINST_LASTUSE); }
/// Returns true if the register is used later in the block, and this isn't the last instruction to use it.
/// In other words, the register is worth keeping in a host register/caching it.
static __fi bool FPUINST_USEDTEST(u32 reg)
{
return (g_pCurInstInfo->fpuregs[reg] & (EEINST_USED | EEINST_LASTUSE)) == EEINST_USED;
}
/// Returns true if the value should be computed/written back.
static __fi bool FPUINST_LIVETEST(u32 reg)
{
return EE_WRITE_DEAD_VALUES || FPUINST_ISLIVE(reg);
}
/// Returns true if the register can be renamed into another.
static __fi bool FPUINST_RENAMETEST(u32 reg)
{
return (!EEINST_USEDTEST(reg) || !EEINST_LIVETEST(reg));
}
extern _xmmregs xmmregs[iREGCNT_XMM], s_saveXMMregs[iREGCNT_XMM];
extern thread_local u8* j8Ptr[32]; // depreciated item. use local u8* vars instead.
@ -261,47 +296,32 @@ extern thread_local u32* j32Ptr[32]; // depreciated item. use local u32* vars i
extern u16 g_x86AllocCounter;
extern u16 g_xmmAllocCounter;
// allocates only if later insts use XMM, otherwise checks
int _allocCheckGPRtoXMM(EEINST* pinst, int gprreg, int mode);
int _allocCheckFPUtoXMM(EEINST* pinst, int fpureg, int mode);
// allocates only if later insts use this register
int _allocCheckGPRtoX86(EEINST* pinst, int gprreg, int mode);
int _allocIfUsedGPRtoX86(int gprreg, int mode);
int _allocIfUsedGPRtoXMM(int gprreg, int mode);
int _allocIfUsedFPUtoXMM(int fpureg, int mode);
//////////////////////////////////////////////////////////////////////////
// iFlushCall / _psxFlushCall Parameters
// Flushing vs. Freeing, as understood by Air (I could be wrong still....)
// "Freeing" registers means that the contents of the registers are flushed to memory.
// This is good for any sort of C code function that plans to modify the actual
// registers. When the Recs resume, they'll reload the registers with values saved
// as needed. (similar to a "FreezeXMMRegs")
// "Flushing" means that in addition to the standard free (which is actually a flush)
// the register allocations are additionally wiped. This should only be necessary if
// the code being called is going to modify register allocations -- ie, be doing
// some kind of recompiling of its own.
#define FLUSH_CACHED_REGS 0x001
#define FLUSH_FLUSH_XMM 0x002
#define FLUSH_FREE_XMM 0x004 // both flushes and frees
#define FLUSH_FLUSH_ALLX86 0x020 // flush x86
#define FLUSH_FREE_TEMPX86 0x040 // flush and free temporary x86 regs
#define FLUSH_FREE_ALLX86 0x080 // free all x86 regs
#define FLUSH_FREE_VU0 0x100 // free all vu0 related regs
#define FLUSH_PC 0x200 // program counter
#define FLUSH_CAUSE 0x000 // disabled for now: cause register, only the branch delay bit
#define FLUSH_CODE 0x800 // opcode for interpreter
#define FLUSH_NONE 0x000 // frees caller saved registers
#define FLUSH_CONSTANT_REGS 0x001
#define FLUSH_FLUSH_XMM 0x002
#define FLUSH_FREE_XMM 0x004 // both flushes and frees
#define FLUSH_ALL_X86 0x020 // flush x86
#define FLUSH_FREE_TEMP_X86 0x040 // flush and free temporary x86 regs
#define FLUSH_FREE_NONTEMP_X86 0x080 // free all x86 regs, except temporary
#define FLUSH_FREE_VU0 0x100 // free all vu0 related regs
#define FLUSH_PC 0x200 // program counter
//#define FLUSH_CAUSE 0x000 // disabled for now: cause register, only the branch delay bit
#define FLUSH_CODE 0x800 // opcode for interpreter
#define FLUSH_EVERYTHING 0x1ff
//#define FLUSH_EXCEPTION 0x1ff // will probably do this totally differently actually
#define FLUSH_INTERPRETER 0xfff
#define FLUSH_FULLVTLB FLUSH_NOCONST
#define FLUSH_FULLVTLB 0x000
// no freeing, used when callee won't destroy xmm regs
#define FLUSH_NODESTROY (FLUSH_CACHED_REGS | FLUSH_FLUSH_XMM | FLUSH_FLUSH_ALLX86)
// used when regs aren't going to be changed be callee
#define FLUSH_NOCONST (FLUSH_FREE_XMM | FLUSH_FREE_TEMPX86)
#define FLUSH_NODESTROY (FLUSH_CONSTANT_REGS | FLUSH_FLUSH_XMM | FLUSH_ALL_X86)
#endif

View File

@ -126,23 +126,18 @@ void recCFC1(void)
return;
EE::Profiler.EmitOp(eeOpcode::CFC1);
_eeOnWriteReg(_Rt_, 1);
if (_Fs_ >= 16)
xMOV(eax, ptr[&fpuRegs.fprc[31]]);
else
xMOV(eax, ptr[&fpuRegs.fprc[0]]);
_deleteEEreg(_Rt_, 0);
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
if (_Fs_ >= 16)
{
xAND(eax, 0x0083c078); //remove always-zero bits
xOR(eax, 0x01000001); //set always-one bits
xMOV(xRegister32(regt), ptr32[&fpuRegs.fprc[31]]);
xAND(xRegister32(regt), 0x0083c078); //remove always-zero bits
xOR(xRegister32(regt), 0x01000001); //set always-one bits
xMOVSX(xRegister64(regt), xRegister32(regt));
}
else
{
xMOVSX(xRegister64(regt), ptr32[&fpuRegs.fprc[0]]);
}
xCDQ();
xMOV(ptr[&cpuRegs.GPR.r[_Rt_].UL[0]], eax);
xMOV(ptr[&cpuRegs.GPR.r[_Rt_].UL[1]], edx);
}
void recCTC1()
@ -163,7 +158,10 @@ void recCTC1()
{
xMOVSS(ptr[&fpuRegs.fprc[_Fs_]], xRegisterSSE(mmreg));
}
else if ((mmreg = _checkX86reg(X86TYPE_GPR, _Rt_, MODE_READ)) >= 0)
{
xMOV(ptr32[&fpuRegs.fprc[_Fs_]], xRegister32(mmreg));
}
else
{
_deleteGPRtoXMMreg(_Rt_, 1);
@ -184,36 +182,42 @@ void recMFC1()
{
if (!_Rt_)
return;
EE::Profiler.EmitOp(eeOpcode::MFC1);
_eeOnWriteReg(_Rt_, 1);
const int xmmregt = _allocIfUsedGPRtoXMM(_Rt_, MODE_READ | MODE_WRITE);
const int regs = _allocIfUsedFPUtoXMM(_Fs_, MODE_READ);
if (regs >= 0 && xmmregt >= 0)
{
// if we're in xmm, we shouldn't be const
pxAssert(!GPR_IS_CONST1(_Rt_));
const int regs = _checkXMMreg(XMMTYPE_FPREG, _Fs_, MODE_READ);
// both in xmm, sign extend and insert lower bits
const int temp = _allocTempXMMreg(XMMT_FPS);
xMOVAPS(xRegisterSSE(temp), xRegisterSSE(regs));
xPSRA.D(xRegisterSSE(temp), 31);
xMOVSS(xRegisterSSE(xmmregt), xRegisterSSE(regs));
xINSERTPS(xRegisterSSE(xmmregt), xRegisterSSE(temp), _MM_MK_INSERTPS_NDX(0, 1, 0));
_freeXMMreg(temp);
return;
}
// storing to a gpr..
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
// shouldn't be const after we're writing.
pxAssert(!GPR_IS_CONST1(_Rt_));
if (regs >= 0)
{
_deleteGPRtoXMMreg(_Rt_, 2);
_signExtendXMMtoM((uptr)&cpuRegs.GPR.r[_Rt_].UL[0], regs, 0);
// xmm -> gpr
xMOVD(xRegister32(regt), xRegisterSSE(regs));
xMOVSX(xRegister64(regt), xRegister32(regt));
}
else
{
const int regt = _checkXMMreg(XMMTYPE_GPRREG, _Rt_, MODE_READ);
if (regt >= 0)
{
if (xmmregs[regt].mode & MODE_WRITE)
{
xMOVH.PS(ptr[&cpuRegs.GPR.r[_Rt_].UL[2]], xRegisterSSE(regt));
}
xmmregs[regt].inuse = 0;
}
_deleteEEreg(_Rt_, 0);
xMOV(eax, ptr[&fpuRegs.fpr[_Fs_].UL]);
xCDQ();
xMOV(ptr[&cpuRegs.GPR.r[_Rt_].UL[0]], eax);
xMOV(ptr[&cpuRegs.GPR.r[_Rt_].UL[1]], edx);
// mem -> gpr
xMOVSX(xRegister64(regt), ptr32[&fpuRegs.fpr[_Fs_].UL]);
}
}
@ -228,44 +232,60 @@ void recMTC1()
EE::Profiler.EmitOp(eeOpcode::MTC1);
if (GPR_IS_CONST1(_Rt_))
{
_deleteFPtoXMMreg(_Fs_, 0);
xMOV(ptr32[&fpuRegs.fpr[_Fs_].UL], g_cpuConstRegs[_Rt_].UL[0]);
}
else
{
int mmreg = _checkXMMreg(XMMTYPE_GPRREG, _Rt_, MODE_READ);
if (mmreg >= 0)
const int xmmreg = _allocIfUsedFPUtoXMM(_Fs_, MODE_WRITE);
if (xmmreg >= 0)
{
if (g_pCurInstInfo->regs[_Rt_] & EEINST_LASTUSE)
// common case: mtc1 zero, fnn
if (g_cpuConstRegs[_Rt_].UL[0] == 0)
{
// transfer the reg directly
_deleteGPRtoXMMreg(_Rt_, 2);
_deleteFPtoXMMreg(_Fs_, 2);
_allocFPtoXMMreg(mmreg, _Fs_, MODE_WRITE);
xPXOR(xRegisterSSE(xmmreg), xRegisterSSE(xmmreg));
}
else
{
int mmreg2 = _allocCheckFPUtoXMM(g_pCurInstInfo, _Fs_, MODE_WRITE);
if (mmreg2 >= 0)
xMOVSS(xRegisterSSE(mmreg2), xRegisterSSE(mmreg));
else
xMOVSS(ptr[&fpuRegs.fpr[_Fs_].UL], xRegisterSSE(mmreg));
// may as well flush the constant register, since we're needing it in a gpr anyway
const int x86reg = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_READ);
xMOVDZX(xRegisterSSE(xmmreg), xRegister32(x86reg));
}
}
else
{
int mmreg2 = _allocCheckFPUtoXMM(g_pCurInstInfo, _Fs_, MODE_WRITE);
if (mmreg2 >= 0)
pxAssert(!_hasXMMreg(XMMTYPE_FPREG, _Fs_));
xMOV(ptr32[&fpuRegs.fpr[_Fs_].UL], g_cpuConstRegs[_Rt_].UL[0]);
}
}
else
{
const int xmmgpr = _checkXMMreg(XMMTYPE_GPRREG, _Rt_, MODE_READ);
if (xmmgpr >= 0)
{
if (g_pCurInstInfo->regs[_Rt_] & EEINST_LASTUSE)
{
xMOVSSZX(xRegisterSSE(mmreg2), ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
// transfer the reg directly
_deleteFPtoXMMreg(_Fs_, DELETE_REG_FREE_NO_WRITEBACK);
_reallocateXMMreg(xmmgpr, XMMTYPE_FPREG, _Fs_, MODE_WRITE);
}
else
{
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
xMOV(ptr[&fpuRegs.fpr[_Fs_].UL], eax);
const int xmmreg2 = _allocIfUsedFPUtoXMM(_Fs_, MODE_WRITE);
if (xmmreg2 >= 0)
xMOVSS(xRegisterSSE(xmmreg2), xRegisterSSE(xmmgpr));
else
xMOVSS(ptr[&fpuRegs.fpr[_Fs_].UL], xRegisterSSE(xmmgpr));
}
}
else
{
// may as well cache it..
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_READ);
const int mmreg2 = _allocIfUsedFPUtoXMM(_Fs_, MODE_WRITE);
if (mmreg2 >= 0)
{
xMOVDZX(xRegisterSSE(mmreg2), xRegister32(regt));
}
else
{
xMOV(ptr32[&fpuRegs.fpr[_Fs_].UL], xRegister32(regt));
}
}
}
@ -311,31 +331,39 @@ REC_FPUFUNC(RSQRT_S);
// Clamp Functions (Converts NaN's and Infinities to Normal Numbers)
//------------------------------------------------------------------
alignas(16) static u64 FPU_FLOAT_TEMP[2];
static int fpuCopyToTempForClamp(int fpureg, int xmmreg)
{
if (FPUINST_USEDTEST(fpureg))
{
const int tempreg = _allocTempXMMreg(XMMT_FPS);
xMOVSS(xRegisterSSE(tempreg), xRegisterSSE(xmmreg));
return tempreg;
}
// flush back the original value, before we mess with it below
if (FPUINST_LIVETEST(fpureg))
_flushXMMreg(xmmreg);
// turn it into a temp, so in case the liveness was incorrect, we don't reuse it after clamp
_reallocateXMMreg(xmmreg, XMMTYPE_TEMP, 0, 0, true);
return xmmreg;
}
static void fpuFreeIfTemp(int xmmreg)
{
if (xmmregs[xmmreg].inuse && xmmregs[xmmreg].type == XMMTYPE_TEMP)
_freeXMMreg(xmmreg);
}
__fi void fpuFloat3(int regd) // +NaN -> +fMax, -NaN -> -fMax, +Inf -> +fMax, -Inf -> -fMax
{
int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
if (t1reg >= 0)
{
xMOVSS(xRegisterSSE(t1reg), xRegisterSSE(regd));
xAND.PS(xRegisterSSE(t1reg), ptr[&s_neg[0]]);
xMIN.SS(xRegisterSSE(regd), ptr[&g_maxvals[0]]);
xMAX.SS(xRegisterSSE(regd), ptr[&g_minvals[0]]);
xOR.PS(xRegisterSSE(regd), xRegisterSSE(t1reg));
_freeXMMreg(t1reg);
}
else
{
Console.Error("fpuFloat2() allocation error");
t1reg = (regd == 0) ? 1 : 0; // get a temp reg thats not regd
xMOVAPS(ptr[&FPU_FLOAT_TEMP[0]], xRegisterSSE(t1reg)); // backup data in t1reg to a temp address
xMOVSS(xRegisterSSE(t1reg), xRegisterSSE(regd));
xAND.PS(xRegisterSSE(t1reg), ptr[&s_neg[0]]);
xMIN.SS(xRegisterSSE(regd), ptr[&g_maxvals[0]]);
xMAX.SS(xRegisterSSE(regd), ptr[&g_minvals[0]]);
xOR.PS(xRegisterSSE(regd), xRegisterSSE(t1reg));
xMOVAPS(xRegisterSSE(t1reg), ptr[&FPU_FLOAT_TEMP[0]]); // restore t1reg data
}
const int t1reg = _allocTempXMMreg(XMMT_FPS);
xMOVSS(xRegisterSSE(t1reg), xRegisterSSE(regd));
xAND.PS(xRegisterSSE(t1reg), ptr[&s_neg[0]]);
xMIN.SS(xRegisterSSE(regd), ptr[&g_maxvals[0]]);
xMAX.SS(xRegisterSSE(regd), ptr[&g_minvals[0]]);
xOR.PS(xRegisterSSE(regd), xRegisterSSE(t1reg));
_freeXMMreg(t1reg);
}
__fi void fpuFloat(int regd) // +/-NaN -> +fMax, +Inf -> +fMax, -Inf -> -fMax
@ -396,34 +424,31 @@ FPURECOMPILE_CONSTCODE(ABS_S, XMMINFO_WRITED | XMMINFO_READS);
//------------------------------------------------------------------
void FPU_ADD_SUB(int regd, int regt, int issub)
{
int tempecx = _allocX86reg(ecx, X86TYPE_TEMP, 0, 0); //receives regd
int temp2 = _allocX86reg(xEmptyReg, X86TYPE_TEMP, 0, 0); //receives regt
int xmmtemp = _allocTempXMMreg(XMMT_FPS, -1); //temporary for anding with regd/regt
xMOVD(xRegister32(tempecx), xRegisterSSE(regd));
xMOVD(xRegister32(temp2), xRegisterSSE(regt));
const int xmmtemp = _allocTempXMMreg(XMMT_FPS); //temporary for anding with regd/regt
xMOVD(ecx, xRegisterSSE(regd)); // ecx receives regd
xMOVD(eax, xRegisterSSE(regt)); // eax receives regt
//mask the exponents
xSHR(xRegister32(tempecx), 23);
xSHR(xRegister32(temp2), 23);
xAND(xRegister32(tempecx), 0xff);
xAND(xRegister32(temp2), 0xff);
xSHR(ecx, 23);
xSHR(eax, 23);
xAND(ecx, 0xff);
xAND(eax, 0xff);
xSUB(xRegister32(tempecx), xRegister32(temp2)); //tempecx = exponent difference
xCMP(xRegister32(tempecx), 25);
xSUB(ecx, eax); //tempecx = exponent difference
xCMP(ecx, 25);
j8Ptr[0] = JGE8(0);
xCMP(xRegister32(tempecx), 0);
xCMP(ecx, 0);
j8Ptr[1] = JG8(0);
j8Ptr[2] = JE8(0);
xCMP(xRegister32(tempecx), -25);
xCMP(ecx, -25);
j8Ptr[3] = JLE8(0);
//diff = -24 .. -1 , expd < expt
xNEG(xRegister32(tempecx));
xDEC(xRegister32(tempecx));
xMOV(xRegister32(temp2), 0xffffffff);
xSHL(xRegister32(temp2), cl); //temp2 = 0xffffffff << tempecx
xMOVDZX(xRegisterSSE(xmmtemp), xRegister32(temp2));
xNEG(ecx);
xDEC(ecx);
xMOV(eax, 0xffffffff);
xSHL(eax, cl); //temp2 = 0xffffffff << tempecx
xMOVDZX(xRegisterSSE(xmmtemp), eax);
xAND.PS(xRegisterSSE(regd), xRegisterSSE(xmmtemp));
if (issub)
xSUB.SS(xRegisterSSE(regd), xRegisterSSE(regt));
@ -443,10 +468,10 @@ void FPU_ADD_SUB(int regd, int regt, int issub)
x86SetJ8(j8Ptr[1]);
//diff = 1 .. 24, expt < expd
xDEC(xRegister32(tempecx));
xMOV(xRegister32(temp2), 0xffffffff);
xSHL(xRegister32(temp2), cl); //temp2 = 0xffffffff << tempecx
xMOVDZX(xRegisterSSE(xmmtemp), xRegister32(temp2));
xDEC(ecx);
xMOV(eax, 0xffffffff);
xSHL(eax, cl); //temp2 = 0xffffffff << tempecx
xMOVDZX(xRegisterSSE(xmmtemp), eax);
xAND.PS(xRegisterSSE(xmmtemp), xRegisterSSE(regt));
if (issub)
xSUB.SS(xRegisterSSE(regd), xRegisterSSE(xmmtemp));
@ -476,8 +501,6 @@ void FPU_ADD_SUB(int regd, int regt, int issub)
x86SetJ8(j8Ptr[7]);
_freeXMMreg(xmmtemp);
_freeX86reg(temp2);
_freeX86reg(tempecx);
}
void FPU_ADD(int regd, int regt)
@ -550,7 +573,7 @@ static void (*recComOpXMM_to_XMM_REV[])(x86SSERegType, x86SSERegType) = { //reve
int recCommutativeOp(int info, int regd, int op)
{
int t0reg = _allocTempXMMreg(XMMT_FPS, -1);
int t0reg = _allocTempXMMreg(XMMT_FPS);
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
@ -667,7 +690,7 @@ FPURECOMPILE_CONSTCODE(ADDA_S, XMMINFO_WRITEACC | XMMINFO_READS | XMMINFO_READT)
static void _setupBranchTest()
{
_eeFlushAllUnused();
_eeFlushAllDirty();
// COP1 branch conditionals are based on the following equation:
// (fpuRegs.fprc[31] & 0x00800000)
@ -680,29 +703,35 @@ static void _setupBranchTest()
void recBC1F()
{
EE::Profiler.EmitOp(eeOpcode::BC1F);
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
const bool swap = TrySwapDelaySlot(0, 0, 0);
_setupBranchTest();
recDoBranchImm(JNZ32(0));
recDoBranchImm(branchTo, JNZ32(0), false, swap);
}
void recBC1T()
{
EE::Profiler.EmitOp(eeOpcode::BC1T);
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
const bool swap = TrySwapDelaySlot(0, 0, 0);
_setupBranchTest();
recDoBranchImm(JZ32(0));
recDoBranchImm(branchTo, JZ32(0), false, swap);
}
void recBC1FL()
{
EE::Profiler.EmitOp(eeOpcode::BC1FL);
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
_setupBranchTest();
recDoBranchImm_Likely(JNZ32(0));
recDoBranchImm(branchTo, JNZ32(0), true, false);
}
void recBC1TL()
{
EE::Profiler.EmitOp(eeOpcode::BC1TL);
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
_setupBranchTest();
recDoBranchImm_Likely(JZ32(0));
recDoBranchImm(branchTo, JZ32(0), true, false);
}
//------------------------------------------------------------------
@ -713,49 +742,62 @@ void recBC1TL()
void recC_EQ_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::CEQ_F);
int tempReg;
int t0reg;
//Console.WriteLn("recC_EQ_xmm()");
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
fpuFloat3(EEREC_S);
t0reg = _allocTempXMMreg(XMMT_FPS, -1);
if (t0reg >= 0)
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(EEREC_S), xRegisterSSE(t0reg));
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regs);
}
else
xUCOMI.SS(xRegisterSSE(EEREC_S), ptr[&fpuRegs.fpr[_Ft_]]);
break;
case PROCESS_EE_T:
fpuFloat3(EEREC_T);
t0reg = _allocTempXMMreg(XMMT_FPS, -1);
if (t0reg >= 0)
{
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xUCOMI.SS(xRegisterSSE(t0reg), xRegisterSSE(regt));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regt);
}
else
xUCOMI.SS(xRegisterSSE(EEREC_T), ptr[&fpuRegs.fpr[_Fs_]]);
break;
case (PROCESS_EE_S | PROCESS_EE_T):
fpuFloat3(EEREC_S);
fpuFloat3(EEREC_T);
xUCOMI.SS(xRegisterSSE(EEREC_S), xRegisterSSE(EEREC_T));
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(regt));
fpuFreeIfTemp(regs);
fpuFreeIfTemp(regt);
}
break;
default:
Console.WriteLn(Color_Magenta, "recC_EQ_xmm: Default");
tempReg = _allocX86reg(xEmptyReg, X86TYPE_TEMP, 0, 0);
xMOV(xRegister32(tempReg), ptr[&fpuRegs.fpr[_Fs_]]);
xCMP(xRegister32(tempReg), ptr[&fpuRegs.fpr[_Ft_]]);
xMOV(eax, ptr[&fpuRegs.fpr[_Fs_]]);
xCMP(eax, ptr[&fpuRegs.fpr[_Ft_]]);
j8Ptr[0] = JZ8(0);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
@ -763,9 +805,6 @@ void recC_EQ_xmm(int info)
x86SetJ8(j8Ptr[0]);
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
x86SetJ8(j8Ptr[1]);
if (tempReg >= 0)
_freeX86reg(tempReg);
return;
}
@ -790,59 +829,62 @@ void recC_F()
void recC_LE_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::CLE_F);
int tempReg; //tempX86reg
int t0reg; //tempXMMreg
//Console.WriteLn("recC_LE_xmm()");
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
fpuFloat3(EEREC_S);
t0reg = _allocTempXMMreg(XMMT_FPS, -1);
if (t0reg >= 0)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(EEREC_S), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
}
else
xUCOMI.SS(xRegisterSSE(EEREC_S), ptr[&fpuRegs.fpr[_Ft_]]);
break;
case PROCESS_EE_T:
fpuFloat3(EEREC_T);
t0reg = _allocTempXMMreg(XMMT_FPS, -1);
if (t0reg >= 0)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
_freeXMMreg(t0reg);
}
else
{
xUCOMI.SS(xRegisterSSE(EEREC_T), ptr[&fpuRegs.fpr[_Fs_]]);
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regs);
}
break;
case PROCESS_EE_T:
{
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(t0reg), xRegisterSSE(regt));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regt);
}
break;
j8Ptr[0] = JAE8(0);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
j8Ptr[1] = JMP8(0);
x86SetJ8(j8Ptr[0]);
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
x86SetJ8(j8Ptr[1]);
return;
}
break;
case (PROCESS_EE_S | PROCESS_EE_T):
fpuFloat3(EEREC_S);
fpuFloat3(EEREC_T);
xUCOMI.SS(xRegisterSSE(EEREC_S), xRegisterSSE(EEREC_T));
break;
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(regt));
fpuFreeIfTemp(regs);
fpuFreeIfTemp(regt);
}
break;
default: // Untested and incorrect, but this case is never reached AFAIK (cottonvibes)
Console.WriteLn(Color_Magenta, "recC_LE_xmm: Default");
tempReg = _allocX86reg(xEmptyReg, X86TYPE_TEMP, 0, 0);
xMOV(xRegister32(tempReg), ptr[&fpuRegs.fpr[_Fs_]]);
xCMP(xRegister32(tempReg), ptr[&fpuRegs.fpr[_Ft_]]);
xMOV(eax, ptr[&fpuRegs.fpr[_Fs_]]);
xCMP(eax, ptr[&fpuRegs.fpr[_Ft_]]);
j8Ptr[0] = JLE8(0);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
@ -850,9 +892,6 @@ void recC_LE_xmm(int info)
x86SetJ8(j8Ptr[0]);
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
x86SetJ8(j8Ptr[1]);
if (tempReg >= 0)
_freeX86reg(tempReg);
return;
}
@ -870,61 +909,62 @@ FPURECOMPILE_CONSTCODE(C_LE, XMMINFO_READS | XMMINFO_READT);
void recC_LT_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::CLT_F);
int tempReg;
int t0reg;
//Console.WriteLn("recC_LT_xmm()");
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
fpuFloat3(EEREC_S);
t0reg = _allocTempXMMreg(XMMT_FPS, -1);
if (t0reg >= 0)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(EEREC_S), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
}
else
xUCOMI.SS(xRegisterSSE(EEREC_S), ptr[&fpuRegs.fpr[_Ft_]]);
break;
case PROCESS_EE_T:
fpuFloat3(EEREC_T);
t0reg = _allocTempXMMreg(XMMT_FPS, -1);
if (t0reg >= 0)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
_freeXMMreg(t0reg);
}
else
{
xUCOMI.SS(xRegisterSSE(EEREC_T), ptr[&fpuRegs.fpr[_Fs_]]);
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regs);
}
break;
case PROCESS_EE_T:
{
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(t0reg), xRegisterSSE(regt));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regt);
}
break;
j8Ptr[0] = JA8(0);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
j8Ptr[1] = JMP8(0);
x86SetJ8(j8Ptr[0]);
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
x86SetJ8(j8Ptr[1]);
return;
}
break;
case (PROCESS_EE_S | PROCESS_EE_T):
// Clamp NaNs
// Note: This fixes a crash in Rule of Rose.
fpuFloat3(EEREC_S);
fpuFloat3(EEREC_T);
xUCOMI.SS(xRegisterSSE(EEREC_S), xRegisterSSE(EEREC_T));
break;
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(regt));
fpuFreeIfTemp(regs);
fpuFreeIfTemp(regt);
}
break;
default:
Console.WriteLn(Color_Magenta, "recC_LT_xmm: Default");
tempReg = _allocX86reg(xEmptyReg, X86TYPE_TEMP, 0, 0);
xMOV(xRegister32(tempReg), ptr[&fpuRegs.fpr[_Fs_]]);
xCMP(xRegister32(tempReg), ptr[&fpuRegs.fpr[_Ft_]]);
xMOV(eax, ptr[&fpuRegs.fpr[_Fs_]]);
xCMP(eax, ptr[&fpuRegs.fpr[_Ft_]]);
j8Ptr[0] = JL8(0);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
@ -932,9 +972,6 @@ void recC_LT_xmm(int info)
x86SetJ8(j8Ptr[0]);
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
x86SetJ8(j8Ptr[1]);
if (tempReg >= 0)
_freeX86reg(tempReg);
return;
}
@ -957,13 +994,19 @@ FPURECOMPILE_CONSTCODE(C_LT, XMMINFO_READS | XMMINFO_READT);
void recCVT_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::CVTS_F);
if (!(info & PROCESS_EE_S) || (EEREC_D != EEREC_S && !(info & PROCESS_EE_MODEWRITES)))
if (info & PROCESS_EE_D)
{
xCVTSI2SS(xRegisterSSE(EEREC_D), ptr32[&fpuRegs.fpr[_Fs_]]);
if (info & PROCESS_EE_S)
xCVTDQ2PS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else
xCVTSI2SS(xRegisterSSE(EEREC_D), ptr32[&fpuRegs.fpr[_Fs_]]);
}
else
{
xCVTDQ2PS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
const int temp = _allocTempXMMreg(XMMT_FPS);
xCVTSI2SS(xRegisterSSE(temp), ptr32[&fpuRegs.fpr[_Fs_]]);
xMOVSS(ptr32[&fpuRegs.fpr[_Fd_]], xRegisterSSE(temp));
_freeXMMreg(temp);
}
}
@ -998,7 +1041,7 @@ void recCVT_W()
}
//kill register allocation for dst because we write directly to fpuRegs.fpr[_Fd_]
_deleteFPtoXMMreg(_Fd_, 2);
_deleteFPtoXMMreg(_Fd_, DELETE_REG_FREE_NO_WRITEBACK);
xADD(edx, 0x7FFFFFFF); // 0x7FFFFFFF if positive, 0x8000 0000 if negative
@ -1018,23 +1061,22 @@ void recDIVhelper1(int regd, int regt) // Sets flags
{
u8 *pjmp1, *pjmp2;
u32 *ajmp32, *bjmp32;
int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
int tempReg = _allocX86reg(xEmptyReg, X86TYPE_TEMP, 0, 0);
const int t1reg = _allocTempXMMreg(XMMT_FPS);
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
/*--- Check for divide by zero ---*/
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regt));
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(t1reg));
xAND(xRegister32(tempReg), 1); //Check sign (if regt == zero, sign will be set)
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if regt == zero, sign will be set)
ajmp32 = JZ32(0); //Skip if not set
/*--- Check for 0/0 ---*/
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regd));
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(t1reg));
xAND(xRegister32(tempReg), 1); //Check sign (if regd == zero, sign will be set)
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if regd == zero, sign will be set)
pjmp1 = JZ8(0); //Skip if not set
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags ( 0/0 )
pjmp2 = JMP8(0);
@ -1059,7 +1101,6 @@ void recDIVhelper1(int regd, int regt) // Sets flags
x86SetJ32(bjmp32);
_freeXMMreg(t1reg);
_freeX86reg(tempReg);
}
void recDIVhelper2(int regd, int regt) // Doesn't sets flags
@ -1075,7 +1116,7 @@ void recDIV_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::DIV_F);
bool roundmodeFlag = false;
int t0reg = _allocTempXMMreg(XMMT_FPS, -1);
int t0reg = _allocTempXMMreg(XMMT_FPS);
//Console.WriteLn("DIV");
if (CHECK_FPUNEGDIVHACK)
@ -1181,7 +1222,7 @@ FPURECOMPILE_CONSTCODE(DIV_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
//------------------------------------------------------------------
void recMADDtemp(int info, int regd)
{
const int t0reg = _allocTempXMMreg(XMMT_FPS, -1);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
@ -1203,7 +1244,7 @@ void recMADDtemp(int info, int regd)
FPU_ADD(regd, t0reg);
}
}
else if (regd == EEREC_ACC)
else if ((info & PROCESS_EE_ACC) && regd == EEREC_ACC)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_S); fpuFloat2(t0reg); }
@ -1306,7 +1347,7 @@ void recMADDtemp(int info, int regd)
FPU_ADD(regd, t0reg);
}
}
else if (regd == EEREC_ACC)
else if ((info & PROCESS_EE_ACC) && regd == EEREC_ACC)
{
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(EEREC_T); }
@ -1335,7 +1376,7 @@ void recMADDtemp(int info, int regd)
default:
if (regd == EEREC_ACC)
{
const int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
const int t1reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
xMOVSSZX(xRegisterSSE(t1reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(t1reg); }
@ -1433,7 +1474,7 @@ FPURECOMPILE_CONSTCODE(MOV_S, XMMINFO_WRITED | XMMINFO_READS);
//------------------------------------------------------------------
void recMSUBtemp(int info, int regd)
{
int t0reg = _allocTempXMMreg(XMMT_FPS, -1);
int t0reg = _allocTempXMMreg(XMMT_FPS);
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
@ -1559,7 +1600,7 @@ void recMSUBtemp(int info, int regd)
default:
if (regd == EEREC_ACC)
{
const int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
const int t1reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
xMOVSSZX(xRegisterSSE(t1reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(t1reg); }
@ -1663,7 +1704,7 @@ void recSUBhelper(int regd, int regt)
void recSUBop(int info, int regd)
{
int t0reg = _allocTempXMMreg(XMMT_FPS, -1);
int t0reg = _allocTempXMMreg(XMMT_FPS);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
@ -1761,19 +1802,15 @@ void recSQRT_S_xmm(int info)
if (CHECK_FPU_EXTRA_FLAGS)
{
int tempReg = _allocX86reg(xEmptyReg, X86TYPE_TEMP, 0, 0);
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
/*--- Check for negative SQRT ---*/
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(EEREC_D));
xAND(xRegister32(tempReg), 1); //Check sign
xMOVMSKPS(eax, xRegisterSSE(EEREC_D));
xAND(eax, 1); //Check sign
u8* pjmp = JZ8(0); //Skip if none are
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags
xAND.PS(xRegisterSSE(EEREC_D), ptr[&s_pos[0]]); // Make EEREC_D Positive
x86SetJ8(pjmp);
_freeX86reg(tempReg);
}
else
xAND.PS(xRegisterSSE(EEREC_D), ptr[&s_pos[0]]); // Make EEREC_D Positive
@ -1800,14 +1837,13 @@ void recRSQRThelper1(int regd, int t0reg) // Preforms the RSQRT function when re
u8 *pjmp1, *pjmp2;
u32 *pjmp32;
u8 *qjmp1, *qjmp2;
int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
int tempReg = _allocX86reg(xEmptyReg, X86TYPE_TEMP, 0, 0);
int t1reg = _allocTempXMMreg(XMMT_FPS);
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
/*--- (first) Check for negative SQRT ---*/
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(t0reg));
xAND(xRegister32(tempReg), 1); //Check sign
xMOVMSKPS(eax, xRegisterSSE(t0reg));
xAND(eax, 1); //Check sign
pjmp2 = JZ8(0); //Skip if not set
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags
xAND.PS(xRegisterSSE(t0reg), ptr[&s_pos[0]]); // Make t0reg Positive
@ -1816,14 +1852,14 @@ void recRSQRThelper1(int regd, int t0reg) // Preforms the RSQRT function when re
/*--- Check for zero ---*/
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(t0reg));
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(t1reg));
xAND(xRegister32(tempReg), 1); //Check sign (if t0reg == zero, sign will be set)
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if t0reg == zero, sign will be set)
pjmp1 = JZ8(0); //Skip if not set
/*--- Check for 0/0 ---*/
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regd));
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(t1reg));
xAND(xRegister32(tempReg), 1); //Check sign (if regd == zero, sign will be set)
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if regd == zero, sign will be set)
qjmp1 = JZ8(0); //Skip if not set
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags ( 0/0 )
qjmp2 = JMP8(0);
@ -1850,7 +1886,6 @@ void recRSQRThelper1(int regd, int t0reg) // Preforms the RSQRT function when re
x86SetJ32(pjmp32);
_freeXMMreg(t1reg);
_freeX86reg(tempReg);
}
void recRSQRThelper2(int regd, int t0reg) // Preforms the RSQRT function when regd <- Fs and t0reg <- Ft (Doesn't set flags)
@ -1872,7 +1907,7 @@ void recRSQRT_S_xmm(int info)
// iFPUd (Full mode) sets roundmode to nearest for rSQRT.
// Should this do the same, or should Full mode leave roundmode alone? --air
int t0reg = _allocTempXMMreg(XMMT_FPS, -1);
int t0reg = _allocTempXMMreg(XMMT_FPS);
//Console.WriteLn("FPU: RSQRT");
switch (info & (PROCESS_EE_S | PROCESS_EE_T))

View File

@ -288,7 +288,7 @@ void SetMaxValue(int regd)
#define ALLOC_S(sreg) \
do { \
(sreg) = _allocTempXMMreg(XMMT_FPS, -1); \
(sreg) = _allocTempXMMreg(XMMT_FPS); \
GET_S(sreg); \
} while (0)
@ -302,7 +302,7 @@ void SetMaxValue(int regd)
#define ALLOC_T(treg) \
do { \
(treg) = _allocTempXMMreg(XMMT_FPS, -1); \
(treg) = _allocTempXMMreg(XMMT_FPS); \
GET_T(treg); \
} while (0)
@ -316,7 +316,7 @@ void SetMaxValue(int regd)
#define ALLOC_ACC(areg) \
do { \
(areg) = _allocTempXMMreg(XMMT_FPS, -1); \
(areg) = _allocTempXMMreg(XMMT_FPS); \
GET_ACC(areg); \
} while (0)
@ -355,34 +355,31 @@ FPURECOMPILE_CONSTCODE(ABS_S, XMMINFO_WRITED | XMMINFO_READS);
//------------------------------------------------------------------
void FPU_ADD_SUB(int tempd, int tempt) //tempd and tempt are overwritten, they are floats
{
int tempecx = _allocX86reg(ecx, X86TYPE_TEMP, 0, 0); //receives regd
int temp2 = _allocX86reg(xEmptyReg, X86TYPE_TEMP, 0, 0); //receives regt
int xmmtemp = _allocTempXMMreg(XMMT_FPS, -1); //temporary for anding with regd/regt
xMOVD(xRegister32(tempecx), xRegisterSSE(tempd));
xMOVD(xRegister32(temp2), xRegisterSSE(tempt));
const int xmmtemp = _allocTempXMMreg(XMMT_FPS); //temporary for anding with regd/regt
xMOVD(ecx, xRegisterSSE(tempd)); //receives regd
xMOVD(eax, xRegisterSSE(tempt)); //receives regt
//mask the exponents
xSHR(xRegister32(tempecx), 23);
xSHR(xRegister32(temp2), 23);
xAND(xRegister32(tempecx), 0xff);
xAND(xRegister32(temp2), 0xff);
xSHR(ecx, 23);
xSHR(eax, 23);
xAND(ecx, 0xff);
xAND(eax, 0xff);
xSUB(xRegister32(tempecx), xRegister32(temp2)); //tempecx = exponent difference
xCMP(xRegister32(tempecx), 25);
xSUB(ecx, eax); //tempecx = exponent difference
xCMP(ecx, 25);
j8Ptr[0] = JGE8(0);
xCMP(xRegister32(tempecx), 0);
xCMP(ecx, 0);
j8Ptr[1] = JG8(0);
j8Ptr[2] = JE8(0);
xCMP(xRegister32(tempecx), -25);
xCMP(ecx, -25);
j8Ptr[3] = JLE8(0);
//diff = -24 .. -1 , expd < expt
xNEG(xRegister32(tempecx));
xDEC(xRegister32(tempecx));
xMOV(xRegister32(temp2), 0xffffffff);
xSHL(xRegister32(temp2), cl); //temp2 = 0xffffffff << tempecx
xMOVDZX(xRegisterSSE(xmmtemp), xRegister32(temp2));
xNEG(ecx);
xDEC(ecx);
xMOV(eax, 0xffffffff);
xSHL(eax, cl); //temp2 = 0xffffffff << tempecx
xMOVDZX(xRegisterSSE(xmmtemp), eax);
xAND.PS(xRegisterSSE(tempd), xRegisterSSE(xmmtemp));
j8Ptr[4] = JMP8(0);
@ -393,10 +390,10 @@ void FPU_ADD_SUB(int tempd, int tempt) //tempd and tempt are overwritten, they a
x86SetJ8(j8Ptr[1]);
//diff = 1 .. 24, expt < expd
xDEC(xRegister32(tempecx));
xMOV(xRegister32(temp2), 0xffffffff);
xSHL(xRegister32(temp2), cl); //temp2 = 0xffffffff << tempecx
xMOVDZX(xRegisterSSE(xmmtemp), xRegister32(temp2));
xDEC(ecx);
xMOV(eax, 0xffffffff);
xSHL(eax, cl); //temp2 = 0xffffffff << tempecx
xMOVDZX(xRegisterSSE(xmmtemp), eax);
xAND.PS(xRegisterSSE(tempt), xRegisterSSE(xmmtemp));
j8Ptr[6] = JMP8(0);
@ -412,8 +409,6 @@ void FPU_ADD_SUB(int tempd, int tempt) //tempd and tempt are overwritten, they a
x86SetJ8(j8Ptr[6]);
_freeXMMreg(xmmtemp);
_freeX86reg(temp2);
_freeX86reg(tempecx);
}
void FPU_MUL(int info, int regd, int sreg, int treg, bool acc)
@ -554,10 +549,21 @@ FPURECOMPILE_CONSTCODE(C_LT, XMMINFO_READS | XMMINFO_READT);
void recCVT_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::CVTS_F);
if (!(info & PROCESS_EE_S) || (EEREC_D != EEREC_S && !(info & PROCESS_EE_MODEWRITES)))
xCVTSI2SS(xRegisterSSE(EEREC_D), ptr32[&fpuRegs.fpr[_Fs_]]);
if (info & PROCESS_EE_D)
{
if (info & PROCESS_EE_S)
xCVTDQ2PS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else
xCVTSI2SS(xRegisterSSE(EEREC_D), ptr32[&fpuRegs.fpr[_Fs_]]);
}
else
xCVTDQ2PS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
{
const int temp = _allocTempXMMreg(XMMT_FPS);
xCVTSI2SS(xRegisterSSE(temp), ptr32[&fpuRegs.fpr[_Fs_]]);
xMOVSS(ptr32[&fpuRegs.fpr[_Fd_]], xRegisterSSE(temp));
_freeXMMreg(temp);
}
}
FPURECOMPILE_CONSTCODE(CVT_S, XMMINFO_WRITED | XMMINFO_READS);
@ -581,7 +587,7 @@ void recCVT_W() //called from iFPU.cpp's recCVT_W
}
//kill register allocation for dst because we write directly to fpuRegs.fpr[_Fd_]
_deleteFPtoXMMreg(_Fd_, 2);
_deleteFPtoXMMreg(_Fd_, DELETE_REG_FREE_NO_WRITEBACK);
xADD(edx, 0x7FFFFFFF); // 0x7FFFFFFF if positive, 0x8000 0000 if negative
@ -601,23 +607,22 @@ void recDIVhelper1(int regd, int regt) // Sets flags
{
u8 *pjmp1, *pjmp2;
u32 *ajmp32, *bjmp32;
int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
int tempReg = _allocX86reg(xEmptyReg, X86TYPE_TEMP, 0, 0);
const int t1reg = _allocTempXMMreg(XMMT_FPS);
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
//--- Check for divide by zero ---
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regt));
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(t1reg));
xAND(xRegister32(tempReg), 1); //Check sign (if regt == zero, sign will be set)
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if regt == zero, sign will be set)
ajmp32 = JZ32(0); //Skip if not set
//--- Check for 0/0 ---
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regd));
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(t1reg));
xAND(xRegister32(tempReg), 1); //Check sign (if regd == zero, sign will be set)
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if regd == zero, sign will be set)
pjmp1 = JZ8(0); //Skip if not set
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags ( 0/0 )
pjmp2 = JMP8(0);
@ -642,7 +647,6 @@ void recDIVhelper1(int regd, int regt) // Sets flags
x86SetJ32(bjmp32);
_freeXMMreg(t1reg);
_freeX86reg(tempReg);
}
void recDIVhelper2(int regd, int regt) // Doesn't sets flags
@ -951,8 +955,7 @@ void recSQRT_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::SQRT_F);
int roundmodeFlag = 0;
const int tempReg = _allocX86reg(xEmptyReg, X86TYPE_TEMP, 0, 0);
const int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
const int t1reg = _allocTempXMMreg(XMMT_FPS);
//Console.WriteLn("FPU: SQRT");
if (g_sseMXCSR.GetRoundMode() != SSEround_Nearest)
@ -972,8 +975,8 @@ void recSQRT_S_xmm(int info)
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
//--- Check for negative SQRT --- (sqrt(-0) = 0, unlike what the docs say)
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(EEREC_D));
xAND(xRegister32(tempReg), 1); //Check sign
xMOVMSKPS(eax, xRegisterSSE(EEREC_D));
xAND(eax, 1); //Check sign
u8* pjmp = JZ8(0); //Skip if none are
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags
xAND.PS(xRegisterSSE(EEREC_D), ptr[&s_const.pos[0]]); // Make EEREC_D Positive
@ -994,7 +997,6 @@ void recSQRT_S_xmm(int info)
if (roundmodeFlag == 1)
xLDMXCSR(g_sseMXCSR);
_freeX86reg(tempReg);
_freeXMMreg(t1reg);
}
@ -1010,14 +1012,13 @@ void recRSQRThelper1(int regd, int regt) // Preforms the RSQRT function when reg
u8 *pjmp1, *pjmp2;
u8 *qjmp1, *qjmp2;
u32* pjmp32;
int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
int tempReg = _allocX86reg(xEmptyReg, X86TYPE_TEMP, 0, 0);
int t1reg = _allocTempXMMreg(XMMT_FPS);
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
//--- (first) Check for negative SQRT ---
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(regt));
xAND(xRegister32(tempReg), 1); //Check sign
xMOVMSKPS(eax, xRegisterSSE(regt));
xAND(eax, 1); //Check sign
pjmp2 = JZ8(0); //Skip if not set
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags
xAND.PS(xRegisterSSE(regt), ptr[&s_const.pos[0]]); // Make regt Positive
@ -1026,15 +1027,15 @@ void recRSQRThelper1(int regd, int regt) // Preforms the RSQRT function when reg
//--- Check for zero ---
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regt));
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(t1reg));
xAND(xRegister32(tempReg), 1); //Check sign (if regt == zero, sign will be set)
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if regt == zero, sign will be set)
pjmp1 = JZ8(0); //Skip if not set
//--- Check for 0/0 ---
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regd));
xMOVMSKPS(xRegister32(tempReg), xRegisterSSE(t1reg));
xAND(xRegister32(tempReg), 1); //Check sign (if regd == zero, sign will be set)
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if regd == zero, sign will be set)
qjmp1 = JZ8(0); //Skip if not set
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags ( 0/0 )
qjmp2 = JMP8(0);
@ -1055,7 +1056,6 @@ void recRSQRThelper1(int regd, int regt) // Preforms the RSQRT function when reg
x86SetJ32(pjmp32);
_freeXMMreg(t1reg);
_freeX86reg(tempReg);
}
void recRSQRThelper2(int regd, int regt) // Preforms the RSQRT function when regd <- Fs and regt <- Ft (Doesn't set flags)

View File

@ -56,11 +56,14 @@ REC_FUNC_DEL(PSLLW, _Rd_);
void recPLZCW()
{
int regs = -1;
int x86regs = -1;
int xmmregs = -1;
if (!_Rd_)
return;
// TODO(Stenzek): Don't flush to memory at the end here. Careful of Rs == Rd.
EE::Profiler.EmitOp(eeOpcode::PLZCW);
if (GPR_IS_CONST1(_Rs_))
@ -78,16 +81,20 @@ void recPLZCW()
_eeOnWriteReg(_Rd_, 0);
if ((regs = _checkXMMreg(XMMTYPE_GPRREG, _Rs_, MODE_READ)) >= 0)
if ((xmmregs = _checkXMMreg(XMMTYPE_GPRREG, _Rs_, MODE_READ)) >= 0)
{
xMOVD(eax, xRegisterSSE(regs));
xMOVD(eax, xRegisterSSE(xmmregs));
}
else if ((x86regs = _checkX86reg(X86TYPE_GPR, _Rs_, MODE_READ)) >= 0)
{
xMOV(eax, xRegister32(x86regs));
}
else
{
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
}
_deleteEEreg(_Rd_, 0);
_deleteEEreg(_Rd_, DELETE_REG_FREE_NO_WRITEBACK);
// Count the number of leading bits (MSB) that match the sign bit, excluding the sign
// bit itself.
@ -115,11 +122,14 @@ void recPLZCW()
// second word
if (regs >= 0)
if (xmmregs >= 0)
{
xPSHUF.D(xRegisterSSE(regs & 0xf), xRegisterSSE(regs & 0xf), 0xe1);
xMOVD(eax, xRegisterSSE(regs & 0xf));
xPSHUF.D(xRegisterSSE(regs & 0xf), xRegisterSSE(regs & 0xf), 0xe1);
xPEXTR.D(eax, xRegisterSSE(xmmregs), 1);
}
else if (x86regs >= 0)
{
xMOV(rax, xRegister64(x86regs));
xSHR(rax, 32);
}
else
{
@ -158,7 +168,7 @@ void recPMFHL()
{
case 0x00: // LW
t0reg = _allocTempXMMreg(XMMT_INT, -1);
t0reg = _allocTempXMMreg(XMMT_INT);
xPSHUF.D(xRegisterSSE(t0reg), xRegisterSSE(EEREC_HI), 0x88);
xPSHUF.D(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_LO), 0x88);
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -167,7 +177,7 @@ void recPMFHL()
break;
case 0x01: // UW
t0reg = _allocTempXMMreg(XMMT_INT, -1);
t0reg = _allocTempXMMreg(XMMT_INT);
xPSHUF.D(xRegisterSSE(t0reg), xRegisterSSE(EEREC_HI), 0xdd);
xPSHUF.D(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_LO), 0xdd);
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -182,7 +192,7 @@ void recPMFHL()
break;
case 0x03: // LH
t0reg = _allocTempXMMreg(XMMT_INT, -1);
t0reg = _allocTempXMMreg(XMMT_INT);
xPSHUF.LW(xRegisterSSE(t0reg), xRegisterSSE(EEREC_HI), 0x88);
xPSHUF.LW(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_LO), 0x88);
xPSHUF.HW(xRegisterSSE(t0reg), xRegisterSSE(t0reg), 0x88);
@ -452,7 +462,7 @@ void recPPACW()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
if (EEREC_D == EEREC_T)
{
xPSHUF.D(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S), 0x88);
@ -492,7 +502,7 @@ void recPPACH()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xPSHUF.LW(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S), 0x88);
xPSHUF.LW(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T), 0x88);
xPSHUF.HW(xRegisterSSE(t0reg), xRegisterSSE(t0reg), 0x88);
@ -518,28 +528,19 @@ void recPPACB()
int info = eeRecompileCodeXMM((_Rs_ != 0 ? XMMINFO_READS : 0) | XMMINFO_READT | XMMINFO_WRITED);
if (_Rs_ == 0)
{
if (_hasFreeXMMreg())
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
xPSLL.W(xRegisterSSE(EEREC_D), 8);
xPXOR(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSRL.W(xRegisterSSE(EEREC_D), 8);
xPACK.USWB(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
}
else
{
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
xPSLL.W(xRegisterSSE(EEREC_D), 8);
xPSRL.W(xRegisterSSE(EEREC_D), 8);
xPACK.USWB(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
xPSRL.DQ(xRegisterSSE(EEREC_D), 8);
}
const int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
xPSLL.W(xRegisterSSE(EEREC_D), 8);
xPXOR(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSRL.W(xRegisterSSE(EEREC_D), 8);
xPACK.USWB(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
const int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
@ -563,8 +564,8 @@ void recPEXT5()
EE::Profiler.EmitOp(eeOpcode::PEXT5);
int info = eeRecompileCodeXMM(XMMINFO_READT | XMMINFO_WRITED);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
int t1reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T)); // for bit 5..9
xMOVDQA(xRegisterSSE(t1reg), xRegisterSSE(EEREC_T)); // for bit 15
@ -602,8 +603,8 @@ void recPPAC5()
EE::Profiler.EmitOp(eeOpcode::PPAC5);
int info = eeRecompileCodeXMM(XMMINFO_READT | XMMINFO_WRITED);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
int t1reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T)); // for bit 10..14
xMOVDQA(xRegisterSSE(t1reg), xRegisterSSE(EEREC_T)); // for bit 15
@ -671,7 +672,7 @@ void recPCGTB()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPCMP.GTB(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -696,7 +697,7 @@ void recPCGTH()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPCMP.GTW(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -722,7 +723,7 @@ void recPCGTW()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPCMP.GTD(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -783,9 +784,9 @@ void recPADDSW()
EE::Profiler.EmitOp(eeOpcode::PADDSW);
int info = eeRecompileCodeXMM(XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t2reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
int t1reg = _allocTempXMMreg(XMMT_INT);
int t2reg = _allocTempXMMreg(XMMT_INT);
// The idea is:
// s = x + y; (wrap-arounded)
@ -843,7 +844,7 @@ void recPSUBSB()
xPSUB.SB(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
else if (EEREC_D == EEREC_T)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPSUB.SB(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -870,7 +871,7 @@ void recPSUBSH()
xPSUB.SW(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
else if (EEREC_D == EEREC_T)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPSUB.SW(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -894,9 +895,9 @@ void recPSUBSW()
EE::Profiler.EmitOp(eeOpcode::PSUBSW);
int info = eeRecompileCodeXMM(XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t2reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
int t1reg = _allocTempXMMreg(XMMT_INT);
int t2reg = _allocTempXMMreg(XMMT_INT);
// The idea is:
// s = x - y; (wrap-arounded)
@ -1050,7 +1051,7 @@ void recPSUBB()
xPSUB.B(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
else if (EEREC_D == EEREC_T)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPSUB.B(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -1077,7 +1078,7 @@ void recPSUBH()
xPSUB.W(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
else if (EEREC_D == EEREC_T)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPSUB.W(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -1104,7 +1105,7 @@ void recPSUBW()
xPSUB.D(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
else if (EEREC_D == EEREC_T)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPSUB.D(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -1138,7 +1139,7 @@ void recPEXTLW()
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else if (EEREC_D == EEREC_S)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -1172,7 +1173,7 @@ void recPEXTLB()
xPUNPCK.LBW(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else if (EEREC_D == EEREC_S)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
xPUNPCK.LBW(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -1206,7 +1207,7 @@ void recPEXTLH()
xPUNPCK.LWD(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else if (EEREC_D == EEREC_S)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
xPUNPCK.LWD(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -1264,7 +1265,7 @@ void recPABSW() //needs clamping
EE::Profiler.EmitOp(eeOpcode::PABSW);
int info = eeRecompileCodeXMM(XMMINFO_READT | XMMINFO_WRITED);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xPCMP.EQD(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSLL.D(xRegisterSSE(t0reg), 31);
xPCMP.EQD(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T)); //0xffffffff if equal to 0x80000000
@ -1284,7 +1285,7 @@ void recPABSH()
EE::Profiler.EmitOp(eeOpcode::PABSH);
int info = eeRecompileCodeXMM(XMMINFO_READT | XMMINFO_WRITED);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xPCMP.EQW(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSLL.W(xRegisterSSE(t0reg), 15);
xPCMP.EQW(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T)); //0xffff if equal to 0x8000
@ -1337,7 +1338,7 @@ void recPADSBH()
}
else
{
const int t0reg = _allocTempXMMreg(XMMT_INT, -1);
const int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
@ -1387,8 +1388,8 @@ void recPADDUW()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
int t1reg = _allocTempXMMreg(XMMT_INT);
xPCMP.EQB(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSLL.D(xRegisterSSE(t0reg), 31); // 0x80000000
@ -1432,7 +1433,7 @@ void recPSUBUB()
xPSUB.USB(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
else if (EEREC_D == EEREC_T)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPSUB.USB(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -1459,7 +1460,7 @@ void recPSUBUH()
xPSUB.USW(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
else if (EEREC_D == EEREC_T)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPSUB.USW(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -1482,8 +1483,8 @@ void recPSUBUW()
EE::Profiler.EmitOp(eeOpcode::PSUBUW);
int info = eeRecompileCodeXMM(XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
int t1reg = _allocTempXMMreg(XMMT_INT);
xPCMP.EQB(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSLL.D(xRegisterSSE(t0reg), 31); // 0x80000000
@ -1545,7 +1546,7 @@ void recPEXTUH()
xPUNPCK.HWD(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else if (EEREC_D == EEREC_S)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
xPUNPCK.HWD(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -1614,7 +1615,7 @@ void recPEXTUB()
xPUNPCK.HBW(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else if (EEREC_D == EEREC_S)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
xPUNPCK.HBW(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -1649,7 +1650,7 @@ void recPEXTUW()
xPUNPCK.HDQ(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else if (EEREC_D == EEREC_S)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
xPUNPCK.HDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
@ -1910,8 +1911,8 @@ void recPSLLVW()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
int t1reg = _allocTempXMMreg(XMMT_INT);
// shamt is 5-bit
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
@ -1967,8 +1968,8 @@ void recPSRLVW()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
int t1reg = _allocTempXMMreg(XMMT_INT);
// shamt is 5-bit
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
@ -2134,7 +2135,7 @@ void recPHMADH()
EE::Profiler.EmitOp(eeOpcode::PHMADH);
int info = eeRecompileCodeXMM((_Rd_ ? XMMINFO_WRITED : 0) | XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITELO | XMMINFO_WRITEHI);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
xPSRL.D(xRegisterSSE(t0reg), 16);
@ -2181,8 +2182,8 @@ void recPMSUBH()
EE::Profiler.EmitOp(eeOpcode::PMSUBH);
int info = eeRecompileCodeXMM((_Rd_ ? XMMINFO_WRITED : 0) | XMMINFO_READS | XMMINFO_READT | XMMINFO_READLO | XMMINFO_READHI | XMMINFO_WRITELO | XMMINFO_WRITEHI);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
int t1reg = _allocTempXMMreg(XMMT_INT);
if (!_Rd_)
{
@ -2247,7 +2248,7 @@ void recPHMSBH()
EE::Profiler.EmitOp(eeOpcode::PHMSBH);
int info = eeRecompileCodeXMM((_Rd_ ? XMMINFO_WRITED : 0) | XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITELO | XMMINFO_WRITEHI);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xPCMP.EQD(xRegisterSSE(EEREC_LO), xRegisterSSE(EEREC_LO));
xPSRL.D(xRegisterSSE(EEREC_LO), 16);
@ -2316,7 +2317,7 @@ void recPINTH()
int info = eeRecompileCodeXMM(XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
if (EEREC_D == EEREC_S)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVHL.PS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
if (EEREC_D != EEREC_T)
xMOVQZX(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
@ -2360,7 +2361,7 @@ void recPMULTH()
EE::Profiler.EmitOp(eeOpcode::PMULTH);
int info = eeRecompileCodeXMM(XMMINFO_READS | XMMINFO_READT | (_Rd_ ? XMMINFO_WRITED : 0) | XMMINFO_WRITELO | XMMINFO_WRITEHI);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(EEREC_LO), xRegisterSSE(EEREC_S));
xMOVDQA(xRegisterSSE(EEREC_HI), xRegisterSSE(EEREC_S));
@ -2506,8 +2507,8 @@ void recPMADDH()
EE::Profiler.EmitOp(eeOpcode::PMADDH);
int info = eeRecompileCodeXMM((_Rd_ ? XMMINFO_WRITED : 0) | XMMINFO_READS | XMMINFO_READT | XMMINFO_READLO | XMMINFO_READHI | XMMINFO_WRITELO | XMMINFO_WRITEHI);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
int t1reg = _allocTempXMMreg(XMMT_INT);
if (!_Rd_)
{
@ -2616,8 +2617,8 @@ void recPSRAVW()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
int t1reg = _allocTempXMMreg(XMMT_INT);
// shamt is 5-bit
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
@ -2699,7 +2700,7 @@ void recPINTEH()
else if (EEREC_D == EEREC_T)
{
pxAssert(EEREC_D != EEREC_S);
t0reg = _allocTempXMMreg(XMMT_INT, -1);
t0reg = _allocTempXMMreg(XMMT_INT);
xPSLL.D(xRegisterSSE(EEREC_D), 16);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
xPSRL.D(xRegisterSSE(EEREC_D), 16);
@ -2708,7 +2709,7 @@ void recPINTEH()
}
else
{
t0reg = _allocTempXMMreg(XMMT_INT, -1);
t0reg = _allocTempXMMreg(XMMT_INT);
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xPSLL.D(xRegisterSSE(t0reg), 16);
@ -2767,7 +2768,7 @@ void recPMULTUW()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xPSHUF.D(xRegisterSSE(t0reg), xRegisterSSE(EEREC_HI), 0xd8);
xMOVDQA(xRegisterSSE(EEREC_LO), xRegisterSSE(t0reg));
xMOVDQA(xRegisterSSE(EEREC_HI), xRegisterSSE(t0reg));
@ -2833,7 +2834,7 @@ void recPMADDUW()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xPSHUF.D(xRegisterSSE(t0reg), xRegisterSSE(EEREC_HI), 0xd8);
xMOVDQA(xRegisterSSE(EEREC_LO), xRegisterSSE(t0reg));
xMOVDQA(xRegisterSSE(EEREC_HI), xRegisterSSE(t0reg));
@ -2902,7 +2903,7 @@ void recPNOR()
{
if (EEREC_D == EEREC_T)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xPCMP.EQD(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
@ -2919,7 +2920,7 @@ void recPNOR()
{
if (EEREC_D == EEREC_S)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
xPCMP.EQD(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
@ -2932,7 +2933,7 @@ void recPNOR()
}
else
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t0reg = _allocTempXMMreg(XMMT_INT);
if (EEREC_D == EEREC_S)
xPOR(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));

File diff suppressed because it is too large Load Diff

View File

@ -34,25 +34,17 @@ static const int psxInstCycles_Load = 0;
extern uptr psxRecLUT[];
u8 _psxLoadWritesRs(u32 tempcode);
u8 _psxIsLoadStore(u32 tempcode);
void _psxFlushAllUnused();
int _psxFlushUnusedConstReg();
void _psxFlushCachedRegs();
void _psxFlushConstReg(int reg);
void _psxFlushConstRegs();
void _psxDeleteReg(int reg, int flush);
void _psxFlushCall(int flushtype);
void _psxFlushAllDirty();
void _psxOnWriteReg(int reg);
void _psxMoveGPRtoR(const x86Emitter::xRegister32& to, int fromgpr);
#if 0
void _psxMoveGPRtoM(uptr to, int fromgpr);
void _psxMoveGPRtoRm(x86IntRegType to, int fromgpr);
#endif
extern u32 psxpc; // recompiler pc
extern int psxbranch; // set for branch
@ -63,13 +55,14 @@ void psxLoadBranchState();
extern void psxSetBranchReg(u32 reg);
extern void psxSetBranchImm(u32 imm);
extern void psxRecompileNextInstruction(int delayslot);
extern void psxRecompileNextInstruction(bool delayslot, bool swapped_delayslot);
////////////////////////////////////////////////////////////////////
// IOP Constant Propagation Defines, Vars, and API - From here down!
#define PSX_IS_CONST1(reg) ((reg) < 32 && (g_psxHasConstReg & (1 << (reg))))
#define PSX_IS_CONST2(reg1, reg2) ((g_psxHasConstReg & (1 << (reg1))) && (g_psxHasConstReg & (1 << (reg2))))
#define PSX_IS_DIRTY_CONST(reg) ((reg) < 32 && (g_psxHasConstReg & (1 << (reg))) && (!(g_psxFlushedConstReg & (1 << (reg)))))
#define PSX_SET_CONST(reg) \
{ \
if ((reg) < 32) \
@ -91,28 +84,31 @@ extern u32 g_psxHasConstReg, g_psxFlushedConstReg;
typedef void (*R3000AFNPTR)();
typedef void (*R3000AFNPTR_INFO)(int info);
bool psxTrySwapDelaySlot(u32 rs, u32 rt, u32 rd);
int psxTryRenameReg(int to, int from, int fromx86, int other, int xmminfo);
//
// non mmx/xmm version, slower
//
// rd = rs op rt
#define PSXRECOMPILE_CONSTCODE0(fn) \
#define PSXRECOMPILE_CONSTCODE0(fn, info) \
void rpsx##fn(void) \
{ \
psxRecompileCodeConst0(rpsx##fn##_const, rpsx##fn##_consts, rpsx##fn##_constt, rpsx##fn##_); \
psxRecompileCodeConst0(rpsx##fn##_const, rpsx##fn##_consts, rpsx##fn##_constt, rpsx##fn##_, info); \
}
// rt = rs op imm16
#define PSXRECOMPILE_CONSTCODE1(fn) \
#define PSXRECOMPILE_CONSTCODE1(fn, info) \
void rpsx##fn(void) \
{ \
psxRecompileCodeConst1(rpsx##fn##_const, rpsx##fn##_); \
psxRecompileCodeConst1(rpsx##fn##_const, rpsx##fn##_, info); \
}
// rd = rt op sa
#define PSXRECOMPILE_CONSTCODE2(fn) \
#define PSXRECOMPILE_CONSTCODE2(fn, info) \
void rpsx##fn(void) \
{ \
psxRecompileCodeConst2(rpsx##fn##_const, rpsx##fn##_); \
psxRecompileCodeConst2(rpsx##fn##_const, rpsx##fn##_, info); \
}
// [lo,hi] = rt op rs
@ -130,11 +126,11 @@ typedef void (*R3000AFNPTR_INFO)(int info);
}
// rd = rs op rt
void psxRecompileCodeConst0(R3000AFNPTR constcode, R3000AFNPTR_INFO constscode, R3000AFNPTR_INFO consttcode, R3000AFNPTR_INFO noconstcode);
void psxRecompileCodeConst0(R3000AFNPTR constcode, R3000AFNPTR_INFO constscode, R3000AFNPTR_INFO consttcode, R3000AFNPTR_INFO noconstcode, int xmminfo);
// rt = rs op imm16
void psxRecompileCodeConst1(R3000AFNPTR constcode, R3000AFNPTR_INFO noconstcode);
void psxRecompileCodeConst1(R3000AFNPTR constcode, R3000AFNPTR_INFO noconstcode, int xmminfo);
// rd = rt op sa
void psxRecompileCodeConst2(R3000AFNPTR constcode, R3000AFNPTR_INFO noconstcode);
void psxRecompileCodeConst2(R3000AFNPTR constcode, R3000AFNPTR_INFO noconstcode, int xmminfo);
// [lo,hi] = rt op rs
void psxRecompileCodeConst3(R3000AFNPTR constcode, R3000AFNPTR_INFO constscode, R3000AFNPTR_INFO consttcode, R3000AFNPTR_INFO noconstcode, int LOHI);

File diff suppressed because it is too large Load Diff

View File

@ -21,6 +21,9 @@
#include "iCore.h"
#include "R5900_Profiler.h"
// Register containing a pointer to our fastmem (4GB) area
#define RFASTMEMBASE x86Emitter::rbp
extern u32 maxrecmem;
extern u32 pc; // recompiler pc
extern int g_branch; // set for branch
@ -61,11 +64,16 @@ extern bool s_nBlockInterlocked; // Current block has VU0 interlocking
extern bool g_recompilingDelaySlot;
// Used for generating backpatch thunks for fastmem.
u8* recBeginThunk();
u8* recEndThunk();
// used when processing branches
bool TrySwapDelaySlot(u32 rs, u32 rt, u32 rd);
void SaveBranchState();
void LoadBranchState();
void recompileNextInstruction(int delayslot);
void recompileNextInstruction(bool delayslot, bool swapped_delay_slot);
void SetBranchReg(u32 reg);
void SetBranchImm(u32 imm);
@ -78,8 +86,7 @@ namespace R5900
{
namespace Dynarec
{
extern void recDoBranchImm(u32* jmpSkip, bool isLikely = false);
extern void recDoBranchImm_Likely(u32* jmpSkip);
extern void recDoBranchImm(u32 branchTo, u32* jmpSkip, bool isLikely = false, bool swappedDelaySlot = false);
} // namespace Dynarec
} // namespace R5900
@ -88,6 +95,7 @@ namespace R5900
#define GPR_IS_CONST1(reg) (EE_CONST_PROP && (reg) < 32 && (g_cpuHasConstReg & (1 << (reg))))
#define GPR_IS_CONST2(reg1, reg2) (EE_CONST_PROP && (g_cpuHasConstReg & (1 << (reg1))) && (g_cpuHasConstReg & (1 << (reg2))))
#define GPR_IS_DIRTY_CONST(reg) (EE_CONST_PROP && (reg) < 32 && (g_cpuHasConstReg & (1 << (reg))) && (!(g_cpuFlushedConstReg & (1 << (reg)))))
#define GPR_SET_CONST(reg) \
{ \
if ((reg) < 32) \
@ -106,29 +114,23 @@ namespace R5900
alignas(16) extern GPR_reg64 g_cpuConstRegs[32];
extern u32 g_cpuHasConstReg, g_cpuFlushedConstReg;
// gets a memory pointer to the constant reg
u32* _eeGetConstReg(int reg);
// finds where the GPR is stored and moves lower 32 bits to EAX
void _eeMoveGPRtoR(const x86Emitter::xRegister32& to, int fromgpr);
void _eeMoveGPRtoR(const x86Emitter::xRegister64& to, int fromgpr);
void _eeMoveGPRtoM(uptr to, int fromgpr);
void _eeMoveGPRtoRm(x86IntRegType to, int fromgpr);
void _signExtendToMem(void* mem);
void eeSignExtendTo(int gpr, bool onlyupper = false);
void _eeMoveGPRtoR(const x86Emitter::xRegister32& to, int fromgpr, bool allow_preload = true);
void _eeMoveGPRtoR(const x86Emitter::xRegister64& to, int fromgpr, bool allow_preload = true);
void _eeMoveGPRtoM(uptr to, int fromgpr); // 32-bit only
void _eeFlushAllUnused();
void _eeFlushAllDirty();
void _eeOnWriteReg(int reg, int signext);
// totally deletes from const, xmm, and mmx entries
// if flush is 1, also flushes to memory
// if 0, only flushes if not an xmm reg (used when overwriting lower 64bits of reg)
void _deleteEEreg(int reg, int flush);
void _deleteEEreg128(int reg);
void _flushEEreg(int reg, bool clear = false);
// allocates memory on the instruction size and returns the pointer
u32* recGetImm64(u32 hi, u32 lo);
int _eeTryRenameReg(int to, int from, int fromx86, int other, int xmminfo);
//////////////////////////////////////
// Templates for code recompilation //
@ -141,14 +143,27 @@ typedef void (*R5900FNPTR_INFO)(int info);
void rec##fn(void) \
{ \
EE::Profiler.EmitOp(eeOpcode::fn); \
eeRecompileCode0(rec##fn##_const, rec##fn##_consts, rec##fn##_constt, rec##fn##_, xmminfo); \
eeRecompileCode0(rec##fn##_const, rec##fn##_consts, rec##fn##_constt, rec##fn##_, (xmminfo)); \
}
#define EERECOMPILE_CODEX(codename, fn) \
#define EERECOMPILE_CODERC0(fn, xmminfo) \
void rec##fn(void) \
{ \
EE::Profiler.EmitOp(eeOpcode::fn); \
codename(rec##fn##_const, rec##fn##_); \
eeRecompileCodeRC0(rec##fn##_const, rec##fn##_consts, rec##fn##_constt, rec##fn##_, (xmminfo)); \
}
#define EERECOMPILE_CODEX(codename, fn, xmminfo) \
void rec##fn(void) \
{ \
EE::Profiler.EmitOp(eeOpcode::fn); \
codename(rec##fn##_const, rec##fn##_, (xmminfo)); \
}
#define EERECOMPILE_CODEI(codename, fn, xmminfo) \
void rec##fn(void) \
{ \
EE::Profiler.EmitOp(eeOpcode::fn); \
codename(rec##fn##_const, rec##fn##_, (xmminfo)); \
}
//
@ -156,66 +171,11 @@ typedef void (*R5900FNPTR_INFO)(int info);
//
// rd = rs op rt
void eeRecompileCode0(R5900FNPTR constcode, R5900FNPTR_INFO constscode, R5900FNPTR_INFO consttcode, R5900FNPTR_INFO noconstcode, int xmminfo);
void eeRecompileCodeRC0(R5900FNPTR constcode, R5900FNPTR_INFO constscode, R5900FNPTR_INFO consttcode, R5900FNPTR_INFO noconstcode, int xmminfo);
// rt = rs op imm16
void eeRecompileCode1(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode);
void eeRecompileCodeRC1(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode, int xmminfo);
// rd = rt op sa
void eeRecompileCode2(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode);
// rt op rs (SPECIAL)
void eeRecompileCode3(R5900FNPTR constcode, R5900FNPTR_INFO multicode);
//
// non mmx/xmm version, slower
//
// rd = rs op rt
#define EERECOMPILE_CONSTCODE0(fn) \
void rec##fn(void) \
{ \
eeRecompileCodeConst0(rec##fn##_const, rec##fn##_consts, rec##fn##_constt, rec##fn##_); \
}
// rt = rs op imm16
#define EERECOMPILE_CONSTCODE1(fn) \
void rec##fn(void) \
{ \
eeRecompileCodeConst1(rec##fn##_const, rec##fn##_); \
}
// rd = rt op sa
#define EERECOMPILE_CONSTCODE2(fn) \
void rec##fn(void) \
{ \
eeRecompileCodeConst2(rec##fn##_const, rec##fn##_); \
}
// rd = rt op rs
#define EERECOMPILE_CONSTCODESPECIAL(fn, mult) \
void rec##fn(void) \
{ \
eeRecompileCodeConstSPECIAL(rec##fn##_const, rec##fn##_, mult); \
}
// rd = rs op rt
void eeRecompileCodeConst0(R5900FNPTR constcode, R5900FNPTR_INFO constscode, R5900FNPTR_INFO consttcode, R5900FNPTR_INFO noconstcode);
// rt = rs op imm16
void eeRecompileCodeConst1(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode);
// rd = rt op sa
void eeRecompileCodeConst2(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode);
// rd = rt MULT rs (SPECIAL)
void eeRecompileCodeConstSPECIAL(R5900FNPTR constcode, R5900FNPTR_INFO multicode, int MULT);
// XMM caching helpers
#define XMMINFO_READLO 0x001
#define XMMINFO_READHI 0x002
#define XMMINFO_WRITELO 0x004
#define XMMINFO_WRITEHI 0x008
#define XMMINFO_WRITED 0x010
#define XMMINFO_READD 0x020
#define XMMINFO_READS 0x040
#define XMMINFO_READT 0x080
#define XMMINFO_READD_LO 0x100 // if set and XMMINFO_READD is set, reads only low 64 bits of D
#define XMMINFO_READACC 0x200
#define XMMINFO_WRITEACC 0x400
void eeRecompileCodeRC2(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode, int xmminfo);
#define FPURECOMPILE_CONSTCODE(fn, xmminfo) \
void rec##fn(void) \

File diff suppressed because it is too large Load Diff

View File

@ -71,4 +71,6 @@ namespace R5900
void Run(u32 start, u32 end, EEINST* inst_cache) override;
};
} // namespace R5900
} // namespace R5900
void recBackpropBSC(u32 code, EEINST* prev, EEINST* pinst);

View File

@ -31,17 +31,18 @@ namespace Dynarec {
// Parameters:
// jmpSkip - This parameter is the result of the appropriate J32 instruction
// (usually JZ32 or JNZ32).
void recDoBranchImm(u32* jmpSkip, bool isLikely)
void recDoBranchImm(u32 branchTo, u32* jmpSkip, bool isLikely, bool swappedDelaySlot)
{
// All R5900 branches use this format:
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
// First up is the Branch Taken Path : Save the recompiler's state, compile the
// DelaySlot, and issue a BranchTest insertion. The state is reloaded below for
// the "did not branch" path (maintains consts, register allocations, and other optimizations).
SaveBranchState();
recompileNextInstruction(1);
if (!swappedDelaySlot)
{
SaveBranchState();
recompileNextInstruction(true, false);
}
SetBranchImm(branchTo);
// Jump target when the branch is *not* taken, skips the branchtest code
@ -50,18 +51,17 @@ void recDoBranchImm(u32* jmpSkip, bool isLikely)
// if it's a likely branch then we'll need to skip the delay slot here, since
// MIPS cancels the delay slot instruction when branches aren't taken.
LoadBranchState();
if (!isLikely)
if (!swappedDelaySlot)
{
pc -= 4; // instruction rewinder for delay slot, if non-likely.
recompileNextInstruction(1);
LoadBranchState();
if (!isLikely)
{
pc -= 4; // instruction rewinder for delay slot, if non-likely.
recompileNextInstruction(true, false);
}
}
SetBranchImm(pc); // start a new recompiled block.
}
void recDoBranchImm_Likely(u32* jmpSkip)
{
recDoBranchImm(jmpSkip, true);
SetBranchImm(pc); // start a new recompiled block.
}
namespace OpcodeImpl {
@ -95,6 +95,7 @@ void recMFSA()
if (!_Rd_)
return;
// TODO(Stenzek): Make these less rubbish
mmreg = _checkXMMreg(XMMTYPE_GPRREG, _Rd_, MODE_WRITE);
if (mmreg >= 0)
{
@ -102,10 +103,9 @@ void recMFSA()
}
else
{
xMOV(eax, ptr[&cpuRegs.sa]);
xMOV(rax, ptr32[&cpuRegs.sa]);
_deleteEEreg(_Rd_, 0);
xMOV(ptr[&cpuRegs.GPR.r[_Rd_].UL[0]], eax);
xMOV(ptr32[&cpuRegs.GPR.r[_Rd_].UL[1]], 0);
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
}
}
@ -124,6 +124,10 @@ void recMTSA()
{
xMOVSS(ptr[&cpuRegs.sa], xRegisterSSE(mmreg));
}
else if ((mmreg = _checkX86reg(X86TYPE_GPR, _Rs_, MODE_READ)) >= 0)
{
xMOV(ptr[&cpuRegs.sa], xRegister32(mmreg));
}
else
{
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);

View File

@ -21,6 +21,7 @@
#include "VU.h"
#include "common/emitter/x86emitter.h"
#include "R3000A.h"
#include "x86/iR3000A.h"
using namespace x86Emitter;
@ -29,7 +30,7 @@ using namespace x86Emitter;
extern u32 g_psxConstRegs[32];
// X86 caching
static int g_x86checknext;
static uint g_x86checknext;
// use special x86 register allocation for ia32
@ -40,92 +41,19 @@ void _initX86regs()
g_x86checknext = 0;
}
uptr _x86GetAddr(int type, int reg)
{
uptr ret = 0;
switch (type & ~X86TYPE_VU1)
{
case X86TYPE_GPR:
ret = (uptr)&cpuRegs.GPR.r[reg];
break;
case X86TYPE_VI:
if (type & X86TYPE_VU1)
ret = (uptr)&VU1.VI[reg];
else
ret = (uptr)&VU0.VI[reg];
break;
case X86TYPE_MEMOFFSET:
ret = 0;
break;
case X86TYPE_VIMEMOFFSET:
ret = 0;
break;
case X86TYPE_VUQREAD:
if (type & X86TYPE_VU1)
ret = (uptr)&VU1.VI[REG_Q];
else
ret = (uptr)&VU0.VI[REG_Q];
break;
case X86TYPE_VUPREAD:
if (type & X86TYPE_VU1)
ret = (uptr)&VU1.VI[REG_P];
else
ret = (uptr)&VU0.VI[REG_P];
break;
case X86TYPE_VUQWRITE:
if (type & X86TYPE_VU1)
ret = (uptr)&VU1.q;
else
ret = (uptr)&VU0.q;
break;
case X86TYPE_VUPWRITE:
if (type & X86TYPE_VU1)
ret = (uptr)&VU1.p;
else
ret = (uptr)&VU0.p;
break;
case X86TYPE_PSX:
ret = (uptr)&psxRegs.GPR.r[reg];
break;
case X86TYPE_PCWRITEBACK:
ret = (uptr)&cpuRegs.pcWriteback;
break;
case X86TYPE_PSX_PCWRITEBACK:
ret = (uptr)&psxRegs.pcWriteback;
break;
jNO_DEFAULT;
}
return ret;
}
int _getFreeX86reg(int mode)
{
int tempi = -1;
u32 bestcount = 0x10000;
int maxreg = (mode & MODE_8BITREG) ? 4 : iREGCNT_GPR;
for (uint i = 0; i < iREGCNT_GPR; i++)
{
int reg = (g_x86checknext + i) % iREGCNT_GPR;
if (reg == 0 || reg == esp.GetId() || reg == ebp.GetId())
const int reg = (g_x86checknext + i) % iREGCNT_GPR;
if (x86regs[reg].inuse || !_isAllocatableX86reg(reg))
continue;
if (reg >= maxreg)
if ((mode & MODE_CALLEESAVED) && xRegister32::IsCallerSaved(reg))
continue;
//if( (mode&MODE_NOFRAME) && reg==EBP ) continue;
if (x86regs[reg].inuse == 0)
{
@ -134,20 +62,26 @@ int _getFreeX86reg(int mode)
}
}
for (int i = 1; i < maxreg; i++)
for (uint i = 0; i < iREGCNT_GPR; i++)
{
if (i == esp.GetId() || i == ebp.GetId())
if (!_isAllocatableX86reg(i))
continue;
//if( (mode&MODE_NOFRAME) && i==EBP ) continue;
if ((mode & MODE_CALLEESAVED) && xRegister32::IsCallerSaved(i))
continue;
// should have checked inuse in the previous loop.
pxAssert(x86regs[i].inuse);
if (x86regs[i].needed)
continue;
if (x86regs[i].type != X86TYPE_TEMP)
{
if (x86regs[i].counter < bestcount)
{
tempi = i;
tempi = static_cast<int>(i);
bestcount = x86regs[i].counter;
}
continue;
@ -163,22 +97,15 @@ int _getFreeX86reg(int mode)
return tempi;
}
pxFailDev("x86 register allocation error");
throw Exception::FailedToAllocateRegister();
}
void _flushCachedRegs()
{
_flushConstRegs();
_flushXMMregs();
pxFailRel("x86 register allocation error");
return -1;
}
void _flushConstReg(int reg)
{
if (GPR_IS_CONST1(reg) && !(g_cpuFlushedConstReg & (1 << reg)))
{
xMOV(ptr32[&cpuRegs.GPR.r[reg].UL[0]], g_cpuConstRegs[reg].UL[0]);
xMOV(ptr32[&cpuRegs.GPR.r[reg].UL[1]], g_cpuConstRegs[reg].UL[1]);
xWriteImm64ToMem(&cpuRegs.GPR.r[reg].UD[0], rax, g_cpuConstRegs[reg].SD[0]);
g_cpuFlushedConstReg |= (1 << reg);
if (reg == 0)
DevCon.Warning("Flushing r0!");
@ -187,243 +114,367 @@ void _flushConstReg(int reg)
void _flushConstRegs()
{
s32 zero_cnt = 0, minusone_cnt = 0;
s32 eaxval = 1; // 0, -1
u32 done[4] = {0, 0, 0, 0};
u8* rewindPtr;
// flush constants
// flush 0 and -1 first
// ignore r0
for (int i = 1, j = 0; i < 32; j++ && ++i, j %= 2)
int zero_reg_count = 0;
int minusone_reg_count = 0;
for (u32 i = 0; i < 32; i++)
{
if (!GPR_IS_CONST1(i) || g_cpuFlushedConstReg & (1 << i))
continue;
if (g_cpuConstRegs[i].SL[j] != 0)
if (!GPR_IS_CONST1(i) || g_cpuFlushedConstReg & (1u << i))
continue;
if (eaxval != 0)
{
xXOR(eax, eax);
eaxval = 0;
}
xMOV(ptr[&cpuRegs.GPR.r[i].SL[j]], eax);
done[j] |= 1 << i;
zero_cnt++;
if (g_cpuConstRegs[i].SD[0] == 0)
zero_reg_count++;
else if (g_cpuConstRegs[i].SD[0] == -1)
minusone_reg_count++;
}
rewindPtr = x86Ptr;
for (int i = 1, j = 0; i < 32; j++ && ++i, j %= 2)
// if we have more than one of zero/minus-one, precompute
bool rax_is_zero = false;
if (zero_reg_count > 1)
{
if (!GPR_IS_CONST1(i) || g_cpuFlushedConstReg & (1 << i))
continue;
if (g_cpuConstRegs[i].SL[j] != -1)
continue;
if (eaxval > 0)
xXOR(eax, eax);
for (u32 i = 0; i < 32; i++)
{
xXOR(eax, eax);
eaxval = 0;
}
if (eaxval == 0)
{
xNOT(eax);
eaxval = -1;
}
if (!GPR_IS_CONST1(i) || g_cpuFlushedConstReg & (1u << i))
continue;
xMOV(ptr[&cpuRegs.GPR.r[i].SL[j]], eax);
done[j + 2] |= 1 << i;
minusone_cnt++;
}
if (minusone_cnt == 1 && !zero_cnt) // not worth it for one byte
{
x86SetPtr(rewindPtr);
}
else
{
done[0] |= done[2];
done[1] |= done[3];
}
for (int i = 1; i < 32; ++i)
{
if (GPR_IS_CONST1(i))
{
if (!(g_cpuFlushedConstReg & (1 << i)))
if (g_cpuConstRegs[i].SD[0] == 0)
{
if (!(done[0] & (1 << i)))
xMOV(ptr32[&cpuRegs.GPR.r[i].UL[0]], g_cpuConstRegs[i].UL[0]);
if (!(done[1] & (1 << i)))
xMOV(ptr32[&cpuRegs.GPR.r[i].UL[1]], g_cpuConstRegs[i].UL[1]);
g_cpuFlushedConstReg |= 1 << i;
xMOV(ptr64[&cpuRegs.GPR.r[i].UD[0]], rax);
g_cpuFlushedConstReg |= 1u << i;
}
if (g_cpuHasConstReg == g_cpuFlushedConstReg)
break;
}
rax_is_zero = true;
}
if (minusone_reg_count > 1)
{
if (!rax_is_zero)
xMOV(rax, -1);
else
xNOT(rax);
for (u32 i = 0; i < 32; i++)
{
if (!GPR_IS_CONST1(i) || g_cpuFlushedConstReg & (1u << i))
continue;
if (g_cpuConstRegs[i].SD[0] == -1)
{
xMOV(ptr64[&cpuRegs.GPR.r[i].UD[0]], rax);
g_cpuFlushedConstReg |= 1u << i;
}
}
}
// and whatever's left over..
for (u32 i = 0; i < 32; i++)
{
if (!GPR_IS_CONST1(i) || g_cpuFlushedConstReg & (1u << i))
continue;
xWriteImm64ToMem(&cpuRegs.GPR.r[i].UD[0], rax, g_cpuConstRegs[i].UD[0]);
g_cpuFlushedConstReg |= 1u << i;
}
}
int _allocX86reg(xRegister32 x86reg, int type, int reg, int mode)
static const char* GetModeString(int mode)
{
uint i;
pxAssertDev(reg >= 0 && reg < 32, "Register index out of bounds.");
pxAssertDev(x86reg != esp && x86reg != ebp, "Allocation of ESP/EBP is not allowed!");
return ((mode & MODE_READ)) ? ((mode & MODE_WRITE) ? "readwrite" : "read") : "write";
}
// don't alloc EAX and ESP,EBP if MODE_NOFRAME
int oldmode = mode;
//int noframe = mode & MODE_NOFRAME;
uint maxreg = (mode & MODE_8BITREG) ? 4 : iREGCNT_GPR;
mode &= ~(MODE_NOFRAME | MODE_8BITREG);
int readfromreg = -1;
if (type != X86TYPE_TEMP)
void _validateRegs()
{
#ifdef PCSX2_DEVBUILD
// check that no two registers are in write mode in both fprs and gprs
for (s8 guestreg = 0; guestreg < 32; guestreg++)
{
if (maxreg < iREGCNT_GPR)
u32 gprreg = 0, gprmode = 0;
u32 fprreg = 0, fprmode = 0;
for (u32 hostreg = 0; hostreg < iREGCNT_GPR; hostreg++)
{
// make sure reg isn't in the higher regs
for (i = maxreg; i < iREGCNT_GPR; ++i)
if (x86regs[hostreg].inuse && x86regs[hostreg].type == X86TYPE_GPR && x86regs[hostreg].reg == guestreg)
{
if (!x86regs[i].inuse || x86regs[i].type != type || x86regs[i].reg != reg)
continue;
if (mode & MODE_READ)
{
readfromreg = i;
x86regs[i].inuse = 0;
break;
}
else if (mode & MODE_WRITE)
{
x86regs[i].inuse = 0;
break;
}
pxAssertMsg(gprreg == 0 && gprmode == 0, "register is not already allocated in a GPR");
gprreg = hostreg;
gprmode = x86regs[hostreg].mode;
}
}
for (u32 hostreg = 0; hostreg < iREGCNT_XMM; hostreg++)
{
if (xmmregs[hostreg].inuse && xmmregs[hostreg].type == XMMTYPE_GPRREG && xmmregs[hostreg].reg == guestreg)
{
pxAssertMsg(fprreg == 0 && fprmode == 0, "register is not already allocated in a XMM");
fprreg = hostreg;
fprmode = xmmregs[hostreg].mode;
}
}
for (i = 1; i < maxreg; i++)
if ((gprmode | fprmode) & MODE_WRITE)
pxAssertMsg((gprmode & MODE_WRITE) != (fprmode & MODE_WRITE), "only one of gpr or fps is in write state");
if (gprmode & MODE_WRITE)
pxAssertMsg(fprmode == 0, "when writing to the gpr, fpr is invalid");
if (fprmode & MODE_WRITE)
pxAssertMsg(gprmode == 0, "when writing to the fpr, gpr is invalid");
}
#endif
}
int _allocX86reg(int type, int reg, int mode)
{
if (type == X86TYPE_GPR || type == X86TYPE_PSX)
{
pxAssertDev(reg >= 0 && reg < 34, "Register index out of bounds.");
}
int hostXMMreg = (type == X86TYPE_GPR) ? _checkXMMreg(XMMTYPE_GPRREG, reg, 0) : -1;
if (type != X86TYPE_TEMP)
{
for (int i = 0; i < static_cast<int>(iREGCNT_GPR); i++)
{
if ((int)i == esp.GetId() || (int)i == ebp.GetId())
continue;
if (!x86regs[i].inuse || x86regs[i].type != type || x86regs[i].reg != reg)
continue;
// We're in a for loop until i<maxreg. This will never happen.
/*if( i >= maxreg ) {
if (x86regs[i].mode & MODE_READ) readfromreg = i;
pxAssert(type != X86TYPE_GPR || !GPR_IS_CONST1(reg) || (GPR_IS_CONST1(reg) && g_cpuFlushedConstReg & (1u << reg)));
mode |= x86regs[i].mode&MODE_WRITE;
x86regs[i].inuse = 0;
break;
}*/
// can't go from write to read
pxAssert(!((x86regs[i].mode & (MODE_READ | MODE_WRITE)) == MODE_WRITE && (mode & (MODE_READ | MODE_WRITE)) == MODE_READ));
// if (type != X86TYPE_TEMP && !(x86regs[i].mode & MODE_READ) && (mode & MODE_READ))
if (!x86reg.IsEmpty())
if (type == X86TYPE_GPR)
{
// requested specific reg, so return that instead
if (i != (uint)x86reg.GetId())
RALOG("Changing host reg %d for guest reg %d from %s to %s mode\n", i, reg, GetModeString(x86regs[i].mode), GetModeString(x86regs[i].mode | mode));
if (mode & MODE_WRITE)
{
if (x86regs[i].mode & MODE_READ)
readfromreg = i;
mode |= x86regs[i].mode & MODE_WRITE;
x86regs[i].inuse = 0;
break;
if (GPR_IS_CONST1(reg))
{
RALOG("Clearing constant value for guest reg %d on change to write mode\n", reg);
GPR_DEL_CONST(reg);
}
if (hostXMMreg >= 0)
{
// ensure upper bits get written
RALOG("Invalidating host XMM reg %d for guest reg %d due to GPR write transition\n", hostXMMreg, reg);
pxAssert(!(xmmregs[hostXMMreg].mode & MODE_WRITE));
_freeXMMreg(hostXMMreg);
}
}
}
if (type != X86TYPE_TEMP && !(x86regs[i].mode & MODE_READ) && (mode & MODE_READ))
else if (type == X86TYPE_PSX)
{
RALOG("Changing host reg %d for guest PSX reg %d from %s to %s mode\n", i, reg, GetModeString(x86regs[i].mode), GetModeString(x86regs[i].mode | mode));
if (type == X86TYPE_GPR)
_flushConstReg(reg);
if (X86_ISVI(type) && reg < 16)
xMOVZX(xRegister32(i), ptr16[(u16*)(_x86GetAddr(type, reg))]);
else
xMOV(xRegister32(i), ptr[(void*)(_x86GetAddr(type, reg))]);
x86regs[i].mode |= MODE_READ;
if (mode & MODE_WRITE)
{
if (PSX_IS_CONST1(reg))
{
RALOG("Clearing constant value for guest PSX reg %d on change to write mode\n", reg);
PSX_DEL_CONST(reg);
}
}
}
else if (type == X86TYPE_VIREG)
{
// keep VI temporaries separate
if (reg < 0)
continue;
}
x86regs[i].needed = 1;
x86regs[i].mode |= mode;
x86regs[i].counter = g_x86AllocCounter++;
x86regs[i].mode |= mode & ~MODE_CALLEESAVED;
x86regs[i].needed = true;
return i;
}
}
if (x86reg.IsEmpty())
x86reg = xRegister32(_getFreeX86reg(oldmode));
else
_freeX86reg(x86reg);
const int regnum = _getFreeX86reg(mode);
xRegister64 new_reg(regnum);
x86regs[regnum].type = type;
x86regs[regnum].reg = reg;
x86regs[regnum].mode = mode & ~MODE_CALLEESAVED;
x86regs[regnum].counter = g_x86AllocCounter++;
x86regs[regnum].needed = true;
x86regs[regnum].inuse = true;
x86regs[x86reg.GetId()].type = type;
x86regs[x86reg.GetId()].reg = reg;
x86regs[x86reg.GetId()].mode = mode;
x86regs[x86reg.GetId()].needed = 1;
x86regs[x86reg.GetId()].inuse = 1;
if (type == X86TYPE_GPR)
{
RALOG("Allocating host reg %d to guest reg %d in %s mode\n", regnum, reg, GetModeString(mode));
}
if (mode & MODE_READ)
{
if (readfromreg >= 0)
xMOV(x86reg, xRegister32(readfromreg));
else
switch (type)
{
if (type == X86TYPE_GPR)
case X86TYPE_GPR:
{
if (reg == 0)
{
xXOR(x86reg, x86reg);
xXOR(xRegister32(new_reg), xRegister32(new_reg)); // 32-bit is smaller and zexts anyway
}
else
{
_flushConstReg(reg);
_deleteGPRtoXMMreg(reg, 1);
if (hostXMMreg >= 0)
{
// is in a XMM. we don't need to free the XMM since we're not writing, and it's still valid
RALOG("Copying %d from XMM %d to GPR %d on read\n", reg, hostXMMreg, regnum);
xMOVD(new_reg, xRegisterSSE(hostXMMreg)); // actually MOVQ
_eeMoveGPRtoR(x86reg, reg);
// if the XMM was dirty, just get rid of it, we don't want to try to sync the values up...
if (xmmregs[hostXMMreg].mode & MODE_WRITE)
{
RALOG("Freeing dirty XMM %d for GPR %d\n", hostXMMreg, reg);
_freeXMMreg(hostXMMreg);
}
}
else if (GPR_IS_CONST1(reg))
{
xMOV64(new_reg, g_cpuConstRegs[reg].SD[0]);
g_cpuFlushedConstReg |= (1u << reg);
x86regs[regnum].mode |= MODE_WRITE; // reg is dirty
_deleteGPRtoXMMreg(reg, 0);
}
}
else
{
if (X86_ISVI(type) && reg < 16)
{
if (reg == 0)
xXOR(x86reg, x86reg);
RALOG("Writing constant value %lld from guest reg %d to host reg %d\n", g_cpuConstRegs[reg].SD[0], reg, regnum);
}
else
xMOVZX(x86reg, ptr16[(u16*)(_x86GetAddr(type, reg))]);
{
// not loaded
RALOG("Loading guest reg %d to GPR %d\n", reg, regnum);
xMOV(new_reg, ptr64[&cpuRegs.GPR.r[reg].UD[0]]);
}
}
}
break;
case X86TYPE_FPRC:
RALOG("Loading guest reg FPCR %d to GPR %d\n", reg, regnum);
xMOV(xRegister32(regnum), ptr32[&fpuRegs.fprc[reg]]);
break;
case X86TYPE_PSX:
{
const xRegister32 new_reg32(regnum);
if (reg == 0)
{
xXOR(new_reg32, new_reg32);
}
else
xMOV(x86reg, ptr[(void*)(_x86GetAddr(type, reg))]);
{
if (PSX_IS_CONST1(reg))
{
xMOV(new_reg32, g_psxConstRegs[reg]);
g_psxFlushedConstReg |= (1u << reg);
x86regs[regnum].mode |= MODE_WRITE; // reg is dirty
RALOG("Writing constant value %d from guest PSX reg %d to host reg %d\n", g_psxConstRegs[reg], reg, regnum);
}
else
{
RALOG("Loading guest PSX reg %d to GPR %d\n", reg, regnum);
xMOV(new_reg32, ptr32[&psxRegs.GPR.r[reg]]);
}
}
}
break;
default:
abort();
break;
}
}
// Need to port all the code
// return x86reg;
return x86reg.GetId();
if (type == X86TYPE_GPR && (mode & MODE_WRITE))
{
if (reg < 32 && GPR_IS_CONST1(reg))
{
RALOG("Clearing constant value for guest reg %d on write allocation\n", reg);
GPR_DEL_CONST(reg);
}
if (hostXMMreg >= 0)
{
// writing, so kill the xmm allocation. gotta ensure the upper bits gets stored first.
RALOG("Invalidating %d from XMM %d because of GPR %d write\n", reg, hostXMMreg, regnum);
_freeXMMreg(hostXMMreg);
}
}
else if (type == X86TYPE_PSX && (mode & MODE_WRITE))
{
if (reg < 32 && PSX_IS_CONST1(reg))
{
RALOG("Clearing constant value for guest PSX reg %d on write allocation\n", reg);
PSX_DEL_CONST(reg);
}
}
// Console.WriteLn("Allocating reg %d", regnum);
return regnum;
}
void _writebackX86Reg(int x86reg)
{
switch (x86regs[x86reg].type)
{
case X86TYPE_GPR:
RALOG("Writing back GPR reg %d for guest reg %d P2\n", x86reg, x86regs[x86reg].reg);
xMOV(ptr64[&cpuRegs.GPR.r[x86regs[x86reg].reg].UD[0]], xRegister64(x86reg));
break;
case X86TYPE_FPRC:
RALOG("Writing back GPR reg %d for guest reg FPCR %d P2\n", x86reg, x86regs[x86reg].reg);
xMOV(ptr32[&fpuRegs.fprc[x86regs[x86reg].reg]], xRegister32(x86reg));
break;
case X86TYPE_VIREG:
RALOG("Writing back VI reg %d for guest reg %d P2\n", x86reg, x86regs[x86reg].reg);
xMOV(ptr16[&VU0.VI[x86regs[x86reg].reg].UL], xRegister16(x86reg));
break;
case X86TYPE_PCWRITEBACK:
RALOG("Writing back PC writeback in host reg %d\n", x86reg);
xMOV(ptr32[&cpuRegs.pcWriteback], xRegister32(x86reg));
break;
case X86TYPE_PSX:
RALOG("Writing back PSX GPR reg %d for guest reg %d P2\n", x86reg, x86regs[x86reg].reg);
xMOV(ptr32[&psxRegs.GPR.r[x86regs[x86reg].reg]], xRegister32(x86reg));
break;
case X86TYPE_PSX_PCWRITEBACK:
RALOG("Writing back PSX PC writeback in host reg %d\n", x86reg);
xMOV(ptr32[&psxRegs.pcWriteback], xRegister32(x86reg));
break;
default:
abort();
break;
}
}
int _checkX86reg(int type, int reg, int mode)
{
uint i;
for (i = 0; i < iREGCNT_GPR; i++)
for (uint i = 0; i < iREGCNT_GPR; i++)
{
if (x86regs[i].inuse && x86regs[i].reg == reg && x86regs[i].type == type)
{
// shouldn't have dirty constants...
pxAssert((type != X86TYPE_GPR || !GPR_IS_DIRTY_CONST(reg)) &&
(type != X86TYPE_PSX || !PSX_IS_DIRTY_CONST(reg)));
if (!(x86regs[i].mode & MODE_READ) && (mode & MODE_READ))
if ((type == X86TYPE_GPR || type == X86TYPE_PSX) && !(x86regs[i].mode & MODE_READ) && (mode & MODE_READ))
pxFailRel("Somehow ended up with an allocated x86 without mode");
// ensure constants get deleted once we alloc as write
if (mode & MODE_WRITE)
{
if (X86_ISVI(type))
xMOVZX(xRegister32(i), ptr16[(u16*)(_x86GetAddr(type, reg))]);
else
xMOV(xRegister32(i), ptr[(void*)(_x86GetAddr(type, reg))]);
if (type == X86TYPE_GPR)
{
// go through the alloc path instead, because we might need to invalidate an xmm.
return _allocX86reg(X86TYPE_GPR, reg, mode);
}
else if (type == X86TYPE_PSX)
{
pxAssert(!PSX_IS_DIRTY_CONST(reg));
PSX_DEL_CONST(reg);
}
}
x86regs[i].mode |= mode;
@ -438,9 +489,7 @@ int _checkX86reg(int type, int reg, int mode)
void _addNeededX86reg(int type, int reg)
{
uint i;
for (i = 0; i < iREGCNT_GPR; i++)
for (uint i = 0; i < iREGCNT_GPR; i++)
{
if (!x86regs[i].inuse || x86regs[i].reg != reg || x86regs[i].type != type)
continue;
@ -452,9 +501,7 @@ void _addNeededX86reg(int type, int reg)
void _clearNeededX86regs()
{
uint i;
for (i = 0; i < iREGCNT_GPR; i++)
for (uint i = 0; i < iREGCNT_GPR; i++)
{
if (x86regs[i].needed)
{
@ -465,44 +512,6 @@ void _clearNeededX86regs()
}
}
void _deleteX86reg(int type, int reg, int flush)
{
uint i;
for (i = 0; i < iREGCNT_GPR; i++)
{
if (x86regs[i].inuse && x86regs[i].reg == reg && x86regs[i].type == type)
{
switch (flush)
{
case 0:
_freeX86reg(i);
break;
case 1:
if (x86regs[i].mode & MODE_WRITE)
{
if (X86_ISVI(type) && x86regs[i].reg < 16)
xMOV(ptr[(void*)(_x86GetAddr(type, x86regs[i].reg))], xRegister16(i));
else
xMOV(ptr[(void*)(_x86GetAddr(type, x86regs[i].reg))], xRegister32(i));
// get rid of MODE_WRITE since don't want to flush again
x86regs[i].mode &= ~MODE_WRITE;
x86regs[i].mode |= MODE_READ;
}
return;
case 2:
x86regs[i].inuse = 0;
break;
}
}
}
}
// Temporary solution to support eax/ebx... type
void _freeX86reg(const x86Emitter::xRegister32& x86reg)
{
_freeX86reg(x86reg.GetId());
@ -514,17 +523,33 @@ void _freeX86reg(int x86reg)
if (x86regs[x86reg].inuse && (x86regs[x86reg].mode & MODE_WRITE))
{
_writebackX86Reg(x86reg);
x86regs[x86reg].mode &= ~MODE_WRITE;
if (X86_ISVI(x86regs[x86reg].type) && x86regs[x86reg].reg < 16)
{
xMOV(ptr[(void*)(_x86GetAddr(x86regs[x86reg].type, x86regs[x86reg].reg))], xRegister16(x86reg));
}
else
xMOV(ptr[(void*)(_x86GetAddr(x86regs[x86reg].type, x86regs[x86reg].reg))], xRegister32(x86reg));
}
_freeX86regWithoutWriteback(x86reg);
}
void _freeX86regWithoutWriteback(int x86reg)
{
pxAssert(x86reg >= 0 && x86reg < (int)iREGCNT_GPR);
x86regs[x86reg].inuse = 0;
if (x86regs[x86reg].type == X86TYPE_VIREG)
{
RALOG("Freeing VI reg %d in host GPR %d\n", x86regs[x86reg].reg, x86reg);
//mVUFreeCOP2GPR(x86reg);
abort();
}
else if (x86regs[x86reg].inuse && x86regs[x86reg].type == X86TYPE_GPR)
{
RALOG("Freeing X86 register %d (was guest %d)...\n", x86reg, x86regs[x86reg].reg);
}
else if (x86regs[x86reg].inuse)
{
RALOG("Freeing X86 register %d...\n", x86reg);
}
}
void _freeX86regs()
@ -533,12 +558,18 @@ void _freeX86regs()
_freeX86reg(i);
}
// Misc
void _signExtendSFtoM(uptr mem)
void _flushX86regs()
{
xLAHF();
xSAR(ax, 15);
xCWDE();
xMOV(ptr[(void*)(mem)], eax);
for (u32 i = 0; i < iREGCNT_GPR; ++i)
{
if (x86regs[i].inuse && x86regs[i].mode & MODE_WRITE)
{
// shouldn't be const, because if we got to write mode, we should've flushed then
pxAssert(x86regs[i].type != X86TYPE_GPR || !GPR_IS_DIRTY_CONST(x86regs[i].reg));
RALOG("Flushing x86 reg %u in _eeFlushAllDirty()\n", i);
_writebackX86Reg(i);
x86regs[i].mode = (x86regs[i].mode & ~MODE_WRITE) | MODE_READ;
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -22,10 +22,8 @@
using namespace x86Emitter;
namespace R5900 {
namespace Dynarec {
namespace OpcodeImpl {
namespace R5900::Dynarec::OpcodeImpl
{
/*********************************************************
* Register arithmetic *
* Format: OP rd, rs, rt *
@ -37,67 +35,126 @@ namespace OpcodeImpl {
namespace Interp = R5900::Interpreter::OpcodeImpl;
REC_FUNC_DEL(ADD, _Rd_);
REC_FUNC_DEL(ADDU, _Rd_);
REC_FUNC_DEL(DADD, _Rd_);
REC_FUNC_DEL(ADD, _Rd_);
REC_FUNC_DEL(ADDU, _Rd_);
REC_FUNC_DEL(DADD, _Rd_);
REC_FUNC_DEL(DADDU, _Rd_);
REC_FUNC_DEL(SUB, _Rd_);
REC_FUNC_DEL(SUBU, _Rd_);
REC_FUNC_DEL(DSUB, _Rd_);
REC_FUNC_DEL(SUB, _Rd_);
REC_FUNC_DEL(SUBU, _Rd_);
REC_FUNC_DEL(DSUB, _Rd_);
REC_FUNC_DEL(DSUBU, _Rd_);
REC_FUNC_DEL(AND, _Rd_);
REC_FUNC_DEL(OR, _Rd_);
REC_FUNC_DEL(XOR, _Rd_);
REC_FUNC_DEL(NOR, _Rd_);
REC_FUNC_DEL(SLT, _Rd_);
REC_FUNC_DEL(SLTU, _Rd_);
REC_FUNC_DEL(AND, _Rd_);
REC_FUNC_DEL(OR, _Rd_);
REC_FUNC_DEL(XOR, _Rd_);
REC_FUNC_DEL(NOR, _Rd_);
REC_FUNC_DEL(SLT, _Rd_);
REC_FUNC_DEL(SLTU, _Rd_);
#else
static void recMoveStoD(int info)
{
if (info & PROCESS_EE_S)
xMOV(xRegister32(EEREC_D), xRegister32(EEREC_S));
else
xMOV(xRegister32(EEREC_D), ptr32[&cpuRegs.GPR.r[_Rs_].UL[0]]);
}
static void recMoveStoD64(int info)
{
if (info & PROCESS_EE_S)
xMOV(xRegister64(EEREC_D), xRegister64(EEREC_S));
else
xMOV(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]]);
}
static void recMoveTtoD(int info)
{
if (info & PROCESS_EE_T)
xMOV(xRegister32(EEREC_D), xRegister32(EEREC_T));
else
xMOV(xRegister32(EEREC_D), ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
}
static void recMoveTtoD64(int info)
{
if (info & PROCESS_EE_T)
xMOV(xRegister64(EEREC_D), xRegister64(EEREC_T));
else
xMOV(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]]);
}
//// ADD
void recADD_const()
static void recADD_const()
{
g_cpuConstRegs[_Rd_].SD[0] = s64(s32(g_cpuConstRegs[_Rs_].UL[0] + g_cpuConstRegs[_Rt_].UL[0]));
}
void recADD_constv(int info, int creg, u32 vreg)
// s is constant
static void recADD_consts(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
s32 cval = g_cpuConstRegs[creg].SL[0];
xMOV(eax, ptr32[&cpuRegs.GPR.r[vreg].SL[0]]);
if (cval)
xADD(eax, cval);
eeSignExtendTo(_Rd_, _Rd_ == vreg && !cval);
}
// s is constant
void recADD_consts(int info)
{
recADD_constv(info, _Rs_, _Rt_);
const s32 cval = g_cpuConstRegs[_Rs_].SL[0];
recMoveTtoD(info);
if (cval != 0)
xADD(xRegister32(EEREC_D), cval);
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
// t is constant
void recADD_constt(int info)
{
recADD_constv(info, _Rt_, _Rs_);
}
// nothing is constant
void recADD_(int info)
static void recADD_constt(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rs_].SL[0]]);
if (_Rs_ == _Rt_)
xADD(eax, eax);
else
xADD(eax, ptr32[&cpuRegs.GPR.r[_Rt_].SL[0]]);
eeSignExtendTo(_Rd_);
const s32 cval = g_cpuConstRegs[_Rt_].SL[0];
recMoveStoD(info);
if (cval != 0)
xADD(xRegister32(EEREC_D), cval);
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
EERECOMPILE_CODE0(ADD, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
// nothing is constant
static void recADD_(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
if ((info & PROCESS_EE_S) && (info & PROCESS_EE_T))
{
if (EEREC_D == EEREC_S)
{
xADD(xRegister32(EEREC_D), xRegister32(EEREC_T));
}
else if (EEREC_D == EEREC_T)
{
xADD(xRegister32(EEREC_D), xRegister32(EEREC_S));
}
else
{
xMOV(xRegister32(EEREC_D), xRegister32(EEREC_S));
xADD(xRegister32(EEREC_D), xRegister32(EEREC_T));
}
}
else if (info & PROCESS_EE_S)
{
xMOV(xRegister32(EEREC_D), xRegister32(EEREC_S));
xADD(xRegister32(EEREC_D), ptr32[&cpuRegs.GPR.r[_Rt_].UD[0]]);
}
else if (info & PROCESS_EE_T)
{
xMOV(xRegister32(EEREC_D), xRegister32(EEREC_T));
xADD(xRegister32(EEREC_D), ptr32[&cpuRegs.GPR.r[_Rs_].UD[0]]);
}
else
{
xMOV(xRegister32(EEREC_D), ptr32[&cpuRegs.GPR.r[_Rs_].UD[0]]);
xADD(xRegister32(EEREC_D), ptr32[&cpuRegs.GPR.r[_Rt_].UD[0]]);
}
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
EERECOMPILE_CODERC0(ADD, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
//// ADDU
void recADDU(void)
@ -111,77 +168,67 @@ void recDADD_const(void)
g_cpuConstRegs[_Rd_].UD[0] = g_cpuConstRegs[_Rs_].UD[0] + g_cpuConstRegs[_Rt_].UD[0];
}
void recDADD_constv(int info, int creg, u32 vreg)
// s is constant
static void recDADD_consts(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
GPR_reg64 cval = g_cpuConstRegs[creg];
const s64 cval = g_cpuConstRegs[_Rs_].SD[0];
recMoveTtoD64(info);
if (cval != 0)
xImm64Op(xADD, xRegister64(EEREC_D), rax, cval);
}
if (_Rd_ == vreg)
// t is constant
static void recDADD_constt(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
const s64 cval = g_cpuConstRegs[_Rt_].SD[0];
recMoveStoD64(info);
if (cval != 0)
xImm64Op(xADD, xRegister64(EEREC_D), rax, cval);
}
// nothing is constant
static void recDADD_(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
if ((info & PROCESS_EE_S) && (info & PROCESS_EE_T))
{
if (!cval.SD[0])
return; // no-op
xImm64Op(xADD, ptr64[&cpuRegs.GPR.r[_Rd_].SD[0]], rax, cval.SD[0]);
}
else
{
if (cval.SD[0])
if (EEREC_D == EEREC_S)
{
xMOV64(rax, cval.SD[0]);
xADD(rax, ptr64[&cpuRegs.GPR.r[vreg].SD[0]]);
xADD(xRegister64(EEREC_D), xRegister64(EEREC_T));
}
else if (EEREC_D == EEREC_T)
{
xADD(xRegister64(EEREC_D), xRegister64(EEREC_S));
}
else
{
xMOV(rax, ptr64[&cpuRegs.GPR.r[vreg].SD[0]]);
xMOV(xRegister64(EEREC_D), xRegister64(EEREC_S));
xADD(xRegister64(EEREC_D), xRegister64(EEREC_T));
}
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].SD[0]], rax);
}
}
void recDADD_consts(int info)
{
recDADD_constv(info, _Rs_, _Rt_);
}
void recDADD_constt(int info)
{
recDADD_constv(info, _Rt_, _Rs_);
}
void recDADD_(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
u32 rs = _Rs_, rt = _Rt_;
if (_Rd_ == _Rt_)
rs = _Rt_, rt = _Rs_;
if (_Rd_ == _Rs_ && _Rs_ == _Rt_)
else if (info & PROCESS_EE_S)
{
xSHL(ptr64[&cpuRegs.GPR.r[_Rd_].SD[0]], 1);
return;
xMOV(xRegister64(EEREC_D), xRegister64(EEREC_S));
xADD(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]]);
}
xMOV(rax, ptr64[&cpuRegs.GPR.r[rt].SD[0]]);
if (_Rd_ == rs)
else if (info & PROCESS_EE_T)
{
xADD(ptr64[&cpuRegs.GPR.r[_Rd_].SD[0]], rax);
return;
}
else if (rs == rt)
{
xADD(rax, rax);
xMOV(xRegister64(EEREC_D), xRegister64(EEREC_T));
xADD(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]]);
}
else
{
xADD(rax, ptr32[&cpuRegs.GPR.r[rs].SD[0]]);
xMOV(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]]);
xADD(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]]);
}
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].SD[0]], rax);
}
EERECOMPILE_CODE0(DADD, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
EERECOMPILE_CODERC0(DADD, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT | XMMINFO_64BITOP);
//// DADDU
void recDADDU(void)
@ -191,50 +238,92 @@ void recDADDU(void)
//// SUB
void recSUB_const()
static void recSUB_const()
{
g_cpuConstRegs[_Rd_].SD[0] = s64(s32(g_cpuConstRegs[_Rs_].UL[0] - g_cpuConstRegs[_Rt_].UL[0]));
}
void recSUB_consts(int info)
static void recSUB_consts(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
s32 sval = g_cpuConstRegs[_Rs_].SL[0];
const s32 sval = g_cpuConstRegs[_Rs_].SL[0];
xMOV(eax, sval);
xSUB(eax, ptr32[&cpuRegs.GPR.r[_Rt_].SL[0]]);
eeSignExtendTo(_Rd_);
if (info & PROCESS_EE_T)
xSUB(eax, xRegister32(EEREC_T));
else
xSUB(eax, ptr32[&cpuRegs.GPR.r[_Rt_].SL[0]]);
xMOVSX(xRegister64(EEREC_D), eax);
}
void recSUB_constt(int info)
static void recSUB_constt(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
s32 tval = g_cpuConstRegs[_Rt_].SL[0];
const s32 tval = g_cpuConstRegs[_Rt_].SL[0];
recMoveStoD(info);
if (tval != 0)
xSUB(xRegister32(EEREC_D), tval);
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rs_].SL[0]]);
if (tval)
xSUB(eax, tval);
eeSignExtendTo(_Rd_, _Rd_ == _Rs_ && !tval);
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
void recSUB_(int info)
static void recSUB_(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
if (_Rs_ == _Rt_)
{
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].SD[0]], 0);
xXOR(xRegister32(EEREC_D), xRegister32(EEREC_D));
return;
}
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rs_].SL[0]]);
xSUB(eax, ptr32[&cpuRegs.GPR.r[_Rt_].SL[0]]);
eeSignExtendTo(_Rd_);
// a bit messier here because it's not commutative..
if ((info & PROCESS_EE_S) && (info & PROCESS_EE_T))
{
if (EEREC_D == EEREC_S)
{
xSUB(xRegister32(EEREC_D), xRegister32(EEREC_T));
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
else if (EEREC_D == EEREC_T)
{
// D might equal T
xMOV(eax, xRegister32(EEREC_S));
xSUB(eax, xRegister32(EEREC_T));
xMOVSX(xRegister64(EEREC_D), eax);
}
else
{
xMOV(xRegister32(EEREC_D), xRegister32(EEREC_S));
xSUB(xRegister32(EEREC_D), xRegister32(EEREC_T));
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
}
else if (info & PROCESS_EE_S)
{
xMOV(xRegister32(EEREC_D), xRegister32(EEREC_S));
xSUB(xRegister32(EEREC_D), ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
else if (info & PROCESS_EE_T)
{
// D might equal T
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rs_].UL[0]]);
xSUB(eax, xRegister32(EEREC_T));
xMOVSX(xRegister64(EEREC_D), eax);
}
else
{
xMOV(xRegister32(EEREC_D), ptr32[&cpuRegs.GPR.r[_Rs_].UL[0]]);
xSUB(xRegister32(EEREC_D), ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
}
EERECOMPILE_CODE0(SUB, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(SUB, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
//// SUBU
void recSUBU(void)
@ -243,74 +332,79 @@ void recSUBU(void)
}
//// DSUB
void recDSUB_const()
static void recDSUB_const()
{
g_cpuConstRegs[_Rd_].UD[0] = g_cpuConstRegs[_Rs_].UD[0] - g_cpuConstRegs[_Rt_].UD[0];
}
void recDSUB_consts(int info)
static void recDSUB_consts(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
GPR_reg64 sval = g_cpuConstRegs[_Rs_];
// gross, because if d == t, we can't destroy t
const s64 sval = g_cpuConstRegs[_Rs_].SD[0];
const xRegister64 regd((info & PROCESS_EE_T && EEREC_D == EEREC_T) ? rax.GetId() : EEREC_D);
xMOV64(regd, sval);
if (!sval.SD[0] && _Rd_ == _Rt_)
{
xNEG(ptr64[&cpuRegs.GPR.r[_Rd_].SD[0]]);
return;
}
if (info & PROCESS_EE_T)
xSUB(regd, xRegister64(EEREC_T));
else
{
xMOV64(rax, sval.SD[0]);
}
xSUB(regd, ptr64[&cpuRegs.GPR.r[_Rt_].SD[0]]);
xSUB(rax, ptr32[&cpuRegs.GPR.r[_Rt_].SD[0]]);
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].SL[0]], rax);
// emitter will eliminate redundant moves.
xMOV(xRegister64(EEREC_D), regd);
}
void recDSUB_constt(int info)
static void recDSUB_constt(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
GPR_reg64 tval = g_cpuConstRegs[_Rt_];
if (_Rd_ == _Rs_)
{
xImm64Op(xSUB, ptr64[&cpuRegs.GPR.r[_Rd_].SD[0]], rax, tval.SD[0]);
}
else
{
xMOV(rax, ptr64[&cpuRegs.GPR.r[_Rs_].SD[0]]);
if (tval.SD[0])
{
xImm64Op(xSUB, rax, rdx, tval.SD[0]);
}
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].SL[0]], rax);
}
const s64 tval = g_cpuConstRegs[_Rt_].SD[0];
recMoveStoD64(info);
if (tval != 0)
xImm64Op(xSUB, xRegister64(EEREC_D), rax, tval);
}
void recDSUB_(int info)
static void recDSUB_(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
if (_Rs_ == _Rt_)
{
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].SD[0]], 0);
xXOR(xRegister32(EEREC_D), xRegister32(EEREC_D));
return;
}
else if (_Rd_ == _Rs_)
// a bit messier here because it's not commutative..
if ((info & PROCESS_EE_S) && (info & PROCESS_EE_T))
{
xMOV(rax, ptr64[&cpuRegs.GPR.r[_Rt_].SD[0]]);
xSUB(ptr64[&cpuRegs.GPR.r[_Rd_].SD[0]], rax);
// D might equal T
const xRegister64 regd(EEREC_D == EEREC_T ? rax.GetId() : EEREC_D);
xMOV(regd, xRegister64(EEREC_S));
xSUB(regd, xRegister64(EEREC_T));
xMOV(xRegister64(EEREC_D), regd);
}
else if (info & PROCESS_EE_S)
{
xMOV(xRegister64(EEREC_D), xRegister64(EEREC_S));
xSUB(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]]);
}
else if (info & PROCESS_EE_T)
{
// D might equal T
const xRegister64 regd(EEREC_D == EEREC_T ? rax.GetId() : EEREC_D);
xMOV(regd, ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]]);
xSUB(regd, xRegister64(EEREC_T));
xMOV(xRegister64(EEREC_D), regd);
}
else
{
xMOV(rax, ptr64[&cpuRegs.GPR.r[_Rs_].SD[0]]);
xSUB(rax, ptr64[&cpuRegs.GPR.r[_Rt_].SD[0]]);
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].SL[0]], rax);
xMOV(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]]);
xSUB(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]]);
}
}
EERECOMPILE_CODE0(DSUB, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(DSUB, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED | XMMINFO_64BITOP);
//// DSUBU
void recDSUBU(void)
@ -320,24 +414,24 @@ void recDSUBU(void)
namespace
{
enum class LogicalOp
{
AND,
OR,
XOR,
NOR
};
enum class LogicalOp
{
AND,
OR,
XOR,
NOR
};
} // namespace
static void recLogicalOp_constv(LogicalOp op, int info, int creg, u32 vreg)
static void recLogicalOp_constv(LogicalOp op, int info, int creg, u32 vreg, int regv)
{
pxAssert(!(info & PROCESS_EE_XMM));
xImpl_G1Logic bad{};
const xImpl_G1Logic& xOP = op == LogicalOp::AND ? xAND
: op == LogicalOp::OR ? xOR
: op == LogicalOp::XOR ? xXOR
: op == LogicalOp::NOR ? xOR : bad;
const xImpl_G1Logic& xOP = op == LogicalOp::AND ? xAND : op == LogicalOp::OR ? xOR :
op == LogicalOp::XOR ? xXOR :
op == LogicalOp::NOR ? xOR :
bad;
s64 fixedInput, fixedOutput, identityInput;
bool hasFixed = true;
switch (op)
@ -369,29 +463,18 @@ static void recLogicalOp_constv(LogicalOp op, int info, int creg, u32 vreg)
if (hasFixed && cval.SD[0] == fixedInput)
{
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], fixedOutput);
}
else if (_Rd_ == vreg)
{
if (cval.SD[0] != identityInput)
xImm64Op(xOP, ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], rax, cval.UD[0]);
if (op == LogicalOp::NOR)
xNOT(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]]);
xMOV64(xRegister64(EEREC_D), fixedOutput);
}
else
{
if (cval.SD[0] != identityInput)
{
xMOV64(rax, cval.SD[0]);
xOP(rax, ptr32[&cpuRegs.GPR.r[vreg].UD[0]]);
}
if (regv >= 0)
xMOV(xRegister64(EEREC_D), xRegister64(regv));
else
{
xMOV(rax, ptr32[&cpuRegs.GPR.r[vreg].UD[0]]);
}
xMOV(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[vreg].UD[0]]);
if (cval.SD[0] != identityInput)
xImm64Op(xOP, xRegister64(EEREC_D), rax, cval.UD[0]);
if (op == LogicalOp::NOR)
xNOT(rax);
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
xNOT(xRegister64(EEREC_D));
}
}
@ -400,208 +483,234 @@ static void recLogicalOp(LogicalOp op, int info)
pxAssert(!(info & PROCESS_EE_XMM));
xImpl_G1Logic bad{};
const xImpl_G1Logic& xOP = op == LogicalOp::AND ? xAND
: op == LogicalOp::OR ? xOR
: op == LogicalOp::XOR ? xXOR
: op == LogicalOp::NOR ? xOR : bad;
const xImpl_G1Logic& xOP = op == LogicalOp::AND ? xAND : op == LogicalOp::OR ? xOR :
op == LogicalOp::XOR ? xXOR :
op == LogicalOp::NOR ? xOR :
bad;
pxAssert(&xOP != &bad);
// swap because it's commutative and Rd might be Rt
u32 rs = _Rs_, rt = _Rt_;
int regs = (info & PROCESS_EE_S) ? EEREC_S : -1, regt = (info & PROCESS_EE_T) ? EEREC_T : -1;
if (_Rd_ == _Rt_)
rs = _Rt_, rt = _Rs_;
{
std::swap(rs, rt);
std::swap(regs, regt);
}
if (op == LogicalOp::XOR && rs == rt)
{
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], 0);
}
else if (_Rd_ == rs)
{
if (rs != rt)
{
xMOV(rax, ptr64[&cpuRegs.GPR.r[rt].UD[0]]);
xOP(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
}
if (op == LogicalOp::NOR)
xNOT(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]]);
xXOR(xRegister32(EEREC_D), xRegister32(EEREC_D));
}
else
{
xMOV(rax, ptr64[&cpuRegs.GPR.r[rs].UD[0]]);
if (rs != rt)
xOP(rax, ptr64[&cpuRegs.GPR.r[rt].UD[0]]);
if (regs >= 0)
xMOV(xRegister64(EEREC_D), xRegister64(regs));
else
xMOV(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[rs].UD[0]]);
if (regt >= 0)
xOP(xRegister64(EEREC_D), xRegister64(regt));
else
xOP(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[rt].UD[0]]);
if (op == LogicalOp::NOR)
xNOT(rax);
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
xNOT(xRegister64(EEREC_D));
}
}
//// AND
void recAND_const()
static void recAND_const()
{
g_cpuConstRegs[_Rd_].UD[0] = g_cpuConstRegs[_Rs_].UD[0] & g_cpuConstRegs[_Rt_].UD[0];
}
void recAND_consts(int info)
static void recAND_consts(int info)
{
recLogicalOp_constv(LogicalOp::AND, info, _Rs_, _Rt_);
recLogicalOp_constv(LogicalOp::AND, info, _Rs_, _Rt_, (info & PROCESS_EE_T) ? EEREC_T : -1);
}
void recAND_constt(int info)
static void recAND_constt(int info)
{
recLogicalOp_constv(LogicalOp::AND, info, _Rt_, _Rs_);
recLogicalOp_constv(LogicalOp::AND, info, _Rt_, _Rs_, (info & PROCESS_EE_S) ? EEREC_S : -1);
}
void recAND_(int info)
static void recAND_(int info)
{
recLogicalOp(LogicalOp::AND, info);
}
EERECOMPILE_CODE0(AND, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(AND, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED | XMMINFO_64BITOP);
//// OR
void recOR_const()
static void recOR_const()
{
g_cpuConstRegs[_Rd_].UD[0] = g_cpuConstRegs[_Rs_].UD[0] | g_cpuConstRegs[_Rt_].UD[0];
}
void recOR_consts(int info)
static void recOR_consts(int info)
{
recLogicalOp_constv(LogicalOp::OR, info, _Rs_, _Rt_);
recLogicalOp_constv(LogicalOp::OR, info, _Rs_, _Rt_, (info & PROCESS_EE_T) ? EEREC_T : -1);
}
void recOR_constt(int info)
static void recOR_constt(int info)
{
recLogicalOp_constv(LogicalOp::OR, info, _Rt_, _Rs_);
recLogicalOp_constv(LogicalOp::OR, info, _Rt_, _Rs_, (info & PROCESS_EE_S) ? EEREC_S : -1);
}
void recOR_(int info)
static void recOR_(int info)
{
recLogicalOp(LogicalOp::OR, info);
}
EERECOMPILE_CODE0(OR, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(OR, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED | XMMINFO_64BITOP);
//// XOR
void recXOR_const()
static void recXOR_const()
{
g_cpuConstRegs[_Rd_].UD[0] = g_cpuConstRegs[_Rs_].UD[0] ^ g_cpuConstRegs[_Rt_].UD[0];
}
void recXOR_consts(int info)
static void recXOR_consts(int info)
{
recLogicalOp_constv(LogicalOp::XOR, info, _Rs_, _Rt_);
recLogicalOp_constv(LogicalOp::XOR, info, _Rs_, _Rt_, (info & PROCESS_EE_T) ? EEREC_T : -1);
}
void recXOR_constt(int info)
static void recXOR_constt(int info)
{
recLogicalOp_constv(LogicalOp::XOR, info, _Rt_, _Rs_);
recLogicalOp_constv(LogicalOp::XOR, info, _Rt_, _Rs_, (info & PROCESS_EE_S) ? EEREC_S : -1);
}
void recXOR_(int info)
static void recXOR_(int info)
{
recLogicalOp(LogicalOp::XOR, info);
}
EERECOMPILE_CODE0(XOR, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(XOR, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED | XMMINFO_64BITOP);
//// NOR
void recNOR_const()
static void recNOR_const()
{
g_cpuConstRegs[_Rd_].UD[0] = ~(g_cpuConstRegs[_Rs_].UD[0] | g_cpuConstRegs[_Rt_].UD[0]);
}
void recNOR_consts(int info)
static void recNOR_consts(int info)
{
recLogicalOp_constv(LogicalOp::NOR, info, _Rs_, _Rt_);
recLogicalOp_constv(LogicalOp::NOR, info, _Rs_, _Rt_, (info & PROCESS_EE_T) ? EEREC_T : -1);
}
void recNOR_constt(int info)
static void recNOR_constt(int info)
{
recLogicalOp_constv(LogicalOp::NOR, info, _Rt_, _Rs_);
recLogicalOp_constv(LogicalOp::NOR, info, _Rt_, _Rs_, (info & PROCESS_EE_S) ? EEREC_S : -1);
}
void recNOR_(int info)
static void recNOR_(int info)
{
recLogicalOp(LogicalOp::NOR, info);
}
EERECOMPILE_CODE0(NOR, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(NOR, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED | XMMINFO_64BITOP);
//// SLT - test with silent hill, lemans
void recSLT_const()
static void recSLT_const()
{
g_cpuConstRegs[_Rd_].UD[0] = g_cpuConstRegs[_Rs_].SD[0] < g_cpuConstRegs[_Rt_].SD[0];
}
void recSLTs_const(int info, int sign, int st)
static void recSLTs_const(int info, int sign, int st)
{
pxAssert(!(info & PROCESS_EE_XMM));
GPR_reg64 cval = g_cpuConstRegs[st ? _Rt_ : _Rs_];
const s64 cval = g_cpuConstRegs[st ? _Rt_ : _Rs_].SD[0];
const xImpl_Set& SET = st ? (sign ? xSETL : xSETB) : (sign ? xSETG : xSETA);
xXOR(eax, eax);
xImm64Op(xCMP, ptr64[&cpuRegs.GPR.r[st ? _Rs_ : _Rt_].UD[0]], rdx, cval.UD[0]);
SET(al);
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
// If Rd == Rs or Rt, we can't xor it before it's used.
// So, allocate a temporary register first, and then reallocate it to Rd.
const xRegister32 dreg((_Rd_ == (st ? _Rs_ : _Rt_)) ? _allocX86reg(X86TYPE_TEMP, 0, 0) : EEREC_D);
const int regs = st ? ((info & PROCESS_EE_S) ? EEREC_S : -1) : ((info & PROCESS_EE_T) ? EEREC_T : -1);
xXOR(dreg, dreg);
if (regs >= 0)
xImm64Op(xCMP, xRegister64(regs), rcx, cval);
else
xImm64Op(xCMP, ptr64[&cpuRegs.GPR.r[st ? _Rs_ : _Rt_].UD[0]], rcx, cval);
SET(xRegister8(dreg));
if (dreg.GetId() != EEREC_D)
{
std::swap(x86regs[dreg.GetId()], x86regs[EEREC_D]);
_freeX86reg(EEREC_D);
}
}
void recSLTs_(int info, int sign)
static void recSLTs_(int info, int sign)
{
pxAssert(!(info & PROCESS_EE_XMM));
const xImpl_Set& SET = sign ? xSETL : xSETB;
xXOR(eax, eax);
xMOV(rdx, ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]]);
xCMP(rdx, ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]]);
SET(al);
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
// need to keep Rs/Rt around.
const xRegister32 dreg((_Rd_ == _Rt_ || _Rd_ == _Rs_) ? _allocX86reg(X86TYPE_TEMP, 0, 0) : EEREC_D);
// force Rs into a register, may as well cache it since we're loading anyway.
const int regs = (info & PROCESS_EE_S) ? EEREC_S : _allocX86reg(X86TYPE_GPR, _Rs_, MODE_READ);
xXOR(dreg, dreg);
if (info & PROCESS_EE_T)
xCMP(xRegister64(regs), xRegister64(EEREC_T));
else
xCMP(xRegister64(regs), ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]]);
SET(xRegister8(dreg));
if (dreg.GetId() != EEREC_D)
{
std::swap(x86regs[dreg.GetId()], x86regs[EEREC_D]);
_freeX86reg(EEREC_D);
}
}
void recSLT_consts(int info)
static void recSLT_consts(int info)
{
recSLTs_const(info, 1, 0);
}
void recSLT_constt(int info)
static void recSLT_constt(int info)
{
recSLTs_const(info, 1, 1);
}
void recSLT_(int info)
static void recSLT_(int info)
{
recSLTs_(info, 1);
}
EERECOMPILE_CODE0(SLT, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(SLT, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED | XMMINFO_NORENAME);
// SLTU - test with silent hill, lemans
void recSLTU_const()
static void recSLTU_const()
{
g_cpuConstRegs[_Rd_].UD[0] = g_cpuConstRegs[_Rs_].UD[0] < g_cpuConstRegs[_Rt_].UD[0];
}
void recSLTU_consts(int info)
static void recSLTU_consts(int info)
{
recSLTs_const(info, 0, 0);
}
void recSLTU_constt(int info)
static void recSLTU_constt(int info)
{
recSLTs_const(info, 0, 1);
}
void recSLTU_(int info)
static void recSLTU_(int info)
{
recSLTs_(info, 0);
}
EERECOMPILE_CODE0(SLTU, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(SLTU, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED | XMMINFO_NORENAME);
#endif
} // namespace OpcodeImpl
} // namespace Dynarec
} // namespace R5900
} // namespace R5900::Dynarec::OpcodeImpl

View File

@ -22,10 +22,8 @@
using namespace x86Emitter;
namespace R5900 {
namespace Dynarec {
namespace OpcodeImpl {
namespace R5900::Dynarec::OpcodeImpl
{
/*********************************************************
* Arithmetic with immediate operand *
* Format: OP rt, rs, immediate *
@ -35,47 +33,50 @@ namespace OpcodeImpl {
namespace Interp = R5900::Interpreter::OpcodeImpl;
REC_FUNC_DEL(ADDI, _Rt_);
REC_FUNC_DEL(ADDIU, _Rt_);
REC_FUNC_DEL(DADDI, _Rt_);
REC_FUNC_DEL(ADDI, _Rt_);
REC_FUNC_DEL(ADDIU, _Rt_);
REC_FUNC_DEL(DADDI, _Rt_);
REC_FUNC_DEL(DADDIU, _Rt_);
REC_FUNC_DEL(ANDI, _Rt_);
REC_FUNC_DEL(ORI, _Rt_);
REC_FUNC_DEL(XORI, _Rt_);
REC_FUNC_DEL(ANDI, _Rt_);
REC_FUNC_DEL(ORI, _Rt_);
REC_FUNC_DEL(XORI, _Rt_);
REC_FUNC_DEL(SLTI, _Rt_);
REC_FUNC_DEL(SLTIU, _Rt_);
REC_FUNC_DEL(SLTI, _Rt_);
REC_FUNC_DEL(SLTIU, _Rt_);
#else
static void recMoveStoT(int info)
{
if (info & PROCESS_EE_S)
xMOV(xRegister32(EEREC_T), xRegister32(EEREC_S));
else
xMOV(xRegister32(EEREC_T), ptr32[&cpuRegs.GPR.r[_Rs_].UL[0]]);
}
static void recMoveStoT64(int info)
{
if (info & PROCESS_EE_S)
xMOV(xRegister64(EEREC_T), xRegister64(EEREC_S));
else
xMOV(xRegister64(EEREC_T), ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]]);
}
//// ADDI
void recADDI_const(void)
static void recADDI_const(void)
{
g_cpuConstRegs[_Rt_].SD[0] = s64(s32(g_cpuConstRegs[_Rs_].UL[0] + u32(s32(_Imm_))));
}
void recADDI_(int info)
static void recADDI_(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
if (_Rt_ == _Rs_)
{
// must perform the ADD unconditionally, to maintain flags status:
xADD(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]], _Imm_);
_signExtendSFtoM((uptr)&cpuRegs.GPR.r[_Rt_].UL[1]);
}
else
{
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
if (_Imm_ != 0)
xADD(eax, _Imm_);
eeSignExtendTo(_Rt_);
}
recMoveStoT(info);
xADD(xRegister32(EEREC_T), _Imm_);
xMOVSX(xRegister64(EEREC_T), xRegister32(EEREC_T));
}
EERECOMPILE_CODEX(eeRecompileCode1, ADDI);
EERECOMPILE_CODEX(eeRecompileCodeRC1, ADDI, XMMINFO_WRITET | XMMINFO_READS);
////////////////////////////////////////////////////
void recADDIU()
@ -84,33 +85,19 @@ void recADDIU()
}
////////////////////////////////////////////////////
void recDADDI_const()
static void recDADDI_const()
{
g_cpuConstRegs[_Rt_].UD[0] = g_cpuConstRegs[_Rs_].UD[0] + u64(s64(_Imm_));
}
void recDADDI_(int info)
static void recDADDI_(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
if (_Rt_ == _Rs_)
{
xADD(ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]], _Imm_);
}
else
{
xMOV(rax, ptr[&cpuRegs.GPR.r[_Rs_].UD[0]]);
if (_Imm_ != 0)
{
xADD(rax, _Imm_);
}
xMOV(ptr[&cpuRegs.GPR.r[_Rt_].UD[0]], rax);
}
recMoveStoT64(info);
xADD(xRegister64(EEREC_T), _Imm_);
}
EERECOMPILE_CODEX(eeRecompileCode1, DADDI);
EERECOMPILE_CODEX(eeRecompileCodeRC1, DADDI, XMMINFO_WRITET | XMMINFO_READS | XMMINFO_64BITOP);
//// DADDIU
void recDADDIU()
@ -119,133 +106,137 @@ void recDADDIU()
}
//// SLTIU
void recSLTIU_const()
static void recSLTIU_const()
{
g_cpuConstRegs[_Rt_].UD[0] = g_cpuConstRegs[_Rs_].UD[0] < (u64)(_Imm_);
}
extern void recSLTmemconstt(int regd, int regs, u32 mem, int sign);
extern u32 s_sltone;
void recSLTIU_(int info)
static void recSLTIU_(int info)
{
xXOR(eax, eax);
xCMP(ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]], _Imm_);
xSETB(al);
xMOV(ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]], rax);
pxAssert(!(info & PROCESS_EE_XMM));
// TODO(Stenzek): this can be made to suck less by turning Rs into a temp and reallocating Rt.
const xRegister32 dreg((_Rt_ == _Rs_) ? _allocX86reg(X86TYPE_TEMP, 0, 0) : EEREC_T);
xXOR(dreg, dreg);
if (info & PROCESS_EE_S)
xCMP(xRegister64(EEREC_S), _Imm_);
else
xCMP(ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]], _Imm_);
xSETB(xRegister8(dreg));
if (dreg.GetId() != EEREC_T)
{
std::swap(x86regs[dreg.GetId()], x86regs[EEREC_T]);
_freeX86reg(EEREC_T);
}
}
EERECOMPILE_CODEX(eeRecompileCode1, SLTIU);
EERECOMPILE_CODEX(eeRecompileCodeRC1, SLTIU, XMMINFO_WRITET | XMMINFO_READS | XMMINFO_64BITOP | XMMINFO_NORENAME);
//// SLTI
void recSLTI_const()
static void recSLTI_const()
{
g_cpuConstRegs[_Rt_].UD[0] = g_cpuConstRegs[_Rs_].SD[0] < (s64)(_Imm_);
}
void recSLTI_(int info)
static void recSLTI_(int info)
{
// test silent hill if modding
xXOR(eax, eax);
xCMP(ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]], _Imm_);
xSETL(al);
xMOV(ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]], rax);
const xRegister32 dreg((_Rt_ == _Rs_) ? _allocX86reg(X86TYPE_TEMP, 0, 0) : EEREC_T);
xXOR(dreg, dreg);
if (info & PROCESS_EE_S)
xCMP(xRegister64(EEREC_S), _Imm_);
else
xCMP(ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]], _Imm_);
xSETL(xRegister8(dreg));
if (dreg.GetId() != EEREC_T)
{
std::swap(x86regs[dreg.GetId()], x86regs[EEREC_T]);
_freeX86reg(EEREC_T);
}
}
EERECOMPILE_CODEX(eeRecompileCode1, SLTI);
EERECOMPILE_CODEX(eeRecompileCodeRC1, SLTI, XMMINFO_WRITET | XMMINFO_READS | XMMINFO_64BITOP | XMMINFO_NORENAME);
//// ANDI
void recANDI_const()
static void recANDI_const()
{
g_cpuConstRegs[_Rt_].UD[0] = g_cpuConstRegs[_Rs_].UD[0] & (u64)_ImmU_; // Zero-extended Immediate
}
namespace
{
enum class LogicalOp
{
AND,
OR,
XOR
};
enum class LogicalOp
{
AND,
OR,
XOR
};
} // namespace
static void recLogicalOpI(int info, LogicalOp op)
{
xImpl_G1Logic bad{};
const xImpl_G1Logic& xOP = op == LogicalOp::AND ? xAND
: op == LogicalOp::OR ? xOR
: op == LogicalOp::XOR ? xXOR : bad;
const xImpl_G1Logic& xOP = op == LogicalOp::AND ? xAND : op == LogicalOp::OR ? xOR :
op == LogicalOp::XOR ? xXOR :
bad;
pxAssert(&xOP != &bad);
if (_ImmU_ != 0)
{
if (_Rt_ == _Rs_)
{
if (op == LogicalOp::AND)
xOP(ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]], _ImmU_);
else
xOP(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]], _ImmU_);
}
else
{
xMOV(rax, ptr[&cpuRegs.GPR.r[_Rs_].UD[0]]);
xOP(rax, _ImmU_);
xMOV(ptr[&cpuRegs.GPR.r[_Rt_].UD[0]], rax);
}
recMoveStoT64(info);
xOP(xRegister64(EEREC_T), _ImmU_);
}
else
{
if (op == LogicalOp::AND)
{
xMOV(ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]], 0);
xXOR(xRegister32(EEREC_T), xRegister32(EEREC_T));
}
else
{
if (_Rt_ != _Rs_)
{
xMOV(rax, ptr[&cpuRegs.GPR.r[_Rs_].UD[0]]);
xMOV(ptr[&cpuRegs.GPR.r[_Rt_].UD[0]], rax);
}
recMoveStoT64(info);
}
}
}
void recANDI_(int info)
static void recANDI_(int info)
{
recLogicalOpI(info, LogicalOp::AND);
}
EERECOMPILE_CODEX(eeRecompileCode1, ANDI);
EERECOMPILE_CODEX(eeRecompileCodeRC1, ANDI, XMMINFO_WRITET | XMMINFO_READS | XMMINFO_64BITOP);
////////////////////////////////////////////////////
void recORI_const()
static void recORI_const()
{
g_cpuConstRegs[_Rt_].UD[0] = g_cpuConstRegs[_Rs_].UD[0] | (u64)_ImmU_; // Zero-extended Immediate
}
void recORI_(int info)
static void recORI_(int info)
{
recLogicalOpI(info, LogicalOp::OR);
}
EERECOMPILE_CODEX(eeRecompileCode1, ORI);
EERECOMPILE_CODEX(eeRecompileCodeRC1, ORI, XMMINFO_WRITET | XMMINFO_READS | XMMINFO_64BITOP);
////////////////////////////////////////////////////
void recXORI_const()
static void recXORI_const()
{
g_cpuConstRegs[_Rt_].UD[0] = g_cpuConstRegs[_Rs_].UD[0] ^ (u64)_ImmU_; // Zero-extended Immediate
}
void recXORI_(int info)
static void recXORI_(int info)
{
recLogicalOpI(info, LogicalOp::XOR);
}
EERECOMPILE_CODEX(eeRecompileCode1, XORI);
EERECOMPILE_CODEX(eeRecompileCodeRC1, XORI, XMMINFO_WRITET | XMMINFO_READS | XMMINFO_64BITOP);
#endif
} // namespace OpcodeImpl
} // namespace Dynarec
} // namespace R5900
} // namespace R5900::Dynarec::OpcodeImpl

File diff suppressed because it is too large Load Diff

View File

@ -14,8 +14,6 @@
*/
// recompiler reworked to add dynamic linking zerofrog(@gmail.com) Jan06
#include "PrecompiledHeader.h"
#include "Common.h"
@ -24,9 +22,8 @@
using namespace x86Emitter;
namespace R5900 {
namespace Dynarec {
namespace OpcodeImpl {
namespace R5900::Dynarec::OpcodeImpl
{
/*********************************************************
* Jump to target *
@ -50,7 +47,7 @@ void recJ()
// SET_FPUSTATE;
u32 newpc = (_InstrucTarget_ << 2) + (pc & 0xf0000000);
recompileNextInstruction(1);
recompileNextInstruction(true, false);
if (EmuConfig.Gamefixes.GoemonTlbHack)
SetBranchImm(vtlb_V2P(newpc));
else
@ -76,7 +73,7 @@ void recJAL()
xMOV(ptr32[&cpuRegs.GPR.r[31].UL[1]], 0);
}
recompileNextInstruction(1);
recompileNextInstruction(true, false);
if (EmuConfig.Gamefixes.GoemonTlbHack)
SetBranchImm(vtlb_V2P(newpc));
else
@ -101,34 +98,40 @@ void recJALR()
{
EE::Profiler.EmitOp(eeOpcode::JALR);
int newpc = pc + 4;
_allocX86reg(calleeSavedReg2d, X86TYPE_PCWRITEBACK, 0, MODE_WRITE);
_eeMoveGPRtoR(calleeSavedReg2d, _Rs_);
const u32 newpc = pc + 4;
const bool swap = (EmuConfig.Gamefixes.GoemonTlbHack || _Rd_ == _Rs_) ? false : TrySwapDelaySlot(_Rs_, 0, _Rd_);
if (EmuConfig.Gamefixes.GoemonTlbHack)
{
xMOV(ecx, calleeSavedReg2d);
vtlb_DynV2P();
xMOV(calleeSavedReg2d, eax);
}
// uncomment when there are NO instructions that need to call interpreter
// int mmreg;
// if (GPR_IS_CONST1(_Rs_))
// xMOV(ptr32[&cpuRegs.pc], g_cpuConstRegs[_Rs_].UL[0]);
// else
// {
// int mmreg;
//
// if ((mmreg = _checkXMMreg(XMMTYPE_GPRREG, _Rs_, MODE_READ)) >= 0)
// {
// xMOVSS(ptr[&cpuRegs.pc], xRegisterSSE(mmreg));
// }
// else {
// xMOV(eax, ptr[(void*)((int)&cpuRegs.GPR.r[_Rs_].UL[0])]);
// xMOV(ptr[&cpuRegs.pc], eax);
// }
// }
// int mmreg;
// if (GPR_IS_CONST1(_Rs_))
// xMOV(ptr32[&cpuRegs.pc], g_cpuConstRegs[_Rs_].UL[0]);
// else
// {
// int mmreg;
//
// if ((mmreg = _checkXMMreg(XMMTYPE_GPRREG, _Rs_, MODE_READ)) >= 0)
// {
// xMOVSS(ptr[&cpuRegs.pc], xRegisterSSE(mmreg));
// }
// else {
// xMOV(eax, ptr[(void*)((int)&cpuRegs.GPR.r[_Rs_].UL[0])]);
// xMOV(ptr[&cpuRegs.pc], eax);
// }
// }
int wbreg = -1;
if (!swap)
{
wbreg = _allocX86reg(X86TYPE_PCWRITEBACK, 0, MODE_WRITE | MODE_CALLEESAVED);
_eeMoveGPRtoR(xRegister32(wbreg), _Rs_);
if (EmuConfig.Gamefixes.GoemonTlbHack)
{
xMOV(ecx, xRegister32(wbreg));
vtlb_DynV2P();
xMOV(xRegister32(wbreg), eax);
}
}
if (_Rd_)
{
@ -136,29 +139,41 @@ void recJALR()
if (EE_CONST_PROP)
{
GPR_SET_CONST(_Rd_);
g_cpuConstRegs[_Rd_].UL[0] = newpc;
g_cpuConstRegs[_Rd_].UL[1] = 0;
g_cpuConstRegs[_Rd_].UD[0] = newpc;
}
else
{
xMOV(ptr32[&cpuRegs.GPR.r[_Rd_].UL[0]], newpc);
xMOV(ptr32[&cpuRegs.GPR.r[_Rd_].UL[1]], 0);
xWriteImm64ToMem(&cpuRegs.GPR.r[_Rd_].UD[0], rax, newpc);
}
}
_clearNeededXMMregs();
recompileNextInstruction(1);
if (x86regs[calleeSavedReg2d.GetId()].inuse)
if (!swap)
{
pxAssert(x86regs[calleeSavedReg2d.GetId()].type == X86TYPE_PCWRITEBACK);
xMOV(ptr[&cpuRegs.pc], calleeSavedReg2d);
x86regs[calleeSavedReg2d.GetId()].inuse = 0;
recompileNextInstruction(true, false);
// the next instruction may have flushed the register.. so reload it if so.
if (x86regs[wbreg].inuse && x86regs[wbreg].type == X86TYPE_PCWRITEBACK)
{
xMOV(ptr[&cpuRegs.pc], xRegister32(wbreg));
x86regs[wbreg].inuse = 0;
}
else
{
xMOV(eax, ptr[&cpuRegs.pcWriteback]);
xMOV(ptr[&cpuRegs.pc], eax);
}
}
else
{
xMOV(eax, ptr[&cpuRegs.pcWriteback]);
xMOV(ptr[&cpuRegs.pc], eax);
if (GPR_IS_DIRTY_CONST(_Rs_) || _hasX86reg(X86TYPE_GPR, _Rs_, 0))
{
const int x86reg = _allocX86reg(X86TYPE_GPR, _Rs_, MODE_READ);
xMOV(ptr32[&cpuRegs.pc], xRegister32(x86reg));
}
else
{
_eeMoveGPRtoM((uptr)&cpuRegs.pc, _Rs_);
}
}
SetBranchReg(0xffffffff);
@ -166,6 +181,4 @@ void recJALR()
#endif
} // namespace OpcodeImpl
} // namespace Dynarec
} // namespace R5900
} // namespace R5900::Dynarec::OpcodeImpl

File diff suppressed because it is too large Load Diff

View File

@ -22,9 +22,8 @@
using namespace x86Emitter;
namespace R5900 {
namespace Dynarec {
namespace OpcodeImpl {
namespace R5900::Dynarec::OpcodeImpl
{
/*********************************************************
* Shift arithmetic with constant shift *
@ -34,7 +33,7 @@ namespace OpcodeImpl {
namespace Interp = R5900::Interpreter::OpcodeImpl;
REC_FUNC_DEL(LUI, _Rt_);
REC_FUNC_DEL(LUI, _Rt_);
REC_FUNC_DEL(MFLO, _Rd_);
REC_FUNC_DEL(MFHI, _Rd_);
REC_FUNC(MTLO);
@ -56,11 +55,6 @@ static void xCopy64(u64* dst, u64* src)
xMOV(ptr64[dst], rax);
}
static void xCMPToZero64(u64* mem)
{
xCMP(ptr64[mem], 0);
}
/*********************************************************
* Load higher 16 bits of the first word in GPR with imm *
* Format: OP rt, immediate *
@ -69,22 +63,13 @@ static void xCMPToZero64(u64* mem)
//// LUI
void recLUI()
{
int mmreg;
if (!_Rt_)
return;
_eeOnWriteReg(_Rt_, 1);
if ((mmreg = _checkXMMreg(XMMTYPE_GPRREG, _Rt_, MODE_WRITE)) >= 0)
{
if (xmmregs[mmreg].mode & MODE_WRITE)
{
xMOVH.PS(ptr[&cpuRegs.GPR.r[_Rt_].UL[2]], xRegisterSSE(mmreg));
}
xmmregs[mmreg].inuse = 0;
}
_deleteEEreg(_Rt_, 0);
// need to flush the upper 64 bits for xmm
GPR_DEL_CONST(_Rt_);
_deleteGPRtoX86reg(_Rt_, DELETE_REG_FREE_NO_WRITEBACK);
_deleteGPRtoXMMreg(_Rt_, DELETE_REG_FLUSH_AND_FREE);
if (EE_CONST_PROP)
{
@ -93,363 +78,300 @@ void recLUI()
}
else
{
xMOV(eax, (s32)(cpuRegs.code << 16));
eeSignExtendTo(_Rt_);
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
xMOV64(xRegister64(regt), (s64)(s32)(cpuRegs.code << 16));
}
EE::Profiler.EmitOp(eeOpcode::LUI);
}
////////////////////////////////////////////////////
void recMFHILO(int hi)
static void recMFHILO(bool hi, bool upper)
{
int reghi, regd, xmmhilo;
if (!_Rd_)
return;
xmmhilo = hi ? XMMGPR_HI : XMMGPR_LO;
reghi = _checkXMMreg(XMMTYPE_GPRREG, xmmhilo, MODE_READ);
// kill any constants on rd, lower 64 bits get written regardless of upper
_eeOnWriteReg(_Rd_, 0);
regd = _checkXMMreg(XMMTYPE_GPRREG, _Rd_, MODE_READ | MODE_WRITE);
if (reghi >= 0)
const int reg = hi ? XMMGPR_HI : XMMGPR_LO;
const int xmmd = EEINST_XMMUSEDTEST(_Rd_) ? _allocGPRtoXMMreg(_Rd_, MODE_READ | MODE_WRITE) : _checkXMMreg(XMMTYPE_GPRREG, _Rd_, MODE_READ | MODE_WRITE);
const int xmmhilo = EEINST_XMMUSEDTEST(reg) ? _allocGPRtoXMMreg(reg, MODE_READ) : _checkXMMreg(XMMTYPE_GPRREG, reg, MODE_READ);
if (xmmd >= 0)
{
if (regd >= 0)
if (xmmhilo >= 0)
{
pxAssert(regd != reghi);
xmmregs[regd].inuse = 0;
xMOVQ(ptr[&cpuRegs.GPR.r[_Rd_].UL[0]], xRegisterSSE(reghi));
if (xmmregs[regd].mode & MODE_WRITE)
{
xMOVH.PS(ptr[&cpuRegs.GPR.r[_Rd_].UL[2]], xRegisterSSE(regd));
}
if (upper)
xMOVHL.PS(xRegisterSSE(xmmd), xRegisterSSE(xmmhilo));
else
xMOVSD(xRegisterSSE(xmmd), xRegisterSSE(xmmhilo));
}
else
{
_deleteEEreg(_Rd_, 0);
xMOVQ(ptr[&cpuRegs.GPR.r[_Rd_].UD[0]], xRegisterSSE(reghi));
const int gprhilo = upper ? -1 : _allocIfUsedGPRtoX86(reg, MODE_READ);
if (gprhilo >= 0)
xPINSR.Q(xRegisterSSE(xmmd), xRegister64(gprhilo), 0);
else
xPINSR.Q(xRegisterSSE(xmmd), ptr64[hi ? &cpuRegs.HI.UD[static_cast<u8>(upper)] : &cpuRegs.LO.UD[static_cast<u8>(upper)]], 0);
}
}
else
{
if (regd >= 0)
// try rename {hi,lo} -> rd
const int gprreg = upper ? -1 : _checkX86reg(X86TYPE_GPR, reg, MODE_READ);
if (gprreg >= 0 && _eeTryRenameReg(_Rd_, reg, gprreg, -1, 0) >= 0)
return;
const int gprd = _allocIfUsedGPRtoX86(_Rd_, MODE_WRITE);
if (gprd >= 0 && xmmhilo >= 0)
{
if (EEINST_ISLIVE2(_Rd_))
xMOVL.PS(xRegisterSSE(regd), ptr[(void*)(hi ? (uptr)&cpuRegs.HI.UD[0] : (uptr)&cpuRegs.LO.UD[0])]);
pxAssert(gprreg < 0);
if (upper)
xPEXTR.Q(xRegister64(gprd), xRegisterSSE(xmmhilo), 1);
else
xMOVQZX(xRegisterSSE(regd), ptr[(void*)(hi ? (uptr)&cpuRegs.HI.UD[0] : (uptr)&cpuRegs.LO.UD[0])]);
xMOVD(xRegister64(gprd), xRegisterSSE(xmmhilo));
}
else if (gprd < 0 && xmmhilo >= 0)
{
pxAssert(gprreg < 0);
if (upper)
xPEXTR.Q(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], xRegisterSSE(xmmhilo), 1);
else
xMOVQ(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], xRegisterSSE(xmmhilo));
}
else if (gprd >= 0)
{
if (gprreg >= 0)
xMOV(xRegister64(gprd), xRegister64(gprreg));
else
xMOV(xRegister64(gprd), ptr64[hi ? &cpuRegs.HI.UD[static_cast<u8>(upper)] : &cpuRegs.LO.UD[static_cast<u8>(upper)]]);
}
else if (gprreg >= 0)
{
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], xRegister64(gprreg));
}
else
{
_deleteEEreg(_Rd_, 0);
xCopy64(&cpuRegs.GPR.r[_Rd_].UD[0], hi ? &cpuRegs.HI.UD[0] : &cpuRegs.LO.UD[0]);
xMOV(rax, ptr64[hi ? &cpuRegs.HI.UD[static_cast<u8>(upper)] : &cpuRegs.LO.UD[static_cast<u8>(upper)]]);
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
}
}
}
void recMTHILO(int hi)
static void recMTHILO(bool hi, bool upper)
{
int reghi, regs, xmmhilo;
uptr addrhilo;
const int reg = hi ? XMMGPR_HI : XMMGPR_LO;
_eeOnWriteReg(reg, 0);
xmmhilo = hi ? XMMGPR_HI : XMMGPR_LO;
addrhilo = hi ? (uptr)&cpuRegs.HI.UD[0] : (uptr)&cpuRegs.LO.UD[0];
regs = _checkXMMreg(XMMTYPE_GPRREG, _Rs_, MODE_READ);
reghi = _checkXMMreg(XMMTYPE_GPRREG, xmmhilo, MODE_READ | MODE_WRITE);
if (reghi >= 0)
const int xmms = EEINST_XMMUSEDTEST(_Rs_) ? _allocGPRtoXMMreg(_Rs_, MODE_READ) : _checkXMMreg(XMMTYPE_GPRREG, _Rs_, MODE_READ);
const int xmmhilo = EEINST_XMMUSEDTEST(reg) ? _allocGPRtoXMMreg(reg, MODE_READ | MODE_WRITE) : _checkXMMreg(XMMTYPE_GPRREG, reg, MODE_READ | MODE_WRITE);
if (xmms >= 0)
{
if (regs >= 0)
if (xmmhilo >= 0)
{
pxAssert(reghi != regs);
_deleteGPRtoXMMreg(_Rs_, 0);
xPUNPCK.HQDQ(xRegisterSSE(reghi), xRegisterSSE(reghi));
xPUNPCK.LQDQ(xRegisterSSE(regs), xRegisterSSE(reghi));
// swap regs
xmmregs[regs] = xmmregs[reghi];
xmmregs[reghi].inuse = 0;
xmmregs[regs].mode |= MODE_WRITE;
if (upper)
xMOVLH.PS(xRegisterSSE(xmmhilo), xRegisterSSE(xmms));
else
xMOVSD(xRegisterSSE(xmmhilo), xRegisterSSE(xmms));
}
else
{
_flushConstReg(_Rs_);
xMOVL.PS(xRegisterSSE(reghi), ptr[&cpuRegs.GPR.r[_Rs_].UD[0]]);
xmmregs[reghi].mode |= MODE_WRITE;
const int gprhilo = upper ? -1 : _allocIfUsedGPRtoX86(reg, MODE_WRITE);
if (gprhilo >= 0)
xMOVD(xRegister64(gprhilo), xRegisterSSE(xmms)); // actually movq
else
xMOVQ(ptr64[hi ? &cpuRegs.HI.UD[static_cast<u8>(upper)] : &cpuRegs.LO.UD[static_cast<u8>(upper)]], xRegisterSSE(xmms));
}
}
else
{
if (regs >= 0)
// try rename rs -> {hi,lo}
const int gprs = _checkX86reg(X86TYPE_GPR, _Rs_, MODE_READ);
if (gprs >= 0 && !upper && _eeTryRenameReg(reg, _Rs_, gprs, -1, 0) >= 0)
return;
if (xmmhilo >= 0)
{
xMOVQ(ptr[(void*)(addrhilo)], xRegisterSSE(regs));
}
else
{
if (GPR_IS_CONST1(_Rs_))
if (gprs >= 0)
{
xWriteImm64ToMem((u64*)addrhilo, rax, g_cpuConstRegs[_Rs_].UD[0]);
xPINSR.Q(xRegisterSSE(xmmhilo), xRegister64(gprs), static_cast<u8>(upper));
}
else if (GPR_IS_CONST1(_Rs_))
{
_eeMoveGPRtoR(rax, _Rs_);
xPINSR.Q(xRegisterSSE(xmmhilo), rax, static_cast<u8>(upper));
}
else
{
_eeMoveGPRtoR(ecx, _Rs_);
_flushEEreg(_Rs_);
xCopy64((u64*)addrhilo, &cpuRegs.GPR.r[_Rs_].UD[0]);
xPINSR.Q(xRegisterSSE(xmmhilo), ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]], static_cast<u8>(upper));
}
}
else
{
const int gprreg = upper ? -1 : _allocIfUsedGPRtoX86(reg, MODE_WRITE);
if (gprreg >= 0)
_eeMoveGPRtoR(xRegister64(gprreg), _Rs_);
else
_eeMoveGPRtoM((uptr)(hi ? &cpuRegs.HI.UD[static_cast<u8>(upper)] : &cpuRegs.LO.UD[static_cast<u8>(upper)]), _Rs_);
}
}
}
void recMFHI()
{
recMFHILO(1);
recMFHILO(true, false);
EE::Profiler.EmitOp(eeOpcode::MFHI);
}
void recMFLO()
{
recMFHILO(0);
recMFHILO(false, false);
EE::Profiler.EmitOp(eeOpcode::MFLO);
}
void recMTHI()
{
recMTHILO(1);
recMTHILO(true, false);
EE::Profiler.EmitOp(eeOpcode::MTHI);
}
void recMTLO()
{
recMTHILO(0);
recMTHILO(false, false);
EE::Profiler.EmitOp(eeOpcode::MTLO);
}
////////////////////////////////////////////////////
void recMFHILO1(int hi)
{
int reghi, regd, xmmhilo;
if (!_Rd_)
return;
xmmhilo = hi ? XMMGPR_HI : XMMGPR_LO;
reghi = _checkXMMreg(XMMTYPE_GPRREG, xmmhilo, MODE_READ);
_eeOnWriteReg(_Rd_, 0);
regd = _checkXMMreg(XMMTYPE_GPRREG, _Rd_, MODE_READ | MODE_WRITE);
if (reghi >= 0)
{
if (regd >= 0)
{
xMOVHL.PS(xRegisterSSE(regd), xRegisterSSE(reghi));
xmmregs[regd].mode |= MODE_WRITE;
}
else
{
_deleteEEreg(_Rd_, 0);
xMOVH.PS(ptr[&cpuRegs.GPR.r[_Rd_].UD[0]], xRegisterSSE(reghi));
}
}
else
{
if (regd >= 0)
{
if (EEINST_ISLIVE2(_Rd_))
{
xPUNPCK.HQDQ(xRegisterSSE(regd), ptr[(void*)(hi ? (uptr)&cpuRegs.HI.UD[0] : (uptr)&cpuRegs.LO.UD[0])]);
xPSHUF.D(xRegisterSSE(regd), xRegisterSSE(regd), 0x4e);
}
else
{
xMOVQZX(xRegisterSSE(regd), ptr[(void*)(hi ? (uptr)&cpuRegs.HI.UD[1] : (uptr)&cpuRegs.LO.UD[1])]);
}
xmmregs[regd].mode |= MODE_WRITE;
}
else
{
_deleteEEreg(_Rd_, 0);
xCopy64(&cpuRegs.GPR.r[_Rd_].UD[0], hi ? &cpuRegs.HI.UD[1] : &cpuRegs.LO.UD[1]);
}
}
}
void recMTHILO1(int hi)
{
int reghi, regs, xmmhilo;
uptr addrhilo;
xmmhilo = hi ? XMMGPR_HI : XMMGPR_LO;
addrhilo = hi ? (uptr)&cpuRegs.HI.UD[0] : (uptr)&cpuRegs.LO.UD[0];
regs = _checkXMMreg(XMMTYPE_GPRREG, _Rs_, MODE_READ);
reghi = _allocCheckGPRtoXMM(g_pCurInstInfo, xmmhilo, MODE_WRITE | MODE_READ);
if (reghi >= 0)
{
if (regs >= 0)
{
xPUNPCK.LQDQ(xRegisterSSE(reghi), xRegisterSSE(regs));
}
else
{
_flushEEreg(_Rs_);
xPUNPCK.LQDQ(xRegisterSSE(reghi), ptr[&cpuRegs.GPR.r[_Rs_].UD[0]]);
}
}
else
{
if (regs >= 0)
{
xMOVQ(ptr[(void*)(addrhilo + 8)], xRegisterSSE(regs));
}
else
{
if (GPR_IS_CONST1(_Rs_))
{
xWriteImm64ToMem((u64*)(addrhilo + 8), rax, g_cpuConstRegs[_Rs_].UD[0]);
}
else
{
_flushEEreg(_Rs_);
xCopy64((u64*)(addrhilo + 8), &cpuRegs.GPR.r[_Rs_].UD[0]);
}
}
}
}
void recMFHI1()
{
recMFHILO1(1);
recMFHILO(true, true);
EE::Profiler.EmitOp(eeOpcode::MFHI1);
}
void recMFLO1()
{
recMFHILO1(0);
recMFHILO(false, true);
EE::Profiler.EmitOp(eeOpcode::MFLO1);
}
void recMTHI1()
{
recMTHILO1(1);
recMTHILO(true, true);
EE::Profiler.EmitOp(eeOpcode::MTHI1);
}
void recMTLO1()
{
recMTHILO1(0);
recMTHILO(false, true);
EE::Profiler.EmitOp(eeOpcode::MTLO1);
}
//// MOVZ
void recMOVZtemp_const()
// if (rt == 0) then rd <- rs
static void recMOVZtemp_const()
{
g_cpuConstRegs[_Rd_].UD[0] = g_cpuConstRegs[_Rs_].UD[0];
}
void recMOVZtemp_consts(int info)
static void recMOVZtemp_consts(int info)
{
xCMPToZero64(&cpuRegs.GPR.r[_Rt_].UD[0]);
j8Ptr[0] = JNZ8(0);
// we need the constant anyway, so just force it into a register
const int regs = (info & PROCESS_EE_S) ? EEREC_S : _allocX86reg(X86TYPE_GPR, _Rs_, MODE_READ);
if (info & PROCESS_EE_T)
xTEST(xRegister64(EEREC_T), xRegister64(EEREC_T));
else
xCMP(ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]], 0);
xWriteImm64ToMem(&cpuRegs.GPR.r[_Rd_].UD[0], rax, g_cpuConstRegs[_Rs_].UD[0]);
x86SetJ8(j8Ptr[0]);
xCMOVE(xRegister64(EEREC_D), xRegister64(regs));
}
void recMOVZtemp_constt(int info)
static void recMOVZtemp_constt(int info)
{
xCopy64(&cpuRegs.GPR.r[_Rd_].UD[0], &cpuRegs.GPR.r[_Rs_].UD[0]);
if (info & PROCESS_EE_S)
xMOV(xRegister64(EEREC_D), xRegister64(EEREC_S));
else
xMOV(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]]);
}
void recMOVZtemp_(int info)
static void recMOVZtemp_(int info)
{
xCMPToZero64(&cpuRegs.GPR.r[_Rt_].UD[0]);
j8Ptr[0] = JNZ8(0);
if (info & PROCESS_EE_T)
xTEST(xRegister64(EEREC_T), xRegister64(EEREC_T));
else
xCMP(ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]], 0);
xCopy64(&cpuRegs.GPR.r[_Rd_].UD[0], &cpuRegs.GPR.r[_Rs_].UD[0]);
x86SetJ8(j8Ptr[0]);
if (info & PROCESS_EE_S)
xCMOVE(xRegister64(EEREC_D), xRegister64(EEREC_S));
else
xCMOVE(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]]);
}
EERECOMPILE_CODE0(MOVZtemp, XMMINFO_READS | XMMINFO_READD | XMMINFO_READD | XMMINFO_WRITED);
// Specify READD here, because we might not write to it, and want to preserve the value.
static EERECOMPILE_CODERC0(MOVZtemp, XMMINFO_READS | XMMINFO_READT | XMMINFO_READD | XMMINFO_WRITED | XMMINFO_NORENAME);
void recMOVZ()
{
if (_Rs_ == _Rd_)
return;
if (GPR_IS_CONST1(_Rt_))
{
if (g_cpuConstRegs[_Rt_].UD[0] != 0)
return;
}
else
_deleteEEreg(_Rd_, 1);
if (GPR_IS_CONST1(_Rt_) && g_cpuConstRegs[_Rt_].UD[0] != 0)
return;
recMOVZtemp();
}
//// MOVN
void recMOVNtemp_const()
static void recMOVNtemp_const()
{
g_cpuConstRegs[_Rd_].UD[0] = g_cpuConstRegs[_Rs_].UD[0];
}
void recMOVNtemp_consts(int info)
static void recMOVNtemp_consts(int info)
{
xCMPToZero64(&cpuRegs.GPR.r[_Rt_].UD[0]);
j8Ptr[0] = JZ8(0);
// we need the constant anyway, so just force it into a register
const int regs = (info & PROCESS_EE_S) ? EEREC_S : _allocX86reg(X86TYPE_GPR, _Rs_, MODE_READ);
if (info & PROCESS_EE_T)
xTEST(xRegister64(EEREC_T), xRegister64(EEREC_T));
else
xCMP(ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]], 0);
xWriteImm64ToMem(&cpuRegs.GPR.r[_Rd_].UD[0], rax, g_cpuConstRegs[_Rs_].UD[0]);
x86SetJ8(j8Ptr[0]);
xCMOVNE(xRegister64(EEREC_D), xRegister64(regs));
}
void recMOVNtemp_constt(int info)
static void recMOVNtemp_constt(int info)
{
xCopy64(&cpuRegs.GPR.r[_Rd_].UD[0], &cpuRegs.GPR.r[_Rs_].UD[0]);
if (info & PROCESS_EE_S)
xMOV(xRegister64(EEREC_D), xRegister64(EEREC_S));
else
xMOV(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]]);
}
void recMOVNtemp_(int info)
static void recMOVNtemp_(int info)
{
xCMPToZero64(&cpuRegs.GPR.r[_Rt_].UD[0]);
j8Ptr[0] = JZ8(0);
if (info & PROCESS_EE_T)
xTEST(xRegister64(EEREC_T), xRegister64(EEREC_T));
else
xCMP(ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]], 0);
xCopy64(&cpuRegs.GPR.r[_Rd_].UD[0], &cpuRegs.GPR.r[_Rs_].UD[0]);
x86SetJ8(j8Ptr[0]);
if (info & PROCESS_EE_S)
xCMOVNE(xRegister64(EEREC_D), xRegister64(EEREC_S));
else
xCMOVNE(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rs_].UD[0]]);
}
EERECOMPILE_CODE0(MOVNtemp, XMMINFO_READS | XMMINFO_READD | XMMINFO_READD | XMMINFO_WRITED);
static EERECOMPILE_CODERC0(MOVNtemp, XMMINFO_READS | XMMINFO_READT | XMMINFO_READD | XMMINFO_WRITED | XMMINFO_NORENAME);
void recMOVN()
{
if (_Rs_ == _Rd_)
return;
if (GPR_IS_CONST1(_Rt_))
{
if (g_cpuConstRegs[_Rt_].UD[0] == 0)
return;
}
else
_deleteEEreg(_Rd_, 1);
if (GPR_IS_CONST1(_Rt_) && g_cpuConstRegs[_Rt_].UD[0] == 0)
return;
recMOVNtemp();
}
#endif
} // namespace OpcodeImpl
} // namespace Dynarec
} // namespace R5900
} // namespace R5900::Dynarec::OpcodeImpl

View File

@ -24,9 +24,8 @@ using namespace x86Emitter;
namespace Interp = R5900::Interpreter::OpcodeImpl;
namespace R5900 {
namespace Dynarec {
namespace OpcodeImpl {
namespace R5900::Dynarec::OpcodeImpl
{
/*********************************************************
* Register mult/div & Register trap logic *
@ -34,9 +33,9 @@ namespace OpcodeImpl {
*********************************************************/
#ifndef MULTDIV_RECOMPILE
REC_FUNC_DEL(MULT, _Rd_);
REC_FUNC_DEL(MULTU, _Rd_);
REC_FUNC_DEL(MULT1, _Rd_);
REC_FUNC_DEL(MULT, _Rd_);
REC_FUNC_DEL(MULTU, _Rd_);
REC_FUNC_DEL(MULT1, _Rd_);
REC_FUNC_DEL(MULTU1, _Rd_);
REC_FUNC(DIV);
@ -44,290 +43,300 @@ REC_FUNC(DIVU);
REC_FUNC(DIV1);
REC_FUNC(DIVU1);
REC_FUNC_DEL(MADD, _Rd_);
REC_FUNC_DEL(MADDU, _Rd_);
REC_FUNC_DEL(MADD1, _Rd_);
REC_FUNC_DEL(MADD, _Rd_);
REC_FUNC_DEL(MADDU, _Rd_);
REC_FUNC_DEL(MADD1, _Rd_);
REC_FUNC_DEL(MADDU1, _Rd_);
#else
// if upper is 1, write in upper 64 bits of LO/HI
void recWritebackHILO(int info, int writed, int upper)
static void recWritebackHILO(int info, bool writed, bool upper)
{
int savedlo = 0;
uptr loaddr = (uptr)&cpuRegs.LO.UL[upper ? 2 : 0];
const uptr hiaddr = (uptr)&cpuRegs.HI.UL[upper ? 2 : 0];
const u8 testlive = upper ? EEINST_LIVE2 : EEINST_LIVE0;
// writeback low 32 bits, sign extended to 64 bits
bool eax_sign_extended = false;
if (g_pCurInstInfo->regs[XMMGPR_HI] & testlive)
xMOVSX(rcx, edx);
// case 1: LO is already in an XMM - use the xmm
// case 2: LO is used as an XMM later in the block - use or allocate the XMM
// case 3: LO is used as a GPR later in the block - use XMM if upper, otherwise use GPR, so it can be renamed
// case 4: LO is already in a GPR - write to the GPR, or write to memory if upper
// case 4: LO is not used - writeback to memory
if (g_pCurInstInfo->regs[XMMGPR_LO] & testlive)
if (EEINST_LIVETEST(XMMGPR_LO))
{
int reglo = 0;
if ((reglo = _checkXMMreg(XMMTYPE_GPRREG, XMMGPR_LO, MODE_READ)) >= 0)
const bool loused = EEINST_USEDTEST(XMMGPR_LO);
const bool lousedxmm = loused && (upper || EEINST_XMMUSEDTEST(XMMGPR_LO));
const int xmmlo = lousedxmm ? _allocGPRtoXMMreg(XMMGPR_LO, MODE_READ | MODE_WRITE) : _checkXMMreg(XMMTYPE_GPRREG, XMMGPR_LO, MODE_WRITE);
if (xmmlo >= 0)
{
if (xmmregs[reglo].mode & MODE_WRITE)
{
if (upper)
xMOVQ(ptr[(void*)(loaddr - 8)], xRegisterSSE(reglo));
else
xMOVH.PS(ptr[(void*)(loaddr + 8)], xRegisterSSE(reglo));
}
xmmregs[reglo].inuse = 0;
reglo = -1;
// we use CDQE over MOVSX because it's shorter.
xCDQE();
xPINSR.Q(xRegisterSSE(xmmlo), rax, static_cast<u8>(upper));
}
_signExtendToMem((void*)loaddr);
savedlo = 1;
}
if (writed && _Rd_)
{
_eeOnWriteReg(_Rd_, 1);
int regd = -1;
if (g_pCurInstInfo->regs[_Rd_] & EEINST_XMM)
else
{
if (savedlo)
const int gprlo = upper ? -1 : (loused ? _allocX86reg(X86TYPE_GPR, XMMGPR_LO, MODE_WRITE) : _checkX86reg(X86TYPE_GPR, XMMGPR_LO, MODE_WRITE));
if (gprlo >= 0)
{
regd = _checkXMMreg(XMMTYPE_GPRREG, _Rd_, MODE_WRITE | MODE_READ);
if (regd >= 0)
{
xMOVL.PS(xRegisterSSE(regd), ptr[(void*)(loaddr)]);
}
xMOVSX(xRegister64(gprlo), eax);
}
}
if (regd < 0)
{
_deleteEEreg(_Rd_, 0);
if (!savedlo)
else
{
xCDQE();
xMOV(ptr[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
eax_sign_extended = true;
xMOV(ptr64[&cpuRegs.LO.UD[upper]], rax);
}
}
}
if (g_pCurInstInfo->regs[XMMGPR_HI] & testlive)
if (EEINST_LIVETEST(XMMGPR_HI))
{
int reghi = 0;
if ((reghi = _checkXMMreg(XMMTYPE_GPRREG, XMMGPR_HI, MODE_READ)) >= 0)
const bool hiused = EEINST_USEDTEST(XMMGPR_HI);
const bool hiusedxmm = hiused && (upper || EEINST_XMMUSEDTEST(XMMGPR_HI));
const int xmmhi = hiusedxmm ? _allocGPRtoXMMreg(XMMGPR_HI, MODE_READ | MODE_WRITE) : _checkXMMreg(XMMTYPE_GPRREG, XMMGPR_HI, MODE_WRITE);
if (xmmhi >= 0)
{
if (xmmregs[reghi].mode & MODE_WRITE)
{
if (upper)
xMOVQ(ptr[(void*)(hiaddr - 8)], xRegisterSSE(reghi));
else
xMOVH.PS(ptr[(void*)(hiaddr + 8)], xRegisterSSE(reghi));
}
xmmregs[reghi].inuse = 0;
reghi = -1;
xMOVSX(rdx, edx);
xPINSR.Q(xRegisterSSE(xmmhi), rdx, static_cast<u8>(upper));
}
else
{
const int gprhi = upper ? -1 : (hiused ? _allocX86reg(X86TYPE_GPR, XMMGPR_HI, MODE_WRITE) : _checkX86reg(X86TYPE_GPR, XMMGPR_HI, MODE_WRITE));
if (gprhi >= 0)
{
xMOVSX(xRegister64(gprhi), edx);
}
else
{
xMOVSX(rdx, edx);
xMOV(ptr64[&cpuRegs.HI.UD[upper]], rdx);
}
}
}
xMOV(ptr[(void*)(hiaddr)], rcx);
// writeback lo to Rd if present
if (writed && _Rd_ && EEINST_LIVETEST(_Rd_))
{
// TODO: This can be made optimal by keeping it in an xmm.
// But currently the templates aren't hooked up for that - we'd need a "allow xmm" flag.
if (info & PROCESS_EE_D)
{
if (eax_sign_extended)
xMOV(xRegister64(EEREC_D), rax);
else
xMOVSX(xRegister64(EEREC_D), eax);
}
else
{
if (!eax_sign_extended)
xCDQE();
xMOV(ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
}
}
}
void recWritebackConstHILO(u64 res, int writed, int upper)
static void recWritebackConstHILO(u64 res, bool writed, int upper)
{
uptr loaddr = (uptr)&cpuRegs.LO.UL[upper ? 2 : 0];
uptr hiaddr = (uptr)&cpuRegs.HI.UL[upper ? 2 : 0];
u8 testlive = upper ? EEINST_LIVE2 : EEINST_LIVE0;
// It's not often that MULT/DIV are entirely constant. So while the MOV64s here are not optimal
// by any means, it's not something that's going to be hit often enough to worry about a cache.
// Except for apparently when it's getting set to all-zeros, but that'll be fine with immediates.
const s64 loval = static_cast<s64>(static_cast<s32>(static_cast<u32>(res)));
const s64 hival = static_cast<s64>(static_cast<s32>(static_cast<u32>(res >> 32)));
if (g_pCurInstInfo->regs[XMMGPR_LO] & testlive)
if (EEINST_LIVETEST(XMMGPR_LO))
{
int reglo = _allocCheckGPRtoXMM(g_pCurInstInfo, XMMGPR_LO, MODE_WRITE | MODE_READ);
if (reglo >= 0)
const bool lolive = EEINST_USEDTEST(XMMGPR_LO);
const bool lolivexmm = lolive && (upper || EEINST_XMMUSEDTEST(XMMGPR_LO));
const int xmmlo = lolivexmm ? _allocGPRtoXMMreg(XMMGPR_LO, MODE_READ | MODE_WRITE) : _checkXMMreg(XMMTYPE_GPRREG, XMMGPR_LO, MODE_WRITE);
if (xmmlo >= 0)
{
u32* mem_ptr = recGetImm64(res & 0x80000000 ? -1 : 0, (u32)res);
if (upper)
xMOVH.PS(xRegisterSSE(reglo), ptr[mem_ptr]);
else
xMOVL.PS(xRegisterSSE(reglo), ptr[mem_ptr]);
xMOV64(rax, loval);
xPINSR.Q(xRegisterSSE(xmmlo), rax, static_cast<u8>(upper));
}
else
{
xWriteImm64ToMem((u64*)loaddr, rax, (s64)(s32)(res & 0xffffffff));
const int gprlo = upper ? -1 : (lolive ? _allocX86reg(X86TYPE_GPR, XMMGPR_LO, MODE_WRITE) : _checkX86reg(X86TYPE_GPR, XMMGPR_LO, MODE_WRITE));
if (gprlo >= 0)
xImm64Op(xMOV, xRegister64(gprlo), rax, loval);
else
xImm64Op(xMOV, ptr64[&cpuRegs.LO.UD[upper]], rax, loval);
}
}
if (g_pCurInstInfo->regs[XMMGPR_HI] & testlive)
if (EEINST_LIVETEST(XMMGPR_HI))
{
int reghi = _allocCheckGPRtoXMM(g_pCurInstInfo, XMMGPR_HI, MODE_WRITE | MODE_READ);
if (reghi >= 0)
const bool hilive = EEINST_USEDTEST(XMMGPR_HI);
const bool hilivexmm = hilive && (upper || EEINST_XMMUSEDTEST(XMMGPR_HI));
const int xmmhi = hilivexmm ? _allocGPRtoXMMreg(XMMGPR_HI, MODE_READ | MODE_WRITE) : _checkXMMreg(XMMTYPE_GPRREG, XMMGPR_HI, MODE_WRITE);
if (xmmhi >= 0)
{
u32* mem_ptr = recGetImm64((res >> 63) ? -1 : 0, res >> 32);
if (upper)
xMOVH.PS(xRegisterSSE(reghi), ptr[mem_ptr]);
else
xMOVL.PS(xRegisterSSE(reghi), ptr[mem_ptr]);
xMOV64(rax, hival);
xPINSR.Q(xRegisterSSE(xmmhi), rax, static_cast<u8>(upper));
}
else
{
_deleteEEreg(XMMGPR_HI, 0);
xWriteImm64ToMem((u64*)hiaddr, rax, (s64)res >> 32);
const int gprhi = upper ? -1 : (hilive ? _allocX86reg(X86TYPE_GPR, XMMGPR_HI, MODE_WRITE) : _checkX86reg(X86TYPE_GPR, XMMGPR_HI, MODE_WRITE));
if (gprhi >= 0)
xImm64Op(xMOV, xRegister64(gprhi), rax, hival);
else
xImm64Op(xMOV, ptr64[&cpuRegs.HI.UD[upper]], rax, hival);
}
}
if (!writed || !_Rd_)
return;
g_cpuConstRegs[_Rd_].SD[0] = (s32)(res & 0xffffffffULL); //that is the difference
// writeback lo to Rd if present
if (writed && _Rd_ && EEINST_LIVETEST(_Rd_))
{
_eeOnWriteReg(_Rd_, 0);
const int regd = _checkX86reg(X86TYPE_GPR, _Rd_, MODE_WRITE);
if (regd >= 0)
xImm64Op(xMOV, xRegister64(regd), rax, loval);
else
xImm64Op(xMOV, ptr64[&cpuRegs.GPR.r[_Rd_].UD[0]], rax, loval);
}
}
//// MULT
void recMULT_const()
static void recMULT_const()
{
s64 res = (s64)g_cpuConstRegs[_Rs_].SL[0] * (s64)g_cpuConstRegs[_Rt_].SL[0];
recWritebackConstHILO(res, 1, 0);
}
void recMULTUsuper(int info, int upper, int process);
void recMULTsuper(int info, int upper, int process)
static void recMULTsuper(int info, bool sign, bool upper, int process)
{
// TODO(Stenzek): Use MULX where available.
if (process & PROCESS_CONSTS)
{
xMOV(eax, g_cpuConstRegs[_Rs_].UL[0]);
xMUL(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
if (info & PROCESS_EE_T)
sign ? xMUL(xRegister32(EEREC_T)) : xUMUL(xRegister32(EEREC_T));
else
sign ? xMUL(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]) : xUMUL(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
}
else if (process & PROCESS_CONSTT)
{
xMOV(eax, g_cpuConstRegs[_Rt_].UL[0]);
xMUL(ptr32[&cpuRegs.GPR.r[_Rs_].UL[0]]);
if (info & PROCESS_EE_S)
sign ? xMUL(xRegister32(EEREC_S)) : xUMUL(xRegister32(EEREC_S));
else
sign ? xMUL(ptr32[&cpuRegs.GPR.r[_Rs_].UL[0]]) : xUMUL(ptr32[&cpuRegs.GPR.r[_Rs_].UL[0]]);
}
else
{
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
xMUL(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
// S is more likely to be in a register than T (so put T in eax).
if (info & PROCESS_EE_T)
xMOV(eax, xRegister32(EEREC_T));
else
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
if (info & PROCESS_EE_S)
sign ? xMUL(xRegister32(EEREC_S)) : xUMUL(xRegister32(EEREC_S));
else
sign ? xMUL(ptr32[&cpuRegs.GPR.r[_Rs_].UL[0]]) : xUMUL(ptr32[&cpuRegs.GPR.r[_Rs_].UL[0]]);
}
recWritebackHILO(info, 1, upper);
}
void recMULT_(int info)
static void recMULT_(int info)
{
recMULTsuper(info, 0, 0);
recMULTsuper(info, true, false, 0);
}
void recMULT_consts(int info)
static void recMULT_consts(int info)
{
recMULTsuper(info, 0, PROCESS_CONSTS);
recMULTsuper(info, true, false, PROCESS_CONSTS);
}
void recMULT_constt(int info)
static void recMULT_constt(int info)
{
recMULTsuper(info, 0, PROCESS_CONSTT);
recMULTsuper(info, true, false, PROCESS_CONSTT);
}
// don't set XMMINFO_WRITED|XMMINFO_WRITELO|XMMINFO_WRITEHI
EERECOMPILE_CODE0(MULT, XMMINFO_READS | XMMINFO_READT | (_Rd_ ? XMMINFO_WRITED : 0));
// lo/hi allocation are taken care of in recWritebackHILO().
EERECOMPILE_CODERC0(MULT, XMMINFO_READS | XMMINFO_READT | (_Rd_ ? XMMINFO_WRITED : 0));
//// MULTU
void recMULTU_const()
static void recMULTU_const()
{
u64 res = (u64)g_cpuConstRegs[_Rs_].UL[0] * (u64)g_cpuConstRegs[_Rt_].UL[0];
const u64 res = (u64)g_cpuConstRegs[_Rs_].UL[0] * (u64)g_cpuConstRegs[_Rt_].UL[0];
recWritebackConstHILO(res, 1, 0);
}
void recMULTUsuper(int info, int upper, int process)
static void recMULTU_(int info)
{
if (process & PROCESS_CONSTS)
{
xMOV(eax, g_cpuConstRegs[_Rs_].UL[0]);
xUMUL(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
}
else if (process & PROCESS_CONSTT)
{
xMOV(eax, g_cpuConstRegs[_Rt_].UL[0]);
xUMUL(ptr32[&cpuRegs.GPR.r[_Rs_].UL[0]]);
}
else
{
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
xUMUL(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
}
recWritebackHILO(info, 1, upper);
recMULTsuper(info, false, false, 0);
}
void recMULTU_(int info)
static void recMULTU_consts(int info)
{
recMULTUsuper(info, 0, 0);
recMULTsuper(info, false, false, PROCESS_CONSTS);
}
void recMULTU_consts(int info)
static void recMULTU_constt(int info)
{
recMULTUsuper(info, 0, PROCESS_CONSTS);
}
void recMULTU_constt(int info)
{
recMULTUsuper(info, 0, PROCESS_CONSTT);
recMULTsuper(info, false, false, PROCESS_CONSTT);
}
// don't specify XMMINFO_WRITELO or XMMINFO_WRITEHI, that is taken care of
EERECOMPILE_CODE0(MULTU, XMMINFO_READS | XMMINFO_READT | (_Rd_ ? XMMINFO_WRITED : 0));
EERECOMPILE_CODERC0(MULTU, XMMINFO_READS | XMMINFO_READT | (_Rd_ ? XMMINFO_WRITED : 0));
////////////////////////////////////////////////////
void recMULT1_const()
static void recMULT1_const()
{
s64 res = (s64)g_cpuConstRegs[_Rs_].SL[0] * (s64)g_cpuConstRegs[_Rt_].SL[0];
recWritebackConstHILO((u64)res, 1, 1);
}
void recMULT1_(int info)
static void recMULT1_(int info)
{
recMULTsuper(info, 1, 0);
recMULTsuper(info, true, true, 0);
}
void recMULT1_consts(int info)
static void recMULT1_consts(int info)
{
recMULTsuper(info, 1, PROCESS_CONSTS);
recMULTsuper(info, true, true, PROCESS_CONSTS);
}
void recMULT1_constt(int info)
static void recMULT1_constt(int info)
{
recMULTsuper(info, 1, PROCESS_CONSTT);
recMULTsuper(info, true, true, PROCESS_CONSTT);
}
EERECOMPILE_CODE0(MULT1, XMMINFO_READS | XMMINFO_READT | (_Rd_ ? XMMINFO_WRITED : 0));
EERECOMPILE_CODERC0(MULT1, XMMINFO_READS | XMMINFO_READT | (_Rd_ ? XMMINFO_WRITED : 0));
////////////////////////////////////////////////////
void recMULTU1_const()
static void recMULTU1_const()
{
u64 res = (u64)g_cpuConstRegs[_Rs_].UL[0] * (u64)g_cpuConstRegs[_Rt_].UL[0];
recWritebackConstHILO(res, 1, 1);
}
void recMULTU1_(int info)
static void recMULTU1_(int info)
{
recMULTUsuper(info, 1, 0);
recMULTsuper(info, false, true, 0);
}
void recMULTU1_consts(int info)
static void recMULTU1_consts(int info)
{
recMULTUsuper(info, 1, PROCESS_CONSTS);
recMULTsuper(info, false, true, PROCESS_CONSTS);
}
void recMULTU1_constt(int info)
static void recMULTU1_constt(int info)
{
recMULTUsuper(info, 1, PROCESS_CONSTT);
recMULTsuper(info, false, true, PROCESS_CONSTT);
}
EERECOMPILE_CODE0(MULTU1, XMMINFO_READS | XMMINFO_READT | (_Rd_ ? XMMINFO_WRITED : 0));
EERECOMPILE_CODERC0(MULTU1, XMMINFO_READS | XMMINFO_READT | (_Rd_ ? XMMINFO_WRITED : 0));
//// DIV
void recDIVconst(int upper)
static void recDIVconst(int upper)
{
s32 quot, rem;
if (g_cpuConstRegs[_Rs_].UL[0] == 0x80000000 && g_cpuConstRegs[_Rt_].SL[0] == -1)
@ -348,29 +357,36 @@ void recDIVconst(int upper)
recWritebackConstHILO((u64)quot | ((u64)rem << 32), 0, upper);
}
void recDIV_const()
static void recDIV_const()
{
recDIVconst(0);
}
void recDIVsuper(int info, int sign, int upper, int process)
static void recDIVsuper(int info, bool sign, bool upper, int process)
{
if (process & PROCESS_CONSTT)
xMOV(ecx, g_cpuConstRegs[_Rt_].UL[0]);
else
xMOV(ecx, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
const xRegister32 divisor((info & PROCESS_EE_T) ? EEREC_T : ecx.GetId());
if (!(info & PROCESS_EE_T))
{
if (process & PROCESS_CONSTT)
xMOV(divisor, g_cpuConstRegs[_Rt_].UL[0]);
else
xMOV(divisor, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
}
// can't use edx, it's part of the dividend
pxAssert(divisor.GetId() != edx.GetId());
if (process & PROCESS_CONSTS)
xMOV(eax, g_cpuConstRegs[_Rs_].UL[0]);
else
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
_eeMoveGPRtoR(rax, _Rs_);
u8* end1;
if (sign) //test for overflow (x86 will just throw an exception)
{
xCMP(eax, 0x80000000);
u8* cont1 = JNE8(0);
xCMP(ecx, 0xffffffff);
xCMP(divisor, 0xffffffff);
u8* cont2 = JNE8(0);
//overflow case:
xXOR(edx, edx); //EAX remains 0x80000000
@ -380,7 +396,7 @@ void recDIVsuper(int info, int sign, int upper, int process)
x86SetJ8(cont2);
}
xCMP(ecx, 0);
xCMP(divisor, 0);
u8* cont3 = JNE8(0);
//divide by zero
xMOV(edx, eax);
@ -398,12 +414,12 @@ void recDIVsuper(int info, int sign, int upper, int process)
if (sign)
{
xCDQ();
xDIV(ecx);
xDIV(divisor);
}
else
{
xXOR(edx, edx);
xUDIV(ecx);
xUDIV(divisor);
}
if (sign)
@ -411,28 +427,29 @@ void recDIVsuper(int info, int sign, int upper, int process)
x86SetJ8(end2);
// need to execute regardless of bad divide
recWritebackHILO(info, 0, upper);
recWritebackHILO(info, false, upper);
}
void recDIV_(int info)
static void recDIV_(int info)
{
recDIVsuper(info, 1, 0, 0);
}
void recDIV_consts(int info)
static void recDIV_consts(int info)
{
recDIVsuper(info, 1, 0, PROCESS_CONSTS);
}
void recDIV_constt(int info)
static void recDIV_constt(int info)
{
recDIVsuper(info, 1, 0, PROCESS_CONSTT);
}
EERECOMPILE_CODE0(DIV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITELO | XMMINFO_WRITEHI);
// We handle S reading in the routine itself, since it needs to go into eax.
EERECOMPILE_CODERC0(DIV, /*XMMINFO_READS |*/ XMMINFO_READT);
//// DIVU
void recDIVUconst(int upper)
static void recDIVUconst(int upper)
{
u32 quot, rem;
if (g_cpuConstRegs[_Rt_].UL[0] != 0)
@ -449,71 +466,73 @@ void recDIVUconst(int upper)
recWritebackConstHILO((u64)quot | ((u64)rem << 32), 0, upper);
}
void recDIVU_const()
static void recDIVU_const()
{
recDIVUconst(0);
}
void recDIVU_(int info)
static void recDIVU_(int info)
{
recDIVsuper(info, 0, 0, 0);
recDIVsuper(info, false, false, 0);
}
void recDIVU_consts(int info)
static void recDIVU_consts(int info)
{
recDIVsuper(info, 0, 0, PROCESS_CONSTS);
recDIVsuper(info, false, false, PROCESS_CONSTS);
}
void recDIVU_constt(int info)
static void recDIVU_constt(int info)
{
recDIVsuper(info, 0, 0, PROCESS_CONSTT);
recDIVsuper(info, false, false, PROCESS_CONSTT);
}
EERECOMPILE_CODE0(DIVU, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITELO | XMMINFO_WRITEHI);
EERECOMPILE_CODERC0(DIVU, /*XMMINFO_READS |*/ XMMINFO_READT);
void recDIV1_const()
static void recDIV1_const()
{
recDIVconst(1);
}
void recDIV1_(int info)
static void recDIV1_(int info)
{
recDIVsuper(info, 1, 1, 0);
recDIVsuper(info, true, true, 0);
}
void recDIV1_consts(int info)
static void recDIV1_consts(int info)
{
recDIVsuper(info, 1, 1, PROCESS_CONSTS);
recDIVsuper(info, true, true, PROCESS_CONSTS);
}
void recDIV1_constt(int info)
static void recDIV1_constt(int info)
{
recDIVsuper(info, 1, 1, PROCESS_CONSTT);
recDIVsuper(info, true, true, PROCESS_CONSTT);
}
EERECOMPILE_CODE0(DIV1, XMMINFO_READS | XMMINFO_READT);
EERECOMPILE_CODERC0(DIV1, /*XMMINFO_READS |*/ XMMINFO_READT);
void recDIVU1_const()
static void recDIVU1_const()
{
recDIVUconst(1);
}
void recDIVU1_(int info)
static void recDIVU1_(int info)
{
recDIVsuper(info, 0, 1, 0);
recDIVsuper(info, false, true, 0);
}
void recDIVU1_consts(int info)
static void recDIVU1_consts(int info)
{
recDIVsuper(info, 0, 1, PROCESS_CONSTS);
recDIVsuper(info, false, true, PROCESS_CONSTS);
}
void recDIVU1_constt(int info)
static void recDIVU1_constt(int info)
{
recDIVsuper(info, 0, 1, PROCESS_CONSTT);
recDIVsuper(info, false, true, PROCESS_CONSTT);
}
EERECOMPILE_CODE0(DIVU1, XMMINFO_READS | XMMINFO_READT);
EERECOMPILE_CODERC0(DIVU1, /*XMMINFO_READS |*/ XMMINFO_READT);
// TODO(Stenzek): All of these :(
static void writeBackMAddToHiLoRd(int hiloID)
{
@ -564,8 +583,10 @@ void recMADD()
_deleteEEreg(XMMGPR_LO, 1);
_deleteEEreg(XMMGPR_HI, 1);
_deleteGPRtoXMMreg(_Rs_, 1);
_deleteGPRtoXMMreg(_Rt_, 1);
_deleteGPRtoX86reg(_Rs_, DELETE_REG_FLUSH);
_deleteGPRtoX86reg(_Rt_, DELETE_REG_FLUSH);
_deleteGPRtoXMMreg(_Rs_, DELETE_REG_FLUSH);
_deleteGPRtoXMMreg(_Rt_, DELETE_REG_FLUSH);
if (GPR_IS_CONST1(_Rs_))
{
@ -597,8 +618,10 @@ void recMADDU()
_deleteEEreg(XMMGPR_LO, 1);
_deleteEEreg(XMMGPR_HI, 1);
_deleteGPRtoXMMreg(_Rs_, 1);
_deleteGPRtoXMMreg(_Rt_, 1);
_deleteGPRtoX86reg(_Rs_, DELETE_REG_FLUSH);
_deleteGPRtoX86reg(_Rt_, DELETE_REG_FLUSH);
_deleteGPRtoXMMreg(_Rs_, DELETE_REG_FLUSH);
_deleteGPRtoXMMreg(_Rt_, DELETE_REG_FLUSH);
if (GPR_IS_CONST1(_Rs_))
{
@ -630,8 +653,10 @@ void recMADD1()
_deleteEEreg(XMMGPR_LO, 1);
_deleteEEreg(XMMGPR_HI, 1);
_deleteGPRtoXMMreg(_Rs_, 1);
_deleteGPRtoXMMreg(_Rt_, 1);
_deleteGPRtoX86reg(_Rs_, DELETE_REG_FLUSH);
_deleteGPRtoX86reg(_Rt_, DELETE_REG_FLUSH);
_deleteGPRtoXMMreg(_Rs_, DELETE_REG_FLUSH);
_deleteGPRtoXMMreg(_Rt_, DELETE_REG_FLUSH);
if (GPR_IS_CONST1(_Rs_))
{
@ -663,8 +688,10 @@ void recMADDU1()
_deleteEEreg(XMMGPR_LO, 1);
_deleteEEreg(XMMGPR_HI, 1);
_deleteGPRtoXMMreg(_Rs_, 1);
_deleteGPRtoXMMreg(_Rt_, 1);
_deleteGPRtoX86reg(_Rs_, DELETE_REG_FLUSH);
_deleteGPRtoX86reg(_Rt_, DELETE_REG_FLUSH);
_deleteGPRtoXMMreg(_Rs_, DELETE_REG_FLUSH);
_deleteGPRtoXMMreg(_Rt_, DELETE_REG_FLUSH);
if (GPR_IS_CONST1(_Rs_))
{
@ -688,6 +715,4 @@ void recMADDU1()
#endif
} // namespace OpcodeImpl
} // namespace Dynarec
} // namespace R5900
} // namespace R5900::Dynarec::OpcodeImpl

View File

@ -22,9 +22,8 @@
using namespace x86Emitter;
namespace R5900 {
namespace Dynarec {
namespace OpcodeImpl {
namespace R5900::Dynarec::OpcodeImpl
{
/*********************************************************
* Shift arithmetic with constant shift *
@ -34,431 +33,406 @@ namespace OpcodeImpl {
namespace Interp = R5900::Interpreter::OpcodeImpl;
REC_FUNC_DEL(SLL, _Rd_);
REC_FUNC_DEL(SRL, _Rd_);
REC_FUNC_DEL(SRA, _Rd_);
REC_FUNC_DEL(DSLL, _Rd_);
REC_FUNC_DEL(DSRL, _Rd_);
REC_FUNC_DEL(DSRA, _Rd_);
REC_FUNC_DEL(SLL, _Rd_);
REC_FUNC_DEL(SRL, _Rd_);
REC_FUNC_DEL(SRA, _Rd_);
REC_FUNC_DEL(DSLL, _Rd_);
REC_FUNC_DEL(DSRL, _Rd_);
REC_FUNC_DEL(DSRA, _Rd_);
REC_FUNC_DEL(DSLL32, _Rd_);
REC_FUNC_DEL(DSRL32, _Rd_);
REC_FUNC_DEL(DSRA32, _Rd_);
REC_FUNC_DEL(SLLV, _Rd_);
REC_FUNC_DEL(SRLV, _Rd_);
REC_FUNC_DEL(SRAV, _Rd_);
REC_FUNC_DEL(DSLLV, _Rd_);
REC_FUNC_DEL(DSRLV, _Rd_);
REC_FUNC_DEL(DSRAV, _Rd_);
REC_FUNC_DEL(SLLV, _Rd_);
REC_FUNC_DEL(SRLV, _Rd_);
REC_FUNC_DEL(SRAV, _Rd_);
REC_FUNC_DEL(DSLLV, _Rd_);
REC_FUNC_DEL(DSRLV, _Rd_);
REC_FUNC_DEL(DSRAV, _Rd_);
#else
static void recMoveTtoD(int info)
{
if (info & PROCESS_EE_T)
xMOV(xRegister32(EEREC_D), xRegister32(EEREC_T));
else
xMOV(xRegister32(EEREC_D), ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
}
static void recMoveTtoD64(int info)
{
if (info & PROCESS_EE_T)
xMOV(xRegister64(EEREC_D), xRegister64(EEREC_T));
else
xMOV(xRegister64(EEREC_D), ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]]);
}
static void recMoveSToRCX(int info)
{
// load full 64-bits for store->load forwarding, since we always store >=64.
if (info & PROCESS_EE_S)
xMOV(rcx, xRegister64(EEREC_S));
else
xMOV(rcx, ptr64[&cpuRegs.GPR.r[_Rs_].UL[0]]);
}
//// SLL
void recSLL_const()
static void recSLL_const()
{
g_cpuConstRegs[_Rd_].SD[0] = (s32)(g_cpuConstRegs[_Rt_].UL[0] << _Sa_);
}
void recSLLs_(int info, int sa)
static void recSLLs_(int info, int sa)
{
// TODO: Use BMI
pxAssert(!(info & PROCESS_EE_XMM));
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
recMoveTtoD(info);
if (sa != 0)
{
xSHL(eax, sa);
}
eeSignExtendTo(_Rd_);
xSHL(xRegister32(EEREC_D), sa);
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
void recSLL_(int info)
static void recSLL_(int info)
{
recSLLs_(info, _Sa_);
}
EERECOMPILE_CODEX(eeRecompileCode2, SLL);
EERECOMPILE_CODEX(eeRecompileCodeRC2, SLL, XMMINFO_WRITED | XMMINFO_READT);
//// SRL
void recSRL_const()
static void recSRL_const()
{
g_cpuConstRegs[_Rd_].SD[0] = (s32)(g_cpuConstRegs[_Rt_].UL[0] >> _Sa_);
}
void recSRLs_(int info, int sa)
static void recSRLs_(int info, int sa)
{
pxAssert(!(info & PROCESS_EE_XMM));
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
recMoveTtoD(info);
if (sa != 0)
xSHR(eax, sa);
eeSignExtendTo(_Rd_);
xSHR(xRegister32(EEREC_D), sa);
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
void recSRL_(int info)
static void recSRL_(int info)
{
recSRLs_(info, _Sa_);
}
EERECOMPILE_CODEX(eeRecompileCode2, SRL);
EERECOMPILE_CODEX(eeRecompileCodeRC2, SRL, XMMINFO_WRITED | XMMINFO_READT);
//// SRA
void recSRA_const()
static void recSRA_const()
{
g_cpuConstRegs[_Rd_].SD[0] = (s32)(g_cpuConstRegs[_Rt_].SL[0] >> _Sa_);
}
void recSRAs_(int info, int sa)
static void recSRAs_(int info, int sa)
{
pxAssert(!(info & PROCESS_EE_XMM));
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
recMoveTtoD(info);
if (sa != 0)
xSAR(eax, sa);
eeSignExtendTo(_Rd_);
xSAR(xRegister32(EEREC_D), sa);
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
void recSRA_(int info)
static void recSRA_(int info)
{
recSRAs_(info, _Sa_);
}
EERECOMPILE_CODEX(eeRecompileCode2, SRA);
EERECOMPILE_CODEX(eeRecompileCodeRC2, SRA, XMMINFO_WRITED | XMMINFO_READT);
////////////////////////////////////////////////////
void recDSLL_const()
static void recDSLL_const()
{
g_cpuConstRegs[_Rd_].UD[0] = (u64)(g_cpuConstRegs[_Rt_].UD[0] << _Sa_);
}
void recDSLLs_(int info, int sa)
static void recDSLLs_(int info, int sa)
{
pxAssert(!(info & PROCESS_EE_XMM));
xMOV(rax, ptr[&cpuRegs.GPR.r[_Rt_].UD[0]]);
recMoveTtoD64(info);
if (sa != 0)
xSHL(rax, sa);
xMOV(ptr[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
xSHL(xRegister64(EEREC_D), sa);
}
void recDSLL_(int info)
static void recDSLL_(int info)
{
recDSLLs_(info, _Sa_);
}
EERECOMPILE_CODEX(eeRecompileCode2, DSLL);
EERECOMPILE_CODEX(eeRecompileCodeRC2, DSLL, XMMINFO_WRITED | XMMINFO_READT | XMMINFO_64BITOP);
////////////////////////////////////////////////////
void recDSRL_const()
static void recDSRL_const()
{
g_cpuConstRegs[_Rd_].UD[0] = (u64)(g_cpuConstRegs[_Rt_].UD[0] >> _Sa_);
}
void recDSRLs_(int info, int sa)
static void recDSRLs_(int info, int sa)
{
pxAssert(!(info & PROCESS_EE_XMM));
xMOV(rax, ptr[&cpuRegs.GPR.r[_Rt_].UD[0]]);
recMoveTtoD64(info);
if (sa != 0)
xSHR(rax, sa);
xMOV(ptr[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
xSHR(xRegister64(EEREC_D), sa);
}
void recDSRL_(int info)
static void recDSRL_(int info)
{
recDSRLs_(info, _Sa_);
}
EERECOMPILE_CODEX(eeRecompileCode2, DSRL);
EERECOMPILE_CODEX(eeRecompileCodeRC2, DSRL, XMMINFO_WRITED | XMMINFO_READT | XMMINFO_64BITOP);
//// DSRA
void recDSRA_const()
static void recDSRA_const()
{
g_cpuConstRegs[_Rd_].SD[0] = (u64)(g_cpuConstRegs[_Rt_].SD[0] >> _Sa_);
}
void recDSRAs_(int info, int sa)
static void recDSRAs_(int info, int sa)
{
pxAssert(!(info & PROCESS_EE_XMM));
xMOV(rax, ptr[&cpuRegs.GPR.r[_Rt_].UD[0]]);
recMoveTtoD64(info);
if (sa != 0)
xSAR(rax, sa);
xMOV(ptr[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
xSAR(xRegister64(EEREC_D), sa);
}
void recDSRA_(int info)
static void recDSRA_(int info)
{
recDSRAs_(info, _Sa_);
}
EERECOMPILE_CODEX(eeRecompileCode2, DSRA);
EERECOMPILE_CODEX(eeRecompileCodeRC2, DSRA, XMMINFO_WRITED | XMMINFO_READT | XMMINFO_64BITOP);
///// DSLL32
void recDSLL32_const()
static void recDSLL32_const()
{
g_cpuConstRegs[_Rd_].UD[0] = (u64)(g_cpuConstRegs[_Rt_].UD[0] << (_Sa_ + 32));
}
void recDSLL32s_(int info, int sa)
static void recDSLL32_(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
xSHL(rax, sa + 32);
xMOV(ptr[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
recDSLLs_(info, _Sa_ + 32);
}
void recDSLL32_(int info)
{
recDSLL32s_(info, _Sa_);
}
EERECOMPILE_CODEX(eeRecompileCode2, DSLL32);
EERECOMPILE_CODEX(eeRecompileCodeRC2, DSLL32, XMMINFO_WRITED | XMMINFO_READT | XMMINFO_64BITOP);
//// DSRL32
void recDSRL32_const()
static void recDSRL32_const()
{
g_cpuConstRegs[_Rd_].UD[0] = (u64)(g_cpuConstRegs[_Rt_].UD[0] >> (_Sa_ + 32));
}
void recDSRL32s_(int info, int sa)
static void recDSRL32_(int info)
{
pxAssert(!(info & PROCESS_EE_XMM));
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rt_].UL[1]]);
if (sa != 0)
xSHR(eax, sa);
xMOV(ptr[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
recDSRLs_(info, _Sa_ + 32);
}
void recDSRL32_(int info)
{
recDSRL32s_(info, _Sa_);
}
EERECOMPILE_CODEX(eeRecompileCode2, DSRL32);
EERECOMPILE_CODEX(eeRecompileCodeRC2, DSRL32, XMMINFO_WRITED | XMMINFO_READT);
//// DSRA32
void recDSRA32_const()
static void recDSRA32_const()
{
g_cpuConstRegs[_Rd_].SD[0] = (u64)(g_cpuConstRegs[_Rt_].SD[0] >> (_Sa_ + 32));
}
void recDSRA32s_(int info, int sa)
static void recDSRA32_(int info)
{
recDSRAs_(info, sa + 32);
recDSRAs_(info, _Sa_ + 32);
}
void recDSRA32_(int info)
{
recDSRA32s_(info, _Sa_);
}
EERECOMPILE_CODEX(eeRecompileCode2, DSRA32);
EERECOMPILE_CODEX(eeRecompileCodeRC2, DSRA32, XMMINFO_WRITED | XMMINFO_READT | XMMINFO_64BITOP);
/*********************************************************
* Shift arithmetic with variant register shift *
* Format: OP rd, rt, rs *
*********************************************************/
static void recShiftV_constt(const xImpl_Group2& shift)
static void recShiftV_constt(int info, const xImpl_Group2& shift)
{
xMOV(ecx, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
xMOV(eax, g_cpuConstRegs[_Rt_].UL[0]);
shift(eax, cl);
eeSignExtendTo(_Rd_);
pxAssert(_Rs_ != 0);
recMoveSToRCX(info);
xMOV(xRegister32(EEREC_D), g_cpuConstRegs[_Rt_].UL[0]);
shift(xRegister32(EEREC_D), cl);
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
static void recShiftV(const xImpl_Group2& shift)
static void recShiftV(int info, const xImpl_Group2& shift)
{
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
if (_Rs_ != 0)
{
xMOV(ecx, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
shift(eax, cl);
}
eeSignExtendTo(_Rd_);
pxAssert(_Rs_ != 0);
recMoveSToRCX(info);
recMoveTtoD(info);
shift(xRegister32(EEREC_D), cl);
xMOVSX(xRegister64(EEREC_D), xRegister32(EEREC_D));
}
static void recDShiftV_constt(const xImpl_Group2& shift)
static void recDShiftV_constt(int info, const xImpl_Group2& shift)
{
xMOV(ecx, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
xMOV64(rax, g_cpuConstRegs[_Rt_].UD[0]);
shift(rax, cl);
xMOV(ptr[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
pxAssert(_Rs_ != 0);
recMoveSToRCX(info);
xMOV64(xRegister64(EEREC_D), g_cpuConstRegs[_Rt_].SD[0]);
shift(xRegister64(EEREC_D), cl);
}
static void recDShiftV(const xImpl_Group2& shift)
static void recDShiftV(int info, const xImpl_Group2& shift)
{
xMOV(rax, ptr[&cpuRegs.GPR.r[_Rt_].UD[0]]);
if (_Rs_ != 0)
{
xMOV(ecx, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
shift(rax, cl);
}
xMOV(ptr[&cpuRegs.GPR.r[_Rd_].UD[0]], rax);
pxAssert(_Rs_ != 0);
recMoveSToRCX(info);
recMoveTtoD64(info);
shift(xRegister64(EEREC_D), cl);
}
//// SLLV
void recSLLV_const()
static void recSLLV_const()
{
g_cpuConstRegs[_Rd_].SD[0] = (s32)(g_cpuConstRegs[_Rt_].UL[0] << (g_cpuConstRegs[_Rs_].UL[0] & 0x1f));
}
void recSLLV_consts(int info)
static void recSLLV_consts(int info)
{
recSLLs_(info, g_cpuConstRegs[_Rs_].UL[0] & 0x1f);
}
void recSLLV_constt(int info)
static void recSLLV_constt(int info)
{
recShiftV_constt(xSHL);
recShiftV_constt(info, xSHL);
}
void recSLLV_(int info)
static void recSLLV_(int info)
{
recShiftV(xSHL);
recShiftV(info, xSHL);
}
EERECOMPILE_CODE0(SLLV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(SLLV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
//// SRLV
void recSRLV_const()
static void recSRLV_const()
{
g_cpuConstRegs[_Rd_].SD[0] = (s32)(g_cpuConstRegs[_Rt_].UL[0] >> (g_cpuConstRegs[_Rs_].UL[0] & 0x1f));
}
void recSRLV_consts(int info)
static void recSRLV_consts(int info)
{
recSRLs_(info, g_cpuConstRegs[_Rs_].UL[0] & 0x1f);
}
void recSRLV_constt(int info)
static void recSRLV_constt(int info)
{
recShiftV_constt(xSHR);
recShiftV_constt(info, xSHR);
}
void recSRLV_(int info)
static void recSRLV_(int info)
{
recShiftV(xSHR);
recShiftV(info, xSHR);
}
EERECOMPILE_CODE0(SRLV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(SRLV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
//// SRAV
void recSRAV_const()
static void recSRAV_const()
{
g_cpuConstRegs[_Rd_].SD[0] = (s32)(g_cpuConstRegs[_Rt_].SL[0] >> (g_cpuConstRegs[_Rs_].UL[0] & 0x1f));
}
void recSRAV_consts(int info)
static void recSRAV_consts(int info)
{
recSRAs_(info, g_cpuConstRegs[_Rs_].UL[0] & 0x1f);
}
void recSRAV_constt(int info)
static void recSRAV_constt(int info)
{
recShiftV_constt(xSAR);
recShiftV_constt(info, xSAR);
}
void recSRAV_(int info)
static void recSRAV_(int info)
{
recShiftV(xSAR);
recShiftV(info, xSAR);
}
EERECOMPILE_CODE0(SRAV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(SRAV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
//// DSLLV
void recDSLLV_const()
static void recDSLLV_const()
{
g_cpuConstRegs[_Rd_].UD[0] = (u64)(g_cpuConstRegs[_Rt_].UD[0] << (g_cpuConstRegs[_Rs_].UL[0] & 0x3f));
}
void recDSLLV_consts(int info)
static void recDSLLV_consts(int info)
{
int sa = g_cpuConstRegs[_Rs_].UL[0] & 0x3f;
if (sa < 32)
recDSLLs_(info, sa);
else
recDSLL32s_(info, sa - 32);
recDSLLs_(info, sa);
}
void recDSLLV_constt(int info)
static void recDSLLV_constt(int info)
{
recDShiftV_constt(xSHL);
recDShiftV_constt(info, xSHL);
}
void recDSLLV_(int info)
static void recDSLLV_(int info)
{
recDShiftV(xSHL);
recDShiftV(info, xSHL);
}
EERECOMPILE_CODE0(DSLLV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(DSLLV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED | XMMINFO_64BITOP);
//// DSRLV
void recDSRLV_const()
static void recDSRLV_const()
{
g_cpuConstRegs[_Rd_].UD[0] = (u64)(g_cpuConstRegs[_Rt_].UD[0] >> (g_cpuConstRegs[_Rs_].UL[0] & 0x3f));
}
void recDSRLV_consts(int info)
static void recDSRLV_consts(int info)
{
int sa = g_cpuConstRegs[_Rs_].UL[0] & 0x3f;
if (sa < 32)
recDSRLs_(info, sa);
else
recDSRL32s_(info, sa - 32);
recDSRLs_(info, sa);
}
void recDSRLV_constt(int info)
static void recDSRLV_constt(int info)
{
recDShiftV_constt(xSHR);
recDShiftV_constt(info, xSHR);
}
void recDSRLV_(int info)
static void recDSRLV_(int info)
{
recDShiftV(xSHR);
recDShiftV(info, xSHR);
}
EERECOMPILE_CODE0(DSRLV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(DSRLV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED | XMMINFO_64BITOP);
//// DSRAV
void recDSRAV_const()
static void recDSRAV_const()
{
g_cpuConstRegs[_Rd_].SD[0] = (s64)(g_cpuConstRegs[_Rt_].SD[0] >> (g_cpuConstRegs[_Rs_].UL[0] & 0x3f));
}
void recDSRAV_consts(int info)
static void recDSRAV_consts(int info)
{
int sa = g_cpuConstRegs[_Rs_].UL[0] & 0x3f;
if (sa < 32)
recDSRAs_(info, sa);
else
recDSRA32s_(info, sa - 32);
recDSRAs_(info, sa);
}
void recDSRAV_constt(int info)
static void recDSRAV_constt(int info)
{
recDShiftV_constt(xSAR);
recDShiftV_constt(info, xSAR);
}
void recDSRAV_(int info)
static void recDSRAV_(int info)
{
recDShiftV(xSAR);
recDShiftV(info, xSAR);
}
EERECOMPILE_CODE0(DSRAV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED);
EERECOMPILE_CODERC0(DSRAV, XMMINFO_READS | XMMINFO_READT | XMMINFO_WRITED | XMMINFO_64BITOP);
#endif
} // namespace OpcodeImpl
} // namespace Dynarec
} // namespace R5900
} // namespace R5900::Dynarec::OpcodeImpl

View File

@ -47,527 +47,240 @@ void _deleteEEreg(int reg, int flush)
_flushConstReg(reg);
}
GPR_DEL_CONST(reg);
_deleteGPRtoXMMreg(reg, flush ? 0 : 2);
_deleteGPRtoXMMreg(reg, flush ? DELETE_REG_FREE : DELETE_REG_FLUSH_AND_FREE);
_deleteGPRtoX86reg(reg, flush ? DELETE_REG_FREE : DELETE_REG_FLUSH_AND_FREE);
}
void _deleteEEreg128(int reg)
{
if (!reg)
return;
GPR_DEL_CONST(reg);
_deleteGPRtoXMMreg(reg, DELETE_REG_FREE_NO_WRITEBACK);
_deleteGPRtoX86reg(reg, DELETE_REG_FREE_NO_WRITEBACK);
}
void _flushEEreg(int reg, bool clear)
{
if (!reg)
return;
if (GPR_IS_CONST1(reg))
{
if (GPR_IS_DIRTY_CONST(reg))
_flushConstReg(reg);
return;
}
_deleteGPRtoXMMreg(reg, clear ? 2 : 1);
if (clear)
GPR_DEL_CONST(reg);
_deleteGPRtoXMMreg(reg, clear ? DELETE_REG_FLUSH_AND_FREE : DELETE_REG_FLUSH);
_deleteGPRtoX86reg(reg, clear ? DELETE_REG_FLUSH_AND_FREE : DELETE_REG_FLUSH);
}
int eeProcessHILO(int reg, int mode, int mmx)
int _eeTryRenameReg(int to, int from, int fromx86, int other, int xmminfo)
{
if (_hasFreeXMMreg() || !(g_pCurInstInfo->regs[reg] & EEINST_LASTUSE))
{
return _allocGPRtoXMMreg(-1, reg, mode);
}
// can't rename when in form Rd = Rs op Rt and Rd == Rs or Rd == Rt
if ((xmminfo & XMMINFO_NORENAME) || fromx86 < 0 || to == from || to == other || !EEINST_RENAMETEST(from))
return -1;
return -1;
RALOG("Renaming %s to %s\n", R3000A::disRNameGPR[from], R3000A::disRNameGPR[to]);
// flush back when it's been modified
if (x86regs[fromx86].mode & MODE_WRITE && EEINST_LIVETEST(from))
_writebackX86Reg(fromx86);
// remove all references to renamed-to register
_deleteGPRtoX86reg(to, DELETE_REG_FREE_NO_WRITEBACK);
_deleteGPRtoXMMreg(to, DELETE_REG_FLUSH_AND_FREE);
GPR_DEL_CONST(to);
// and do the actual rename, new register has been modified.
x86regs[fromx86].reg = to;
x86regs[fromx86].mode |= MODE_READ | MODE_WRITE;
return fromx86;
}
// Strangely this code is used on NOT-MMX path ...
#define PROCESS_EE_SETMODES(mmreg) (/*(mmxregs[mmreg].mode&MODE_WRITE)*/ false ? PROCESS_EE_MODEWRITES : 0)
#define PROCESS_EE_SETMODET(mmreg) (/*(mmxregs[mmreg].mode&MODE_WRITE)*/ false ? PROCESS_EE_MODEWRITET : 0)
// ignores XMMINFO_READS, XMMINFO_READT, and XMMINFO_READD_LO from xmminfo
// core of reg caching
void eeRecompileCode0(R5900FNPTR constcode, R5900FNPTR_INFO constscode, R5900FNPTR_INFO consttcode, R5900FNPTR_INFO noconstcode, int xmminfo)
static bool FitsInImmediate(int reg, int fprinfo)
{
if (fprinfo & XMMINFO_64BITOP)
return (s32)g_cpuConstRegs[reg].SD[0] == g_cpuConstRegs[reg].SD[0];
else
return true; // all 32bit ops fit
}
void eeRecompileCodeRC0(R5900FNPTR constcode, R5900FNPTR_INFO constscode, R5900FNPTR_INFO consttcode, R5900FNPTR_INFO noconstcode, int xmminfo)
{
if (!_Rd_ && (xmminfo & XMMINFO_WRITED))
return;
if (GPR_IS_CONST2(_Rs_, _Rt_))
{
if (xmminfo & XMMINFO_WRITED)
if (_Rd_ && (xmminfo & XMMINFO_WRITED))
{
_deleteGPRtoXMMreg(_Rd_, 2);
}
if (xmminfo & XMMINFO_WRITED)
_deleteGPRtoX86reg(_Rd_, DELETE_REG_FREE_NO_WRITEBACK);
_deleteGPRtoXMMreg(_Rd_, DELETE_REG_FLUSH_AND_FREE);
GPR_SET_CONST(_Rd_);
}
constcode();
return;
}
const int moded = MODE_WRITE | ((xmminfo & XMMINFO_READD) ? MODE_READ : 0);
// test if should write xmm, mirror to mmx code
if (g_pCurInstInfo->info & EEINST_XMM)
{
int mmreg1, mmreg3, mmtemp;
pxAssert(0);
// this function should not be used for lo/hi.
pxAssert(!(xmminfo & (XMMINFO_READLO | XMMINFO_READHI | XMMINFO_WRITELO | XMMINFO_WRITEHI)));
if (xmminfo & (XMMINFO_READLO | XMMINFO_WRITELO))
_addNeededGPRtoXMMreg(XMMGPR_LO);
if (xmminfo & (XMMINFO_READHI | XMMINFO_WRITEHI))
_addNeededGPRtoXMMreg(XMMGPR_HI);
_addNeededGPRtoXMMreg(_Rs_);
_addNeededGPRtoXMMreg(_Rt_);
if (GPR_IS_CONST1(_Rs_) || GPR_IS_CONST1(_Rt_))
{
u32 creg = GPR_IS_CONST1(_Rs_) ? _Rs_ : _Rt_;
int vreg = creg == _Rs_ ? _Rt_ : _Rs_;
// if (g_pCurInstInfo->regs[vreg] & EEINST_XMM)
// {
// mmreg1 = _allocGPRtoXMMreg(-1, vreg, MODE_READ);
// _addNeededGPRtoXMMreg(vreg);
// }
mmreg1 = _allocCheckGPRtoXMM(g_pCurInstInfo, vreg, MODE_READ);
if (mmreg1 >= 0)
{
int info = PROCESS_EE_XMM;
if (GPR_IS_CONST1(_Rs_))
info |= PROCESS_EE_SETMODET(mmreg1);
else
info |= PROCESS_EE_SETMODES(mmreg1);
if (xmminfo & XMMINFO_WRITED)
{
_addNeededGPRtoXMMreg(_Rd_);
mmreg3 = _checkXMMreg(XMMTYPE_GPRREG, _Rd_, MODE_WRITE);
if (!(xmminfo & XMMINFO_READD) && mmreg3 < 0 && ((g_pCurInstInfo->regs[vreg] & EEINST_LASTUSE) || !EEINST_ISLIVEXMM(vreg)))
{
_freeXMMreg(mmreg1);
if (GPR_IS_CONST1(_Rs_))
info &= ~PROCESS_EE_MODEWRITET;
else
info &= ~PROCESS_EE_MODEWRITES;
xmmregs[mmreg1].inuse = 1;
xmmregs[mmreg1].reg = _Rd_;
xmmregs[mmreg1].mode = moded;
mmreg3 = mmreg1;
}
else if (mmreg3 < 0)
mmreg3 = _allocGPRtoXMMreg(-1, _Rd_, moded);
info |= PROCESS_EE_SET_D(mmreg3);
}
if (xmminfo & (XMMINFO_READLO | XMMINFO_WRITELO))
{
mmtemp = eeProcessHILO(XMMGPR_LO, ((xmminfo & XMMINFO_READLO) ? MODE_READ : 0) | ((xmminfo & XMMINFO_WRITELO) ? MODE_WRITE : 0), 0);
if (mmtemp >= 0)
info |= PROCESS_EE_SET_LO(mmtemp);
}
if (xmminfo & (XMMINFO_READHI | XMMINFO_WRITEHI))
{
mmtemp = eeProcessHILO(XMMGPR_HI, ((xmminfo & XMMINFO_READLO) ? MODE_READ : 0) | ((xmminfo & XMMINFO_WRITELO) ? MODE_WRITE : 0), 0);
if (mmtemp >= 0)
info |= PROCESS_EE_SET_HI(mmtemp);
}
if (creg == _Rs_)
constscode(info | PROCESS_EE_SET_T(mmreg1));
else
consttcode(info | PROCESS_EE_SET_S(mmreg1));
_clearNeededXMMregs();
if (xmminfo & XMMINFO_WRITED)
GPR_DEL_CONST(_Rd_);
return;
}
}
else
{
// no const regs
mmreg1 = _allocCheckGPRtoXMM(g_pCurInstInfo, _Rs_, MODE_READ);
int mmreg2 = _allocCheckGPRtoXMM(g_pCurInstInfo, _Rt_, MODE_READ);
if (mmreg1 >= 0 || mmreg2 >= 0)
{
int info = PROCESS_EE_XMM;
// do it all in xmm
if (mmreg1 < 0)
mmreg1 = _allocGPRtoXMMreg(-1, _Rs_, MODE_READ);
if (mmreg2 < 0)
mmreg2 = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ);
info |= PROCESS_EE_SETMODES(mmreg1) | PROCESS_EE_SETMODET(mmreg2);
if (xmminfo & XMMINFO_WRITED)
{
// check for last used, if so don't alloc a new XMM reg
_addNeededGPRtoXMMreg(_Rd_);
mmreg3 = _checkXMMreg(XMMTYPE_GPRREG, _Rd_, moded);
if (mmreg3 < 0)
{
if (!(xmminfo & XMMINFO_READD) && ((g_pCurInstInfo->regs[_Rt_] & EEINST_LASTUSE) || !EEINST_ISLIVEXMM(_Rt_)))
{
_freeXMMreg(mmreg2);
info &= ~PROCESS_EE_MODEWRITET;
xmmregs[mmreg2].inuse = 1;
xmmregs[mmreg2].reg = _Rd_;
xmmregs[mmreg2].mode = moded;
mmreg3 = mmreg2;
}
else if (!(xmminfo & XMMINFO_READD) && ((g_pCurInstInfo->regs[_Rs_] & EEINST_LASTUSE) || !EEINST_ISLIVEXMM(_Rs_)))
{
_freeXMMreg(mmreg1);
info &= ~PROCESS_EE_MODEWRITES;
xmmregs[mmreg1].inuse = 1;
xmmregs[mmreg1].reg = _Rd_;
xmmregs[mmreg1].mode = moded;
mmreg3 = mmreg1;
}
else
mmreg3 = _allocGPRtoXMMreg(-1, _Rd_, moded);
}
info |= PROCESS_EE_SET_D(mmreg3);
}
if (xmminfo & (XMMINFO_READLO | XMMINFO_WRITELO))
{
mmtemp = eeProcessHILO(XMMGPR_LO, ((xmminfo & XMMINFO_READLO) ? MODE_READ : 0) | ((xmminfo & XMMINFO_WRITELO) ? MODE_WRITE : 0), 0);
if (mmtemp >= 0)
info |= PROCESS_EE_SET_LO(mmtemp);
}
if (xmminfo & (XMMINFO_READHI | XMMINFO_WRITEHI))
{
mmtemp = eeProcessHILO(XMMGPR_HI, ((xmminfo & XMMINFO_READLO) ? MODE_READ : 0) | ((xmminfo & XMMINFO_WRITELO) ? MODE_WRITE : 0), 0);
if (mmtemp >= 0)
info |= PROCESS_EE_SET_HI(mmtemp);
}
noconstcode(info | PROCESS_EE_SET_S(mmreg1) | PROCESS_EE_SET_T(mmreg2));
_clearNeededXMMregs();
if (xmminfo & XMMINFO_WRITED)
GPR_DEL_CONST(_Rd_);
return;
}
}
_clearNeededXMMregs();
}
// we have to put these up here, because the register allocator below will wipe out const flags
// for the destination register when/if it switches it to write mode.
const bool s_is_const = GPR_IS_CONST1(_Rs_);
const bool t_is_const = GPR_IS_CONST1(_Rt_);
const bool d_is_const = GPR_IS_CONST1(_Rd_);
const bool s_is_used = EEINST_USEDTEST(_Rs_);
const bool t_is_used = EEINST_USEDTEST(_Rt_);
const bool s_in_xmm = _hasXMMreg(XMMTYPE_GPRREG, _Rs_);
const bool t_in_xmm = _hasXMMreg(XMMTYPE_GPRREG, _Rt_);
// regular x86
_deleteGPRtoXMMreg(_Rs_, 1);
_deleteGPRtoXMMreg(_Rt_, 1);
if (xmminfo & XMMINFO_WRITED)
_deleteGPRtoXMMreg(_Rd_, (xmminfo & XMMINFO_READD) ? 0 : 2);
if ((xmminfo & XMMINFO_READS) && !s_is_const)
_addNeededGPRtoX86reg(_Rs_);
if ((xmminfo & XMMINFO_READT) && !t_is_const)
_addNeededGPRtoX86reg(_Rt_);
if ((xmminfo & XMMINFO_READD) && !d_is_const)
_addNeededGPRtoX86reg(_Rd_);
// don't delete, fn will take care of them
// if (xmminfo & (XMMINFO_READLO|XMMINFO_WRITELO))
// {
// _deleteGPRtoXMMreg(XMMGPR_LO, (xmminfo & XMMINFO_READLO) ? 1 : 0);
// }
// if (xmminfo & (XMMINFO_READHI|XMMINFO_WRITEHI))
// {
// _deleteGPRtoXMMreg(XMMGPR_HI, (xmminfo & XMMINFO_READHI) ? 1 : 0);
// }
if (GPR_IS_CONST1(_Rs_))
// when it doesn't fit in an immediate, we'll flush it to a reg early to save code
u32 info = 0;
int regs = -1, regt = -1, regd = -1;
if (xmminfo & XMMINFO_READS)
{
constscode(0);
if (xmminfo & XMMINFO_WRITED)
GPR_DEL_CONST(_Rd_);
return;
regs = _checkX86reg(X86TYPE_GPR, _Rs_, MODE_READ);
if (regs < 0 && (!s_is_const || !FitsInImmediate(_Rs_, xmminfo)) && (s_is_used || s_in_xmm || ((xmminfo & XMMINFO_WRITED) && _Rd_ == _Rs_) || (xmminfo & XMMINFO_FORCEREGS)))
{
regs = _allocX86reg(X86TYPE_GPR, _Rs_, MODE_READ);
}
if (regs >= 0)
info |= PROCESS_EE_SET_S(regs);
}
if (GPR_IS_CONST1(_Rt_))
if (xmminfo & XMMINFO_READT)
{
consttcode(0);
if (xmminfo & XMMINFO_WRITED)
GPR_DEL_CONST(_Rd_);
return;
regt = _checkX86reg(X86TYPE_GPR, _Rt_, MODE_READ);
if (regt < 0 && (!t_is_const || !FitsInImmediate(_Rt_, xmminfo)) && (t_is_used || t_in_xmm || ((xmminfo & XMMINFO_WRITED) && _Rd_ == _Rt_) || (xmminfo & XMMINFO_FORCEREGT)))
{
regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_READ);
}
if (regt >= 0)
info |= PROCESS_EE_SET_T(regt);
}
if (xmminfo & (XMMINFO_WRITED | XMMINFO_READD))
{
// _eeTryRenameReg() sets READ | WRITE already, so this is only needed when allocating.
const int moded = ((xmminfo & XMMINFO_WRITED) ? MODE_WRITE : 0) | ((xmminfo & XMMINFO_READD) ? MODE_READ : 0);
// If S is no longer live, swap D for S. Saves the move.
int regd = (_Rd_ && xmminfo & XMMINFO_WRITED) ? _eeTryRenameReg(_Rd_, (xmminfo & XMMINFO_READS) ? _Rs_ : 0, regs, (xmminfo & XMMINFO_READT) ? _Rt_ : 0, xmminfo) : 0;
if (regd < 0)
regd = _allocX86reg(X86TYPE_GPR, _Rd_, moded);
pxAssert(regd >= 0);
info |= PROCESS_EE_SET_D(regd);
}
noconstcode(0);
if (xmminfo & XMMINFO_WRITED)
GPR_DEL_CONST(_Rd_);
_validateRegs();
if (s_is_const && regs < 0)
{
constscode(info /*| PROCESS_CONSTS*/);
return;
}
if (t_is_const && regt < 0)
{
consttcode(info /*| PROCESS_CONSTT*/);
return;
}
noconstcode(info);
}
// rt = rs op imm16
void eeRecompileCode1(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode)
void eeRecompileCodeRC1(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode, int xmminfo)
{
pxAssert((xmminfo & (XMMINFO_READS | XMMINFO_WRITET)) == (XMMINFO_READS | XMMINFO_WRITET));
if (!_Rt_)
return;
if (GPR_IS_CONST1(_Rs_))
{
_deleteGPRtoXMMreg(_Rt_, 2);
_deleteGPRtoXMMreg(_Rt_, DELETE_REG_FLUSH_AND_FREE);
_deleteGPRtoX86reg(_Rt_, DELETE_REG_FREE_NO_WRITEBACK);
GPR_SET_CONST(_Rt_);
constcode();
return;
}
// test if should write xmm, mirror to mmx code
if (g_pCurInstInfo->info & EEINST_XMM)
{
pxAssert(0);
const bool s_is_used = EEINST_USEDTEST(_Rs_);
const bool s_in_xmm = _hasXMMreg(XMMTYPE_GPRREG, _Rs_);
// no const regs
const int mmreg1 = _allocCheckGPRtoXMM(g_pCurInstInfo, _Rs_, MODE_READ);
u32 info = 0;
int regs = _checkX86reg(X86TYPE_GPR, _Rs_, MODE_READ);
if (regs < 0 && (s_is_used || s_in_xmm || _Rt_ == _Rs_ || (xmminfo & XMMINFO_FORCEREGS)))
regs = _allocX86reg(X86TYPE_GPR, _Rs_, MODE_READ);
if (regs >= 0)
info |= PROCESS_EE_SET_S(regs);
if (mmreg1 >= 0)
{
int info = PROCESS_EE_XMM | PROCESS_EE_SETMODES(mmreg1);
// If S is no longer live, swap D for S. Saves the move.
int regt = _eeTryRenameReg(_Rt_, _Rs_, regs, 0, xmminfo);
if (regt < 0)
regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
// check for last used, if so don't alloc a new XMM reg
_addNeededGPRtoXMMreg(_Rt_);
int mmreg2 = _checkXMMreg(XMMTYPE_GPRREG, _Rt_, MODE_WRITE);
info |= PROCESS_EE_SET_T(regt);
_validateRegs();
if (mmreg2 < 0)
{
if ((g_pCurInstInfo->regs[_Rs_] & EEINST_LASTUSE) || !EEINST_ISLIVEXMM(_Rs_))
{
_freeXMMreg(mmreg1);
info &= ~PROCESS_EE_MODEWRITES;
xmmregs[mmreg1].inuse = 1;
xmmregs[mmreg1].reg = _Rt_;
xmmregs[mmreg1].mode = MODE_WRITE | MODE_READ;
mmreg2 = mmreg1;
}
else
mmreg2 = _allocGPRtoXMMreg(-1, _Rt_, MODE_WRITE);
}
noconstcode(info | PROCESS_EE_SET_S(mmreg1) | PROCESS_EE_SET_T(mmreg2));
_clearNeededXMMregs();
GPR_DEL_CONST(_Rt_);
return;
}
_clearNeededXMMregs();
}
// regular x86
_deleteGPRtoXMMreg(_Rs_, 1);
_deleteGPRtoXMMreg(_Rt_, 2);
noconstcode(0);
GPR_DEL_CONST(_Rt_);
noconstcode(info);
}
// rd = rt op sa
void eeRecompileCode2(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode)
void eeRecompileCodeRC2(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode, int xmminfo)
{
pxAssert((xmminfo & (XMMINFO_READT | XMMINFO_WRITED)) == (XMMINFO_READT | XMMINFO_WRITED));
if (!_Rd_)
return;
if (GPR_IS_CONST1(_Rt_))
{
_deleteGPRtoXMMreg(_Rd_, 2);
_deleteGPRtoXMMreg(_Rd_, DELETE_REG_FLUSH_AND_FREE);
_deleteGPRtoX86reg(_Rd_, DELETE_REG_FREE_NO_WRITEBACK);
GPR_SET_CONST(_Rd_);
constcode();
return;
}
// test if should write xmm, mirror to mmx code
if (g_pCurInstInfo->info & EEINST_XMM)
{
pxAssert(0);
const bool t_is_used = EEINST_USEDTEST(_Rt_);
const bool t_in_xmm = _hasXMMreg(XMMTYPE_GPRREG, _Rt_);
// no const regs
const int mmreg1 = _allocCheckGPRtoXMM(g_pCurInstInfo, _Rt_, MODE_READ);
u32 info = 0;
int regt = _checkX86reg(X86TYPE_GPR, _Rt_, MODE_READ);
if (regt < 0 && (t_is_used || t_in_xmm || (_Rd_ == _Rt_) || (xmminfo & XMMINFO_FORCEREGT)))
regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_READ);
if (regt >= 0)
info |= PROCESS_EE_SET_T(regt);
if (mmreg1 >= 0)
{
int info = PROCESS_EE_XMM | PROCESS_EE_SETMODET(mmreg1);
// If S is no longer live, swap D for T. Saves the move.
int regd = _eeTryRenameReg(_Rd_, _Rt_, regt, 0, xmminfo);
if (regd < 0)
regd = _allocX86reg(X86TYPE_GPR, _Rd_, MODE_WRITE);
// check for last used, if so don't alloc a new XMM reg
_addNeededGPRtoXMMreg(_Rd_);
int mmreg2 = _checkXMMreg(XMMTYPE_GPRREG, _Rd_, MODE_WRITE);
info |= PROCESS_EE_SET_D(regd);
_validateRegs();
if (mmreg2 < 0)
{
if ((g_pCurInstInfo->regs[_Rt_] & EEINST_LASTUSE) || !EEINST_ISLIVE64(_Rt_))
{
_freeXMMreg(mmreg1);
info &= ~PROCESS_EE_MODEWRITET;
xmmregs[mmreg1].inuse = 1;
xmmregs[mmreg1].reg = _Rd_;
xmmregs[mmreg1].mode = MODE_WRITE | MODE_READ;
mmreg2 = mmreg1;
}
else
mmreg2 = _allocGPRtoXMMreg(-1, _Rd_, MODE_WRITE);
}
noconstcode(info | PROCESS_EE_SET_T(mmreg1) | PROCESS_EE_SET_D(mmreg2));
_clearNeededXMMregs();
GPR_DEL_CONST(_Rd_);
return;
}
_clearNeededXMMregs();
}
// regular x86
_deleteGPRtoXMMreg(_Rt_, 1);
_deleteGPRtoXMMreg(_Rd_, 2);
noconstcode(0);
GPR_DEL_CONST(_Rd_);
}
// rt op rs
void eeRecompileCode3(R5900FNPTR constcode, R5900FNPTR_INFO multicode)
{
pxFail("Unfinished code reached.");
// for now, don't support xmm
_deleteEEreg(_Rs_, 0);
_deleteEEreg(_Rt_, 1);
if (GPR_IS_CONST2(_Rs_, _Rt_))
{
constcode();
return;
}
if (GPR_IS_CONST1(_Rs_))
{
//multicode(PROCESS_EE_CONSTT);
return;
}
if (GPR_IS_CONST1(_Rt_))
{
//multicode(PROCESS_EE_CONSTT);
return;
}
multicode(0);
}
// Simple Code Templates //
// rd = rs op rt
void eeRecompileCodeConst0(R5900FNPTR constcode, R5900FNPTR_INFO constscode, R5900FNPTR_INFO consttcode, R5900FNPTR_INFO noconstcode)
{
if (!_Rd_)
return;
// for now, don't support xmm
_deleteGPRtoXMMreg(_Rs_, 1);
_deleteGPRtoXMMreg(_Rt_, 1);
_deleteGPRtoXMMreg(_Rd_, 0);
if (GPR_IS_CONST2(_Rs_, _Rt_))
{
GPR_SET_CONST(_Rd_);
constcode();
return;
}
if (GPR_IS_CONST1(_Rs_))
{
constscode(0);
GPR_DEL_CONST(_Rd_);
return;
}
if (GPR_IS_CONST1(_Rt_))
{
consttcode(0);
GPR_DEL_CONST(_Rd_);
return;
}
noconstcode(0);
GPR_DEL_CONST(_Rd_);
}
// rt = rs op imm16
void eeRecompileCodeConst1(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode)
{
if (!_Rt_)
return;
// for now, don't support xmm
_deleteGPRtoXMMreg(_Rs_, 1);
_deleteGPRtoXMMreg(_Rt_, 0);
if (GPR_IS_CONST1(_Rs_))
{
GPR_SET_CONST(_Rt_);
constcode();
return;
}
noconstcode(0);
GPR_DEL_CONST(_Rt_);
}
// rd = rt op sa
void eeRecompileCodeConst2(R5900FNPTR constcode, R5900FNPTR_INFO noconstcode)
{
if (!_Rd_)
return;
// for now, don't support xmm
_deleteGPRtoXMMreg(_Rt_, 1);
_deleteGPRtoXMMreg(_Rd_, 0);
if (GPR_IS_CONST1(_Rt_))
{
GPR_SET_CONST(_Rd_);
constcode();
return;
}
noconstcode(0);
GPR_DEL_CONST(_Rd_);
}
// rd = rt MULT rs (SPECIAL)
void eeRecompileCodeConstSPECIAL(R5900FNPTR constcode, R5900FNPTR_INFO multicode, int MULT)
{
pxFail("Unfinished code reached.");
// for now, don't support xmm
if (MULT)
{
_deleteGPRtoXMMreg(_Rd_, 0);
}
_deleteGPRtoXMMreg(_Rs_, 1);
_deleteGPRtoXMMreg(_Rt_, 1);
if (GPR_IS_CONST2(_Rs_, _Rt_))
{
if (MULT && _Rd_)
GPR_SET_CONST(_Rd_);
constcode();
return;
}
if (GPR_IS_CONST1(_Rs_))
{
//multicode(PROCESS_EE_CONSTS);
if (MULT && _Rd_)
GPR_DEL_CONST(_Rd_);
return;
}
if (GPR_IS_CONST1(_Rt_))
{
//multicode(PROCESS_EE_CONSTT);
if (MULT && _Rd_)
GPR_DEL_CONST(_Rd_);
return;
}
multicode(0);
if (MULT && _Rd_)
GPR_DEL_CONST(_Rd_);
noconstcode(info);
}
// EE XMM allocation code
@ -575,40 +288,11 @@ int eeRecompileCodeXMM(int xmminfo)
{
int info = PROCESS_EE_XMM;
// flush consts
if (xmminfo & XMMINFO_READT)
{
if (GPR_IS_CONST1(_Rt_) && !(g_cpuFlushedConstReg & (1 << _Rt_)))
{
xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]], g_cpuConstRegs[_Rt_].UL[0]);
xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[1]], g_cpuConstRegs[_Rt_].UL[1]);
g_cpuFlushedConstReg |= (1 << _Rt_);
}
}
if (xmminfo & XMMINFO_READS)
{
if (GPR_IS_CONST1(_Rs_) && !(g_cpuFlushedConstReg & (1 << _Rs_)))
{
xMOV(ptr32[&cpuRegs.GPR.r[_Rs_].UL[0]], g_cpuConstRegs[_Rs_].UL[0]);
xMOV(ptr32[&cpuRegs.GPR.r[_Rs_].UL[1]], g_cpuConstRegs[_Rs_].UL[1]);
g_cpuFlushedConstReg |= (1 << _Rs_);
}
}
if (xmminfo & XMMINFO_WRITED)
{
GPR_DEL_CONST(_Rd_);
}
// add needed
if (xmminfo & (XMMINFO_READLO | XMMINFO_WRITELO))
{
_addNeededGPRtoXMMreg(XMMGPR_LO);
}
if (xmminfo & (XMMINFO_READHI | XMMINFO_WRITEHI))
{
_addNeededGPRtoXMMreg(XMMGPR_HI);
}
if (xmminfo & XMMINFO_READS)
_addNeededGPRtoXMMreg(_Rs_);
if (xmminfo & XMMINFO_READT)
@ -616,58 +300,59 @@ int eeRecompileCodeXMM(int xmminfo)
if (xmminfo & XMMINFO_WRITED)
_addNeededGPRtoXMMreg(_Rd_);
// allocate
// TODO: we could do memory operands here if not live. but the MMI implementations aren't hooked up to that at the moment.
if (xmminfo & XMMINFO_READS)
{
int reg = _allocGPRtoXMMreg(-1, _Rs_, MODE_READ);
info |= PROCESS_EE_SET_S(reg) | PROCESS_EE_SETMODES(reg);
const int reg = _allocGPRtoXMMreg(_Rs_, MODE_READ);
info |= PROCESS_EE_SET_S(reg);
}
if (xmminfo & XMMINFO_READT)
{
int reg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ);
info |= PROCESS_EE_SET_T(reg) | PROCESS_EE_SETMODET(reg);
const int reg = _allocGPRtoXMMreg(_Rt_, MODE_READ);
info |= PROCESS_EE_SET_T(reg);
}
if (xmminfo & XMMINFO_WRITED)
{
int readd = MODE_WRITE | ((xmminfo & XMMINFO_READD) ? ((xmminfo & XMMINFO_READD_LO) ? (MODE_READ | MODE_READHALF) : MODE_READ) : 0);
int readd = MODE_WRITE | ((xmminfo & XMMINFO_READD) ? MODE_READ : 0);
int regd = _checkXMMreg(XMMTYPE_GPRREG, _Rd_, readd);
if (regd < 0)
{
if (!(xmminfo & XMMINFO_READD) && (xmminfo & XMMINFO_READT) && (_Rt_ == 0 || (g_pCurInstInfo->regs[_Rt_] & EEINST_LASTUSE) || !EEINST_ISLIVEXMM(_Rt_)))
if (!(xmminfo & XMMINFO_READD) && (xmminfo & XMMINFO_READT) && EEINST_RENAMETEST(_Rt_))
{
_freeXMMreg(EEREC_T);
xmmregs[EEREC_T].inuse = 1;
xmmregs[EEREC_T].reg = _Rd_;
xmmregs[EEREC_T].mode = readd;
_deleteEEreg128(_Rd_);
_reallocateXMMreg(EEREC_T, XMMTYPE_GPRREG, _Rd_, readd, EEINST_LIVETEST(_Rt_));
regd = EEREC_T;
}
else if (!(xmminfo & XMMINFO_READD) && (xmminfo & XMMINFO_READS) && (_Rs_ == 0 || (g_pCurInstInfo->regs[_Rs_] & EEINST_LASTUSE) || !EEINST_ISLIVEXMM(_Rs_)))
else if (!(xmminfo & XMMINFO_READD) && (xmminfo & XMMINFO_READS) && EEINST_RENAMETEST(_Rs_))
{
_freeXMMreg(EEREC_S);
xmmregs[EEREC_S].inuse = 1;
xmmregs[EEREC_S].reg = _Rd_;
xmmregs[EEREC_S].mode = readd;
_deleteEEreg128(_Rd_);
_reallocateXMMreg(EEREC_S, XMMTYPE_GPRREG, _Rd_, readd, EEINST_LIVETEST(_Rs_));
regd = EEREC_S;
}
else
regd = _allocGPRtoXMMreg(-1, _Rd_, readd);
{
regd = _allocGPRtoXMMreg(_Rd_, readd);
}
}
info |= PROCESS_EE_SET_D(regd);
}
if (xmminfo & (XMMINFO_READLO | XMMINFO_WRITELO))
{
info |= PROCESS_EE_SET_LO(_allocGPRtoXMMreg(-1, XMMGPR_LO, ((xmminfo & XMMINFO_READLO) ? MODE_READ : 0) | ((xmminfo & XMMINFO_WRITELO) ? MODE_WRITE : 0)));
info |= PROCESS_EE_LO;
info |= PROCESS_EE_SET_LO(_allocGPRtoXMMreg(XMMGPR_LO, ((xmminfo & XMMINFO_READLO) ? MODE_READ : 0) | ((xmminfo & XMMINFO_WRITELO) ? MODE_WRITE : 0)));
}
if (xmminfo & (XMMINFO_READHI | XMMINFO_WRITEHI))
{
info |= PROCESS_EE_SET_HI(_allocGPRtoXMMreg(-1, XMMGPR_HI, ((xmminfo & XMMINFO_READHI) ? MODE_READ : 0) | ((xmminfo & XMMINFO_WRITEHI) ? MODE_WRITE : 0)));
info |= PROCESS_EE_HI;
info |= PROCESS_EE_SET_HI(_allocGPRtoXMMreg(XMMGPR_HI, ((xmminfo & XMMINFO_READHI) ? MODE_READ : 0) | ((xmminfo & XMMINFO_WRITEHI) ? MODE_WRITE : 0)));
}
if (xmminfo & XMMINFO_WRITED)
GPR_DEL_CONST(_Rd_);
_validateRegs();
return info;
}
@ -676,9 +361,6 @@ int eeRecompileCodeXMM(int xmminfo)
#define _Fs_ _Rd_
#define _Fd_ _Sa_
#define PROCESS_EE_SETMODES_XMM(mmreg) ((xmmregs[mmreg].mode & MODE_WRITE) ? PROCESS_EE_MODEWRITES : 0)
#define PROCESS_EE_SETMODET_XMM(mmreg) ((xmmregs[mmreg].mode & MODE_WRITE) ? PROCESS_EE_MODEWRITET : 0)
// rd = rs op rt
void eeFPURecompileCode(R5900FNPTR_INFO xmmcode, R5900FNPTR fpucode, int xmminfo)
{
@ -699,7 +381,7 @@ void eeFPURecompileCode(R5900FNPTR_INFO xmmcode, R5900FNPTR fpucode, int xmminfo
if (g_pCurInstInfo->fpuregs[_Ft_] & EEINST_LASTUSE)
mmregt = _checkXMMreg(XMMTYPE_FPREG, _Ft_, MODE_READ);
else
mmregt = _allocFPtoXMMreg(-1, _Ft_, MODE_READ);
mmregt = _allocFPtoXMMreg(_Ft_, MODE_READ);
}
if (xmminfo & XMMINFO_READS)
@ -709,26 +391,27 @@ void eeFPURecompileCode(R5900FNPTR_INFO xmmcode, R5900FNPTR fpucode, int xmminfo
mmregs = _checkXMMreg(XMMTYPE_FPREG, _Fs_, MODE_READ);
}
else
mmregs = _allocFPtoXMMreg(-1, _Fs_, MODE_READ);
}
{
mmregs = _allocFPtoXMMreg(_Fs_, MODE_READ);
if (mmregs >= 0)
info |= PROCESS_EE_SETMODES_XMM(mmregs);
if (mmregt >= 0)
info |= PROCESS_EE_SETMODET_XMM(mmregt);
// if we just allocated S and Fs == Ft, share it
if ((xmminfo & XMMINFO_READT) && _Fs_ == _Ft_)
mmregt = mmregs;
}
}
if (xmminfo & XMMINFO_READD)
{
pxAssert(xmminfo & XMMINFO_WRITED);
mmregd = _allocFPtoXMMreg(-1, _Fd_, MODE_READ);
mmregd = _allocFPtoXMMreg(_Fd_, MODE_READ);
}
if (xmminfo & XMMINFO_READACC)
{
if (!(xmminfo & XMMINFO_WRITEACC) && (g_pCurInstInfo->fpuregs[_Ft_] & EEINST_LASTUSE))
if (!(xmminfo & XMMINFO_WRITEACC) && (g_pCurInstInfo->fpuregs[XMMFPU_ACC] & EEINST_LASTUSE))
mmregacc = _checkXMMreg(XMMTYPE_FPACC, 0, MODE_READ);
else
mmregacc = _allocFPACCtoXMMreg(-1, MODE_READ);
mmregacc = _allocFPACCtoXMMreg(MODE_READ);
}
if (xmminfo & XMMINFO_WRITEACC)
@ -741,34 +424,28 @@ void eeFPURecompileCode(R5900FNPTR_INFO xmmcode, R5900FNPTR fpucode, int xmminfo
if (mmregacc < 0)
{
if ((xmminfo & XMMINFO_READT) && mmregt >= 0 && (FPUINST_LASTUSE(_Ft_) || !FPUINST_ISLIVE(_Ft_)))
if ((xmminfo & XMMINFO_READT) && mmregt >= 0 && FPUINST_RENAMETEST(_Ft_))
{
if (FPUINST_ISLIVE(_Ft_))
{
_freeXMMreg(mmregt);
info &= ~PROCESS_EE_MODEWRITET;
}
xmmregs[mmregt].inuse = 1;
if (EE_WRITE_DEAD_VALUES && xmmregs[mmregt].mode & MODE_WRITE)
_writebackXMMreg(mmregt);
xmmregs[mmregt].reg = 0;
xmmregs[mmregt].mode = readacc;
xmmregs[mmregt].type = XMMTYPE_FPACC;
mmregacc = mmregt;
}
else if ((xmminfo & XMMINFO_READS) && mmregs >= 0 && (FPUINST_LASTUSE(_Fs_) || !FPUINST_ISLIVE(_Fs_)))
else if ((xmminfo & XMMINFO_READS) && mmregs >= 0 && FPUINST_RENAMETEST(_Fs_))
{
if (FPUINST_ISLIVE(_Fs_))
{
_freeXMMreg(mmregs);
info &= ~PROCESS_EE_MODEWRITES;
}
xmmregs[mmregs].inuse = 1;
if (EE_WRITE_DEAD_VALUES && xmmregs[mmregs].mode & MODE_WRITE)
_writebackXMMreg(mmregs);
xmmregs[mmregs].reg = 0;
xmmregs[mmregs].mode = readacc;
xmmregs[mmregs].type = XMMTYPE_FPACC;
mmregacc = mmregs;
}
else
mmregacc = _allocFPACCtoXMMreg(-1, readacc);
mmregacc = _allocFPACCtoXMMreg(readacc);
}
xmmregs[mmregacc].mode |= MODE_WRITE;
@ -778,48 +455,43 @@ void eeFPURecompileCode(R5900FNPTR_INFO xmmcode, R5900FNPTR fpucode, int xmminfo
// check for last used, if so don't alloc a new XMM reg
int readd = MODE_WRITE | ((xmminfo & XMMINFO_READD) ? MODE_READ : 0);
if (xmminfo & XMMINFO_READD)
mmregd = _allocFPtoXMMreg(-1, _Fd_, readd);
mmregd = _allocFPtoXMMreg(_Fd_, readd);
else
mmregd = _checkXMMreg(XMMTYPE_FPREG, _Fd_, readd);
if (mmregd < 0)
{
if ((xmminfo & XMMINFO_READT) && mmregt >= 0 && (FPUINST_LASTUSE(_Ft_) || !FPUINST_ISLIVE(_Ft_)))
if ((xmminfo & XMMINFO_READT) && mmregt >= 0 && FPUINST_RENAMETEST(_Ft_))
{
if (FPUINST_ISLIVE(_Ft_))
{
_freeXMMreg(mmregt);
info &= ~PROCESS_EE_MODEWRITET;
}
xmmregs[mmregt].inuse = 1;
if (EE_WRITE_DEAD_VALUES && xmmregs[mmregt].mode & MODE_WRITE)
_writebackXMMreg(mmregt);
xmmregs[mmregt].reg = _Fd_;
xmmregs[mmregt].mode = readd;
mmregd = mmregt;
}
else if ((xmminfo & XMMINFO_READS) && mmregs >= 0 && (FPUINST_LASTUSE(_Fs_) || !FPUINST_ISLIVE(_Fs_)))
else if ((xmminfo & XMMINFO_READS) && mmregs >= 0 && FPUINST_RENAMETEST(_Fs_))
{
if (FPUINST_ISLIVE(_Fs_))
{
_freeXMMreg(mmregs);
info &= ~PROCESS_EE_MODEWRITES;
}
if (EE_WRITE_DEAD_VALUES && xmmregs[mmregs].mode & MODE_WRITE)
_writebackXMMreg(mmregs);
xmmregs[mmregs].inuse = 1;
xmmregs[mmregs].reg = _Fd_;
xmmregs[mmregs].mode = readd;
mmregd = mmregs;
}
else if ((xmminfo & XMMINFO_READACC) && mmregacc >= 0 && (FPUINST_LASTUSE(XMMFPU_ACC) || !FPUINST_ISLIVE(XMMFPU_ACC)))
else if ((xmminfo & XMMINFO_READACC) && mmregacc >= 0 && FPUINST_RENAMETEST(XMMFPU_ACC))
{
if (FPUINST_ISLIVE(XMMFPU_ACC))
_freeXMMreg(mmregacc);
xmmregs[mmregacc].inuse = 1;
if (EE_WRITE_DEAD_VALUES && xmmregs[mmregacc].mode & MODE_WRITE)
_writebackXMMreg(mmregacc);
xmmregs[mmregacc].reg = _Fd_;
xmmregs[mmregacc].mode = readd;
xmmregs[mmregacc].type = XMMTYPE_FPREG;
mmregd = mmregacc;
}
else
mmregd = _allocFPtoXMMreg(-1, _Fd_, readd);
mmregd = _allocFPtoXMMreg(_Fd_, readd);
}
}
@ -841,12 +513,12 @@ void eeFPURecompileCode(R5900FNPTR_INFO xmmcode, R5900FNPTR fpucode, int xmminfo
if (xmminfo & XMMINFO_READS)
{
if (mmregs >= 0)
info |= PROCESS_EE_SET_S(mmregs) | PROCESS_EE_S;
info |= PROCESS_EE_SET_S(mmregs);
}
if (xmminfo & XMMINFO_READT)
{
if (mmregt >= 0)
info |= PROCESS_EE_SET_T(mmregt) | PROCESS_EE_T;
info |= PROCESS_EE_SET_T(mmregt);
}
// at least one must be in xmm
@ -856,5 +528,4 @@ void eeFPURecompileCode(R5900FNPTR_INFO xmmcode, R5900FNPTR fpucode, int xmminfo
}
xmmcode(info);
_clearNeededXMMregs();
}

View File

@ -23,11 +23,36 @@
#include "iR5900.h"
#include "common/Perf.h"
//#define LOG_STORES
using namespace vtlb_private;
using namespace x86Emitter;
// we need enough for a 32-bit jump forwards (5 bytes)
static constexpr u32 LOADSTORE_PADDING = 5;
//#define LOG_STORES
static u32 GetAllocatedGPRBitmask()
{
u32 mask = 0;
for (u32 i = 0; i < iREGCNT_GPR; i++)
{
if (x86regs[i].inuse)
mask |= (1u << i);
}
return mask;
}
static u32 GetAllocatedXMMBitmask()
{
u32 mask = 0;
for (u32 i = 0; i < iREGCNT_XMM; i++)
{
if (xmmregs[i].inuse)
mask |= (1u << i);
}
return mask;
}
/*
// Pseudo-Code For the following Dynarec Implementations -->
@ -112,18 +137,39 @@ namespace vtlb_private
// Prepares eax, ecx, and, ebx for Direct or Indirect operations.
// Returns the writeback pointer for ebx (return address from indirect handling)
//
static u32* DynGen_PrepRegs()
static void DynGen_PrepRegs(int addr_reg, int value_reg, u32 sz, bool xmm)
{
// Warning dirty ebx (in case someone got the very bad idea to move this code)
EE::Profiler.EmitMem();
_freeX86reg(arg1regd);
xMOV(arg1regd, xRegister32(addr_reg));
if (value_reg >= 0)
{
if (sz == 128)
{
pxAssert(xmm);
_freeXMMreg(xRegisterSSE::GetArgRegister(1, 0).GetId());
xMOVAPS(xRegisterSSE::GetArgRegister(1, 0), xRegisterSSE::GetInstance(value_reg));
}
else if (xmm)
{
// 32bit xmms are passed in GPRs
pxAssert(sz == 32);
_freeX86reg(arg2regd);
xMOVD(arg2regd, xRegisterSSE(value_reg));
}
else
{
_freeX86reg(arg2regd);
xMOV(arg2reg, xRegister64(value_reg));
}
}
xMOV(eax, arg1regd);
xSHR(eax, VTLB_PAGE_BITS);
xMOV(rax, ptrNative[xComplexAddress(rbx, vtlbdata.vmap, rax * wordsize)]);
u32* writeback = xLEA_Writeback(rbx);
xMOV(rax, ptrNative[xComplexAddress(arg3reg, vtlbdata.vmap, rax * wordsize)]);
xADD(arg1reg, rax);
return writeback;
}
// ------------------------------------------------------------------------
@ -169,17 +215,14 @@ namespace vtlb_private
// ------------------------------------------------------------------------
static void DynGen_DirectWrite(u32 bits)
{
// TODO: x86Emitter can't use dil
switch (bits)
{
//8 , 16, 32 : data on EDX
case 8:
xMOV(edx, arg2regd);
xMOV(ptr[arg1reg], dl);
xMOV(ptr[arg1reg], xRegister8(arg2regd));
break;
case 16:
xMOV(ptr[arg1reg], xRegister16(arg2reg));
xMOV(ptr[arg1reg], xRegister16(arg2regd));
break;
case 32:
@ -229,7 +272,9 @@ static u8* GetIndirectDispatcherPtr(int mode, int operandsize, int sign = 0)
// Generates a JS instruction that targets the appropriate templated instance of
// the vtlb Indirect Dispatcher.
//
static void DynGen_IndirectDispatch(int mode, int bits, bool sign = false)
template <typename GenDirectFn>
static void DynGen_HandlerTest(const GenDirectFn& gen_direct, int mode, int bits, bool sign = false)
{
int szidx = 0;
switch (bits)
@ -241,7 +286,12 @@ static void DynGen_IndirectDispatch(int mode, int bits, bool sign = false)
case 128: szidx = 4; break;
jNO_DEFAULT;
}
xJS(GetIndirectDispatcherPtr(mode, szidx, sign));
xForwardJS8 to_handler;
gen_direct();
xForwardJump8 done;
to_handler.SetTarget();
xFastCall(GetIndirectDispatcherPtr(mode, szidx, sign));
done.SetTarget();
}
// ------------------------------------------------------------------------
@ -250,6 +300,13 @@ static void DynGen_IndirectDispatch(int mode, int bits, bool sign = false)
// Out: eax: result (if mode < 64)
static void DynGen_IndirectTlbDispatcher(int mode, int bits, bool sign)
{
// fixup stack
#ifdef _WIN32
xSUB(rsp, 32 + 8);
#else
xSUB(rsp, 8);
#endif
xMOVZX(eax, al);
if (wordsize != 8)
xSUB(arg1regd, 0x80000000);
@ -291,7 +348,13 @@ static void DynGen_IndirectTlbDispatcher(int mode, int bits, bool sign)
}
}
xJMP(rbx);
#ifdef _WIN32
xADD(rsp, 32 + 8);
#else
xADD(rsp, 8);
#endif
xRET();
}
// One-time initialization procedure. Multiple subsequent calls during the lifespan of the
@ -342,65 +405,83 @@ static void vtlb_SetWriteback(u32* writeback)
//////////////////////////////////////////////////////////////////////////////////////////
// Dynarec Load Implementations
int vtlb_DynGenReadQuad(u32 bits, int gpr)
{
pxAssume(bits == 128);
u32* writeback = DynGen_PrepRegs();
const int reg = gpr == -1 ? _allocTempXMMreg(XMMT_INT, 0) : _allocGPRtoXMMreg(0, gpr, MODE_WRITE); // Handler returns in xmm0
DynGen_IndirectDispatch(0, bits);
DynGen_DirectRead(bits, false);
vtlb_SetWriteback(writeback); // return target for indirect's call/ret
return reg;
}
// ------------------------------------------------------------------------
// Recompiled input registers:
// ecx - source address to read from
// Returns read value in eax.
void vtlb_DynGenReadNonQuad(u32 bits, bool sign)
int vtlb_DynGenReadNonQuad(u32 bits, bool sign, bool xmm, int addr_reg, vtlb_ReadRegAllocCallback dest_reg_alloc)
{
pxAssume(bits <= 64);
u32* writeback = DynGen_PrepRegs();
DynGen_IndirectDispatch(0, bits, sign && bits < 64);
DynGen_DirectRead(bits, sign);
vtlb_SetWriteback(writeback);
}
// ------------------------------------------------------------------------
// TLB lookup is performed in const, with the assumption that the COP0/TLB will clear the
// recompiler if the TLB is changed.
int vtlb_DynGenReadQuad_Const(u32 bits, u32 addr_const, int gpr)
{
pxAssert(bits == 128);
EE::Profiler.EmitConstMem(addr_const);
int reg;
auto vmv = vtlbdata.vmap[addr_const >> VTLB_PAGE_BITS];
if (!vmv.isHandler(addr_const))
int x86_dest_reg;
if (!CHECK_FASTMEM || vtlb_IsFaultingPC(pc))
{
void* ppf = reinterpret_cast<void*>(vmv.assumePtr(addr_const));
reg = gpr == -1 ? _allocTempXMMreg(XMMT_INT, -1) : _allocGPRtoXMMreg(-1, gpr, MODE_WRITE);
xMOVAPS(xRegisterSSE(reg), ptr128[ppf]);
iFlushCall(FLUSH_FULLVTLB);
DynGen_PrepRegs(addr_reg, -1, bits, xmm);
DynGen_HandlerTest([bits, sign]() { DynGen_DirectRead(bits, sign); }, 0, bits, sign && bits < 64);
if (!xmm)
{
x86_dest_reg = dest_reg_alloc ? dest_reg_alloc() : (_freeX86reg(eax), eax.GetId());
xMOV(xRegister64(x86_dest_reg), rax);
}
else
{
// we shouldn't be loading any FPRs which aren't 32bit..
// we use MOVD here despite it being floating-point data, because we're going int->float reinterpret.
pxAssert(bits == 32);
x86_dest_reg = dest_reg_alloc ? dest_reg_alloc() : (_freeXMMreg(0), 0);
xMOVDZX(xRegisterSSE(x86_dest_reg), eax);
}
return x86_dest_reg;
}
const u8* codeStart;
const xAddressReg x86addr(addr_reg);
if (!xmm)
{
x86_dest_reg = dest_reg_alloc ? dest_reg_alloc() : (_freeX86reg(eax), eax.GetId());
codeStart = x86Ptr;
const xRegister64 x86reg(x86_dest_reg);
switch (bits)
{
case 8:
sign ? xMOVSX(x86reg, ptr8[RFASTMEMBASE + x86addr]) : xMOVZX(xRegister32(x86reg), ptr8[RFASTMEMBASE + x86addr]);
break;
case 16:
sign ? xMOVSX(x86reg, ptr16[RFASTMEMBASE + x86addr]) : xMOVZX(xRegister32(x86reg), ptr16[RFASTMEMBASE + x86addr]);
break;
case 32:
sign ? xMOVSX(x86reg, ptr32[RFASTMEMBASE + x86addr]) : xMOV(xRegister32(x86reg), ptr32[RFASTMEMBASE + x86addr]);
break;
case 64:
xMOV(x86reg, ptr64[RFASTMEMBASE + x86addr]);
break;
jNO_DEFAULT
}
}
else
{
// has to: translate, find function, call function
u32 paddr = vmv.assumeHandlerGetPAddr(addr_const);
const int szidx = 4;
iFlushCall(FLUSH_FULLVTLB);
reg = gpr == -1 ? _allocTempXMMreg(XMMT_INT, 0) : _allocGPRtoXMMreg(0, gpr, MODE_WRITE); // Handler returns in xmm0
xFastCall(vmv.assumeHandlerGetRaw(szidx, 0), paddr, arg2reg);
pxAssert(bits == 32);
x86_dest_reg = dest_reg_alloc ? dest_reg_alloc() : (_freeXMMreg(0), 0);
codeStart = x86Ptr;
const xRegisterSSE xmmreg(x86_dest_reg);
xMOVSSZX(xmmreg, ptr32[RFASTMEMBASE + x86addr]);
}
return reg;
const u32 padding = LOADSTORE_PADDING - std::min<u32>(static_cast<u32>(x86Ptr - codeStart), 5);
for (u32 i = 0; i < padding; i++)
xNOP();
vtlb_AddLoadStoreInfo((uptr)codeStart, static_cast<u32>(x86Ptr - codeStart),
pc, GetAllocatedGPRBitmask(), GetAllocatedXMMBitmask(),
static_cast<u8>(addr_reg), static_cast<u8>(x86_dest_reg),
static_cast<u8>(bits), sign, true, xmm);
return x86_dest_reg;
}
// ------------------------------------------------------------------------
@ -411,40 +492,41 @@ int vtlb_DynGenReadQuad_Const(u32 bits, u32 addr_const, int gpr)
// TLB lookup is performed in const, with the assumption that the COP0/TLB will clear the
// recompiler if the TLB is changed.
//
void vtlb_DynGenReadNonQuad_Const(u32 bits, bool sign, u32 addr_const)
int vtlb_DynGenReadNonQuad_Const(u32 bits, bool sign, bool xmm, u32 addr_const, vtlb_ReadRegAllocCallback dest_reg_alloc)
{
EE::Profiler.EmitConstMem(addr_const);
int x86_dest_reg;
auto vmv = vtlbdata.vmap[addr_const >> VTLB_PAGE_BITS];
if (!vmv.isHandler(addr_const))
{
auto ppf = vmv.assumePtr(addr_const);
switch (bits)
if (!xmm)
{
x86_dest_reg = dest_reg_alloc ? dest_reg_alloc() : (_freeX86reg(eax), eax.GetId());
switch (bits)
{
case 8:
if (sign)
xMOVSX(rax, ptr8[(u8*)ppf]);
else
xMOVZX(rax, ptr8[(u8*)ppf]);
sign ? xMOVSX(xRegister64(x86_dest_reg), ptr8[(u8*)ppf]) : xMOVZX(xRegister32(x86_dest_reg), ptr8[(u8*)ppf]);
break;
case 16:
if (sign)
xMOVSX(rax, ptr16[(u16*)ppf]);
else
xMOVZX(rax, ptr16[(u16*)ppf]);
sign ? xMOVSX(xRegister64(x86_dest_reg), ptr16[(u16*)ppf]) : xMOVZX(xRegister32(x86_dest_reg), ptr16[(u16*)ppf]);
break;
case 32:
if (sign)
xMOVSX(rax, ptr32[(u32*)ppf]);
else
xMOV(eax, ptr32[(u32*)ppf]);
sign ? xMOVSX(xRegister64(x86_dest_reg), ptr32[(u32*)ppf]) : xMOV(xRegister32(x86_dest_reg), ptr32[(u32*)ppf]);
break;
case 64:
xMOV(rax, ptr64[(u64*)ppf]);
xMOV(xRegister64(x86_dest_reg), ptr64[(u64*)ppf]);
break;
}
}
else
{
x86_dest_reg = dest_reg_alloc ? dest_reg_alloc() : (_freeXMMreg(0), 0);
xMOVSSZX(xRegisterSSE(x86_dest_reg), ptr32[(float*)ppf]);
}
}
else
@ -464,60 +546,157 @@ void vtlb_DynGenReadNonQuad_Const(u32 bits, bool sign, u32 addr_const)
// Shortcut for the INTC_STAT register, which many games like to spin on heavily.
if ((bits == 32) && !EmuConfig.Speedhacks.IntcStat && (paddr == INTC_STAT))
{
xMOV(eax, ptr[&psHu32(INTC_STAT)]);
x86_dest_reg = dest_reg_alloc ? dest_reg_alloc() : (_freeX86reg(eax), eax.GetId());
if (!xmm)
{
if (sign)
xMOVSX(xRegister64(x86_dest_reg), ptr32[&psHu32(INTC_STAT)]);
else
xMOV(xRegister32(x86_dest_reg), ptr32[&psHu32(INTC_STAT)]);
}
else
{
xMOVDZX(xRegisterSSE(x86_dest_reg), ptr32[&psHu32(INTC_STAT)]);
}
}
else
{
iFlushCall(FLUSH_FULLVTLB);
xFastCall(vmv.assumeHandlerGetRaw(szidx, false), paddr);
// perform sign extension on the result:
if (!xmm)
{
x86_dest_reg = dest_reg_alloc ? dest_reg_alloc() : (_freeX86reg(eax), eax.GetId());
switch (bits)
{
// save REX prefix by using 32bit dest for zext
case 8:
sign ? xMOVSX(xRegister64(x86_dest_reg), al) : xMOVZX(xRegister32(x86_dest_reg), al);
break;
if (bits == 8)
{
if (sign)
xMOVSX(rax, al);
else
xMOVZX(rax, al);
case 16:
sign ? xMOVSX(xRegister64(x86_dest_reg), ax) : xMOVZX(xRegister32(x86_dest_reg), ax);
break;
case 32:
sign ? xMOVSX(xRegister64(x86_dest_reg), eax) : xMOV(xRegister32(x86_dest_reg), eax);
break;
case 64:
xMOV(xRegister64(x86_dest_reg), rax);
break;
}
}
else if (bits == 16)
else
{
if (sign)
xMOVSX(rax, ax);
else
xMOVZX(rax, ax);
}
else if (bits == 32)
{
if (sign)
xCDQE();
x86_dest_reg = dest_reg_alloc ? dest_reg_alloc() : (_freeXMMreg(0), 0);
xMOVDZX(xRegisterSSE(x86_dest_reg), eax);
}
}
}
return x86_dest_reg;
}
int vtlb_DynGenReadQuad(u32 bits, int addr_reg, vtlb_ReadRegAllocCallback dest_reg_alloc)
{
pxAssume(bits == 128);
if (!CHECK_FASTMEM || vtlb_IsFaultingPC(pc))
{
iFlushCall(FLUSH_FULLVTLB);
DynGen_PrepRegs(arg1regd.GetId(), -1, bits, true);
DynGen_HandlerTest([bits]() {DynGen_DirectRead(bits, false); }, 0, bits);
const int reg = dest_reg_alloc ? dest_reg_alloc() : (_freeXMMreg(0), 0); // Handler returns in xmm0
if (reg >= 0)
xMOVAPS(xRegisterSSE(reg), xmm0);
return reg;
}
const int reg = dest_reg_alloc ? dest_reg_alloc() : (_freeXMMreg(0), 0); // Handler returns in xmm0
const u8* codeStart = x86Ptr;
xMOVAPS(xRegisterSSE(reg), ptr128[RFASTMEMBASE + arg1reg]);
const u32 padding = LOADSTORE_PADDING - std::min<u32>(static_cast<u32>(x86Ptr - codeStart), 5);
for (u32 i = 0; i < padding; i++)
xNOP();
vtlb_AddLoadStoreInfo((uptr)codeStart, static_cast<u32>(x86Ptr - codeStart),
pc, GetAllocatedGPRBitmask(), GetAllocatedXMMBitmask(),
static_cast<u8>(arg1reg.GetId()), static_cast<u8>(reg),
static_cast<u8>(bits), false, true, true);
return reg;
}
// ------------------------------------------------------------------------
// TLB lookup is performed in const, with the assumption that the COP0/TLB will clear the
// recompiler if the TLB is changed.
int vtlb_DynGenReadQuad_Const(u32 bits, u32 addr_const, vtlb_ReadRegAllocCallback dest_reg_alloc)
{
pxAssert(bits == 128);
EE::Profiler.EmitConstMem(addr_const);
int reg;
auto vmv = vtlbdata.vmap[addr_const >> VTLB_PAGE_BITS];
if (!vmv.isHandler(addr_const))
{
void* ppf = reinterpret_cast<void*>(vmv.assumePtr(addr_const));
reg = dest_reg_alloc ? dest_reg_alloc() : (_freeXMMreg(0), 0);
if (reg >= 0)
xMOVAPS(xRegisterSSE(reg), ptr128[ppf]);
}
else
{
// has to: translate, find function, call function
u32 paddr = vmv.assumeHandlerGetPAddr(addr_const);
const int szidx = 4;
iFlushCall(FLUSH_FULLVTLB);
xFastCall(vmv.assumeHandlerGetRaw(szidx, 0), paddr);
reg = dest_reg_alloc ? dest_reg_alloc() : (_freeXMMreg(0), 0);
xMOVAPS(xRegisterSSE(reg), xmm0);
}
return reg;
}
//////////////////////////////////////////////////////////////////////////////////////////
// Dynarec Store Implementations
void vtlb_DynGenWrite(u32 sz)
void vtlb_DynGenWrite(u32 sz, bool xmm, int addr_reg, int value_reg)
{
#ifdef LOG_STORES
//if (sz != 128)
//if (!xmm)
{
iFlushCall(FLUSH_FULLVTLB);
xPUSH(xRegister64(addr_reg));
xPUSH(xRegister64(value_reg));
xPUSH(arg1reg);
xPUSH(arg2reg);
if (sz == 128)
xMOV(arg1regd, xRegister32(addr_reg));
if (xmm)
{
xSUB(rsp, 32 + 32);
xMOVAPS(ptr[rsp + 32], xRegisterSSE::GetArgRegister(1, 0));
xMOVAPS(ptr[rsp + 32], xRegisterSSE::GetInstance(value_reg));
xMOVAPS(ptr[rsp + 48], xRegisterSSE::GetArgRegister(1, 0));
xMOVAPS(xRegisterSSE::GetArgRegister(1, 0), xRegisterSSE::GetInstance(value_reg));
xFastCall((void*)LogWriteQuad);
xMOVAPS(xRegisterSSE::GetArgRegister(1, 0), ptr[rsp + 32]);
xMOVAPS(xRegisterSSE::GetArgRegister(1, 0), ptr[rsp + 48]);
xMOVAPS(xRegisterSSE::GetInstance(value_reg), ptr[rsp + 32]);
xADD(rsp, 32 + 32);
}
else
{
xMOV(arg2reg, xRegister64(value_reg));
if (sz == 8)
xAND(arg2regd, 0xFF);
else if (sz == 16)
@ -530,15 +709,67 @@ void vtlb_DynGenWrite(u32 sz)
}
xPOP(arg2reg);
xPOP(arg1reg);
xPOP(xRegister64(value_reg));
xPOP(xRegister64(addr_reg));
}
#endif
u32* writeback = DynGen_PrepRegs();
if (!CHECK_FASTMEM || vtlb_IsFaultingPC(pc))
{
iFlushCall(FLUSH_FULLVTLB);
DynGen_IndirectDispatch(1, sz);
DynGen_DirectWrite(sz);
DynGen_PrepRegs(addr_reg, value_reg, sz, xmm);
DynGen_HandlerTest([sz]() { DynGen_DirectWrite(sz); }, 1, sz);
return;
}
vtlb_SetWriteback(writeback);
const u8* codeStart = x86Ptr;
const xAddressReg vaddr_reg(addr_reg);
if (!xmm)
{
switch (sz)
{
case 8:
xMOV(ptr8[RFASTMEMBASE + vaddr_reg], xRegister8(xRegister32(value_reg)));
break;
case 16:
xMOV(ptr16[RFASTMEMBASE + vaddr_reg], xRegister16(value_reg));
break;
case 32:
xMOV(ptr32[RFASTMEMBASE + vaddr_reg], xRegister32(value_reg));
break;
case 64:
xMOV(ptr64[RFASTMEMBASE + vaddr_reg], xRegister64(value_reg));
break;
jNO_DEFAULT
}
}
else
{
pxAssert(sz == 32 || sz == 128);
switch (sz)
{
case 32:
xMOVSS(ptr32[RFASTMEMBASE + vaddr_reg], xRegisterSSE(value_reg));
break;
case 128:
xMOVAPS(ptr128[RFASTMEMBASE + vaddr_reg], xRegisterSSE(value_reg));
break;
jNO_DEFAULT
}
}
const u32 padding = LOADSTORE_PADDING - std::min<u32>(static_cast<u32>(x86Ptr - codeStart), 5);
for (u32 i = 0; i < padding; i++)
xNOP();
vtlb_AddLoadStoreInfo((uptr)codeStart, static_cast<u32>(x86Ptr - codeStart),
pc, GetAllocatedGPRBitmask(), GetAllocatedXMMBitmask(),
static_cast<u8>(addr_reg), static_cast<u8>(value_reg),
static_cast<u8>(sz), false, false, xmm);
}
@ -546,28 +777,34 @@ void vtlb_DynGenWrite(u32 sz)
// Generates code for a store instruction, where the address is a known constant.
// TLB lookup is performed in const, with the assumption that the COP0/TLB will clear the
// recompiler if the TLB is changed.
void vtlb_DynGenWrite_Const(u32 bits, u32 addr_const)
void vtlb_DynGenWrite_Const(u32 bits, bool xmm, u32 addr_const, int value_reg)
{
EE::Profiler.EmitConstMem(addr_const);
#ifdef LOG_STORES
iFlushCall(FLUSH_FULLVTLB);
//if (bits != 128)
//if (!xmm)
{
xPUSH(xRegister64(value_reg));
xPUSH(xRegister64(value_reg));
xPUSH(arg1reg);
xPUSH(arg2reg);
xMOV(arg1reg, addr_const);
if (bits == 128)
if (xmm)
{
xSUB(rsp, 32 + 32);
xMOVAPS(ptr[rsp + 32], xRegisterSSE::GetArgRegister(1, 0));
xMOVAPS(ptr[rsp + 32], xRegisterSSE::GetInstance(value_reg));
xMOVAPS(ptr[rsp + 48], xRegisterSSE::GetArgRegister(1, 0));
xMOVAPS(xRegisterSSE::GetArgRegister(1, 0), xRegisterSSE::GetInstance(value_reg));
xFastCall((void*)LogWriteQuad);
xMOVAPS(xRegisterSSE::GetArgRegister(1, 0), ptr[rsp + 32]);
xMOVAPS(xRegisterSSE::GetArgRegister(1, 0), ptr[rsp + 48]);
xMOVAPS(xRegisterSSE::GetInstance(value_reg), ptr[rsp + 32]);
xADD(rsp, 32 + 32);
}
else
{
xMOV(arg2reg, xRegister64(value_reg));
if (bits == 8)
xAND(arg2regd, 0xFF);
else if (bits == 16)
@ -580,37 +817,52 @@ void vtlb_DynGenWrite_Const(u32 bits, u32 addr_const)
}
xPOP(arg2reg);
xPOP(arg1reg);
xPOP(xRegister64(value_reg));
xPOP(xRegister64(value_reg));
}
#endif
auto vmv = vtlbdata.vmap[addr_const >> VTLB_PAGE_BITS];
if (!vmv.isHandler(addr_const))
{
// TODO: x86Emitter can't use dil
auto ppf = vmv.assumePtr(addr_const);
switch (bits)
if (!xmm)
{
//8 , 16, 32 : data on arg2
case 8:
xMOV(edx, arg2regd);
xMOV(ptr[(void*)ppf], dl);
break;
switch (bits)
{
case 8:
xMOV(ptr[(void*)ppf], xRegister8(xRegister32(value_reg)));
break;
case 16:
xMOV(ptr[(void*)ppf], xRegister16(arg2reg));
break;
case 16:
xMOV(ptr[(void*)ppf], xRegister16(value_reg));
break;
case 32:
xMOV(ptr[(void*)ppf], arg2regd);
break;
case 32:
xMOV(ptr[(void*)ppf], xRegister32(value_reg));
break;
case 64:
xMOV(ptr64[(void*)ppf], arg2reg);
break;
case 64:
xMOV(ptr64[(void*)ppf], xRegister64(value_reg));
break;
case 128:
xMOVAPS(ptr128[(void*)ppf], xRegisterSSE::GetArgRegister(1, 0));
break;
jNO_DEFAULT
}
}
else
{
switch (bits)
{
case 32:
xMOVSS(ptr[(void*)ppf], xRegisterSSE(value_reg));
break;
case 128:
xMOVAPS(ptr128[(void*)ppf], xRegisterSSE(value_reg));
break;
jNO_DEFAULT
}
}
}
else
@ -621,15 +873,47 @@ void vtlb_DynGenWrite_Const(u32 bits, u32 addr_const)
int szidx = 0;
switch (bits)
{
case 8: szidx=0; break;
case 16: szidx=1; break;
case 32: szidx=2; break;
case 64: szidx=3; break;
case 128: szidx=4; break;
case 8:
szidx = 0;
break;
case 16:
szidx = 1;
break;
case 32:
szidx = 2;
break;
case 64:
szidx = 3;
break;
case 128:
szidx = 4;
break;
}
iFlushCall(FLUSH_FULLVTLB);
xFastCall(vmv.assumeHandlerGetRaw(szidx, true), paddr);
_freeX86reg(arg1regd);
xMOV(arg1regd, paddr);
if (bits == 128)
{
pxAssert(xmm);
const xRegisterSSE argreg(xRegisterSSE::GetArgRegister(1, 0));
_freeXMMreg(argreg.GetId());
xMOVAPS(argreg, xRegisterSSE(value_reg));
}
else if (xmm)
{
pxAssert(bits == 32);
_freeX86reg(arg2regd);
xMOVD(arg2regd, xRegisterSSE(value_reg));
}
else
{
_freeX86reg(arg2regd);
xMOV(arg2reg, xRegister64(value_reg));
}
xFastCall(vmv.assumeHandlerGetRaw(szidx, true));
}
}
@ -649,3 +933,156 @@ void vtlb_DynV2P()
xOR(eax, ecx);
}
void vtlb_DynBackpatchLoadStore(uptr code_address, u32 code_size, u32 guest_pc, u32 guest_addr,
u32 gpr_bitmask, u32 fpr_bitmask, u8 address_register, u8 data_register,
u8 size_in_bits, bool is_signed, bool is_load, bool is_xmm)
{
static constexpr u32 GPR_SIZE = 8;
static constexpr u32 XMM_SIZE = 16;
// on win32, we need to reserve an additional 32 bytes shadow space when calling out to C
#ifdef _WIN32
static constexpr u32 SHADOW_SIZE = 32;
#else
static constexpr u32 SHADOW_SIZE = 0;
#endif
DevCon.WriteLn("Backpatching %s at %p[%u] (pc %08X vaddr %08X): Bitmask %08X %08X Addr %u Data %u Size %u Flags %02X %02X",
is_load ? "load" : "store", (void*)code_address, code_size, guest_pc, guest_addr, gpr_bitmask, fpr_bitmask,
address_register, data_register, size_in_bits, is_signed, is_load);
u8* thunk = recBeginThunk();
// save regs
u32 num_gprs = 0;
u32 num_fprs = 0;
for (u32 i = 0; i < iREGCNT_GPR; i++)
{
if ((gpr_bitmask & (1u << i)) && (i == rbx.GetId() || i == arg1reg.GetId() || i == arg2reg.GetId() || xRegisterBase::IsCallerSaved(i)) && (!is_load || is_xmm || data_register != i))
num_gprs++;
}
for (u32 i = 0; i < iREGCNT_XMM; i++)
{
if (fpr_bitmask & (1u << i) && xRegisterSSE::IsCallerSaved(i) && (!is_load || !is_xmm || data_register != i))
num_fprs++;
}
const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE) + (num_fprs * XMM_SIZE) + SHADOW_SIZE;
const u32 arg1id = static_cast<u32>(arg1reg.GetId());
const u32 arg2id = static_cast<u32>(arg2reg.GetId());
const u32 arg3id = static_cast<u32>(arg3reg.GetId());
if (stack_size > 0)
{
xSUB(rsp, stack_size);
u32 stack_offset = SHADOW_SIZE;
for (u32 i = 0; i < iREGCNT_XMM; i++)
{
if (fpr_bitmask & (1u << i) && xRegisterSSE::IsCallerSaved(i) && (!is_load || !is_xmm || data_register != i))
{
xMOVAPS(ptr128[rsp + stack_offset], xRegisterSSE(i));
stack_offset += XMM_SIZE;
}
}
for (u32 i = 0; i < iREGCNT_GPR; i++)
{
if ((gpr_bitmask & (1u << i)) && (i == arg1id || i == arg2id || i == arg3id || xRegisterBase::IsCallerSaved(i)) && (!is_load || is_xmm || data_register != i))
{
xMOV(ptr64[rsp + stack_offset], xRegister64(i));
stack_offset += GPR_SIZE;
}
}
}
if (is_load)
{
DynGen_PrepRegs(address_register, -1, size_in_bits, is_xmm);
DynGen_HandlerTest([size_in_bits, is_signed]() {DynGen_DirectRead(size_in_bits, is_signed); }, 0, size_in_bits, is_signed && size_in_bits <= 32);
if (size_in_bits == 128)
{
if (data_register != xmm0.GetId())
xMOVAPS(xRegisterSSE(data_register), xmm0);
}
else
{
if (is_xmm)
{
xMOVDZX(xRegisterSSE(data_register), rax);
}
else
{
if (data_register != eax.GetId())
xMOV(xRegister64(data_register), rax);
}
}
}
else
{
if (address_register != arg1reg.GetId())
xMOV(arg1regd, xRegister32(address_register));
if (size_in_bits == 128)
{
const xRegisterSSE argreg(xRegisterSSE::GetArgRegister(1, 0));
if (data_register != argreg.GetId())
xMOVAPS(argreg, xRegisterSSE(data_register));
}
else
{
if (is_xmm)
{
xMOVD(arg2reg, xRegisterSSE(data_register));
}
else
{
if (data_register != arg2reg.GetId())
xMOV(arg2reg, xRegister64(data_register));
}
}
DynGen_PrepRegs(address_register, data_register, size_in_bits, is_xmm);
DynGen_HandlerTest([size_in_bits]() { DynGen_DirectWrite(size_in_bits); }, 1, size_in_bits);
}
// restore regs
if (stack_size > 0)
{
u32 stack_offset = SHADOW_SIZE;
for (u32 i = 0; i < iREGCNT_XMM; i++)
{
if (fpr_bitmask & (1u << i) && xRegisterSSE::IsCallerSaved(i) && (!is_load || !is_xmm || data_register != i))
{
xMOVAPS(xRegisterSSE(i), ptr128[rsp + stack_offset]);
stack_offset += XMM_SIZE;
}
}
for (u32 i = 0; i < iREGCNT_GPR; i++)
{
if ((gpr_bitmask & (1u << i)) && (i == arg1id || i == arg2id || i == arg3id || xRegisterBase::IsCallerSaved(i)) && (!is_load || is_xmm || data_register != i))
{
xMOV(xRegister64(i), ptr64[rsp + stack_offset]);
stack_offset += GPR_SIZE;
}
}
xADD(rsp, stack_size);
}
xJMP((void*)(code_address + code_size));
recEndThunk();
// backpatch to a jump to the slowmem handler
x86Ptr = (u8*)code_address;
xJMP(thunk);
// fill the rest of it with nops, if any
pxAssertRel(static_cast<u32>((uptr)x86Ptr - code_address) <= code_size, "Overflowed when backpatching");
for (u32 i = static_cast<u32>((uptr)x86Ptr - code_address); i < code_size; i++)
xNOP();
}

View File

@ -125,7 +125,8 @@ void mVUDTendProgram(mV, microFlagCycles* mFC, int isEbit)
xMOVAPS(ptr128[&mVU.regs().micro_statusflags], xmmT1);
}
xMOV(ptr32[&mVU.regs().nextBlockCycles], 0);
if (EmuConfig.Gamefixes.VUSyncHack || EmuConfig.Gamefixes.FullVU0SyncHack)
xMOV(ptr32[&mVU.regs().nextBlockCycles], 0);
xMOV(ptr32[&mVU.regs().VI[REG_TPC].UL], xPC);
@ -251,7 +252,8 @@ void mVUendProgram(mV, microFlagCycles* mFC, int isEbit)
if ((isEbit && isEbit != 3)) // Clear 'is busy' Flags
{
xMOV(ptr32[&mVU.regs().nextBlockCycles], 0);
if (EmuConfig.Gamefixes.VUSyncHack || EmuConfig.Gamefixes.FullVU0SyncHack)
xMOV(ptr32[&mVU.regs().nextBlockCycles], 0);
if (!mVU.index || !THREAD_VU1)
{
xAND(ptr32[&VU0.VI[REG_VPU_STAT].UL], (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
@ -259,7 +261,8 @@ void mVUendProgram(mV, microFlagCycles* mFC, int isEbit)
}
else if(isEbit)
{
xMOV(ptr32[&mVU.regs().nextBlockCycles], 0);
if (EmuConfig.Gamefixes.VUSyncHack || EmuConfig.Gamefixes.FullVU0SyncHack)
xMOV(ptr32[&mVU.regs().nextBlockCycles], 0);
}
if (isEbit != 2 && isEbit != 3) // Save PC, and Jump to Exit Point

View File

@ -484,7 +484,9 @@ void mVUtestCycles(microVU& mVU, microFlagCycles& mFC)
xForwardJGE32 skip;
mVUsavePipelineState(mVU);
xMOV(ptr32[&mVU.regs().nextBlockCycles], mVUcycles);
if (EmuConfig.Gamefixes.VUSyncHack || EmuConfig.Gamefixes.FullVU0SyncHack)
xMOV(ptr32[&mVU.regs().nextBlockCycles], mVUcycles);
mVUendProgram(mVU, &mFC, 0);
skip.SetTarget();
@ -801,7 +803,8 @@ void* mVUcompile(microVU& mVU, u32 startPC, uptr pState)
}
incPC(2);
mVUsetupRange(mVU, xPC, false);
xMOV(ptr32[&mVU.regs().nextBlockCycles], 0);
if (EmuConfig.Gamefixes.VUSyncHack || EmuConfig.Gamefixes.FullVU0SyncHack)
xMOV(ptr32[&mVU.regs().nextBlockCycles], 0);
mVUendProgram(mVU, &mFC, 0);
normBranchCompile(mVU, xPC);
incPC(-2);

View File

@ -215,6 +215,9 @@ struct microIR
// Reg Alloc
//------------------------------------------------------------------
//#define MVURALOG(...) fprintf(stderr, __VA_ARGS__)
#define MVURALOG(...)
struct microMapXMM
{
int VFreg; // VF Reg Number Stored (-1 = Temp; 0 = vf0 and will not be written back; 32 = ACC; 33 = I reg)
@ -231,6 +234,13 @@ protected:
microMapXMM xmmMap[xmmTotal];
int counter; // Current allocation count
int index; // VU0 or VU1
// DO NOT REMOVE THIS.
// This is here for a reason. MSVC likes to turn global writes into a load+conditional move+store.
// That creates a race with the EE thread when we're compiling on the VU thread, even though
// regAllocCOP2 is false. By adding another level of indirection, it emits a branch instead.
_xmmregs* pxmmregs;
bool regAllocCOP2; // Local COP2 check
// Helper functions to get VU regs
@ -260,11 +270,11 @@ protected:
return -1;
}
int findFreeReg()
int findFreeReg(int vfreg)
{
if (regAllocCOP2)
{
return _freeXMMregsCOP2();
return _allocVFtoXMMreg(vfreg, 0);
}
for (int i = 0; i < xmmTotal; i++)
@ -289,12 +299,38 @@ public:
// Fully resets the regalloc by clearing all cached data
void reset(bool cop2mode)
{
// we run this at the of cop2, so don't free fprs
regAllocCOP2 = false;
for (int i = 0; i < xmmTotal; i++)
{
clearReg(i);
}
counter = 0;
regAllocCOP2 = cop2mode;
pxmmregs = cop2mode ? xmmregs : nullptr;
if (cop2mode)
{
for (int i = 0; i < xmmTotal; i++)
{
if (!pxmmregs[i].inuse || pxmmregs[i].type != XMMTYPE_VFREG)
continue;
// we shouldn't have any temp registers in here.. except for PQ, which
// isn't allocated here yet.
// pxAssertRel(fprregs[i].reg >= 0, "Valid full register preserved");
if (pxmmregs[i].reg >= 0)
{
MVURALOG("Preserving VF reg %d in host reg %d across instruction\n", pxmmregs[i].reg, i);
pxAssert(pxmmregs[i].reg != 255);
pxmmregs[i].needed = false;
xmmMap[i].isNeeded = false;
xmmMap[i].VFreg = pxmmregs[i].reg;
xmmMap[i].xyzw = ((pxmmregs[i].mode & MODE_WRITE) != 0) ? 0xf : 0x0;
}
}
}
}
int getXmmCount()
@ -314,6 +350,35 @@ public:
}
}
void flushPartialForCOP2()
{
for (int i = 0; i < xmmTotal; i++)
{
microMapXMM& clear = xmmMap[i];
// toss away anything which is not a full cached register
if (pxmmregs[i].inuse && pxmmregs[i].type == XMMTYPE_VFREG)
{
// Should've been done in clearNeeded()
if (clear.xyzw != 0 && clear.xyzw != 0xf)
writeBackReg(xRegisterSSE::GetInstance(i), false);
if (clear.VFreg <= 0)
{
// temps really shouldn't be here..
_freeXMMreg(i);
}
}
// needed gets cleared in iCore.
clear.VFreg = -1;
clear.count = 0;
clear.xyzw = 0;
clear.isNeeded = 0;
clear.isZero = 0;
}
}
void TDwritebackAll(bool clearState = false)
{
for (int i = 0; i < xmmTotal; i++)
@ -352,6 +417,12 @@ public:
void clearReg(int regId)
{
microMapXMM& clear = xmmMap[regId];
if (regAllocCOP2)
{
pxAssert(pxmmregs[regId].type == XMMTYPE_VFREG);
pxmmregs[regId].inuse = false;
}
clear.VFreg = -1;
clear.count = 0;
clear.xyzw = 0;
@ -368,6 +439,24 @@ public:
}
}
void clearRegCOP2(int xmmReg)
{
if (regAllocCOP2)
clearReg(xmmReg);
}
void updateCOP2AllocState(int rn)
{
if (!regAllocCOP2)
return;
const bool dirty = (xmmMap[rn].VFreg > 0 && xmmMap[rn].xyzw != 0);
pxAssert(pxmmregs[rn].type == XMMTYPE_VFREG);
pxmmregs[rn].reg = xmmMap[rn].VFreg;
pxmmregs[rn].mode = dirty ? (MODE_READ | MODE_WRITE) : MODE_READ;
pxmmregs[rn].needed = xmmMap[rn].isNeeded;
}
// Writes back modified reg to memory.
// If all vectors modified, then keeps the VF reg cached in the xmm register.
// If reg was not modified, then keeps the VF reg cached in the xmm register.
@ -406,6 +495,7 @@ public:
mapX.count = counter;
mapX.xyzw = 0;
mapX.isNeeded = false;
updateCOP2AllocState(reg.Id);
return;
}
clearReg(reg);
@ -453,6 +543,7 @@ public:
mapI.xyzw = 0xf;
mapI.count = counter;
mergeRegs = 2;
updateCOP2AllocState(i);
}
else
clearReg(i); // Clears when mergeRegs is 0 or 2
@ -466,6 +557,12 @@ public:
else
clearReg(reg); // If Reg was temp or vf0, then invalidate itself
}
else if (regAllocCOP2 && clear.VFreg < 0)
{
// free on the EE side
pxAssert(pxmmregs[reg.Id].type == XMMTYPE_VFREG);
pxmmregs[reg.Id].inuse = false;
}
}
// vfLoadReg = VF reg to be loaded to the xmm register
@ -495,7 +592,7 @@ public:
{
if (cloneWrite) // Clone Reg so as not to use the same Cached Reg
{
z = findFreeReg();
z = findFreeReg(vfWriteReg);
const xmm& xmmZ = xmm::GetInstance(z);
writeBackReg(xmmZ);
@ -528,11 +625,13 @@ public:
}
xmmMap[z].count = counter;
xmmMap[z].isNeeded = true;
updateCOP2AllocState(z);
return xmm::GetInstance(z);
}
}
}
int x = findFreeReg();
int x = findFreeReg((vfWriteReg >= 0) ? vfWriteReg : vfLoadReg);
const xmm& xmmX = xmm::GetInstance(x);
writeBackReg(xmmX);
@ -565,6 +664,7 @@ public:
xmmMap[x].isZero = (vfLoadReg == 0);
xmmMap[x].count = counter;
xmmMap[x].isNeeded = true;
updateCOP2AllocState(x);
return xmmX;
}
};

View File

@ -28,6 +28,10 @@ using namespace R5900::Dynarec;
#define printCOP2(...) (void)0
//#define printCOP2 DevCon.Status
// For now, we need to free all XMMs. Because we're not saving the nonvolatile registers when
// we enter micro mode, they will get overriden otherwise...
#define FLUSH_FOR_POSSIBLE_MICRO_EXEC (FLUSH_FREE_XMM | FLUSH_FREE_VU0)
void setupMacroOp(int mode, const char* opName)
{
// Set up reg allocation
@ -96,8 +100,7 @@ void endMacroOp(int mode)
xMOVSS(ptr32[&vu0Regs.VI[REG_Q].UL], xmmPQ);
}
microVU0.regAlloc->flushAll();
_clearNeededCOP2Regs();
microVU0.regAlloc->flushPartialForCOP2();
if (mode & 0x10)
{
@ -119,6 +122,11 @@ void endMacroOp(int mode)
microVU0.regAlloc->reset(false);
}
void mVUFreeCOP2XMMreg(int hostreg)
{
microVU0.regAlloc->clearRegCOP2(hostreg);
}
#define REC_COP2_mVU0(f, opName, mode) \
void recV##f() \
{ \
@ -142,13 +150,9 @@ void endMacroOp(int mode)
#define INTERPRETATE_COP2_FUNC(f) \
void recV##f() \
{ \
_freeX86reg(eax); \
xMOV(eax, ptr32[&cpuRegs.cycle]); \
xADD(eax, scaleblockcycles_clear()); \
xMOV(ptr32[&cpuRegs.cycle], eax); \
_cop2BackupRegs(); \
iFlushCall(FLUSH_FOR_POSSIBLE_MICRO_EXEC); \
xADD(ptr32[&cpuRegs.cycle], scaleblockcycles_clear()); \
recCall(V##f); \
_cop2RestoreRegs(); \
}
//------------------------------------------------------------------
@ -303,13 +307,15 @@ INTERPRETATE_COP2_FUNC(CALLMSR);
// Macro VU - Branches
//------------------------------------------------------------------
void _setupBranchTest(u32*(jmpType)(u32), bool isLikely)
static void _setupBranchTest(u32*(jmpType)(u32), bool isLikely)
{
printCOP2("COP2 Branch");
_eeFlushAllUnused();
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
const bool swap = isLikely ? false : TrySwapDelaySlot(0, 0, 0);
_eeFlushAllDirty();
//xTEST(ptr32[&vif1Regs.stat._u32], 0x4);
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x100);
recDoBranchImm(jmpType(0), isLikely);
recDoBranchImm(branchTo, jmpType(0), isLikely, swap);
}
void recBC2F() { _setupBranchTest(JNZ32, false); }
@ -321,7 +327,7 @@ void recBC2TL() { _setupBranchTest(JZ32, true); }
// Macro VU - COP2 Transfer Instructions
//------------------------------------------------------------------
void COP2_Interlock(bool mBitSync)
static void COP2_Interlock(bool mBitSync)
{
if (cpuRegs.code & 1)
{
@ -329,8 +335,9 @@ void COP2_Interlock(bool mBitSync)
// We can safely skip the _vu0FinishMicro() call, when there's nothing
// that can trigger a VU0 program between CFC2/CTC2/COP2 instructions.
if ((g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0_MICRO) || mBitSync)
if (g_pCurInstInfo->info & EEINST_COP2_SYNC_VU0)
{
iFlushCall(FLUSH_FOR_POSSIBLE_MICRO_EXEC);
_freeX86reg(eax);
xMOV(eax, ptr32[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
@ -338,11 +345,15 @@ void COP2_Interlock(bool mBitSync)
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
_cop2BackupRegs();
if (mBitSync)
{
xSUB(eax, ptr32[&VU0.cycle]);
xSUB(eax, ptr32[&VU0.nextBlockCycles]);
// Why do we check this here? Ratchet games, maybe others end up with flickering polygons
// when we use lazy COP2 sync, otherwise. The micro resumption getting deferred an extra
// EE block is apparently enough to cause issues.
if (EmuConfig.Gamefixes.VUSyncHack || EmuConfig.Gamefixes.FullVU0SyncHack)
xSUB(eax, ptr32[&VU0.nextBlockCycles]);
xCMP(eax, 4);
xForwardJL32 skip;
xLoadFarAddr(arg1reg, CpuVU0);
@ -354,18 +365,47 @@ void COP2_Interlock(bool mBitSync)
}
else
xFastCall((void*)_vu0FinishMicro);
_cop2RestoreRegs();
skipvuidle.SetTarget();
}
}
}
void TEST_FBRST_RESET(FnType_Void* resetFunct, int vuIndex)
static void mVUSyncVU0()
{
xTEST(eax, (vuIndex) ? 0x200 : 0x002);
iFlushCall(FLUSH_FOR_POSSIBLE_MICRO_EXEC);
_freeX86reg(eax);
xMOV(eax, ptr32[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
xSUB(eax, ptr32[&VU0.cycle]);
if (EmuConfig.Gamefixes.VUSyncHack || EmuConfig.Gamefixes.FullVU0SyncHack)
xSUB(eax, ptr32[&VU0.nextBlockCycles]);
xCMP(eax, 4);
xForwardJL32 skip;
xLoadFarAddr(arg1reg, CpuVU0);
xMOV(arg2reg, s_nBlockInterlocked);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg, arg2reg);
skip.SetTarget();
skipvuidle.SetTarget();
}
static void mVUFinishVU0()
{
iFlushCall(FLUSH_FOR_POSSIBLE_MICRO_EXEC);
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
xFastCall((void*)_vu0FinishMicro);
skipvuidle.SetTarget();
}
static void TEST_FBRST_RESET(int flagreg, FnType_Void* resetFunct, int vuIndex)
{
xTEST(xRegister32(flagreg), (vuIndex) ? 0x200 : 0x002);
xForwardJZ8 skip;
xFastCall((void*)resetFunct);
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
skip.SetTarget();
}
@ -380,43 +420,20 @@ static void recCFC2()
if (!(cpuRegs.code & 1))
{
_freeX86reg(eax);
xMOV(eax, ptr32[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
xSUB(eax, ptr32[&VU0.cycle]);
xSUB(eax, ptr32[&VU0.nextBlockCycles]);
xCMP(eax, 4);
xForwardJL32 skip;
_cop2BackupRegs();
xLoadFarAddr(arg1reg, CpuVU0);
xMOV(arg2reg, s_nBlockInterlocked);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg, arg2reg);
_cop2RestoreRegs();
skip.SetTarget();
skipvuidle.SetTarget();
if (g_pCurInstInfo->info & EEINST_COP2_SYNC_VU0)
mVUSyncVU0();
else if (g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0)
mVUFinishVU0();
}
_flushEEreg(_Rt_, true);
if (_Rd_ == REG_STATUS_FLAG) // Normalize Status Flag
xMOV(eax, ptr32[&vu0Regs.VI[REG_STATUS_FLAG].UL]);
else
xMOV(eax, ptr32[&vu0Regs.VI[_Rd_].UL]);
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
pxAssert(!GPR_IS_CONST1(_Rt_));
// FixMe: Should R-Reg have upper 9 bits 0?
if (_Rd_ >= 16)
xCDQE(); // Sign Extend
xMOV(ptr64[&cpuRegs.GPR.r[_Rt_].UD[0]], rax);
// FixMe: I think this is needed, but not sure how it works
// Update Refraction 20/09/2021: This is needed because Const Prop is broken
// the Flushed flag isn't being cleared when it's not flushed. TODO I guess
_eeOnWriteReg(_Rt_, 0);
if (_Rd_ >= REG_STATUS_FLAG)
xMOVSX(xRegister64(regt), ptr32[&vu0Regs.VI[_Rd_].UL]);
else
xMOV(xRegister64(regt), ptr32[&vu0Regs.VI[_Rd_].UL]);
}
static void recCTC2()
@ -430,28 +447,12 @@ static void recCTC2()
if (!(cpuRegs.code & 1))
{
_freeX86reg(eax);
xMOV(eax, ptr32[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
xSUB(eax, ptr32[&VU0.cycle]);
xSUB(eax, ptr32[&VU0.nextBlockCycles]);
xCMP(eax, 4);
xForwardJL32 skip;
_cop2BackupRegs();
xLoadFarAddr(arg1reg, CpuVU0);
xMOV(arg2reg, s_nBlockInterlocked);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg, arg2reg);
_cop2RestoreRegs();
skip.SetTarget();
skipvuidle.SetTarget();
if (g_pCurInstInfo->info & EEINST_COP2_SYNC_VU0)
mVUSyncVU0();
else if (g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0)
mVUFinishVU0();
}
_flushEEreg(_Rt_);
switch (_Rd_)
{
case REG_MAC_FLAG:
@ -459,7 +460,7 @@ static void recCTC2()
case REG_VPU_STAT:
break; // Read Only Regs
case REG_R:
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
_eeMoveGPRtoR(eax, _Rt_);
xAND(eax, 0x7FFFFF);
xOR(eax, 0x3f800000);
xMOV(ptr32[&vu0Regs.VI[REG_R].UL], eax);
@ -468,7 +469,7 @@ static void recCTC2()
{
if (_Rt_)
{
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
_eeMoveGPRtoR(eax, _Rt_);
xAND(eax, 0xFC0);
xAND(ptr32[&vu0Regs.VI[REG_STATUS_FLAG].UL], 0x3F);
xOR(ptr32[&vu0Regs.VI[REG_STATUS_FLAG].UL], eax);
@ -476,42 +477,44 @@ static void recCTC2()
else
xAND(ptr32[&vu0Regs.VI[REG_STATUS_FLAG].UL], 0x3F);
_freeXMMreg(xmmT1.Id);
const int xmmtemp = _allocTempXMMreg(XMMT_INT);
//Need to update the sticky flags for microVU
mVUallocSFLAGd(&vu0Regs.VI[REG_STATUS_FLAG].UL);
xMOVDZX(xmmT1, eax);
xSHUF.PS(xmmT1, xmmT1, 0);
xMOVDZX(xRegisterSSE(xmmtemp), eax); // TODO(Stenzek): This can be a broadcast.
xSHUF.PS(xRegisterSSE(xmmtemp), xRegisterSSE(xmmtemp), 0);
// Make sure the values are everywhere the need to be
xMOVAPS(ptr128[&vu0Regs.micro_statusflags], xmmT1);
xMOVAPS(ptr128[&vu0Regs.micro_statusflags], xRegisterSSE(xmmtemp));
_freeXMMreg(xmmtemp);
break;
}
case REG_CMSAR1: // Execute VU1 Micro SubRoutine
_cop2BackupRegs();
xMOV(ecx, 1);
xFastCall((void*)vu1Finish, ecx);
if (_Rt_)
{
xMOV(ecx, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
}
else
xXOR(ecx, ecx);
xFastCall((void*)vu1ExecMicro, ecx);
_cop2RestoreRegs();
iFlushCall(FLUSH_NONE);
xMOV(arg1regd, 1);
xFastCall((void*)vu1Finish);
_eeMoveGPRtoR(arg1regd, _Rt_);
iFlushCall(FLUSH_NONE);
xFastCall((void*)vu1ExecMicro);
break;
case REG_FBRST:
if (!_Rt_)
{
xMOV(ptr32[&vu0Regs.VI[REG_FBRST].UL], 0);
return;
if (!_Rt_)
{
xMOV(ptr32[&vu0Regs.VI[REG_FBRST].UL], 0);
return;
}
const int flagreg = _allocX86reg(X86TYPE_TEMP, 0, MODE_CALLEESAVED);
_eeMoveGPRtoR(xRegister32(flagreg), _Rt_);
iFlushCall(FLUSH_FREE_VU0);
TEST_FBRST_RESET(flagreg, vu0ResetRegs, 0);
TEST_FBRST_RESET(flagreg, vu1ResetRegs, 1);
xAND(xRegister32(flagreg), 0x0C0C);
xMOV(ptr32[&vu0Regs.VI[REG_FBRST].UL], xRegister32(flagreg));
_freeX86reg(flagreg);
}
else
xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
_cop2BackupRegs();
TEST_FBRST_RESET(vu0ResetRegs, 0);
TEST_FBRST_RESET(vu1ResetRegs, 1);
_cop2RestoreRegs();
xAND(eax, 0x0C0C);
xMOV(ptr32[&vu0Regs.VI[REG_FBRST].UL], eax);
break;
case 0:
// Ignore writes to vi00.
@ -521,6 +524,14 @@ static void recCTC2()
// sVU's COP2 has a comment that "Donald Duck" needs this too...
if (_Rd_ < REG_STATUS_FLAG)
{
// I isn't invalidated correctly yet, ideally we would move this to the XMM directly.
if (_Rd_ == REG_I)
{
const int xmmreg = _checkXMMreg(XMMTYPE_VFREG, 33, 0);
if (xmmreg >= 0)
_freeXMMregWithoutWriteback(xmmreg);
}
// Need to expand this out, because we want to write as 16 bits.
_eeMoveGPRtoR(eax, _Rt_);
xMOV(ptr16[&vu0Regs.VI[_Rd_].US[0]], ax);
@ -542,35 +553,39 @@ static void recQMFC2()
if (!_Rt_)
return;
if (!(cpuRegs.code & 1))
{
_freeX86reg(eax);
xMOV(eax, ptr32[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
xSUB(eax, ptr32[&VU0.cycle]);
xSUB(eax, ptr32[&VU0.nextBlockCycles]);
xCMP(eax, 4);
xForwardJL32 skip;
_cop2BackupRegs();
xLoadFarAddr(arg1reg, CpuVU0);
xMOV(arg2reg, s_nBlockInterlocked);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg, arg2reg);
_cop2RestoreRegs();
skip.SetTarget();
skipvuidle.SetTarget();
if (g_pCurInstInfo->info & EEINST_COP2_SYNC_VU0)
mVUSyncVU0();
else if (g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0)
mVUFinishVU0();
}
int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_WRITE);
// Update Refraction 20/09/2021: This is needed because Const Prop is broken
// the Flushed flag isn't being cleared when it's not flushed. TODO I guess
_eeOnWriteReg(_Rt_, 0); // This is needed because Const Prop is broken
const bool vf_used = COP2INST_USEDTEST(_Rd_);
const int ftreg = _allocVFtoXMMreg(_Rd_, MODE_READ);
_deleteEEreg128(_Rt_);
xMOVAPS(xRegisterSSE(rtreg), ptr128[&vu0Regs.VF[_Rd_]]);
// const flag should've been cleared, but sanity check..
pxAssert(!GPR_IS_CONST1(_Rt_));
if (vf_used)
{
// store direct to state if rt is not used
const int rtreg = _allocIfUsedGPRtoXMM(_Rt_, MODE_WRITE);
if (rtreg >= 0)
xMOVAPS(xRegisterSSE(rtreg), xRegisterSSE(ftreg));
else
xMOVAPS(ptr128[&cpuRegs.GPR.r[_Rt_].UQ], xRegisterSSE(ftreg));
// don't cache vf00, microvu doesn't like it
if (_Rd_ == 0)
_freeXMMreg(ftreg);
}
else
{
_reallocateXMMreg(ftreg, XMMTYPE_GPRREG, _Rt_, MODE_WRITE, true);
}
}
static void recQMTC2()
@ -580,32 +595,49 @@ static void recQMTC2()
if (!_Rd_)
return;
if (!(cpuRegs.code & 1))
{
_freeX86reg(eax);
xMOV(eax, ptr32[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
xSUB(eax, ptr32[&VU0.cycle]);
xSUB(eax, ptr32[&VU0.nextBlockCycles]);
xCMP(eax, 4);
xForwardJL32 skip;
_cop2BackupRegs();
xLoadFarAddr(arg1reg, CpuVU0);
xMOV(arg2reg, s_nBlockInterlocked);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg, arg2reg);
_cop2RestoreRegs();
skip.SetTarget();
skipvuidle.SetTarget();
if (g_pCurInstInfo->info & EEINST_COP2_SYNC_VU0)
mVUSyncVU0();
else if (g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0)
mVUFinishVU0();
}
int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ);
xMOVAPS(ptr128[&vu0Regs.VF[_Rd_]], xRegisterSSE(rtreg));
if (_Rt_)
{
// if we have to flush to memory anyway (has a constant or is x86), force load.
const bool vf_used = COP2INST_USEDTEST(_Rd_);
const bool can_rename = EEINST_RENAMETEST(_Rt_);
const int rtreg = (GPR_IS_DIRTY_CONST(_Rt_) || _hasX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE)) ?
_allocGPRtoXMMreg(_Rt_, MODE_READ) :
_checkXMMreg(XMMTYPE_GPRREG, _Rt_, MODE_READ);
// NOTE: can't transfer xmm15 to VF, it's reserved for PQ.
int vfreg = _checkXMMreg(XMMTYPE_VFREG, _Rd_, MODE_WRITE);
if (can_rename && rtreg >= 0 && rtreg != xmmPQ.GetId())
{
// rt is no longer needed, so transfer to VF.
if (vfreg >= 0)
_freeXMMregWithoutWriteback(vfreg);
_reallocateXMMreg(rtreg, XMMTYPE_VFREG, _Rd_, MODE_WRITE, true);
}
else
{
// copy to VF.
if (vfreg < 0)
vfreg = _allocVFtoXMMreg(_Rd_, MODE_WRITE);
if (rtreg >= 0)
xMOVAPS(xRegisterSSE(vfreg), xRegisterSSE(rtreg));
else
xMOVAPS(xRegisterSSE(vfreg), ptr128[&cpuRegs.GPR.r[_Rt_].UQ]);
}
}
else
{
const int vfreg = _allocVFtoXMMreg(_Rd_, MODE_WRITE);
xPXOR(xRegisterSSE(vfreg), xRegisterSSE(vfreg));
}
}
//------------------------------------------------------------------
@ -669,22 +701,102 @@ void (*recCOP2SPECIAL2t[128])() = {
namespace R5900 {
namespace Dynarec {
namespace OpcodeImpl {
void recCOP2() { recCOP2t[_Rs_](); }
void recCOP2() { recCOP2t[_Rs_](); }
#if defined(LOADSTORE_RECOMPILE) && defined(CP2_RECOMPILE)
/*********************************************************
* Load and store for COP2 (VU0 unit) *
* Format: OP rt, offset(base) *
*********************************************************/
void recLQC2()
{
if (g_pCurInstInfo->info & EEINST_COP2_SYNC_VU0)
mVUSyncVU0();
else if (g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0)
mVUFinishVU0();
vtlb_ReadRegAllocCallback alloc_cb = nullptr;
if (_Rt_)
{
// init regalloc after flush
alloc_cb = []() { return _allocVFtoXMMreg(_Rt_, MODE_WRITE); };
}
int xmmreg;
if (GPR_IS_CONST1(_Rs_))
{
const u32 addr = (g_cpuConstRegs[_Rs_].UL[0] + _Imm_) & ~0xFu;
xmmreg = vtlb_DynGenReadQuad_Const(128, addr, alloc_cb);
}
else
{
_eeMoveGPRtoR(arg1regd, _Rs_);
if (_Imm_ != 0)
xADD(arg1regd, _Imm_);
xAND(arg1regd, ~0xF);
xmmreg = vtlb_DynGenReadQuad(128, arg1regd.GetId(), alloc_cb);
}
// toss away if loading to vf00
if (!_Rt_)
_freeXMMreg(xmmreg);
EE::Profiler.EmitOp(eeOpcode::LQC2);
}
////////////////////////////////////////////////////
void recSQC2()
{
if (g_pCurInstInfo->info & EEINST_COP2_SYNC_VU0)
mVUSyncVU0();
else if (g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0)
mVUFinishVU0();
// vf00 has to be special cased here, because of the microvu temps...
const int ftreg = _Rt_ ? _allocVFtoXMMreg(_Rt_, MODE_READ) : _allocTempXMMreg(XMMT_FPS);
if (!_Rt_)
xMOVAPS(xRegisterSSE(ftreg), ptr128[&vu0Regs.VF[0].F]);
if (GPR_IS_CONST1(_Rs_))
{
const u32 addr = (g_cpuConstRegs[_Rs_].UL[0] + _Imm_) & ~0xFu;
vtlb_DynGenWrite_Const(128, true, addr, ftreg);
}
else
{
_eeMoveGPRtoR(arg1regd, _Rs_);
if (_Imm_ != 0)
xADD(arg1regd, _Imm_);
xAND(arg1regd, ~0xF);
vtlb_DynGenWrite(128, true, arg1regd.GetId(), ftreg);
}
if (!_Rt_)
_freeXMMreg(ftreg);
EE::Profiler.EmitOp(eeOpcode::SQC2);
}
#else
REC_FUNC(LQC2);
REC_FUNC(SQC2);
#endif
} // namespace OpcodeImpl
} // namespace Dynarec
} // namespace R5900
void recCOP2_BC2() { recCOP2_BC2t[_Rt_](); }
void recCOP2_SPEC1()
{
if (g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0_MICRO)
{
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
_cop2BackupRegs();
xFastCall((void*)_vu0FinishMicro);
_cop2RestoreRegs();
skipvuidle.SetTarget();
}
if (g_pCurInstInfo->info & (EEINST_COP2_SYNC_VU0 | EEINST_COP2_FINISH_VU0))
mVUFinishVU0();
recCOP2SPECIAL1t[_Funct_]();