From 230bce078b3f107e4d4e5eb73811cec50609a2e4 Mon Sep 17 00:00:00 2001 From: Prism Tutaj Date: Fri, 22 Dec 2017 18:48:32 -0600 Subject: [PATCH 01/18] Allow computers with non-Latin codepages to build the project --- premake5.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/premake5.lua b/premake5.lua index 593f0503d..e6eac9193 100644 --- a/premake5.lua +++ b/premake5.lua @@ -119,6 +119,7 @@ filter("platforms:Windows") "/wd4127", -- 'conditional expression is constant'. "/wd4324", -- 'structure was padded due to alignment specifier'. "/wd4189", -- 'local variable is initialized but not referenced'. + "/utf-8", -- 'build correctly on systems with non-Latin codepages'. }) flags({ "NoMinimalRebuild", -- Required for /MP above. From 53ed82db716b2cc260f6882dc85f046902f9d744 Mon Sep 17 00:00:00 2001 From: Parker Lamb Date: Mon, 8 Jan 2018 16:42:05 +0000 Subject: [PATCH 02/18] UI/Vulkan: Fixed graphics context recreation on HiDPI systems. --- src/xenia/ui/vulkan/vulkan_context.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xenia/ui/vulkan/vulkan_context.cc b/src/xenia/ui/vulkan/vulkan_context.cc index e91c892ec..0df97b4b3 100644 --- a/src/xenia/ui/vulkan/vulkan_context.cc +++ b/src/xenia/ui/vulkan/vulkan_context.cc @@ -150,8 +150,8 @@ void VulkanContext::BeginSwap() { // If it has been, we'll need to reinitialize the swap chain before we // start touching it. if (target_window_) { - if (target_window_->width() != swap_chain_->surface_width() || - target_window_->height() != swap_chain_->surface_height()) { + if (target_window_->scaled_width() != swap_chain_->surface_width() || + target_window_->scaled_height() != swap_chain_->surface_height()) { // Resized! swap_chain_->Reinitialize(); } From 26212bffb0468a6dfe91ec759f901aff5df71779 Mon Sep 17 00:00:00 2001 From: DrChat Date: Wed, 24 Jan 2018 20:57:11 -0600 Subject: [PATCH 03/18] [x64 seq] Reuse the source variable rather than re-specifying the destination (to make things less confusing) --- src/xenia/cpu/backend/x64/x64_sequences.cc | 50 +++++++++++----------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index ec27b16da..9831eedef 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -2779,13 +2779,13 @@ struct SELECT_F32 Xmm src2 = i.src2.is_constant ? e.xmm2 : i.src2; if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm2, i.src2.constant()); + e.LoadConstantXmm(src2, i.src2.constant()); } e.vpandn(e.xmm1, e.xmm0, src2); Xmm src3 = i.src3.is_constant ? e.xmm2 : i.src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm2, i.src3.constant()); + e.LoadConstantXmm(src3, i.src3.constant()); } e.vpand(i.dest, e.xmm0, src3); e.vpor(i.dest, e.xmm1); @@ -2802,13 +2802,13 @@ struct SELECT_F64 Xmm src2 = i.src2.is_constant ? e.xmm2 : i.src2; if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm2, i.src2.constant()); + e.LoadConstantXmm(src2, i.src2.constant()); } e.vpandn(e.xmm1, e.xmm0, src2); Xmm src3 = i.src3.is_constant ? e.xmm2 : i.src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm2, i.src3.constant()); + e.LoadConstantXmm(src3, i.src3.constant()); } e.vpand(i.dest, e.xmm0, src3); e.vpor(i.dest, e.xmm1); @@ -2827,13 +2827,13 @@ struct SELECT_V128_I8 Xmm src2 = i.src2.is_constant ? e.xmm2 : i.src2; if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm2, i.src2.constant()); + e.LoadConstantXmm(src2, i.src2.constant()); } e.vpandn(e.xmm1, e.xmm0, src2); Xmm src3 = i.src3.is_constant ? e.xmm2 : i.src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm2, i.src3.constant()); + e.LoadConstantXmm(src3, i.src3.constant()); } e.vpand(i.dest, e.xmm0, src3); e.vpor(i.dest, e.xmm1); @@ -2845,18 +2845,18 @@ struct SELECT_V128_V128 static void Emit(X64Emitter& e, const EmitArgType& i) { Xmm src1 = i.src1.is_constant ? e.xmm1 : i.src1; if (i.src1.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src1.constant()); + e.LoadConstantXmm(src1, i.src1.constant()); } Xmm src2 = i.src2.is_constant ? e.xmm0 : i.src2; if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.LoadConstantXmm(src2, i.src2.constant()); } e.vpandn(e.xmm0, src1, src2); Xmm src3 = i.src3.is_constant ? i.dest : i.src3; if (i.src3.is_constant) { - e.LoadConstantXmm(i.dest, i.src3.constant()); + e.LoadConstantXmm(src3, i.src3.constant()); } e.vpand(i.dest, src1, src3); @@ -4499,7 +4499,7 @@ struct MUL_ADD_F32 const Xmm& src2) { Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); + e.LoadConstantXmm(src3, i.src3.constant()); } if (i.dest == src1) { e.vfmadd213ss(i.dest, src2, src3); @@ -4516,8 +4516,8 @@ struct MUL_ADD_F32 } else { Xmm src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); src3 = e.xmm1; + e.LoadConstantXmm(src3, i.src3.constant()); } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. src3 = i.src3; @@ -4558,7 +4558,7 @@ struct MUL_ADD_F64 const Xmm& src2) { Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); + e.LoadConstantXmm(src3, i.src3.constant()); } if (i.dest == src1) { e.vfmadd213sd(i.dest, src2, src3); @@ -4575,8 +4575,8 @@ struct MUL_ADD_F64 } else { Xmm src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); src3 = e.xmm1; + e.LoadConstantXmm(src3, i.src3.constant()); } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. src3 = i.src3; @@ -4623,7 +4623,7 @@ struct MUL_ADD_V128 const Xmm& src2) { Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); + e.LoadConstantXmm(src3, i.src3.constant()); } if (i.dest == src1) { e.vfmadd213ps(i.dest, src2, src3); @@ -4640,8 +4640,8 @@ struct MUL_ADD_V128 } else { Xmm src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); src3 = e.xmm1; + e.LoadConstantXmm(src3, i.src3.constant()); } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. src3 = i.src3; @@ -4696,7 +4696,7 @@ struct MUL_SUB_F32 const Xmm& src2) { Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); + e.LoadConstantXmm(src3, i.src3.constant()); } if (i.dest == src1) { e.vfmsub213ss(i.dest, src2, src3); @@ -4713,8 +4713,8 @@ struct MUL_SUB_F32 } else { Xmm src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); src3 = e.xmm1; + e.LoadConstantXmm(src3, i.src3.constant()); } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. src3 = i.src3; @@ -4755,7 +4755,7 @@ struct MUL_SUB_F64 const Xmm& src2) { Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); + e.LoadConstantXmm(src3, i.src3.constant()); } if (i.dest == src1) { e.vfmsub213sd(i.dest, src2, src3); @@ -4772,8 +4772,8 @@ struct MUL_SUB_F64 } else { Xmm src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); src3 = e.xmm1; + e.LoadConstantXmm(src3, i.src3.constant()); } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. src3 = i.src3; @@ -4818,7 +4818,7 @@ struct MUL_SUB_V128 const Xmm& src2) { Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); + e.LoadConstantXmm(src3, i.src3.constant()); } if (i.dest == src1) { e.vfmsub213ps(i.dest, src2, src3); @@ -4835,8 +4835,8 @@ struct MUL_SUB_V128 } else { Xmm src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src3.constant()); src3 = e.xmm1; + e.LoadConstantXmm(src3, i.src3.constant()); } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. src3 = i.src3; @@ -6877,8 +6877,8 @@ struct SWIZZLE uint8_t swizzle_mask = static_cast(i.src2.value); Xmm src1; if (i.src1.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src1.constant()); src1 = e.xmm0; + e.LoadConstantXmm(src1, i.src1.constant()); } else { src1 = i.src1; } @@ -7135,7 +7135,7 @@ struct PACK : Sequence> { // PACKUSWB / SaturateSignedWordToUnsignedByte Xbyak::Xmm src2 = i.src2.is_constant ? e.xmm0 : i.src2; if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.LoadConstantXmm(src2, i.src2.constant()); } e.vpackuswb(i.dest, i.src1, src2); @@ -7241,8 +7241,8 @@ struct PACK : Sequence> { src2 = i.src2; } else { assert_false(i.src1 == e.xmm0); - e.LoadConstantXmm(e.xmm0, i.src2.constant()); src2 = e.xmm0; + e.LoadConstantXmm(src2, i.src2.constant()); } e.vpackssdw(i.dest, i.src1, src2); e.vpshuflw(i.dest, i.dest, 0b10110001); @@ -7352,8 +7352,8 @@ struct UNPACK : Sequence> { } else { Xmm src; if (i.src1.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src1.constant()); src = e.xmm0; + e.LoadConstantXmm(src, i.src1.constant()); } else { src = i.src1; } From 7ea33816999e3f217c3a0d1663162d39712aade1 Mon Sep 17 00:00:00 2001 From: DrChat Date: Wed, 7 Feb 2018 18:40:12 -0600 Subject: [PATCH 04/18] [Vulkan] texture_bindings_ -> texture_sets_ --- src/xenia/gpu/vulkan/texture_cache.cc | 7 +++---- src/xenia/gpu/vulkan/texture_cache.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc index 3a684bb5e..e8fe61181 100644 --- a/src/xenia/gpu/vulkan/texture_cache.cc +++ b/src/xenia/gpu/vulkan/texture_cache.cc @@ -1329,8 +1329,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet( HashTextureBindings(&hash_state, fetch_mask, vertex_bindings); HashTextureBindings(&hash_state, fetch_mask, pixel_bindings); uint64_t hash = XXH64_digest(&hash_state); - for (auto it = texture_bindings_.find(hash); it != texture_bindings_.end(); - ++it) { + for (auto it = texture_sets_.find(hash); it != texture_sets_.end(); ++it) { // TODO(DrChat): We need to compare the bindings and ensure they're equal. return it->second; } @@ -1378,7 +1377,7 @@ VkDescriptorSet TextureCache::PrepareTextureSet( update_set_info->image_writes, 0, nullptr); } - texture_bindings_[hash] = descriptor_set; + texture_sets_[hash] = descriptor_set; return descriptor_set; } @@ -1515,7 +1514,7 @@ void TextureCache::Scavenge() { // Free unused descriptor sets // TODO(DrChat): These sets could persist across frames, we just need a smart // way to detect if they're unused and free them. - texture_bindings_.clear(); + texture_sets_.clear(); descriptor_pool_->Scavenge(); staging_buffer_.Scavenge(); diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h index 483b88bd2..dcc9894ed 100644 --- a/src/xenia/gpu/vulkan/texture_cache.h +++ b/src/xenia/gpu/vulkan/texture_cache.h @@ -188,7 +188,7 @@ class TextureCache { std::unique_ptr wb_command_pool_ = nullptr; std::unique_ptr descriptor_pool_ = nullptr; - std::unordered_map texture_bindings_; + std::unordered_map texture_sets_; VkDescriptorSetLayout texture_descriptor_set_layout_ = nullptr; VmaAllocator mem_allocator_ = nullptr; From b38b9e43ea1b52e151864e7445144134bc7996f9 Mon Sep 17 00:00:00 2001 From: DrChat Date: Wed, 7 Feb 2018 18:40:41 -0600 Subject: [PATCH 05/18] [Vulkan] Don't bother re-hashing previously covered bindings --- src/xenia/gpu/vulkan/texture_cache.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc index e8fe61181..aa049788f 100644 --- a/src/xenia/gpu/vulkan/texture_cache.cc +++ b/src/xenia/gpu/vulkan/texture_cache.cc @@ -1306,6 +1306,7 @@ void TextureCache::HashTextureBindings( // We've covered this binding. continue; } + fetch_mask |= fetch_bit; auto& regs = *register_file_; int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + binding.fetch_constant * 6; From 9e79babde0cbe8042b6689f2998eaf8bd3cae113 Mon Sep 17 00:00:00 2001 From: DrChat Date: Wed, 7 Feb 2018 18:55:26 -0600 Subject: [PATCH 06/18] Formatting. --- src/xenia/cpu/backend/x64/x64_sequences.cc | 240 ++++++++++----------- 1 file changed, 120 insertions(+), 120 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 9831eedef..0bd483caf 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -4493,26 +4493,26 @@ struct MUL_ADD_F32 // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp( - e, i, - [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, - const Xmm& src2) { - Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmadd213ss(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmadd213ss(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmadd231ss(i.dest, src1, src2); - } else { - // Dest not equal to anything - e.vmovss(i.dest, src1); - e.vfmadd213ss(i.dest, src2, src3); - } - }); + EmitCommutativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + Xmm src3 = + i.src3.is_constant ? e.xmm1 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(src3, i.src3.constant()); + } + if (i.dest == src1) { + e.vfmadd213ss(i.dest, src2, src3); + } else if (i.dest == src2) { + e.vfmadd213ss(i.dest, src1, src3); + } else if (i.dest == i.src3) { + e.vfmadd231ss(i.dest, src1, src2); + } else { + // Dest not equal to anything + e.vmovss(i.dest, src1); + e.vfmadd213ss(i.dest, src2, src3); + } + }); } else { Xmm src3; if (i.src3.is_constant) { @@ -4552,26 +4552,26 @@ struct MUL_ADD_F64 // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp( - e, i, - [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, - const Xmm& src2) { - Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmadd213sd(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmadd213sd(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmadd231sd(i.dest, src1, src2); - } else { - // Dest not equal to anything - e.vmovsd(i.dest, src1); - e.vfmadd213sd(i.dest, src2, src3); - } - }); + EmitCommutativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + Xmm src3 = + i.src3.is_constant ? e.xmm1 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(src3, i.src3.constant()); + } + if (i.dest == src1) { + e.vfmadd213sd(i.dest, src2, src3); + } else if (i.dest == src2) { + e.vfmadd213sd(i.dest, src1, src3); + } else if (i.dest == i.src3) { + e.vfmadd231sd(i.dest, src1, src2); + } else { + // Dest not equal to anything + e.vmovsd(i.dest, src1); + e.vfmadd213sd(i.dest, src2, src3); + } + }); } else { Xmm src3; if (i.src3.is_constant) { @@ -4617,26 +4617,26 @@ struct MUL_ADD_V128 // than vmul+vadd and it'd be nice to know why. Until we know, it's // disabled so tests pass. if (false && e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp( - e, i, - [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, - const Xmm& src2) { - Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmadd213ps(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmadd213ps(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmadd231ps(i.dest, src1, src2); - } else { - // Dest not equal to anything - e.vmovdqa(i.dest, src1); - e.vfmadd213ps(i.dest, src2, src3); - } - }); + EmitCommutativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + Xmm src3 = + i.src3.is_constant ? e.xmm1 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(src3, i.src3.constant()); + } + if (i.dest == src1) { + e.vfmadd213ps(i.dest, src2, src3); + } else if (i.dest == src2) { + e.vfmadd213ps(i.dest, src1, src3); + } else if (i.dest == i.src3) { + e.vfmadd231ps(i.dest, src1, src2); + } else { + // Dest not equal to anything + e.vmovdqa(i.dest, src1); + e.vfmadd213ps(i.dest, src2, src3); + } + }); } else { Xmm src3; if (i.src3.is_constant) { @@ -4690,26 +4690,26 @@ struct MUL_SUB_F32 // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp( - e, i, - [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, - const Xmm& src2) { - Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmsub213ss(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmsub213ss(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmsub231ss(i.dest, src1, src2); - } else { - // Dest not equal to anything - e.vmovss(i.dest, src1); - e.vfmsub213ss(i.dest, src2, src3); - } - }); + EmitCommutativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + Xmm src3 = + i.src3.is_constant ? e.xmm1 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(src3, i.src3.constant()); + } + if (i.dest == src1) { + e.vfmsub213ss(i.dest, src2, src3); + } else if (i.dest == src2) { + e.vfmsub213ss(i.dest, src1, src3); + } else if (i.dest == i.src3) { + e.vfmsub231ss(i.dest, src1, src2); + } else { + // Dest not equal to anything + e.vmovss(i.dest, src1); + e.vfmsub213ss(i.dest, src2, src3); + } + }); } else { Xmm src3; if (i.src3.is_constant) { @@ -4749,26 +4749,26 @@ struct MUL_SUB_F64 // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp( - e, i, - [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, - const Xmm& src2) { - Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmsub213sd(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmsub213sd(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmsub231sd(i.dest, src1, src2); - } else { - // Dest not equal to anything - e.vmovsd(i.dest, src1); - e.vfmsub213sd(i.dest, src2, src3); - } - }); + EmitCommutativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + Xmm src3 = + i.src3.is_constant ? e.xmm1 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(src3, i.src3.constant()); + } + if (i.dest == src1) { + e.vfmsub213sd(i.dest, src2, src3); + } else if (i.dest == src2) { + e.vfmsub213sd(i.dest, src1, src3); + } else if (i.dest == i.src3) { + e.vfmsub231sd(i.dest, src1, src2); + } else { + // Dest not equal to anything + e.vmovsd(i.dest, src1); + e.vfmsub213sd(i.dest, src2, src3); + } + }); } else { Xmm src3; if (i.src3.is_constant) { @@ -4812,26 +4812,26 @@ struct MUL_SUB_V128 // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp( - e, i, - [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, - const Xmm& src2) { - Xmm src3 = i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmsub213ps(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmsub213ps(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmsub231ps(i.dest, src1, src2); - } else { - // Dest not equal to anything - e.vmovdqa(i.dest, src1); - e.vfmsub213ps(i.dest, src2, src3); - } - }); + EmitCommutativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + Xmm src3 = + i.src3.is_constant ? e.xmm1 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(src3, i.src3.constant()); + } + if (i.dest == src1) { + e.vfmsub213ps(i.dest, src2, src3); + } else if (i.dest == src2) { + e.vfmsub213ps(i.dest, src1, src3); + } else if (i.dest == i.src3) { + e.vfmsub231ps(i.dest, src1, src2); + } else { + // Dest not equal to anything + e.vmovdqa(i.dest, src1); + e.vfmsub213ps(i.dest, src2, src3); + } + }); } else { Xmm src3; if (i.src3.is_constant) { From 455dfeb39f15ea191d9000cc5960d5673bff0d66 Mon Sep 17 00:00:00 2001 From: DrChat Date: Fri, 9 Feb 2018 16:58:56 -0600 Subject: [PATCH 07/18] [Vulkan] Use a static function for texture invalidation callbacks --- src/xenia/gpu/vulkan/texture_cache.cc | 49 ++++++++++----------------- src/xenia/gpu/vulkan/texture_cache.h | 3 ++ 2 files changed, 21 insertions(+), 31 deletions(-) diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc index aa049788f..2577b6774 100644 --- a/src/xenia/gpu/vulkan/texture_cache.cc +++ b/src/xenia/gpu/vulkan/texture_cache.cc @@ -364,6 +364,22 @@ bool TextureCache::FreeTexture(Texture* texture) { return true; } +void TextureCache::WatchCallback(void* context_ptr, void* data_ptr, + uint32_t address) { + auto self = reinterpret_cast(context_ptr); + auto touched_texture = reinterpret_cast(data_ptr); + // Clear watch handle first so we don't redundantly + // remove. + assert_not_zero(touched_texture->access_watch_handle); + touched_texture->access_watch_handle = 0; + touched_texture->pending_invalidation = true; + + // Add to pending list so Scavenge will clean it up. + self->invalidated_textures_mutex_.lock(); + self->invalidated_textures_->push_back(touched_texture); + self->invalidated_textures_mutex_.unlock(); +} + TextureCache::Texture* TextureCache::DemandResolveTexture( const TextureInfo& texture_info) { auto texture_hash = texture_info.hash(); @@ -411,22 +427,7 @@ TextureCache::Texture* TextureCache::DemandResolveTexture( // Setup an access watch. If this texture is touched, it is destroyed. texture->access_watch_handle = memory_->AddPhysicalAccessWatch( texture_info.guest_address, texture_info.input_length, - cpu::MMIOHandler::kWatchWrite, - [](void* context_ptr, void* data_ptr, uint32_t address) { - auto self = reinterpret_cast(context_ptr); - auto touched_texture = reinterpret_cast(data_ptr); - // Clear watch handle first so we don't redundantly - // remove. - assert_not_zero(touched_texture->access_watch_handle); - touched_texture->access_watch_handle = 0; - touched_texture->pending_invalidation = true; - - // Add to pending list so Scavenge will clean it up. - self->invalidated_textures_mutex_.lock(); - self->invalidated_textures_->push_back(touched_texture); - self->invalidated_textures_mutex_.unlock(); - }, - this, texture); + cpu::MMIOHandler::kWatchWrite, &WatchCallback, this, texture); textures_[texture_hash] = texture; return texture; @@ -486,21 +487,7 @@ TextureCache::Texture* TextureCache::Demand(const TextureInfo& texture_info, // guest. texture->access_watch_handle = memory_->AddPhysicalAccessWatch( texture_info.guest_address, texture_info.input_length, - cpu::MMIOHandler::kWatchWrite, - [](void* context_ptr, void* data_ptr, uint32_t address) { - auto self = reinterpret_cast(context_ptr); - auto touched_texture = reinterpret_cast(data_ptr); - // Clear watch handle first so we don't redundantly - // remove. - assert_not_zero(touched_texture->access_watch_handle); - touched_texture->access_watch_handle = 0; - touched_texture->pending_invalidation = true; - // Add to pending list so Scavenge will clean it up. - self->invalidated_textures_mutex_.lock(); - self->invalidated_textures_->push_back(touched_texture); - self->invalidated_textures_mutex_.unlock(); - }, - this, texture); + cpu::MMIOHandler::kWatchWrite, &WatchCallback, this, texture); if (!UploadTexture(command_buffer, completion_fence, texture, texture_info)) { FreeTexture(texture); diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h index dcc9894ed..488924bb4 100644 --- a/src/xenia/gpu/vulkan/texture_cache.h +++ b/src/xenia/gpu/vulkan/texture_cache.h @@ -134,6 +134,9 @@ class TextureCache { VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT); bool FreeTexture(Texture* texture); + static void WatchCallback(void* context_ptr, void* data_ptr, + uint32_t address); + // Demands a texture. If command_buffer is null and the texture hasn't been // uploaded to graphics memory already, we will return null and bail. Texture* Demand(const TextureInfo& texture_info, From 7da973d9e6d1e029f21007e3883fe4abb20196b7 Mon Sep 17 00:00:00 2001 From: DrChat Date: Fri, 9 Feb 2018 17:21:35 -0600 Subject: [PATCH 08/18] [CPU] Handle cases in access watches where base addresses overlap --- src/xenia/cpu/mmio_handler.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc index a72545a9c..f12cb65d9 100644 --- a/src/xenia/cpu/mmio_handler.cc +++ b/src/xenia/cpu/mmio_handler.cc @@ -118,16 +118,16 @@ uintptr_t MMIOHandler::AddPhysicalAccessWatch(uint32_t guest_address, bool hit = false; auto entry = *it; - if (base_address < (*it)->address && + if (base_address <= (*it)->address && base_address + length > (*it)->address) { hit = true; - } else if ((*it)->address < base_address && + } else if ((*it)->address <= base_address && (*it)->address + (*it)->length > base_address) { hit = true; - } else if ((*it)->address < base_address && + } else if ((*it)->address <= base_address && (*it)->address + (*it)->length > base_address + length) { hit = true; - } else if ((*it)->address > base_address && + } else if ((*it)->address >= base_address && (*it)->address + (*it)->length < base_address + length) { hit = true; } From 190108dab6732d9db5c8595e1358b7b034cb5a14 Mon Sep 17 00:00:00 2001 From: DrChat Date: Fri, 9 Feb 2018 18:26:12 -0600 Subject: [PATCH 09/18] [CPU] Add some more docs to MMIO handler functions --- src/xenia/cpu/mmio_handler.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/xenia/cpu/mmio_handler.h b/src/xenia/cpu/mmio_handler.h index 1e2f433cc..e68a2e276 100644 --- a/src/xenia/cpu/mmio_handler.h +++ b/src/xenia/cpu/mmio_handler.h @@ -73,7 +73,11 @@ class MMIOHandler { WatchType type, AccessWatchCallback callback, void* callback_context, void* callback_data); void CancelAccessWatch(uintptr_t watch_handle); + + // Fires and clears any access watches that overlap this range. void InvalidateRange(uint32_t physical_address, size_t length); + + // Returns true if /any/ part of this range is watched. bool IsRangeWatched(uint32_t physical_address, size_t length); protected: From e2bbae3896a9c4c0b8635f4c24720b314163443d Mon Sep 17 00:00:00 2001 From: DrChat Date: Tue, 13 Feb 2018 12:50:33 -0600 Subject: [PATCH 10/18] [JIT] Don't bother using a temp for constant addresses < 0x80000000 --- src/xenia/cpu/backend/x64/x64_sequences.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 0bd483caf..03a609ccb 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -2139,8 +2139,13 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { // TODO(benvanik): figure out how to do this without a temp. // Since the constant is often 0x8... if we tried to use that as a // displacement it would be sign extended and mess things up. - e.mov(e.eax, static_cast(guest.constant())); - return e.GetMembaseReg() + e.rax; + uint32_t address = static_cast(guest.constant()); + if (address < 0x80000000) { + return e.GetMembaseReg() + address; + } else { + e.mov(e.eax, address); + return e.GetMembaseReg() + e.rax; + } } else { // Clear the top 32 bits, as they are likely garbage. // TODO(benvanik): find a way to avoid doing this. From 1d0b290c3de5b1ea3e79bd96d3ce72d08e91161f Mon Sep 17 00:00:00 2001 From: DrChat Date: Tue, 13 Feb 2018 13:31:43 -0600 Subject: [PATCH 11/18] [JIT] Remove all calls to ReloadMembase / ReloadContext (rcx/rdx free now) --- src/xenia/cpu/backend/x64/x64_sequences.cc | 51 ---------------------- 1 file changed, 51 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 03a609ccb..80da6a7eb 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -1663,7 +1663,6 @@ struct LOAD_VECTOR_SHL_I8 e.shl(e.dx, 4); e.mov(e.rax, (uintptr_t)lvsl_table); e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); - e.ReloadMembase(); } } }; @@ -1705,7 +1704,6 @@ struct LOAD_VECTOR_SHR_I8 e.shl(e.dx, 4); e.mov(e.rax, (uintptr_t)lvsr_table); e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); - e.ReloadMembase(); } } }; @@ -3868,8 +3866,6 @@ struct MUL_I8 : Sequence> { e.mov(i.dest, e.al); } } - - e.ReloadMembase(); } }; struct MUL_I16 : Sequence> { @@ -3911,8 +3907,6 @@ struct MUL_I16 : Sequence> { e.movzx(i.dest, e.ax); } } - - e.ReloadMembase(); } }; struct MUL_I32 : Sequence> { @@ -3955,8 +3949,6 @@ struct MUL_I32 : Sequence> { e.mov(i.dest, e.eax); } } - - e.ReloadMembase(); } }; struct MUL_I64 : Sequence> { @@ -3998,8 +3990,6 @@ struct MUL_I64 : Sequence> { e.mov(i.dest, e.rax); } } - - e.ReloadMembase(); } }; struct MUL_F32 : Sequence> { @@ -4077,7 +4067,6 @@ struct MUL_HI_I8 : Sequence> { } e.mov(i.dest, e.ah); } - e.ReloadMembase(); } }; struct MUL_HI_I16 @@ -4121,7 +4110,6 @@ struct MUL_HI_I16 } e.mov(i.dest, e.dx); } - e.ReloadMembase(); } }; struct MUL_HI_I32 @@ -4170,7 +4158,6 @@ struct MUL_HI_I32 } e.mov(i.dest, e.edx); } - e.ReloadMembase(); } }; struct MUL_HI_I64 @@ -4219,7 +4206,6 @@ struct MUL_HI_I64 } e.mov(i.dest, e.rdx); } - e.ReloadMembase(); } }; EMITTER_OPCODE_TABLE(OPCODE_MUL_HI, MUL_HI_I8, MUL_HI_I16, MUL_HI_I32, @@ -4235,11 +4221,8 @@ struct DIV_I8 : Sequence> { Xbyak::Label skip; e.inLocalLabel(); - // NOTE: RDX clobbered. - bool clobbered_rcx = false; if (i.src2.is_constant) { assert_true(!i.src1.is_constant); - clobbered_rcx = true; e.mov(e.cl, i.src2.constant()); if (i.instr->flags & ARITHMETIC_UNSIGNED) { e.movzx(e.ax, i.src1); @@ -4273,10 +4256,6 @@ struct DIV_I8 : Sequence> { e.L(skip); e.outLocalLabel(); e.mov(i.dest, e.al); - if (clobbered_rcx) { - e.ReloadContext(); - } - e.ReloadMembase(); } }; struct DIV_I16 : Sequence> { @@ -4284,11 +4263,8 @@ struct DIV_I16 : Sequence> { Xbyak::Label skip; e.inLocalLabel(); - // NOTE: RDX clobbered. - bool clobbered_rcx = false; if (i.src2.is_constant) { assert_true(!i.src1.is_constant); - clobbered_rcx = true; e.mov(e.cx, i.src2.constant()); if (i.instr->flags & ARITHMETIC_UNSIGNED) { e.mov(e.ax, i.src1); @@ -4328,10 +4304,6 @@ struct DIV_I16 : Sequence> { e.L(skip); e.outLocalLabel(); e.mov(i.dest, e.ax); - if (clobbered_rcx) { - e.ReloadContext(); - } - e.ReloadMembase(); } }; struct DIV_I32 : Sequence> { @@ -4339,11 +4311,8 @@ struct DIV_I32 : Sequence> { Xbyak::Label skip; e.inLocalLabel(); - // NOTE: RDX clobbered. - bool clobbered_rcx = false; if (i.src2.is_constant) { assert_true(!i.src1.is_constant); - clobbered_rcx = true; e.mov(e.ecx, i.src2.constant()); if (i.instr->flags & ARITHMETIC_UNSIGNED) { e.mov(e.eax, i.src1); @@ -4383,10 +4352,6 @@ struct DIV_I32 : Sequence> { e.L(skip); e.outLocalLabel(); e.mov(i.dest, e.eax); - if (clobbered_rcx) { - e.ReloadContext(); - } - e.ReloadMembase(); } }; struct DIV_I64 : Sequence> { @@ -4394,11 +4359,8 @@ struct DIV_I64 : Sequence> { Xbyak::Label skip; e.inLocalLabel(); - // NOTE: RDX clobbered. - bool clobbered_rcx = false; if (i.src2.is_constant) { assert_true(!i.src1.is_constant); - clobbered_rcx = true; e.mov(e.rcx, i.src2.constant()); if (i.instr->flags & ARITHMETIC_UNSIGNED) { e.mov(e.rax, i.src1); @@ -4438,10 +4400,6 @@ struct DIV_I64 : Sequence> { e.L(skip); e.outLocalLabel(); e.mov(i.dest, e.rax); - if (clobbered_rcx) { - e.ReloadContext(); - } - e.ReloadMembase(); } }; struct DIV_F32 : Sequence> { @@ -5324,7 +5282,6 @@ void EmitShlXX(X64Emitter& e, const ARGS& i) { } else { e.mov(e.cl, src); e.shl(dest_src, e.cl); - e.ReloadContext(); } }, [](X64Emitter& e, const REG& dest_src, int8_t constant) { @@ -5402,7 +5359,6 @@ void EmitShrXX(X64Emitter& e, const ARGS& i) { } else { e.mov(e.cl, src); e.shr(dest_src, e.cl); - e.ReloadContext(); } }, [](X64Emitter& e, const REG& dest_src, int8_t constant) { @@ -5478,7 +5434,6 @@ void EmitSarXX(X64Emitter& e, const ARGS& i) { } else { e.mov(e.cl, src); e.sar(dest_src, e.cl); - e.ReloadContext(); } }, [](X64Emitter& e, const REG& dest_src, int8_t constant) { @@ -6093,7 +6048,6 @@ void EmitRotateLeftXX(X64Emitter& e, const ARGS& i) { } } e.rol(i.dest, e.cl); - e.ReloadContext(); } } struct ROTATE_LEFT_I8 @@ -6584,7 +6538,6 @@ struct EXTRACT_I32 e.vmovaps(e.xmm0, e.ptr[e.rdx + e.rax]); e.vpshufb(e.xmm0, src1, e.xmm0); e.vpextrd(i.dest, e.xmm0, 0); - e.ReloadMembase(); } } }; @@ -7624,8 +7577,6 @@ struct ATOMIC_COMPARE_EXCHANGE_I32 e.lock(); e.cmpxchg(e.dword[e.GetMembaseReg() + e.rcx], i.src3); e.sete(i.dest); - - e.ReloadContext(); } }; struct ATOMIC_COMPARE_EXCHANGE_I64 @@ -7637,8 +7588,6 @@ struct ATOMIC_COMPARE_EXCHANGE_I64 e.lock(); e.cmpxchg(e.qword[e.GetMembaseReg() + e.rcx], i.src3); e.sete(i.dest); - - e.ReloadContext(); } }; EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_COMPARE_EXCHANGE, From f20338004041cc8c29c9c02add7121d7811d323a Mon Sep 17 00:00:00 2001 From: DrChat Date: Tue, 13 Feb 2018 21:03:53 -0600 Subject: [PATCH 12/18] [Travis] Update libvulkan version --- .travis.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1003f5409..87b692bfb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -29,6 +29,9 @@ addons: #- libvulkan-dev - libx11-dev - liblz4-dev + +env: + - LIBVULKAN_VERSION=1.0.61.1 matrix: include: @@ -48,9 +51,9 @@ before_script: - $CXX --version - python3 --version # Add Vulkan dependencies - - travis_retry wget http://mirrors.kernel.org/ubuntu/pool/universe/v/vulkan/libvulkan1_1.0.42.0+dfsg1-1ubuntu1~16.04.1_amd64.deb - - travis_retry wget http://mirrors.kernel.org/ubuntu/pool/universe/v/vulkan/libvulkan-dev_1.0.42.0+dfsg1-1ubuntu1~16.04.1_amd64.deb - - if [[ $BUILD == true ]]; then sudo dpkg -i libvulkan1_1.0.42.0+dfsg1-1ubuntu1~16.04.1_amd64.deb libvulkan-dev_1.0.42.0+dfsg1-1ubuntu1~16.04.1_amd64.deb; fi + - travis_retry wget http://mirrors.kernel.org/ubuntu/pool/universe/v/vulkan/libvulkan1_$LIBVULKAN_VERSION+dfsg1-1ubuntu1~16.04.1_amd64.deb + - travis_retry wget http://mirrors.kernel.org/ubuntu/pool/universe/v/vulkan/libvulkan-dev_$LIBVULKAN_VERSION+dfsg1-1ubuntu1~16.04.1_amd64.deb + - if [[ $BUILD == true ]]; then sudo dpkg -i libvulkan1_$LIBVULKAN_VERSION+dfsg1-1ubuntu1~16.04.1_amd64.deb libvulkan-dev_$LIBVULKAN_VERSION+dfsg1-1ubuntu1~16.04.1_amd64.deb; fi # Prepare environment (pull dependencies, build tools). - travis_retry ./xenia-build setup From bbafcc089db7d89def739e3faceac4d669d74649 Mon Sep 17 00:00:00 2001 From: DrChat Date: Wed, 14 Feb 2018 07:44:21 -0600 Subject: [PATCH 13/18] [Travis] Fix Travis --- .travis.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 87b692bfb..430b2f08a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -29,9 +29,6 @@ addons: #- libvulkan-dev - libx11-dev - liblz4-dev - -env: - - LIBVULKAN_VERSION=1.0.61.1 matrix: include: @@ -45,6 +42,7 @@ git: submodules: false before_script: + - export LIBVULKAN_VERSION=1.0.61.1 - export CXX=$CXX_COMPILER - export CC=$C_COMPILER # Dump useful info. From 02b5a07bc9e2df713a42198ab09506669977454d Mon Sep 17 00:00:00 2001 From: DrChat Date: Wed, 14 Feb 2018 13:50:57 -0600 Subject: [PATCH 14/18] [JIT] rlwinmx: Use Truncate/ZeroExtend instead of And 0xFFFFFFFF --- src/xenia/cpu/ppc/ppc_emit_alu.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/xenia/cpu/ppc/ppc_emit_alu.cc b/src/xenia/cpu/ppc/ppc_emit_alu.cc index 265fb1267..9dbe68286 100644 --- a/src/xenia/cpu/ppc/ppc_emit_alu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc @@ -984,8 +984,10 @@ int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) { // m <- MASK(MB+32, ME+32) // RA <- r & m Value* v = f.LoadGPR(i.M.RT); + // (x||x) - v = f.Or(f.Shl(v, 32), f.And(v, f.LoadConstantUint64(0xFFFFFFFF))); + v = f.Or(f.Shl(v, 32), f.ZeroExtend(f.Truncate(v, INT32_TYPE), INT64_TYPE)); + // TODO(benvanik): optimize srwi // TODO(benvanik): optimize slwi // The compiler will generate a bunch of these for the special case of SH=0. From 7818cdab60c571f162b8bb752a1e817b1db108f2 Mon Sep 17 00:00:00 2001 From: DrChat Date: Wed, 14 Feb 2018 16:21:16 -0600 Subject: [PATCH 15/18] [CPU] Check for RawModule memory allocation success --- src/xenia/cpu/raw_module.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/xenia/cpu/raw_module.cc b/src/xenia/cpu/raw_module.cc index 583a66922..ae51067c7 100644 --- a/src/xenia/cpu/raw_module.cc +++ b/src/xenia/cpu/raw_module.cc @@ -33,15 +33,18 @@ bool RawModule::LoadFile(uint32_t base_address, const std::wstring& path) { // Allocate memory. // Since we have no real heap just load it wherever. base_address_ = base_address; - memory_->LookupHeap(base_address_) - ->AllocFixed(base_address_, file_length, 0, - kMemoryAllocationReserve | kMemoryAllocationCommit, - kMemoryProtectRead | kMemoryProtectWrite); + auto heap = memory_->LookupHeap(base_address_); + if (!heap || + !heap->AllocFixed(base_address_, file_length, 0, + kMemoryAllocationReserve | kMemoryAllocationCommit, + kMemoryProtectRead | kMemoryProtectWrite)) { + return false; + } + uint8_t* p = memory_->TranslateVirtual(base_address_); // Read into memory. fread(p, file_length, 1, file); - fclose(file); // Setup debug info. From 1de598e4cea07c4c933df4a20a072eeb5275c2d2 Mon Sep 17 00:00:00 2001 From: DrChat Date: Wed, 14 Feb 2018 16:22:14 -0600 Subject: [PATCH 16/18] [JIT] Comment the offsets in PPCContext --- src/xenia/cpu/ppc/ppc_context.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/xenia/cpu/ppc/ppc_context.h b/src/xenia/cpu/ppc/ppc_context.h index 9c96daa6b..2732fdc41 100644 --- a/src/xenia/cpu/ppc/ppc_context.h +++ b/src/xenia/cpu/ppc/ppc_context.h @@ -249,22 +249,22 @@ enum class PPCRegister { typedef struct PPCContext_s { // Must be stored at 0x0 for now. // TODO(benvanik): find a nice way to describe this to the JIT. - ThreadState* thread_state; + ThreadState* thread_state; // 0x0 // TODO(benvanik): this is getting nasty. Must be here. - uint8_t* virtual_membase; + uint8_t* virtual_membase; // 0x8 // Most frequently used registers first. - uint64_t lr; // Link register - uint64_t ctr; // Count register - uint64_t r[32]; // General purpose registers - double f[32]; // Floating-point registers - vec128_t v[128]; // VMX128 vector registers + uint64_t lr; // 0x10 Link register + uint64_t ctr; // 0x18 Count register + uint64_t r[32]; // 0x20 General purpose registers + double f[32]; // 0x120 Floating-point registers + vec128_t v[128]; // 0x220 VMX128 vector registers // XER register: // Split to make it easier to do individual updates. - uint8_t xer_ca; - uint8_t xer_ov; - uint8_t xer_so; + uint8_t xer_ca; // 0xA20 + uint8_t xer_ov; // 0xA21 + uint8_t xer_so; // 0xA22 // Condition registers: // These are split to make it easier to do DCE on unused stores. @@ -279,7 +279,7 @@ typedef struct PPCContext_s { // successfully uint8_t cr0_so; // Summary Overflow (SO) - copy of XER[SO] }; - } cr0; + } cr0; // 0xA24 union { uint32_t value; struct { From e54c24e1505e597ed29a133f162686e13d7294c5 Mon Sep 17 00:00:00 2001 From: DrChat Date: Wed, 14 Feb 2018 16:26:49 -0600 Subject: [PATCH 17/18] [JIT] New opcodes: OPCODE_LOAD_OFFSET and OPCODE_STORE_OFFSET These take full advantage of x86 addressing, and eliminate extra add operations. --- src/xenia/cpu/backend/x64/x64_sequences.cc | 172 ++++++++++++++++++ .../passes/constant_propagation_pass.cc | 14 ++ .../memory_sequence_combination_pass.cc | 16 +- src/xenia/cpu/hir/hir_builder.cc | 19 ++ src/xenia/cpu/hir/hir_builder.h | 5 + src/xenia/cpu/hir/opcodes.h | 2 + src/xenia/cpu/hir/opcodes.inl | 12 ++ src/xenia/cpu/ppc/ppc_emit_memory.cc | 147 +++++++++++---- 8 files changed, 352 insertions(+), 35 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 80da6a7eb..ac2a158c0 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -2127,6 +2127,176 @@ struct STORE_MMIO_I32 }; EMITTER_OPCODE_TABLE(OPCODE_STORE_MMIO, STORE_MMIO_I32); +// ============================================================================ +// OPCODE_LOAD_OFFSET +// ============================================================================ +template +RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest, + const T& offset) { + int32_t offset_const = static_cast(offset.constant()); + + if (guest.is_constant) { + uint32_t address = static_cast(guest.constant()); + address += static_cast(offset.constant()); + if (address < 0x80000000) { + return e.GetMembaseReg() + address; + } else { + e.mov(e.eax, address); + return e.GetMembaseReg() + e.rax; + } + } else { + // Clear the top 32 bits, as they are likely garbage. + // TODO(benvanik): find a way to avoid doing this. + e.mov(e.eax, guest.reg().cvt32()); + return e.GetMembaseReg() + e.rax + offset_const; + } +} + +struct LOAD_OFFSET_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + e.mov(i.dest, e.byte[addr]); + } +}; + +struct LOAD_OFFSET_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(i.dest, e.word[addr]); + } else { + e.mov(i.dest, e.word[addr]); + e.ror(i.dest, 8); + } + } else { + e.mov(i.dest, e.word[addr]); + } + } +}; + +struct LOAD_OFFSET_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(i.dest, e.dword[addr]); + } else { + e.mov(i.dest, e.dword[addr]); + e.bswap(i.dest); + } + } else { + e.mov(i.dest, e.dword[addr]); + } + } +}; + +struct LOAD_OFFSET_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(i.dest, e.qword[addr]); + } else { + e.mov(i.dest, e.qword[addr]); + e.bswap(i.dest); + } + } else { + e.mov(i.dest, e.qword[addr]); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16, + LOAD_OFFSET_I32, LOAD_OFFSET_I64); + +// ============================================================================ +// OPCODE_STORE_OFFSET +// ============================================================================ +struct STORE_OFFSET_I8 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.src3.is_constant) { + e.mov(e.byte[addr], i.src3.constant()); + } else { + e.mov(e.byte[addr], i.src3); + } + } +}; + +struct STORE_OFFSET_I16 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src3.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.word[addr], i.src3); + } else { + assert_always("not implemented"); + } + } else { + if (i.src3.is_constant) { + e.mov(e.word[addr], i.src3.constant()); + } else { + e.mov(e.word[addr], i.src3); + } + } + } +}; + +struct STORE_OFFSET_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src3.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.dword[addr], i.src3); + } else { + assert_always("not implemented"); + } + } else { + if (i.src3.is_constant) { + e.mov(e.dword[addr], i.src3.constant()); + } else { + e.mov(e.dword[addr], i.src3); + } + } + } +}; + +struct STORE_OFFSET_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src3.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.qword[addr], i.src3); + } else { + assert_always("not implemented"); + } + } else { + if (i.src3.is_constant) { + e.MovMem64(addr, i.src3.constant()); + } else { + e.mov(e.qword[addr], i.src3); + } + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STORE_OFFSET, STORE_OFFSET_I8, STORE_OFFSET_I16, + STORE_OFFSET_I32, STORE_OFFSET_I64); + // ============================================================================ // OPCODE_LOAD // ============================================================================ @@ -7650,6 +7820,8 @@ void RegisterSequences() { Register_OPCODE_CONTEXT_BARRIER(); Register_OPCODE_LOAD_MMIO(); Register_OPCODE_STORE_MMIO(); + Register_OPCODE_LOAD_OFFSET(); + Register_OPCODE_STORE_OFFSET(); Register_OPCODE_LOAD(); Register_OPCODE_STORE(); Register_OPCODE_MEMSET(); diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index 187ab5470..bc59c7eab 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -195,10 +195,15 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { break; case OPCODE_LOAD: + case OPCODE_LOAD_OFFSET: if (i->src1.value->IsConstant()) { assert_false(i->flags & LOAD_STORE_BYTE_SWAP); auto memory = processor_->memory(); auto address = i->src1.value->constant.i32; + if (i->opcode->num == OPCODE_LOAD_OFFSET) { + address += i->src2.value->constant.i32; + } + auto mmio_range = processor_->memory()->LookupVirtualMappedRange(address); if (FLAGS_inline_mmio_access && mmio_range) { @@ -246,12 +251,21 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { } break; case OPCODE_STORE: + case OPCODE_STORE_OFFSET: if (FLAGS_inline_mmio_access && i->src1.value->IsConstant()) { auto address = i->src1.value->constant.i32; + if (i->opcode->num == OPCODE_STORE_OFFSET) { + address += i->src2.value->constant.i32; + } + auto mmio_range = processor_->memory()->LookupVirtualMappedRange(address); if (mmio_range) { auto value = i->src2.value; + if (i->opcode->num == OPCODE_STORE_OFFSET) { + value = i->src3.value; + } + i->Replace(&OPCODE_STORE_MMIO_info, 0); i->src1.offset = reinterpret_cast(mmio_range); i->src2.offset = address; diff --git a/src/xenia/cpu/compiler/passes/memory_sequence_combination_pass.cc b/src/xenia/cpu/compiler/passes/memory_sequence_combination_pass.cc index 051e3185f..33cebc4d1 100644 --- a/src/xenia/cpu/compiler/passes/memory_sequence_combination_pass.cc +++ b/src/xenia/cpu/compiler/passes/memory_sequence_combination_pass.cc @@ -35,9 +35,11 @@ bool MemorySequenceCombinationPass::Run(HIRBuilder* builder) { while (block) { auto i = block->instr_head; while (i) { - if (i->opcode == &OPCODE_LOAD_info) { + if (i->opcode == &OPCODE_LOAD_info || + i->opcode == &OPCODE_LOAD_OFFSET_info) { CombineLoadSequence(i); - } else if (i->opcode == &OPCODE_STORE_info) { + } else if (i->opcode == &OPCODE_STORE_info || + i->opcode == &OPCODE_STORE_OFFSET_info) { CombineStoreSequence(i); } i = i->next; @@ -112,6 +114,10 @@ void MemorySequenceCombinationPass::CombineStoreSequence(Instr* i) { // store_convert v0, v1.i64, [swap|i64->i32,trunc] auto src = i->src2.value; + if (i->opcode == &OPCODE_STORE_OFFSET_info) { + src = i->src3.value; + } + if (src->IsConstant()) { // Constant value write - ignore. return; @@ -135,7 +141,11 @@ void MemorySequenceCombinationPass::CombineStoreSequence(Instr* i) { // Pull the original value (from before the byte swap). // The byte swap itself will go away in DCE. - i->set_src2(def->src1.value); + if (i->opcode == &OPCODE_STORE_info) { + i->set_src2(def->src1.value); + } else if (i->opcode == &OPCODE_STORE_OFFSET_info) { + i->set_src3(def->src1.value); + } // TODO(benvanik): extend/truncate. } diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index a74e28377..af954ca6c 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -1232,6 +1232,25 @@ void HIRBuilder::StoreMmio(cpu::MMIORange* mmio_range, uint32_t address, i->set_src3(value); } +Value* HIRBuilder::LoadOffset(Value* address, Value* offset, TypeName type, + uint32_t load_flags) { + ASSERT_ADDRESS_TYPE(address); + Instr* i = AppendInstr(OPCODE_LOAD_OFFSET_info, load_flags, AllocValue(type)); + i->set_src1(address); + i->set_src2(offset); + i->src3.value = NULL; + return i->dest; +} + +void HIRBuilder::StoreOffset(Value* address, Value* offset, Value* value, + uint32_t store_flags) { + ASSERT_ADDRESS_TYPE(address); + Instr* i = AppendInstr(OPCODE_STORE_OFFSET_info, store_flags); + i->set_src1(address); + i->set_src2(offset); + i->set_src3(value); +} + Value* HIRBuilder::Load(Value* address, TypeName type, uint32_t load_flags) { ASSERT_ADDRESS_TYPE(address); Instr* i = AppendInstr(OPCODE_LOAD_info, load_flags, AllocValue(type)); diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h index 41fbf7c1e..6f860249b 100644 --- a/src/xenia/cpu/hir/hir_builder.h +++ b/src/xenia/cpu/hir/hir_builder.h @@ -147,6 +147,11 @@ class HIRBuilder { Value* LoadMmio(cpu::MMIORange* mmio_range, uint32_t address, TypeName type); void StoreMmio(cpu::MMIORange* mmio_range, uint32_t address, Value* value); + Value* LoadOffset(Value* address, Value* offset, TypeName type, + uint32_t load_flags = 0); + void StoreOffset(Value* address, Value* offset, Value* value, + uint32_t store_flags = 0); + Value* Load(Value* address, TypeName type, uint32_t load_flags = 0); void Store(Value* address, Value* value, uint32_t store_flags = 0); void Memset(Value* address, Value* value, Value* length); diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index bfdb7fd15..ce232fd1d 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -152,6 +152,8 @@ enum Opcode { OPCODE_CONTEXT_BARRIER, OPCODE_LOAD_MMIO, OPCODE_STORE_MMIO, + OPCODE_LOAD_OFFSET, + OPCODE_STORE_OFFSET, OPCODE_LOAD, OPCODE_STORE, OPCODE_MEMSET, diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl index 9930cfe8d..389570f50 100644 --- a/src/xenia/cpu/hir/opcodes.inl +++ b/src/xenia/cpu/hir/opcodes.inl @@ -231,6 +231,18 @@ DEFINE_OPCODE( OPCODE_SIG_X_O_O_V, OPCODE_FLAG_MEMORY) +DEFINE_OPCODE( + OPCODE_LOAD_OFFSET, + "load_offset", + OPCODE_SIG_V_V_V, + OPCODE_FLAG_MEMORY) + +DEFINE_OPCODE( + OPCODE_STORE_OFFSET, + "store_offset", + OPCODE_SIG_X_V_V_V, + OPCODE_FLAG_MEMORY) + DEFINE_OPCODE( OPCODE_LOAD, "load", diff --git a/src/xenia/cpu/ppc/ppc_emit_memory.cc b/src/xenia/cpu/ppc/ppc_emit_memory.cc index da93f403a..7cd4f19c1 100644 --- a/src/xenia/cpu/ppc/ppc_emit_memory.cc +++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc @@ -63,8 +63,15 @@ int InstrEmit_lbz(PPCHIRBuilder& f, const InstrData& i) { // b <- (RA) // EA <- b + EXTS(D) // RT <- i56.0 || MEM(EA, 1) - Value* ea = CalculateEA_0_i(f, i.D.RA, XEEXTS16(i.D.DS)); - Value* rt = f.ZeroExtend(f.Load(ea, INT8_TYPE), INT64_TYPE); + Value* b; + if (i.D.RA == 0) { + b = f.LoadZeroInt64(); + } else { + b = f.LoadGPR(i.D.RA); + } + + Value* offset = f.LoadConstantInt64(XEEXTS16(i.D.DS)); + Value* rt = f.ZeroExtend(f.LoadOffset(b, offset, INT8_TYPE), INT64_TYPE); f.StoreGPR(i.D.RT, rt); return 0; } @@ -73,10 +80,11 @@ int InstrEmit_lbzu(PPCHIRBuilder& f, const InstrData& i) { // EA <- (RA) + EXTS(D) // RT <- i56.0 || MEM(EA, 1) // RA <- EA - Value* ea = CalculateEA_i(f, i.D.RA, XEEXTS16(i.D.DS)); - Value* rt = f.ZeroExtend(f.Load(ea, INT8_TYPE), INT64_TYPE); + Value* ra = f.LoadGPR(i.D.RA); + Value* offset = f.LoadConstantInt64(XEEXTS16(i.D.DS)); + Value* rt = f.ZeroExtend(f.LoadOffset(ra, offset, INT8_TYPE), INT64_TYPE); f.StoreGPR(i.D.RT, rt); - StoreEA(f, i.D.RA, ea); + StoreEA(f, i.D.RA, f.Add(ra, offset)); return 0; } @@ -111,8 +119,16 @@ int InstrEmit_lha(PPCHIRBuilder& f, const InstrData& i) { // b <- (RA) // EA <- b + EXTS(D) // RT <- EXTS(MEM(EA, 2)) - Value* ea = CalculateEA_0_i(f, i.D.RA, XEEXTS16(i.D.DS)); - Value* rt = f.SignExtend(f.ByteSwap(f.Load(ea, INT16_TYPE)), INT64_TYPE); + Value* b; + if (i.D.RA == 0) { + b = f.LoadZeroInt64(); + } else { + b = f.LoadGPR(i.D.RA); + } + + Value* offset = f.LoadConstantInt64(XEEXTS16(i.D.DS)); + Value* rt = + f.SignExtend(f.ByteSwap(f.LoadOffset(b, offset, INT16_TYPE)), INT64_TYPE); f.StoreGPR(i.D.RT, rt); return 0; } @@ -121,10 +137,12 @@ int InstrEmit_lhau(PPCHIRBuilder& f, const InstrData& i) { // EA <- (RA) + EXTS(D) // RT <- EXTS(MEM(EA, 2)) // RA <- EA - Value* ea = CalculateEA_i(f, i.D.RA, XEEXTS16(i.D.DS)); - Value* rt = f.SignExtend(f.ByteSwap(f.Load(ea, INT16_TYPE)), INT64_TYPE); + Value* ra = f.LoadGPR(i.D.RA); + Value* offset = f.LoadConstantInt64(XEEXTS16(i.D.DS)); + Value* rt = f.SignExtend(f.ByteSwap(f.LoadOffset(ra, offset, INT16_TYPE)), + INT64_TYPE); f.StoreGPR(i.D.RT, rt); - StoreEA(f, i.D.RA, ea); + StoreEA(f, i.D.RA, f.Add(ra, offset)); return 0; } @@ -159,8 +177,16 @@ int InstrEmit_lhz(PPCHIRBuilder& f, const InstrData& i) { // b <- (RA) // EA <- b + EXTS(D) // RT <- i48.0 || MEM(EA, 2) - Value* ea = CalculateEA_0_i(f, i.D.RA, XEEXTS16(i.D.DS)); - Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT16_TYPE)), INT64_TYPE); + Value* b; + if (i.D.RA == 0) { + b = f.LoadZeroInt64(); + } else { + b = f.LoadGPR(i.D.RA); + } + + Value* offset = f.LoadConstantInt64(XEEXTS16(i.D.DS)); + Value* rt = + f.ZeroExtend(f.ByteSwap(f.LoadOffset(b, offset, INT16_TYPE)), INT64_TYPE); f.StoreGPR(i.D.RT, rt); return 0; } @@ -169,10 +195,12 @@ int InstrEmit_lhzu(PPCHIRBuilder& f, const InstrData& i) { // EA <- (RA) + EXTS(D) // RT <- i48.0 || MEM(EA, 2) // RA <- EA - Value* ea = CalculateEA_i(f, i.D.RA, XEEXTS16(i.D.DS)); - Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT16_TYPE)), INT64_TYPE); + Value* ra = f.LoadGPR(i.D.RA); + Value* offset = f.LoadConstantInt64(XEEXTS16(i.D.DS)); + Value* rt = f.ZeroExtend(f.ByteSwap(f.LoadOffset(ra, offset, INT16_TYPE)), + INT64_TYPE); f.StoreGPR(i.D.RT, rt); - StoreEA(f, i.D.RA, ea); + StoreEA(f, i.D.RA, f.Add(ra, offset)); return 0; } @@ -207,8 +235,16 @@ int InstrEmit_lwa(PPCHIRBuilder& f, const InstrData& i) { // b <- (RA) // EA <- b + EXTS(D || 00) // RT <- EXTS(MEM(EA, 4)) - Value* ea = CalculateEA_0_i(f, i.DS.RA, XEEXTS16(i.DS.DS << 2)); - Value* rt = f.SignExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE); + Value* b; + if (i.DS.RA == 0) { + b = f.LoadZeroInt64(); + } else { + b = f.LoadGPR(i.DS.RA); + } + + Value* offset = f.LoadConstantInt64(XEEXTS16(i.DS.DS << 2)); + Value* rt = + f.SignExtend(f.ByteSwap(f.LoadOffset(b, offset, INT32_TYPE)), INT64_TYPE); f.StoreGPR(i.DS.RT, rt); return 0; } @@ -244,8 +280,16 @@ int InstrEmit_lwz(PPCHIRBuilder& f, const InstrData& i) { // b <- (RA) // EA <- b + EXTS(D) // RT <- i32.0 || MEM(EA, 4) - Value* ea = CalculateEA_0_i(f, i.D.RA, XEEXTS16(i.D.DS)); - Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE); + Value* b; + if (i.D.RA == 0) { + b = f.LoadZeroInt64(); + } else { + b = f.LoadGPR(i.D.RA); + } + + Value* offset = f.LoadConstantInt64(XEEXTS16(i.D.DS)); + Value* rt = + f.ZeroExtend(f.ByteSwap(f.LoadOffset(b, offset, INT32_TYPE)), INT64_TYPE); f.StoreGPR(i.D.RT, rt); return 0; } @@ -254,10 +298,12 @@ int InstrEmit_lwzu(PPCHIRBuilder& f, const InstrData& i) { // EA <- (RA) + EXTS(D) // RT <- i32.0 || MEM(EA, 4) // RA <- EA - Value* ea = CalculateEA_i(f, i.D.RA, XEEXTS16(i.D.DS)); - Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE); + Value* ra = f.LoadGPR(i.D.RA); + Value* offset = f.LoadConstantInt64(XEEXTS16(i.D.DS)); + Value* rt = f.ZeroExtend(f.ByteSwap(f.LoadOffset(ra, offset, INT32_TYPE)), + INT64_TYPE); f.StoreGPR(i.D.RT, rt); - StoreEA(f, i.D.RA, ea); + StoreEA(f, i.D.RA, f.Add(ra, offset)); return 0; } @@ -292,8 +338,15 @@ int InstrEmit_ld(PPCHIRBuilder& f, const InstrData& i) { // b <- (RA) // EA <- b + EXTS(DS || 0b00) // RT <- MEM(EA, 8) - Value* ea = CalculateEA_0_i(f, i.DS.RA, XEEXTS16(i.DS.DS << 2)); - Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE)); + Value* b; + if (i.DS.RA == 0) { + b = f.LoadZeroInt64(); + } else { + b = f.LoadGPR(i.DS.RA); + } + + Value* offset = f.LoadConstantInt64(XEEXTS16(i.DS.DS << 2)); + Value* rt = f.ByteSwap(f.LoadOffset(b, offset, INT64_TYPE)); f.StoreGPR(i.DS.RT, rt); return 0; } @@ -342,8 +395,15 @@ int InstrEmit_stb(PPCHIRBuilder& f, const InstrData& i) { // b <- (RA) // EA <- b + EXTS(D) // MEM(EA, 1) <- (RS)[56:63] - Value* ea = CalculateEA_0_i(f, i.D.RA, XEEXTS16(i.D.DS)); - f.Store(ea, f.Truncate(f.LoadGPR(i.D.RT), INT8_TYPE)); + Value* b; + if (i.D.RA == 0) { + b = f.LoadZeroInt64(); + } else { + b = f.LoadGPR(i.D.RA); + } + + Value* offset = f.LoadConstantInt64(XEEXTS16(i.D.DS)); + f.StoreOffset(b, offset, f.Truncate(f.LoadGPR(i.D.RT), INT8_TYPE)); return 0; } @@ -386,8 +446,16 @@ int InstrEmit_sth(PPCHIRBuilder& f, const InstrData& i) { // b <- (RA) // EA <- b + EXTS(D) // MEM(EA, 2) <- (RS)[48:63] - Value* ea = CalculateEA_0_i(f, i.D.RA, XEEXTS16(i.D.DS)); - f.Store(ea, f.ByteSwap(f.Truncate(f.LoadGPR(i.D.RT), INT16_TYPE))); + Value* b; + if (i.D.RA == 0) { + b = f.LoadZeroInt64(); + } else { + b = f.LoadGPR(i.D.RA); + } + + Value* offset = f.LoadConstantInt64(XEEXTS16(i.D.DS)); + f.StoreOffset(b, offset, + f.ByteSwap(f.Truncate(f.LoadGPR(i.D.RT), INT16_TYPE))); return 0; } @@ -430,8 +498,16 @@ int InstrEmit_stw(PPCHIRBuilder& f, const InstrData& i) { // b <- (RA) // EA <- b + EXTS(D) // MEM(EA, 4) <- (RS)[32:63] - Value* ea = CalculateEA_0_i(f, i.D.RA, XEEXTS16(i.D.DS)); - f.Store(ea, f.ByteSwap(f.Truncate(f.LoadGPR(i.D.RT), INT32_TYPE))); + Value* b; + if (i.D.RA == 0) { + b = f.LoadZeroInt64(); + } else { + b = f.LoadGPR(i.D.RA); + } + + Value* offset = f.LoadConstantInt64(XEEXTS16(i.D.DS)); + f.StoreOffset(b, offset, + f.ByteSwap(f.Truncate(f.LoadGPR(i.D.RT), INT32_TYPE))); return 0; } @@ -474,8 +550,15 @@ int InstrEmit_std(PPCHIRBuilder& f, const InstrData& i) { // b <- (RA) // EA <- b + EXTS(DS || 0b00) // MEM(EA, 8) <- (RS) - Value* ea = CalculateEA_0_i(f, i.DS.RA, XEEXTS16(i.DS.DS << 2)); - f.Store(ea, f.ByteSwap(f.LoadGPR(i.DS.RT))); + Value* b; + if (i.DS.RA == 0) { + b = f.LoadZeroInt64(); + } else { + b = f.LoadGPR(i.DS.RA); + } + + Value* offset = f.LoadConstantInt64(XEEXTS16(i.DS.DS << 2)); + f.StoreOffset(b, offset, f.ByteSwap(f.LoadGPR(i.DS.RT))); return 0; } From b17d6f5088976b5aed12e45d44bf5c73d91bc41d Mon Sep 17 00:00:00 2001 From: DrChat Date: Wed, 14 Feb 2018 20:28:34 -0600 Subject: [PATCH 18/18] [Base] Enable aligned copy and swap routines --- src/xenia/base/memory.cc | 88 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 8 deletions(-) diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc index 1179ecc1f..3ddef8113 100644 --- a/src/xenia/base/memory.cc +++ b/src/xenia/base/memory.cc @@ -24,8 +24,20 @@ void copy_128_aligned(void* dest, const void* src, size_t count) { } #if XE_ARCH_AMD64 -void copy_and_swap_16_aligned(void* dest, const void* src, size_t count) { - return copy_and_swap_16_unaligned(dest, src, count); +void copy_and_swap_16_aligned(void* dest_ptr, const void* src_ptr, + size_t count) { + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); + size_t i; + for (i = 0; i + 8 <= count; i += 8) { + __m128i input = _mm_load_si128(reinterpret_cast(&src[i])); + __m128i output = + _mm_or_si128(_mm_slli_epi16(input, 8), _mm_srli_epi16(input, 8)); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + } + for (; i < count; ++i) { // handle residual elements + dest[i] = byte_swap(src[i]); + } } void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr, @@ -44,8 +56,31 @@ void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr, } } -void copy_and_swap_32_aligned(void* dest, const void* src, size_t count) { - return copy_and_swap_32_unaligned(dest, src, count); +void copy_and_swap_32_aligned(void* dest_ptr, const void* src_ptr, + size_t count) { + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + size_t i; + for (i = 0; i + 4 <= count; i += 4) { + __m128i input = _mm_load_si128(reinterpret_cast(&src[i])); + // Do the four shifts. + __m128i byte1 = _mm_slli_epi32(input, 24); + __m128i byte2 = _mm_slli_epi32(input, 8); + __m128i byte3 = _mm_srli_epi32(input, 8); + __m128i byte4 = _mm_srli_epi32(input, 24); + // OR bytes together. + __m128i output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + } + for (; i < count; ++i) { // handle residual elements + dest[i] = byte_swap(src[i]); + } } void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr, @@ -75,8 +110,33 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr, } } -void copy_and_swap_64_aligned(void* dest, const void* src, size_t count) { - return copy_and_swap_64_unaligned(dest, src, count); +void copy_and_swap_64_aligned(void* dest_ptr, const void* src_ptr, + size_t count) { + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + size_t i; + for (i = 0; i + 2 <= count; i += 2) { + __m128i input = _mm_load_si128(reinterpret_cast(&src[i])); + // Do the four shifts. + __m128i byte1 = _mm_slli_epi32(input, 24); + __m128i byte2 = _mm_slli_epi32(input, 8); + __m128i byte3 = _mm_srli_epi32(input, 8); + __m128i byte4 = _mm_srli_epi32(input, 24); + // OR bytes together. + __m128i output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Reorder the two words. + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + } + for (; i < count; ++i) { // handle residual elements + dest[i] = byte_swap(src[i]); + } } void copy_and_swap_64_unaligned(void* dest_ptr, const void* src_ptr, @@ -108,8 +168,20 @@ void copy_and_swap_64_unaligned(void* dest_ptr, const void* src_ptr, } } -void copy_and_swap_16_in_32_aligned(void* dest, const void* src, size_t count) { - return copy_and_swap_16_in_32_unaligned(dest, src, count); +void copy_and_swap_16_in_32_aligned(void* dest_ptr, const void* src_ptr, + size_t count) { + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); + size_t i; + for (i = 0; i + 4 <= count; i += 4) { + __m128i input = _mm_load_si128(reinterpret_cast(&src[i])); + __m128i output = + _mm_or_si128(_mm_slli_epi32(input, 16), _mm_srli_epi32(input, 16)); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + } + for (; i < count; ++i) { // handle residual elements + dest[i] = (src[i] >> 16) | (src[i] << 16); + } } void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr,