alternative mutex impl on windows works but i really can't tell if it helps much. use larger size in deferred_command_list to cut down on resizes in big scenes on m:dur

2022-08-14 10:26:50 -07:00 · 2022-08-14 10:26:50 -07:00 · c9b2d10e17
parent a037bdb2e8
commit c9b2d10e17
5 changed files with 58 additions and 7 deletions
--- a/src/xenia/base/memory.h
+++ b/src/xenia/base/memory.h
@ -466,6 +466,48 @@ constexpr inline fourcc_t make_fourcc(const std::string_view fourcc) {
  }
  return make_fourcc(fourcc[0], fourcc[1], fourcc[2], fourcc[3]);
 }
+//chrispy::todo:use for command stream vector, resize happens a ton and has to call memset
+template <size_t sz>
+class fixed_vmem_vector {
+  static_assert((sz & 65535) == 0,
+                "Always give fixed_vmem_vector a size divisible by 65536 to "
+                "avoid wasting memory on windows");
+
+  uint8_t* data_;
+  size_t nbytes_;
+
+ public:
+  fixed_vmem_vector()
+      : data_((uint8_t*)AllocFixed(nullptr, sz, AllocationType::kReserveCommit,
+                                   PageAccess::kReadWrite)),
+        nbytes_(0) {}
+  ~fixed_vmem_vector() {
+    if (data_) {
+      DeallocFixed(data_, sz, DeallocationType::kRelease);
+      data_ = nullptr;
+    }
+    nbytes_ = 0;
+  }
+
+  uint8_t* data() const { return data_; }
+  size_t size() const { return nbytes_; }
+
+  void resize(size_t newsize) {
+    nbytes_ = newsize;
+    xenia_assert(newsize < sz);
+  }
+  size_t alloc() const { return sz; }
+
+  void clear() {
+    resize(0);  // todo:maybe zero out
+  }
+  void reserve(size_t size) { xenia_assert(size < sz); }
+
+
+};
+
+
+

 }  // namespace xe

--- a/src/xenia/base/mutex.h
+++ b/src/xenia/base/mutex.h
@ -12,7 +12,7 @@
 #include <mutex>
 #include "platform.h"

-//#define		XE_ENABLE_FAST_WIN32_MUTEX 1
+#define		XE_ENABLE_FAST_WIN32_MUTEX 1
 namespace xe {

 #if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX==1
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@ -493,13 +493,18 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
  // very unlikely. these ORS here are meant to be bitwise ors, so that we do
  // not do branching evaluation of the conditions (we will almost always take
  // all of the branches)
-  if (XE_UNLIKELY(
-          (index - XE_GPU_REG_SCRATCH_REG0 < 8) |
-          (index == XE_GPU_REG_COHER_STATUS_HOST) |
-          ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
-           (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)))) {
+
+  unsigned expr = (index - XE_GPU_REG_SCRATCH_REG0 < 8) |
+                  (index == XE_GPU_REG_COHER_STATUS_HOST) |
+                  ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
+                   (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
+  //chrispy: reordered for msvc branch probability (assumes if is taken and else is not)
+  if (XE_LIKELY(expr == 0)) {
+  
+  } else {
    HandleSpecialRegisterWrite(index, value);
  }
+
 }

 void CommandProcessor::MakeCoherent() {
--- a/src/xenia/gpu/command_processor.h
+++ b/src/xenia/gpu/command_processor.h
@ -153,6 +153,7 @@ class CommandProcessor {
  // rarely needed, most register writes have no special logic here
  XE_NOINLINE
  void HandleSpecialRegisterWrite(uint32_t index, uint32_t value);
+  XE_FORCEINLINE
  virtual void WriteRegister(uint32_t index, uint32_t value);

  const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const {
--- a/src/xenia/gpu/d3d12/deferred_command_list.h
+++ b/src/xenia/gpu/d3d12/deferred_command_list.h
@ -30,8 +30,11 @@ class D3D12CommandProcessor;

 class DeferredCommandList {
 public:
+  /*
+	chrispy: upped from 1_MiB to 4_MiB, m:durandal hits frequent resizes in large open maps
+  */
  DeferredCommandList(const D3D12CommandProcessor& command_processor,
-                      size_t initial_size_bytes = 1_MiB);
+                      size_t initial_size_bytes = 4_MiB);

  void Reset();
  void Execute(ID3D12GraphicsCommandList* command_list,