[D3D12] EDRAM layout of a rectangle and other resolve stuff

2018-08-22 17:33:43 +03:00 · 2018-08-22 17:33:43 +03:00 · c4f80aac0d
parent ddc8f17fa5
commit c4f80aac0d
7 changed files with 154 additions and 70 deletions
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@ -20,6 +20,7 @@
 #include "xenia/base/profiling.h"
 #include "xenia/gpu/d3d12/d3d12_command_processor.h"
 #include "xenia/gpu/texture_info.h"
+#include "xenia/gpu/texture_util.h"

 namespace xe {
 namespace gpu {
@ -744,11 +745,7 @@ bool RenderTargetCache::Resolve(SharedMemory* shared_memory, Memory* memory) {

  // Get the render target properties.
  uint32_t rb_surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32;
-  uint32_t surface_pitch = std::min(rb_surface_info & 0x3FFF, 2560u);
-  if (surface_pitch == 0) {
-    // Nothing to copy or clear.
-    return true;
-  }
+  uint32_t surface_pitch = rb_surface_info & 0x3FFF;
  MsaaSamples msaa_samples = MsaaSamples((rb_surface_info >> 16) & 0x3);
  uint32_t rb_copy_control = regs[XE_GPU_REG_RB_COPY_CONTROL].u32;
  uint32_t surface_index = rb_copy_control & 0x7;
@ -759,12 +756,10 @@ bool RenderTargetCache::Resolve(SharedMemory* shared_memory, Memory* memory) {
  bool surface_is_depth = surface_index == 4;
  uint32_t surface_edram_base;
  uint32_t surface_format;
-  bool surface_format_64bpp;
  if (surface_is_depth) {
    uint32_t rb_depth_info = regs[XE_GPU_REG_RB_DEPTH_INFO].u32;
    surface_edram_base = rb_depth_info & 0xFFF;
    surface_format = (rb_depth_info >> 16) & 0x1;
-    surface_format_64bpp = false;
  } else {
    uint32_t rb_color_info;
    switch (surface_index) {
@ -783,26 +778,7 @@ bool RenderTargetCache::Resolve(SharedMemory* shared_memory, Memory* memory) {
    }
    surface_edram_base = rb_color_info & 0xFFF;
    surface_format = (rb_color_info >> 16) & 0xF;
-    surface_format_64bpp =
-        IsColorFormat64bpp(ColorRenderTargetFormat(surface_format));
  }
-  if (surface_edram_base >= 2048) {
-    // The surface is totally outside of EDRAM - shouldn't happen.
-    return false;
-  }
-  // Calculate the maximum number of rows to clamp the source rectangle.
-  uint32_t surface_pitch_ss =
-      surface_pitch * (msaa_samples >= MsaaSamples::k4X ? 2 : 1);
-  uint32_t surface_pitch_tiles =
-      (surface_pitch_ss + 79) / 80 * (surface_format_64bpp ? 2 : 1);
-  uint32_t surface_edram_max_rows =
-      (2048 - surface_edram_base) / surface_pitch_tiles;
-  if (surface_edram_max_rows == 0) {
-    // The surface is too close to the end of EDRAM.
-    return true;
-  }
-  uint32_t surface_max_height =
-      surface_edram_max_rows * (msaa_samples >= MsaaSamples::k2X ? 8 : 16);

  // Get the resolve region since both copying and clearing need it.
  // HACK: Vertices to use are always in vf0.
@ -849,15 +825,6 @@ bool RenderTargetCache::Resolve(SharedMemory* shared_memory, Memory* memory) {
    src_rect.top += window_offset_y;
    src_rect.bottom += window_offset_y;
  }
-  src_rect.right = std::min(src_rect.right, LONG(surface_pitch));
-  src_rect.bottom = std::min(src_rect.bottom, LONG(surface_max_height));
-  if (src_rect.right <= 0 || src_rect.bottom <= 0 ||
-      src_rect.right <= src_rect.left || src_rect.bottom <= src_rect.top) {
-    // Totally off screen or empty - nothing to copy.
-    return true;
-  }
-  src_rect.left = std::max(src_rect.left, LONG(0));
-  src_rect.top = std::max(src_rect.top, LONG(0));

  XELOGGPU(
      "Resolving (%d,%d)->(%d,%d) of RT %u (pitch %u, %u sample%s, format "
@ -903,22 +870,24 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
    // Nothing to copy.
    return true;
  }
-  uint32_t src_x = uint32_t(src_rect.left);
-  uint32_t src_y = uint32_t(src_rect.top);
-  uint32_t src_width =
-      std::min(uint32_t(src_rect.right - src_rect.left), dest_pitch);
-  uint32_t src_height =
-      std::min(uint32_t(src_rect.bottom - src_rect.top), dest_height);
+  D3D12_RECT copy_rect = src_rect;
+  copy_rect.right =
+      std::min(copy_rect.right, LONG(copy_rect.left + dest_pitch));
+  copy_rect.bottom =
+      std::min(copy_rect.bottom, LONG(copy_rect.top + dest_height));

  // Get format info.
  uint32_t dest_info = regs[XE_GPU_REG_RB_COPY_DEST_INFO].u32;
  TextureFormat src_texture_format;
+  bool src_64bpp;
  if (is_depth) {
    src_texture_format =
        DepthRenderTargetToTextureFormat(DepthRenderTargetFormat(src_format));
+    src_64bpp = false;
  } else {
    src_texture_format =
        ColorRenderTargetToTextureFormat(ColorRenderTargetFormat(src_format));
+    src_64bpp = IsColorFormat64bpp(ColorRenderTargetFormat(src_format));
  }
  assert_true(src_texture_format != TextureFormat::kUnknown);
  src_texture_format = GetBaseFormat(src_texture_format);
@ -928,6 +897,17 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
      is_depth ? src_texture_format
               : GetBaseFormat(TextureFormat((dest_info >> 7) & 0x3F));

+  // See what samples we need and what we should do with them.
+  xenos::CopySampleSelect sample_select =
+      xenos::CopySampleSelect((rb_copy_control >> 4) & 0x7);
+  if (is_depth && sample_select > xenos::CopySampleSelect::k3) {
+    assert_always();
+    return false;
+  }
+  int32_t dest_exp_bias =
+      !is_depth ? (int32_t((dest_info >> 16) << 26) >> 26) : 0;
+  uint32_t dest_swap = (dest_info >> 24) & 0x1;
+
  // Get the destination location.
  uint32_t dest_address = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32 & 0x1FFFFFFF;
  if (dest_address & 0x3) {
@ -936,44 +916,59 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
    // resolve to 8bpp or 16bpp textures at very odd locations.
    return false;
  }
-  int32_t dest_exp_bias = int32_t((dest_info >> 16) << 26) >> 26;
-  uint32_t dest_swap = (dest_info >> 24) & 0x1;
-  // TODO(Triang3l): Copy to array slices.
+  uint32_t dest_size = texture_util::GetGuestMipStorageSize(
+      xe::align(dest_pitch, 32u), xe::align(dest_height, 32u), 1, true,
+      dest_format, nullptr);
+  if (dest_info & (1 << 3)) {
+    // Copying to an array slice.
+    dest_address += dest_size * ((dest_info >> 4) & 0x7);
+  }
  // TODO(Triang3l): Investigate what copy_dest_number is.
+
  XELOGGPU(
      "Copying samples %u to 0x%.8X (%ux%u), destination format %s, "
      "exponent bias %d, red and blue %sswapped",
-      (rb_copy_control >> 4) & 0x7, dest_address, dest_pitch, dest_height,
+      uint32_t(sample_select), dest_address, dest_pitch, dest_height,
      FormatInfo::Get(dest_format)->name, dest_exp_bias,
      dest_swap ? "" : "not ");

-  // There are 3 paths for resolving in this function - they don't necessarily
+  // Validate and clamp the source region, skip parts that don't need to be
+  // copied and calculate the number of threads needed for copying/loading.
+  uint32_t surface_pitch_tiles, row_tiles, rows;
+  if (!GetEDRAMLayout(surface_pitch, msaa_samples, src_64bpp, edram_base,
+                      copy_rect, surface_pitch_tiles, row_tiles, rows)) {
+    // Nothing to copy.
+    return true;
+  }
+  XELOGGPU("Pitch is %u tiles, %u rows of %u tiles", surface_pitch_tiles, rows,
+           row_tiles);
+
+  // There are 2 paths for resolving in this function - they don't necessarily
  // have to map directly to kRaw and kConvert CopyCommands.
-  // - Depth - tiling raw D24S8 or D24FS8 directly from the EDRAM buffer to the
-  //   shared memory. Only 1 sample is resolved from a depth buffer, and it
-  //   looks like format conversion can't be done when resolving depth buffers
-  //   since k_8_8_8_8 is specified as the destination format, while the texture
-  //   is being used as k_24_8 or k_24_8_FLOAT.
-  // - Raw color - when the source is single-sampled and has the same format as
-  //   the destination, and there's no need to apply exponent bias. A regular
-  //   EDRAM load is done to a buffer, and the buffer is then tiled to the
-  //   shared memory. Because swapping red and blue is very common, this path
-  //   supports swapping.
+  // - Raw - when extracting a single color to a texture of the same format as
+  //   the EDRAM surface and exponent bias is not applied, or when resolving a
+  //   depth buffer (games read only one sample of it - resolving multiple
+  //   samples of a depth buffer is meaningless anyway - and apparently there's
+  //   no format conversion as well because k_8_8_8_8 is specified in the
+  //   destination format in the register, which is obviously not true, and the
+  //   texture is then read as k_24_8 or k_24_8_FLOAT). Swapping red and blue is
+  //   possible in this mode.
  // - Conversion - when a simple copy is not enough. The EDRAM region is loaded
  //   to a render target resource, which is then used as a texture in a shader
  //   performing the resolve (by sampling the texture on or between pixels with
  //   bilinear filtering), applying exponent bias and swapping red and blue in
  //   a format-agnostic way, then the resulting color is written to a temporary
  //   RTV of the destination format.
-  if (is_depth) {
-    // Depth.
-    // TODO(Triang3l): Resolve depth.
-    return false;
-  } else if (src_texture_format == dest_format &&
-             msaa_samples == MsaaSamples::k1X && dest_exp_bias == 0) {
-    XELOGGPU("Resolving a single-sampled surface without conversion");
+  if (sample_select <= xenos::CopySampleSelect::k3 &&
+      src_texture_format == dest_format && dest_exp_bias == 0) {
+    XELOGGPU("Resolving a single sample without conversion");
+    // Make sure we have the memory to write to.
+    if (!shared_memory->MakeTilesResident(dest_address, dest_size)) {
+      return false;
+    }
    // TODO(Triang3l): Raw resolve.
-    return false;
+    // Make the texture cache refresh the data.
+    shared_memory->RangeWrittenByGPU(dest_address, dest_size);
  } else {
    XELOGGPU("Resolving with a pixel shader");
    // TODO(Triang3l): Conversion.
@ -1188,6 +1183,65 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
  return render_target;
 }

+bool RenderTargetCache::GetEDRAMLayout(
+    uint32_t pitch_pixels, MsaaSamples msaa_samples, bool is_64bpp,
+    uint32_t& base_in_out, D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out,
+    uint32_t& row_tiles_out, uint32_t& rows_out) {
+  if (pitch_pixels == 0 || rect_in_out.right <= 0 || rect_in_out.bottom <= 0 ||
+      rect_in_out.top >= rect_in_out.bottom) {
+    return false;
+  }
+  pitch_pixels = std::min(pitch_pixels, 2560u);
+  D3D12_RECT rect = rect_in_out;
+  rect.left = std::max(rect.left, LONG(0));
+  rect.top = std::max(rect.top, LONG(0));
+  rect.right = std::min(rect.right, LONG(pitch_pixels));
+  if (rect.left >= rect.right) {
+    return false;
+  }
+
+  uint32_t samples_x_log2 = msaa_samples >= MsaaSamples::k4X ? 1 : 0;
+  uint32_t samples_y_log2 = msaa_samples >= MsaaSamples::k2X ? 1 : 0;
+  uint32_t sample_size_log2 = is_64bpp ? 1 : 0;
+
+  uint32_t pitch_tiles = (((pitch_pixels << samples_x_log2) + 79) / 80)
+                         << sample_size_log2;
+
+  // Adjust the base and the rectangle to skip tiles to the left of the left
+  // bound of the rectangle and to the top of the top bound.
+  uint32_t base = base_in_out;
+  uint32_t skip = rect.top << samples_y_log2 >> 4;
+  base += skip * pitch_tiles;
+  skip <<= 4 - samples_y_log2;
+  rect.top -= skip;
+  rect.bottom -= skip;
+  skip = (rect.left << samples_x_log2) / 80;
+  base += skip << sample_size_log2;
+  skip *= 80 >> samples_x_log2;
+  rect.left -= skip;
+  rect.right -= skip;
+
+  // Calculate the number of 16-sample rows this rectangle spans.
+  uint32_t rows = ((rect.bottom << samples_y_log2) + 15) >> 4;
+  uint32_t rows_max = (2048 - base) / pitch_tiles;
+  if (rows_max == 0) {
+    return false;
+  }
+  if (rows > rows_max) {
+    // Clamp the rectangle if it's partially outside of EDRAM.
+    rows = rows_max;
+    rect.bottom = rows_max << (4 - samples_y_log2);
+  }
+
+  base_in_out = base;
+  rect_in_out = rect;
+  pitch_tiles_out = pitch_tiles;
+  row_tiles_out = (((rect.right << samples_x_log2) + 79) / 80)
+                  << sample_size_log2;
+  rows_out = rows;
+  return true;
+}
+
 RenderTargetCache::EDRAMLoadStoreMode RenderTargetCache::GetLoadStoreMode(
    bool is_depth, uint32_t format) {
  if (is_depth) {
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@ -324,7 +324,24 @@ class RenderTargetCache {
  RenderTarget* FindOrCreateRenderTarget(RenderTargetKey key,
                                         uint32_t heap_page_first);

+  // Calculates the tile layout for a rectangle on a render target of the given
+  // configuration. The base is adjusted so it points to the tile containing the
+  // top-left pixel of the rectangle, the rectangle is also adjusted so it's
+  // relative to that tile (because its coordinates don't have to be multiples
+  // of the tile size) and so it's not larger than the pitch and the available
+  // memory space. EDRAM row pitch in tiles (for memory access) and actual width
+  // and height of the region containing the rectangle in tiles (for thread
+  // group count) are also written. This function returns true if the requested
+  // rectangle is within the bounds of EDRAM and is not empty, but if it returns
+  // false, the output values may not be written, so the return value must be
+  // checked.
+  static bool GetEDRAMLayout(uint32_t pitch_pixels, MsaaSamples msaa_samples,
+                             bool is_64bpp, uint32_t& base_in_out,
+                             D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out,
+                             uint32_t& row_tiles_out, uint32_t& rows_out);
+
  static EDRAMLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format);
+
  // Must be in a frame to call. Stores the dirty areas of the currently bound
  // render targets and marks them as clean.
  void StoreRenderTargetsToEDRAM();
--- a/src/xenia/gpu/d3d12/texture_cache.cc
+++ b/src/xenia/gpu/d3d12/texture_cache.cc
@ -661,7 +661,7 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
                                    height_blocks, depth_blocks);
    texture->base_slice_size = texture_util::GetGuestMipStorageSize(
        width_blocks, height_blocks, depth_blocks, key.tiled, key.format,
-        texture->mip_pitches[0]);
+        &texture->mip_pitches[0]);
    texture->base_in_sync = false;
  } else {
    texture->base_slice_size = 0;
@ -684,7 +684,7 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
      texture->mip_offsets[i] = texture->mip_slice_size;
      texture->mip_slice_size += texture_util::GetGuestMipStorageSize(
          width_blocks, height_blocks, depth_blocks, key.tiled, key.format,
-          texture->mip_pitches[i]);
+          &texture->mip_pitches[i]);
    }
    // The rest are either packed levels or don't exist at all.
    for (uint32_t i = mip_max_storage_level + 1;
--- a/src/xenia/gpu/registers.h
+++ b/src/xenia/gpu/registers.h
@ -253,7 +253,7 @@ union RB_DEPTH_INFO {

 union RB_COPY_CONTROL {
  xe::bf<uint32_t, 0, 3> copy_src_select;
-  xe::bf<uint32_t, 4, 3> copy_sample_select;
+  xe::bf<xenos::CopySampleSelect, 4, 3> copy_sample_select;
  xe::bf<uint32_t, 8, 1> color_clear_enable;
  xe::bf<uint32_t, 9, 1> depth_clear_enable;

--- a/src/xenia/gpu/texture_util.cc
+++ b/src/xenia/gpu/texture_util.cc
@ -56,7 +56,7 @@ void GetGuestMipBlocks(Dimension dimension, uint32_t width, uint32_t height,

 uint32_t GetGuestMipStorageSize(uint32_t width_blocks, uint32_t height_blocks,
                                uint32_t depth_blocks, bool is_tiled,
-                                TextureFormat format, uint32_t& row_pitch_out) {
+                                TextureFormat format, uint32_t* row_pitch_out) {
  const FormatInfo* format_info = FormatInfo::Get(format);
  uint32_t row_pitch = width_blocks * format_info->block_width *
                       format_info->block_height * format_info->bits_per_pixel /
@ -64,7 +64,9 @@ uint32_t GetGuestMipStorageSize(uint32_t width_blocks, uint32_t height_blocks,
  if (!is_tiled) {
    row_pitch = xe::align(row_pitch, 256u);
  }
-  row_pitch_out = row_pitch;
+  if (row_pitch_out != nullptr) {
+    *row_pitch_out = row_pitch;
+  }
  return xe::align(row_pitch * height_blocks * depth_blocks, 4096u);
 }

--- a/src/xenia/gpu/texture_util.h
+++ b/src/xenia/gpu/texture_util.h
@ -33,7 +33,7 @@ void GetGuestMipBlocks(Dimension dimension, uint32_t width, uint32_t height,
 // height and depth must be obtained via GetGuestMipExtent.
 uint32_t GetGuestMipStorageSize(uint32_t width_blocks, uint32_t height_blocks,
                                uint32_t depth_blocks, bool is_tiled,
-                                TextureFormat format, uint32_t& row_pitch_out);
+                                TextureFormat format, uint32_t* row_pitch_out);

 // Gets the number of the mipmap level where the packed mips are stored.
 inline uint32_t GetPackedMipLevel(uint32_t width, uint32_t height) {
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -285,6 +285,17 @@ enum class CopyCommand : uint32_t {
  kNull = 3,  // ?
 };

+// a2xx_rb_copy_sample_select
+enum class CopySampleSelect : uint32_t {
+  k0,
+  k1,
+  k2,
+  k3,
+  k01,
+  k23,
+  k0123,
+};
+
 #define XE_GPU_MAKE_SWIZZLE(x, y, z, w)                        \
  (((XE_GPU_SWIZZLE_##x) << 0) | ((XE_GPU_SWIZZLE_##y) << 3) | \
   ((XE_GPU_SWIZZLE_##z) << 6) | ((XE_GPU_SWIZZLE_##w) << 9))