Merge branch 'master' into vulkan

2022-05-24 22:34:40 +03:00 · 2022-05-24 22:34:40 +03:00 · aac28f19d1
parent f994d3ebb3 a4840e1992
commit aac28f19d1
8 changed files with 653 additions and 521 deletions
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -562,7 +562,7 @@ class D3D12CommandProcessor : public CommandProcessor {
  // Unsubmitted barrier batch.
  std::vector<D3D12_RESOURCE_BARRIER> barriers_;

-  // <Resource, submission where requested>, sorted by the submission number.
+  // <Submission where requested, resource>, sorted by the submission number.
  std::deque<std::pair<uint64_t, ID3D12Resource*>> resources_for_deletion_;

  static constexpr uint32_t kScratchBufferSizeIncrement = 16 * 1024 * 1024;
--- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
--- a/src/xenia/gpu/d3d12/d3d12_texture_cache.h
+++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.h
@ -179,85 +179,23 @@ class D3D12TextureCache final : public TextureCache {
  static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2;
  static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5;

-  enum class LoadMode {
-    k8bpb,
-    k16bpb,
-    k32bpb,
-    k64bpb,
-    k128bpb,
-    kR5G5B5A1ToB5G5R5A1,
-    kR5G6B5ToB5G6R5,
-    kR5G5B6ToB5G6R5WithRBGASwizzle,
-    kR4G4B4A4ToB4G4R4A4,
-    kGBGR8ToGRGB8,
-    kGBGR8ToRGB8,
-    kBGRG8ToRGBG8,
-    kBGRG8ToRGB8,
-    kR10G11B11ToRGBA16,
-    kR10G11B11ToRGBA16SNorm,
-    kR11G11B10ToRGBA16,
-    kR11G11B10ToRGBA16SNorm,
-    kDXT1ToRGBA8,
-    kDXT3ToRGBA8,
-    kDXT5ToRGBA8,
-    kDXNToRG8,
-    kDXT3A,
-    kDXT3AAs1111ToBGRA4,
-    kDXT5AToR8,
-    kCTX1,
-    kDepthUnorm,
-    kDepthFloat,
-
-    kCount,
-
-    kUnknown = kCount
-  };
-
-  struct LoadModeInfo {
-    // Shader without resolution scaling.
-    const void* shader;
-    size_t shader_size;
-    // Shader with resolution scaling, if available. These shaders are separate
-    // so the majority of the textures are not affected by the code needed for
-    // resolution scale support, and also to check if the format allows
-    // resolution scaling.
-    const void* shader_scaled;
-    size_t shader_scaled_size;
-    // Log2 of the sizes, in bytes, of the source (guest) SRV and the
-    // destination (host) UAV accessed by the copying shader, since the shader
-    // may copy multiple blocks per one invocation.
-    uint32_t srv_bpe_log2;
-    uint32_t uav_bpe_log2;
-    // Number of bytes in a host resolution-scaled block (corresponding to a
-    // guest block if not decompressing, or a host texel if decompressing)
-    // written by the shader.
-    uint32_t bytes_per_host_block;
-    // Log2 of the number of guest resolution-scaled blocks along the X axis
-    // loaded by a single thread shader group.
-    uint32_t guest_x_blocks_per_thread_log2;
-
-    uint32_t GetGuestXBlocksPerGroupLog2() const {
-      return kLoadGuestXThreadsPerGroupLog2 + guest_x_blocks_per_thread_log2;
-    }
-  };
-
  struct HostFormat {
    // Format info for the regular case.
    // DXGI format (typeless when different signedness or number representation
    // is used) for the texture resource.
    DXGI_FORMAT dxgi_format_resource;
    // DXGI format for unsigned normalized or unsigned/signed float SRV.
-    DXGI_FORMAT dxgi_format_unorm;
-    // The regular load mode, used when special modes (like signed-specific or
-    // decompressing) aren't needed.
-    LoadMode load_mode;
+    DXGI_FORMAT dxgi_format_unsigned;
+    // The regular load shader, used when special load shaders (like
+    // signed-specific or decompressing) aren't needed.
+    LoadShaderIndex load_shader;
    // DXGI format for signed normalized or unsigned/signed float SRV.
-    DXGI_FORMAT dxgi_format_snorm;
+    DXGI_FORMAT dxgi_format_signed;
    // If the signed version needs a different bit representation on the host,
-    // this is the load mode for the signed version. Otherwise the regular
-    // load_mode will be used for the signed version, and a single copy will be
-    // created if both unsigned and signed are used.
-    LoadMode load_mode_snorm;
+    // this is the load shader for the signed version. Otherwise the regular
+    // load_shader will be used for the signed version, and a single copy will
+    // be created if both unsigned and signed are used.
+    LoadShaderIndex load_shader_signed;

    // Do NOT add integer DXGI formats to this - they are not filterable, can
    // only be read with Load, not Sample! If any game is seen using num_format
@ -276,7 +214,7 @@ class D3D12TextureCache final : public TextureCache {
    // supports unsigned normalized formats - let's hope GPUSIGN_SIGNED was not
    // used for DXN and DXT5A.
    DXGI_FORMAT dxgi_format_uncompressed;
-    LoadMode decompress_mode;
+    LoadShaderIndex load_shader_decompress;

    // Mapping of Xenos swizzle components to DXGI format components.
    uint32_t swizzle;
@ -440,13 +378,13 @@ class D3D12TextureCache final : public TextureCache {
    const HostFormat& host_format = host_formats_[uint32_t(format)];
    return IsDecompressionNeeded(format, width, height)
               ? host_format.dxgi_format_uncompressed
-               : host_format.dxgi_format_unorm;
+               : host_format.dxgi_format_unsigned;
  }
  static DXGI_FORMAT GetDXGIUnormFormat(TextureKey key) {
    return GetDXGIUnormFormat(key.format, key.GetWidth(), key.GetHeight());
  }

-  static LoadMode GetLoadMode(TextureKey key);
+  static LoadShaderIndex GetLoadShaderIndex(TextureKey key);

  static constexpr bool AreDimensionsCompatible(
      xenos::FetchOpDimension binding_dimension,
@ -528,14 +466,11 @@ class D3D12TextureCache final : public TextureCache {
  D3D12CommandProcessor& command_processor_;
  bool bindless_resources_used_;

-  static const LoadModeInfo load_mode_info_[];
  Microsoft::WRL::ComPtr<ID3D12RootSignature> load_root_signature_;
-  std::array<Microsoft::WRL::ComPtr<ID3D12PipelineState>,
-             size_t(LoadMode::kCount)>
+  std::array<Microsoft::WRL::ComPtr<ID3D12PipelineState>, kLoadShaderCount>
      load_pipelines_;
  // Load pipelines for resolution-scaled resolve targets.
-  std::array<Microsoft::WRL::ComPtr<ID3D12PipelineState>,
-             size_t(LoadMode::kCount)>
+  std::array<Microsoft::WRL::ComPtr<ID3D12PipelineState>, kLoadShaderCount>
      load_pipelines_scaled_;

  std::vector<SRVDescriptorCachePage> srv_descriptor_cache_;
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@ -73,6 +73,80 @@ DEFINE_uint32(
 namespace xe {
 namespace gpu {

+const TextureCache::LoadShaderInfo
+    TextureCache::load_shader_info_[kLoadShaderCount] = {
+        // k8bpb
+        {3, 4, 1, 4},
+        // k16bpb
+        {4, 4, 2, 4},
+        // k32bpb
+        {4, 4, 4, 3},
+        // k64bpb
+        {4, 4, 8, 2},
+        // k128bpb
+        {4, 4, 16, 1},
+        // kR5G5B5A1ToB5G5R5A1
+        {4, 4, 2, 4},
+        // kR5G6B5ToB5G6R5
+        {4, 4, 2, 4},
+        // kR5G5B6ToB5G6R5WithRBGASwizzle
+        {4, 4, 2, 4},
+        // kRGBA4ToBGRA4
+        {4, 4, 2, 4},
+        // kRGBA4ToARGB4
+        {4, 4, 2, 4},
+        // kGBGR8ToGRGB8
+        {4, 4, 4, 3},
+        // kGBGR8ToRGB8
+        {4, 4, 8, 3},
+        // kBGRG8ToRGBG8
+        {4, 4, 4, 3},
+        // kBGRG8ToRGB8
+        {4, 4, 8, 3},
+        // kR10G11B11ToRGBA16
+        {4, 4, 8, 3},
+        // kR10G11B11ToRGBA16SNorm
+        {4, 4, 8, 3},
+        // kR11G11B10ToRGBA16
+        {4, 4, 8, 3},
+        // kR11G11B10ToRGBA16SNorm
+        {4, 4, 8, 3},
+        // kR16UNormToFloat
+        {4, 4, 2, 4},
+        // kR16SNormToFloat
+        {4, 4, 2, 4},
+        // kRG16UNormToFloat
+        {4, 4, 4, 3},
+        // kRG16SNormToFloat
+        {4, 4, 4, 3},
+        // kRGBA16UNormToFloat
+        {4, 4, 8, 2},
+        // kRGBA16SNormToFloat
+        {4, 4, 8, 2},
+        // kDXT1ToRGBA8
+        {4, 4, 4, 2},
+        // kDXT3ToRGBA8
+        {4, 4, 4, 1},
+        // kDXT5ToRGBA8
+        {4, 4, 4, 1},
+        // kDXNToRG8
+        {4, 4, 2, 1},
+        // kDXT3A
+        {4, 4, 1, 2},
+        // kDXT3AAs1111ToBGRA4
+        {4, 4, 2, 2},
+        // kDXT3AAs1111ToARGB4
+        {4, 4, 2, 2},
+        // kDXT5AToR8
+        {4, 4, 1, 2},
+        // kCTX1
+        {4, 4, 2, 2},
+        // kDepthUnorm
+        {4, 4, 4, 3},
+        // kDepthFloat
+        {4, 4, 4, 3},
+};
+
 TextureCache::TextureCache(const RegisterFile& register_file,
                           SharedMemory& shared_memory,
                           uint32_t draw_resolution_scale_x,
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@ -395,6 +395,69 @@ class TextureCache {
    uint32_t height_texels;
  };

+  static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2;
+  static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5;
+
+  enum LoadShaderIndex {
+    kLoadShaderIndex8bpb,
+    kLoadShaderIndex16bpb,
+    kLoadShaderIndex32bpb,
+    kLoadShaderIndex64bpb,
+    kLoadShaderIndex128bpb,
+    kLoadShaderIndexR5G5B5A1ToB5G5R5A1,
+    kLoadShaderIndexR5G6B5ToB5G6R5,
+    kLoadShaderIndexR5G5B6ToB5G6R5WithRBGASwizzle,
+    kLoadShaderIndexRGBA4ToBGRA4,
+    kLoadShaderIndexRGBA4ToARGB4,
+    kLoadShaderIndexGBGR8ToGRGB8,
+    kLoadShaderIndexGBGR8ToRGB8,
+    kLoadShaderIndexBGRG8ToRGBG8,
+    kLoadShaderIndexBGRG8ToRGB8,
+    kLoadShaderIndexR10G11B11ToRGBA16,
+    kLoadShaderIndexR10G11B11ToRGBA16SNorm,
+    kLoadShaderIndexR11G11B10ToRGBA16,
+    kLoadShaderIndexR11G11B10ToRGBA16SNorm,
+    kLoadShaderIndexR16UNormToFloat,
+    kLoadShaderIndexR16SNormToFloat,
+    kLoadShaderIndexRG16UNormToFloat,
+    kLoadShaderIndexRG16SNormToFloat,
+    kLoadShaderIndexRGBA16UNormToFloat,
+    kLoadShaderIndexRGBA16SNormToFloat,
+    kLoadShaderIndexDXT1ToRGBA8,
+    kLoadShaderIndexDXT3ToRGBA8,
+    kLoadShaderIndexDXT5ToRGBA8,
+    kLoadShaderIndexDXNToRG8,
+    kLoadShaderIndexDXT3A,
+    kLoadShaderIndexDXT3AAs1111ToBGRA4,
+    kLoadShaderIndexDXT3AAs1111ToARGB4,
+    kLoadShaderIndexDXT5AToR8,
+    kLoadShaderIndexCTX1,
+    kLoadShaderIndexDepthUnorm,
+    kLoadShaderIndexDepthFloat,
+
+    kLoadShaderCount,
+    kLoadShaderIndexUnknown = kLoadShaderCount,
+  };
+
+  struct LoadShaderInfo {
+    // Log2 of the sizes, in bytes, of the elements in the source (guest) and
+    // the destination (host) buffer bindings accessed by the copying shader,
+    // since the shader may copy multiple blocks per one invocation.
+    uint32_t source_bpe_log2;
+    uint32_t dest_bpe_log2;
+    // Number of bytes in a host resolution-scaled block (corresponding to a
+    // guest block if not decompressing, or a host texel if decompressing)
+    // written by the shader.
+    uint32_t bytes_per_host_block;
+    // Log2 of the number of guest resolution-scaled blocks along the X axis
+    // loaded by a single thread shader group.
+    uint32_t guest_x_blocks_per_thread_log2;
+
+    uint32_t GetGuestXBlocksPerGroupLog2() const {
+      return kLoadGuestXThreadsPerGroupLog2 + guest_x_blocks_per_thread_log2;
+    }
+  };
+
  static constexpr uint8_t kSwizzledSignsUnsigned =
      uint8_t(xenos::TextureSign::kUnsigned) * uint8_t(0b01010101);

@ -472,6 +535,11 @@ class TextureCache {
  // should be made.
  Texture* FindOrCreateTexture(TextureKey key);

+  static const LoadShaderInfo& GetLoadShaderInfo(
+      LoadShaderIndex load_shader_index) {
+    assert_true(load_shader_index < kLoadShaderCount);
+    return load_shader_info_[load_shader_index];
+  }
  bool LoadTextureData(Texture& texture);
  // Writes the texture data (for base, mips or both - but not neither) from the
  // shared memory or the scaled resolve memory. The shared memory management is
@ -527,6 +595,8 @@ class TextureCache {
  uint32_t draw_resolution_scale_x_;
  uint32_t draw_resolution_scale_y_;

+  static const LoadShaderInfo load_shader_info_[kLoadShaderCount];
+
  xe::global_critical_region global_critical_region_;
  // Bit vector storing whether each 4 KB physical memory page contains scaled
  // resolve data. uint32_t rather than uint64_t because parts of it can be sent
--- a/src/xenia/gpu/texture_util.cc
+++ b/src/xenia/gpu/texture_util.cc
@ -391,6 +391,12 @@ TextureGuestLayout GetGuestTextureLayout(
        // 2D 32x32-block tiles are laid out linearly in the texture.
        // Calculate the extent as ((all rows except for the last * pitch in
        // tiles + last row length in tiles) * bytes per tile).
+        // FIXME(Triang3l): This is wrong for 1bpb and 2bpb. At 1bpb (32x32 is
+        // 1024 bytes), offset for X + 32 minus offset for X is 512, not 1024,
+        // but offset for X + 128 minus offset for X + 96 is 2560. Also, for
+        // XY = 0...31, the extent of the addresses is 2560, not 1024. At 2bpb,
+        // addressing repeats every 64x64, and the extent for XY = 0...31 is
+        // 3072, not 2048.
        level_layout.array_slice_data_extent_bytes =
            (level_layout.y_extent_blocks - xenos::kTextureTileWidthHeight) *
                level_layout.row_pitch_bytes +
--- a/src/xenia/gpu/texture_util.h
+++ b/src/xenia/gpu/texture_util.h
@ -173,8 +173,8 @@ struct TextureGuestLayout {
  // If mip_max_level specified at calculation time is at least 1, the stored
  // mips are min(1, packed_mip_level) through min(mip_max_level,
  // packed_mip_level).
-  Level mips[xenos::kTexture2DCubeMaxWidthHeightLog2 + 1];
-  uint32_t mip_offsets_bytes[xenos::kTexture2DCubeMaxWidthHeightLog2 + 1];
+  Level mips[xenos::kTextureMaxMips];
+  uint32_t mip_offsets_bytes[xenos::kTextureMaxMips];
  uint32_t mips_total_extent_bytes;
  uint32_t max_level;
  // UINT32_MAX if there's no packed mip tail.
@ -207,6 +207,11 @@ void GetTextureTotalSize(xenos::DataDimension dimension,
 //       Offset3D(X * 32, Y * 32, Z * 8) + Offset3D(x, y, z)
 //   (true for negative offsets too).
 // - 2D 32x32 tiles are laid out linearly.
+// FIXME(Triang3l): This is wrong for 1bpb and 2bpb. At 1bpb (32x32 is 1024
+// bytes), offset for X + 32 minus offset for X is 512, not 1024, but offset for
+// X + 128 minus offset for X + 96 is 2560. Also, for XY = 0...31, the extent of
+// the addresses is 2560, not 1024. At 2bpb, addressing repeats every 64x64, and
+// the extent for XY = 0...31 is 3072, not 2048.
 // - 3D tiled texture slices 0:3 and 4:7 are stored separately in memory, in
 //   non-overlapping ranges, but addressing in 4:7 is different than in 0:3.
 // - Addressing of blocks that are contiguous along X (for tiling/untiling of
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -1045,6 +1045,10 @@ constexpr uint32_t kTexture3DMaxWidthHeight = 1 << kTexture3DMaxWidthHeightLog2;
 constexpr uint32_t kTexture3DMaxDepthLog2 = 10;
 constexpr uint32_t kTexture3DMaxDepth = 1 << kTexture3DMaxDepthLog2;

+constexpr uint32_t kTextureMaxMips =
+    std::max(kTexture2DCubeMaxWidthHeightLog2, kTexture3DMaxWidthHeightLog2) +
+    1;
+
 // Tiled texture sizes are in 32x32 increments for 2D, 32x32x4 for 3D.
 // 2DTiledOffset(X * 32 + x, Y * 32 + y) ==
 //     2DTiledOffset(X * 32, Y * 32) + 2DTiledOffset(x, y)