Merge branch 'master' into vulkan

2022-05-24 22:34:40 +03:00 · 2022-05-24 22:34:40 +03:00 · aac28f19d1
parent f994d3ebb3 a4840e1992
commit aac28f19d1
8 changed files with 653 additions and 521 deletions
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -562,7 +562,7 @@ class D3D12CommandProcessor : public CommandProcessor {
  // Unsubmitted barrier batch.
  std::vector<D3D12_RESOURCE_BARRIER> barriers_;
-  // <Resource, submission where requested>, sorted by the submission number.
+  // <Submission where requested, resource>, sorted by the submission number.
  std::deque<std::pair<uint64_t, ID3D12Resource*>> resources_for_deletion_;
  static constexpr uint32_t kScratchBufferSizeIncrement = 16 * 1024 * 1024;
--- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc
--- a/src/xenia/gpu/d3d12/d3d12_texture_cache.h
+++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.h
@ -179,85 +179,23 @@ class D3D12TextureCache final : public TextureCache {
  static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2;
  static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5;
  enum class LoadMode {
    k8bpb,
    k16bpb,
    k32bpb,
    k64bpb,
    k128bpb,
    kR5G5B5A1ToB5G5R5A1,
    kR5G6B5ToB5G6R5,
    kR5G5B6ToB5G6R5WithRBGASwizzle,
    kR4G4B4A4ToB4G4R4A4,
    kGBGR8ToGRGB8,
    kGBGR8ToRGB8,
    kBGRG8ToRGBG8,
    kBGRG8ToRGB8,
    kR10G11B11ToRGBA16,
    kR10G11B11ToRGBA16SNorm,
    kR11G11B10ToRGBA16,
    kR11G11B10ToRGBA16SNorm,
    kDXT1ToRGBA8,
    kDXT3ToRGBA8,
    kDXT5ToRGBA8,
    kDXNToRG8,
    kDXT3A,
    kDXT3AAs1111ToBGRA4,
    kDXT5AToR8,
    kCTX1,
    kDepthUnorm,
    kDepthFloat,
    kCount,
    kUnknown = kCount
  };
  struct LoadModeInfo {
    // Shader without resolution scaling.
    const void* shader;
    size_t shader_size;
    // Shader with resolution scaling, if available. These shaders are separate
    // so the majority of the textures are not affected by the code needed for
    // resolution scale support, and also to check if the format allows
    // resolution scaling.
    const void* shader_scaled;
    size_t shader_scaled_size;
    // Log2 of the sizes, in bytes, of the source (guest) SRV and the
    // destination (host) UAV accessed by the copying shader, since the shader
    // may copy multiple blocks per one invocation.
    uint32_t srv_bpe_log2;
    uint32_t uav_bpe_log2;
    // Number of bytes in a host resolution-scaled block (corresponding to a
    // guest block if not decompressing, or a host texel if decompressing)
    // written by the shader.
    uint32_t bytes_per_host_block;
    // Log2 of the number of guest resolution-scaled blocks along the X axis
    // loaded by a single thread shader group.
    uint32_t guest_x_blocks_per_thread_log2;
    uint32_t GetGuestXBlocksPerGroupLog2() const {
      return kLoadGuestXThreadsPerGroupLog2 + guest_x_blocks_per_thread_log2;
    }
  };
  struct HostFormat {
    // Format info for the regular case.
    // DXGI format (typeless when different signedness or number representation
    // is used) for the texture resource.
    DXGI_FORMAT dxgi_format_resource;
    // DXGI format for unsigned normalized or unsigned/signed float SRV.
-    DXGI_FORMAT dxgi_format_unorm;
+    DXGI_FORMAT dxgi_format_unsigned;
-    // The regular load mode, used when special modes (like signed-specific or
+    // The regular load shader, used when special load shaders (like
-    // decompressing) aren't needed.
+    // signed-specific or decompressing) aren't needed.
-    LoadMode load_mode;
+    LoadShaderIndex load_shader;
    // DXGI format for signed normalized or unsigned/signed float SRV.
-    DXGI_FORMAT dxgi_format_snorm;
+    DXGI_FORMAT dxgi_format_signed;
    // If the signed version needs a different bit representation on the host,
-    // this is the load mode for the signed version. Otherwise the regular
+    // this is the load shader for the signed version. Otherwise the regular
-    // load_mode will be used for the signed version, and a single copy will be
+    // load_shader will be used for the signed version, and a single copy will
-    // created if both unsigned and signed are used.
+    // be created if both unsigned and signed are used.
-    LoadMode load_mode_snorm;
+    LoadShaderIndex load_shader_signed;
    // Do NOT add integer DXGI formats to this - they are not filterable, can
    // only be read with Load, not Sample! If any game is seen using num_format
@ -276,7 +214,7 @@ class D3D12TextureCache final : public TextureCache {
    // supports unsigned normalized formats - let's hope GPUSIGN_SIGNED was not
    // used for DXN and DXT5A.
    DXGI_FORMAT dxgi_format_uncompressed;
-    LoadMode decompress_mode;
+    LoadShaderIndex load_shader_decompress;
    // Mapping of Xenos swizzle components to DXGI format components.
    uint32_t swizzle;
@ -440,13 +378,13 @@ class D3D12TextureCache final : public TextureCache {
    const HostFormat& host_format = host_formats_[uint32_t(format)];
    return IsDecompressionNeeded(format, width, height)
               ? host_format.dxgi_format_uncompressed
-               : host_format.dxgi_format_unorm;
+               : host_format.dxgi_format_unsigned;
  }
  static DXGI_FORMAT GetDXGIUnormFormat(TextureKey key) {
    return GetDXGIUnormFormat(key.format, key.GetWidth(), key.GetHeight());
  }
-  static LoadMode GetLoadMode(TextureKey key);
+  static LoadShaderIndex GetLoadShaderIndex(TextureKey key);
  static constexpr bool AreDimensionsCompatible(
      xenos::FetchOpDimension binding_dimension,
@ -528,14 +466,11 @@ class D3D12TextureCache final : public TextureCache {
  D3D12CommandProcessor& command_processor_;
  bool bindless_resources_used_;
  static const LoadModeInfo load_mode_info_[];
  Microsoft::WRL::ComPtr<ID3D12RootSignature> load_root_signature_;
-  std::array<Microsoft::WRL::ComPtr<ID3D12PipelineState>,
+  std::array<Microsoft::WRL::ComPtr<ID3D12PipelineState>, kLoadShaderCount>
             size_t(LoadMode::kCount)>
      load_pipelines_;
  // Load pipelines for resolution-scaled resolve targets.
-  std::array<Microsoft::WRL::ComPtr<ID3D12PipelineState>,
+  std::array<Microsoft::WRL::ComPtr<ID3D12PipelineState>, kLoadShaderCount>
             size_t(LoadMode::kCount)>
      load_pipelines_scaled_;
  std::vector<SRVDescriptorCachePage> srv_descriptor_cache_;
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@ -73,6 +73,80 @@ DEFINE_uint32(
 namespace xe {
 namespace gpu {
 const TextureCache::LoadShaderInfo
    TextureCache::load_shader_info_[kLoadShaderCount] = {
        // k8bpb
        {3, 4, 1, 4},
        // k16bpb
        {4, 4, 2, 4},
        // k32bpb
        {4, 4, 4, 3},
        // k64bpb
        {4, 4, 8, 2},
        // k128bpb
        {4, 4, 16, 1},
        // kR5G5B5A1ToB5G5R5A1
        {4, 4, 2, 4},
        // kR5G6B5ToB5G6R5
        {4, 4, 2, 4},
        // kR5G5B6ToB5G6R5WithRBGASwizzle
        {4, 4, 2, 4},
        // kRGBA4ToBGRA4
        {4, 4, 2, 4},
        // kRGBA4ToARGB4
        {4, 4, 2, 4},
        // kGBGR8ToGRGB8
        {4, 4, 4, 3},
        // kGBGR8ToRGB8
        {4, 4, 8, 3},
        // kBGRG8ToRGBG8
        {4, 4, 4, 3},
        // kBGRG8ToRGB8
        {4, 4, 8, 3},
        // kR10G11B11ToRGBA16
        {4, 4, 8, 3},
        // kR10G11B11ToRGBA16SNorm
        {4, 4, 8, 3},
        // kR11G11B10ToRGBA16
        {4, 4, 8, 3},
        // kR11G11B10ToRGBA16SNorm
        {4, 4, 8, 3},
        // kR16UNormToFloat
        {4, 4, 2, 4},
        // kR16SNormToFloat
        {4, 4, 2, 4},
        // kRG16UNormToFloat
        {4, 4, 4, 3},
        // kRG16SNormToFloat
        {4, 4, 4, 3},
        // kRGBA16UNormToFloat
        {4, 4, 8, 2},
        // kRGBA16SNormToFloat
        {4, 4, 8, 2},
        // kDXT1ToRGBA8
        {4, 4, 4, 2},
        // kDXT3ToRGBA8
        {4, 4, 4, 1},
        // kDXT5ToRGBA8
        {4, 4, 4, 1},
        // kDXNToRG8
        {4, 4, 2, 1},
        // kDXT3A
        {4, 4, 1, 2},
        // kDXT3AAs1111ToBGRA4
        {4, 4, 2, 2},
        // kDXT3AAs1111ToARGB4
        {4, 4, 2, 2},
        // kDXT5AToR8
        {4, 4, 1, 2},
        // kCTX1
        {4, 4, 2, 2},
        // kDepthUnorm
        {4, 4, 4, 3},
        // kDepthFloat
        {4, 4, 4, 3},
 };
 TextureCache::TextureCache(const RegisterFile& register_file,
                           SharedMemory& shared_memory,
                           uint32_t draw_resolution_scale_x,
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@ -395,6 +395,69 @@ class TextureCache {
    uint32_t height_texels;
  };
  static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2;
  static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5;
  enum LoadShaderIndex {
    kLoadShaderIndex8bpb,
    kLoadShaderIndex16bpb,
    kLoadShaderIndex32bpb,
    kLoadShaderIndex64bpb,
    kLoadShaderIndex128bpb,
    kLoadShaderIndexR5G5B5A1ToB5G5R5A1,
    kLoadShaderIndexR5G6B5ToB5G6R5,
    kLoadShaderIndexR5G5B6ToB5G6R5WithRBGASwizzle,
    kLoadShaderIndexRGBA4ToBGRA4,
    kLoadShaderIndexRGBA4ToARGB4,
    kLoadShaderIndexGBGR8ToGRGB8,
    kLoadShaderIndexGBGR8ToRGB8,
    kLoadShaderIndexBGRG8ToRGBG8,
    kLoadShaderIndexBGRG8ToRGB8,
    kLoadShaderIndexR10G11B11ToRGBA16,
    kLoadShaderIndexR10G11B11ToRGBA16SNorm,
    kLoadShaderIndexR11G11B10ToRGBA16,
    kLoadShaderIndexR11G11B10ToRGBA16SNorm,
    kLoadShaderIndexR16UNormToFloat,
    kLoadShaderIndexR16SNormToFloat,
    kLoadShaderIndexRG16UNormToFloat,
    kLoadShaderIndexRG16SNormToFloat,
    kLoadShaderIndexRGBA16UNormToFloat,
    kLoadShaderIndexRGBA16SNormToFloat,
    kLoadShaderIndexDXT1ToRGBA8,
    kLoadShaderIndexDXT3ToRGBA8,
    kLoadShaderIndexDXT5ToRGBA8,
    kLoadShaderIndexDXNToRG8,
    kLoadShaderIndexDXT3A,
    kLoadShaderIndexDXT3AAs1111ToBGRA4,
    kLoadShaderIndexDXT3AAs1111ToARGB4,
    kLoadShaderIndexDXT5AToR8,
    kLoadShaderIndexCTX1,
    kLoadShaderIndexDepthUnorm,
    kLoadShaderIndexDepthFloat,
    kLoadShaderCount,
    kLoadShaderIndexUnknown = kLoadShaderCount,
  };
  struct LoadShaderInfo {
    // Log2 of the sizes, in bytes, of the elements in the source (guest) and
    // the destination (host) buffer bindings accessed by the copying shader,
    // since the shader may copy multiple blocks per one invocation.
    uint32_t source_bpe_log2;
    uint32_t dest_bpe_log2;
    // Number of bytes in a host resolution-scaled block (corresponding to a
    // guest block if not decompressing, or a host texel if decompressing)
    // written by the shader.
    uint32_t bytes_per_host_block;
    // Log2 of the number of guest resolution-scaled blocks along the X axis
    // loaded by a single thread shader group.
    uint32_t guest_x_blocks_per_thread_log2;
    uint32_t GetGuestXBlocksPerGroupLog2() const {
      return kLoadGuestXThreadsPerGroupLog2 + guest_x_blocks_per_thread_log2;
    }
  };
  static constexpr uint8_t kSwizzledSignsUnsigned =
      uint8_t(xenos::TextureSign::kUnsigned) * uint8_t(0b01010101);
@ -472,6 +535,11 @@ class TextureCache {
  // should be made.
  Texture* FindOrCreateTexture(TextureKey key);
  static const LoadShaderInfo& GetLoadShaderInfo(
      LoadShaderIndex load_shader_index) {
    assert_true(load_shader_index < kLoadShaderCount);
    return load_shader_info_[load_shader_index];
  }
  bool LoadTextureData(Texture& texture);
  // Writes the texture data (for base, mips or both - but not neither) from the
  // shared memory or the scaled resolve memory. The shared memory management is
@ -527,6 +595,8 @@ class TextureCache {
  uint32_t draw_resolution_scale_x_;
  uint32_t draw_resolution_scale_y_;
  static const LoadShaderInfo load_shader_info_[kLoadShaderCount];
  xe::global_critical_region global_critical_region_;
  // Bit vector storing whether each 4 KB physical memory page contains scaled
  // resolve data. uint32_t rather than uint64_t because parts of it can be sent
--- a/src/xenia/gpu/texture_util.cc
+++ b/src/xenia/gpu/texture_util.cc
@ -391,6 +391,12 @@ TextureGuestLayout GetGuestTextureLayout(
        // 2D 32x32-block tiles are laid out linearly in the texture.
        // Calculate the extent as ((all rows except for the last * pitch in
        // tiles + last row length in tiles) * bytes per tile).
        // FIXME(Triang3l): This is wrong for 1bpb and 2bpb. At 1bpb (32x32 is
        // 1024 bytes), offset for X + 32 minus offset for X is 512, not 1024,
        // but offset for X + 128 minus offset for X + 96 is 2560. Also, for
        // XY = 0...31, the extent of the addresses is 2560, not 1024. At 2bpb,
        // addressing repeats every 64x64, and the extent for XY = 0...31 is
        // 3072, not 2048.
        level_layout.array_slice_data_extent_bytes =
            (level_layout.y_extent_blocks - xenos::kTextureTileWidthHeight) *
                level_layout.row_pitch_bytes +
--- a/src/xenia/gpu/texture_util.h
+++ b/src/xenia/gpu/texture_util.h
@ -173,8 +173,8 @@ struct TextureGuestLayout {
  // If mip_max_level specified at calculation time is at least 1, the stored
  // mips are min(1, packed_mip_level) through min(mip_max_level,
  // packed_mip_level).
-  Level mips[xenos::kTexture2DCubeMaxWidthHeightLog2 + 1];
+  Level mips[xenos::kTextureMaxMips];
-  uint32_t mip_offsets_bytes[xenos::kTexture2DCubeMaxWidthHeightLog2 + 1];
+  uint32_t mip_offsets_bytes[xenos::kTextureMaxMips];
  uint32_t mips_total_extent_bytes;
  uint32_t max_level;
  // UINT32_MAX if there's no packed mip tail.
@ -207,6 +207,11 @@ void GetTextureTotalSize(xenos::DataDimension dimension,
 //       Offset3D(X * 32, Y * 32, Z * 8) + Offset3D(x, y, z)
 //   (true for negative offsets too).
 // - 2D 32x32 tiles are laid out linearly.
 // FIXME(Triang3l): This is wrong for 1bpb and 2bpb. At 1bpb (32x32 is 1024
 // bytes), offset for X + 32 minus offset for X is 512, not 1024, but offset for
 // X + 128 minus offset for X + 96 is 2560. Also, for XY = 0...31, the extent of
 // the addresses is 2560, not 1024. At 2bpb, addressing repeats every 64x64, and
 // the extent for XY = 0...31 is 3072, not 2048.
 // - 3D tiled texture slices 0:3 and 4:7 are stored separately in memory, in
 //   non-overlapping ranges, but addressing in 4:7 is different than in 0:3.
 // - Addressing of blocks that are contiguous along X (for tiling/untiling of
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -1045,6 +1045,10 @@ constexpr uint32_t kTexture3DMaxWidthHeight = 1 << kTexture3DMaxWidthHeightLog2;
 constexpr uint32_t kTexture3DMaxDepthLog2 = 10;
 constexpr uint32_t kTexture3DMaxDepth = 1 << kTexture3DMaxDepthLog2;
 constexpr uint32_t kTextureMaxMips =
    std::max(kTexture2DCubeMaxWidthHeightLog2, kTexture3DMaxWidthHeightLog2) +
    1;
 // Tiled texture sizes are in 32x32 increments for 2D, 32x32x4 for 3D.
 // 2DTiledOffset(X * 32 + x, Y * 32 + y) ==
 //     2DTiledOffset(X * 32, Y * 32) + 2DTiledOffset(x, y)