[GPU] Document memexport/resolve formats with more details

2021-10-22 20:00:41 +03:00 · 2021-10-22 20:00:41 +03:00 · 28fec845d5
parent d6660ac391
commit 28fec845d5
4 changed files with 141 additions and 26 deletions
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@ -876,8 +876,9 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
  }
  info_out.address.copy_sample_select = sample_select;
  // Get the format to pass to the shader in a unified way - for depth (for
-  // which Direct3D 9 specifies the k_8_8_8_8 destination format), make sure the
-  // shader won't try to do conversion - pass proper k_24_8 or k_24_8_FLOAT.
+  // which Direct3D 9 specifies the k_8_8_8_8 uint destination format), make
+  // sure the shader won't try to do conversion - pass proper k_24_8 or
+  // k_24_8_FLOAT.
  auto rb_copy_dest_info = regs.Get<reg::RB_COPY_DEST_INFO>();
  xenos::TextureFormat dest_format;
  auto rb_depth_info = regs.Get<reg::RB_DEPTH_INFO>();
--- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc
@ -15,6 +15,14 @@ namespace xe {
 namespace gpu {
 using namespace ucode;

+// TODO(Triang3l): Support sub-dword memexports (like k_8 in 58410B86). This
+// would require four 128 MB R8_UINT UAVs due to the Nvidia addressing limit.
+// Need to be careful with resource binding tiers, however. Resource binding
+// tier 1 on feature level 11_0 allows only 8 UAVs _across all stages_.
+// RWByteAddressBuffer + 4 typed buffers is 5 per stage already, would need 10
+// for both VS and PS, or even 11 with the eDRAM ROV. Need to drop draw commands
+// doing memexport in both VS and PS on FL 11_0 resource binding tier 1.
+
 void DxbcShaderTranslator::ExportToMemory_PackFixed32(
    const uint32_t* eM_temps, uint32_t eM_count, const uint32_t bits[4],
    const dxbc::Src& is_integer, const dxbc::Src& is_signed) {
--- a/src/xenia/gpu/registers.h
+++ b/src/xenia/gpu/registers.h
@ -712,14 +712,14 @@ static_assert_size(RB_COPY_CONTROL, sizeof(uint32_t));

 union alignas(uint32_t) RB_COPY_DEST_INFO {
  struct {
-    xenos::Endian128 copy_dest_endian : 3;    // +0
-    uint32_t copy_dest_array : 1;             // +3
-    uint32_t copy_dest_slice : 3;             // +4
-    xenos::ColorFormat copy_dest_format : 6;  // +7
-    uint32_t copy_dest_number : 3;            // +13
-    int32_t copy_dest_exp_bias : 6;           // +16
-    uint32_t : 2;                             // +22
-    uint32_t copy_dest_swap : 1;              // +24
+    xenos::Endian128 copy_dest_endian : 3;            // +0
+    uint32_t copy_dest_array : 1;                     // +3
+    uint32_t copy_dest_slice : 3;                     // +4
+    xenos::ColorFormat copy_dest_format : 6;          // +7
+    xenos::SurfaceNumberFormat copy_dest_number : 3;  // +13
+    int32_t copy_dest_exp_bias : 6;                   // +16
+    uint32_t : 2;                                     // +22
+    uint32_t copy_dest_swap : 1;                      // +24
  };
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_RB_COPY_DEST_INFO;
--- a/src/xenia/gpu/xenos.h
+++ b/src/xenia/gpu/xenos.h
@ -185,7 +185,7 @@ enum class IndexFormat : uint32_t {
 };

 // SurfaceNumberX from yamato_enum.h.
-enum class SurfaceNumFormat : uint32_t {
+enum class SurfaceNumberFormat : uint32_t {
  kUnsignedRepeatingFraction = 0,
  // Microsoft-style, scale factor (2^(n-1))-1.
  kSignedRepeatingFraction = 1,
@ -1176,14 +1176,120 @@ union alignas(uint32_t) xe_gpu_fetch_group_t {
 };
 static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);

-// GPU_MEMEXPORT_STREAM_CONSTANT from a game .pdb - float constant for memexport
-// stream configuration.
-// This is used with the floating-point ALU in shaders (written to eA using
-// mad), so the dwords have a normalized exponent when reinterpreted as floats
-// (otherwise they would be flushed to zero), but actually these are packed
-// integers. dword_1 specifically is 2^23 because
-// powf(2.0f, 23.0f) + float(i) == 0x4B000000 | i
-// so mad can pack indices as integers in the lower bits.
+// Shader memory export (memexport) allows for writing of arbitrary formatted
+// data with random access / scatter capabilities. It provides functionality
+// largely similar to resolving - format packing, supporting arbitrary color
+// formats, from sub-dword ones such as k_8 in 58410B86, to 128-bit ones, with
+// endian swap similar to how it's performed in resolves (up to 128-bit);
+// specifying the number format, swapping red and blue channels - though with no
+// exponent biasing. Unlike resolving, however, instead of writing to tiled
+// textures, it exports the data to up to 5 elements (the eM# shader registers,
+// each corresponding to `base address + element size * (offset + 0...4)`) in a
+// stream defined by a stream constant and an offset in elements written to eA -
+// a shader, however, can write to multiple streams with different or the same
+// stream constants, by performing `alloc export` multiple times. It's used
+// mostly in vertex shaders (most commonly in improvised "compute shaders" done
+// by executing a vertex shader for a number of point-type primitives covering
+// nothing), though usage in pixel shaders is also possible - an example is
+// provided in the "Advanced Screenspace Antialiasing" presentation by Arne
+// Schober.
+// https://ubm-twvideo01.s3.amazonaws.com/o1/vault/gdceurope2010/slides/A_Schober_Advanced_Screenspace_Antialiasing.pdf
+//
+// Unlike fetch constants, which are passed via special registers, a memory
+// export stream is configured by writing the stream constant and the offset to
+// a shader export register (eA) allocated by the shader - similar to more
+// conventional exports like oPos, o#, oC#. Therefore, in general, it's not
+// possible to know what its value will be without running the shader. For
+// emulation, this means that the memory range referenced by an export - that
+// needs to be validated - requires running the shader on the CPU in general.
+// Thankfully, however, the usual way of setting up eA is by executing:
+// `mad eA, r#, const0100, c#`
+// where c# is the stream float4 constant from the float constant registers, and
+// const0100 is a literal (0.0f, 1.0f, 0.0f, 0.0f) constant, also from the float
+// constant registers, used for placing the element index (r#) in the correct
+// component of eA. This allows for easy gathering of memexport stream
+// constants, which contain both the base address and the size of the
+// destination buffer for bounds checking, from the shader code and the float
+// constant registers, as long as the guest uses this instruction pattern to
+// write to eA.
+//
+// The Xenos doesn't have an integer ALU, and denormals are treated as zero and
+// are flushed. However, eA contains integers and bit fields. A stream constant
+// is thus structured in a way that allows for packing integers in normalized
+// floating-point numbers.
+//
+// X contains the base address of the stream in dwords as integer bits in the
+// lower 30 bits, and bits 0b01 in the top. The 0b01 bits make the exponent
+// nonzero, so the number is considered normalized, and therefore isn't flushed
+// to zero. With only 512 MB of the physical memory on the Xbox 360, the
+// exponent can't become 0b11111111, so X also won't be NaN for any valid Xbox
+// 360 physical address (though in general the GPU supports 32-bit addresses,
+// but this is originally an Xbox 360-specific feature, that was later, however,
+// likely reused for GL_QCOM_writeonly_rendering).
+//
+// TODO(Triang3l): Verify whether GL_QCOM_writeonly_rendering is actually
+// memexport on the Adreno 2xx using GL_OES_get_program_binary - it's also
+// interesting to see how alphatest interacts with it, whether it's still true
+// fixed-function alphatest, as it's claimed to be supported as usual by the
+// extension specification - it's likely, however, that memory exports are
+// discarded alongside other exports such as oC# and oDepth this way.
+//
+// Y of eA contains the offset in elements - this is what shaders are supposed
+// to calculate from something like the vertex index. Again, it's specified as
+// an integer in the low bits, not as a truly floating-point number. For this
+// purpose, stream constants contain the value 2^23 - when a whole
+// floating-point number smaller than 2^23 is added as floating-point to 2^23,
+// its integer representation becomes the mantissa bits of a number with an
+// exponent of 23. Via multiply-add, `offset * 1.0f + exp2f(23)` is written here
+// by the shader, allowing for element offsets of up to 2^23 - 1.
+//
+// Z is a bit field with the information about the formatting of the data. It's
+// also packed as a normalized floating-point number, but in a cleaner way than
+// X because not as many bits are required - just like Y, it has an exponent of
+// 23 (possibly to let shaders build these values manually using floating-point
+// multiply-add like integer shift-or, and finally to add 2^23, though that's
+// not a case easy to handle in emulation, unlike prebuilt stream constants).
+//
+// W contains the number of elements in the stream. It's also packed with the
+// full 23 exponent just like Y and Z, there's no way to index more than 2^23
+// elements using packing via addition to 2^23, so this field also doesn't need
+// more bits than that.
+//
+// Examples of setup in titles (Z from MSB to LSB):
+//
+// 4D5307E6 particles (different VS invocation counts, like 1, 2, 4):
+// There is a passthrough shader - useful for verification as it simply writes
+// directly what it reads via vfetch of various formats. Another shader (with
+// different c# numbers, but same formats) does complicated math to process the
+// particles.
+// c152:           Z = 010010110000|0|111|00|100110|00000|010, count = 35840
+//   8in32, 32_32_32_32_FLOAT, float, RGBA - from 32_32_32_32_FLOAT vfetch
+// c154, 162:      Z = 010010110000|0|111|00|100000|00000|001, count = 71680
+//   8in16, 16_16_16_16_FLOAT, float, RGBA - from 16_16_16_16_FLOAT vfetch
+// c156, 158, 160: Z = 010010110000|0|000|00|011010|00000|001, count = 71680
+//   8in16, 16_16_16_16, unorm, RGBA - from 16_16_16_16 unorm vfetch
+// c164:           Z = 010010110000|0|111|00|011111|00000|001, count = 143360
+//   8in16, 16_16_FLOAT, float, RGBA - from 16_16_FLOAT vfetch
+// c166:           Z = 010010110000|0|000|00|011001|00000|001, count = 143360
+//   8in16, 16_16, unorm, RGBA - from 16_16 unorm vfetch
+// c168:           Z = 010010110000|0|001|00|000111|00000|010, count = 143360
+//   8in32, 2_10_10_10, snorm, RGBA - from 2_10_10_10 snorm vfetch
+// c170, c172:     Z = 010010110000|1|000|00|000110|00000|010, count = 143360
+//   8in32, 8_8_8_8, unorm, BGRA - from 8_8_8_8 unorm vfetch with .zyxw swizzle
+//
+// 4D5307E6 water simulation (2048 VS invocations):
+// c130: Z = 010010110000|0|111|00|100110|00000|010, count = 16384
+//   8in32, 32_32_32_32_FLOAT, float, RGBA
+//   The shader has 5 memexports of this kind and 6 32_32_32_32_FLOAT vfetches.
+//
+// 4D5307E6 water tessellation factors (1 VS invocation per triangle patch):
+// c130: Z = 010010110000|0|111|11|100100|11111|010, count = patch count * 3
+//   8in32, 32_FLOAT, float, RGBA
+//
+// 41560817 texture memory copying (64 bytes per invocation, two eA, eight eM#):
+// c0: Z = 010010110000|0|010|11|011010|00011|001
+//   8in16, 16_16_16_16, uint, RGBA - from 16_16_16_16 uint vfetch
+//   (16_16_16_16 is the largest color format without special values)
 union alignas(uint32_t) xe_gpu_memexport_stream_t {
  struct {
    uint32_t base_address : 30;  // +0 dword_0 physical address >> 2
@ -1191,13 +1297,13 @@ union alignas(uint32_t) xe_gpu_memexport_stream_t {

    uint32_t const_0x4b000000;  // +0 dword_1

-    Endian128 endianness : 3;         // +0 dword_2
-    uint32_t unused_0 : 5;            // +3
-    ColorFormat format : 6;           // +8
-    uint32_t unused_1 : 2;            // +14
-    SurfaceNumFormat num_format : 3;  // +16
-    uint32_t red_blue_swap : 1;       // +19
-    uint32_t const_0x4b0 : 12;        // +20
+    Endian128 endianness : 3;            // +0 dword_2
+    uint32_t unused_0 : 5;               // +3
+    ColorFormat format : 6;              // +8
+    uint32_t unused_1 : 2;               // +14
+    SurfaceNumberFormat num_format : 3;  // +16
+    uint32_t red_blue_swap : 1;          // +19
+    uint32_t const_0x4b0 : 12;           // +20

    uint32_t index_count : 23;  // +0 dword_3
    uint32_t const_0x96 : 9;    // +23