diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 5157c1a88..a91db496c 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -876,8 +876,9 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, } info_out.address.copy_sample_select = sample_select; // Get the format to pass to the shader in a unified way - for depth (for - // which Direct3D 9 specifies the k_8_8_8_8 destination format), make sure the - // shader won't try to do conversion - pass proper k_24_8 or k_24_8_FLOAT. + // which Direct3D 9 specifies the k_8_8_8_8 uint destination format), make + // sure the shader won't try to do conversion - pass proper k_24_8 or + // k_24_8_FLOAT. auto rb_copy_dest_info = regs.Get(); xenos::TextureFormat dest_format; auto rb_depth_info = regs.Get(); diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc index 87de6cce3..4e8a43b62 100644 --- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc +++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc @@ -15,6 +15,14 @@ namespace xe { namespace gpu { using namespace ucode; +// TODO(Triang3l): Support sub-dword memexports (like k_8 in 58410B86). This +// would require four 128 MB R8_UINT UAVs due to the Nvidia addressing limit. +// Need to be careful with resource binding tiers, however. Resource binding +// tier 1 on feature level 11_0 allows only 8 UAVs _across all stages_. +// RWByteAddressBuffer + 4 typed buffers is 5 per stage already, would need 10 +// for both VS and PS, or even 11 with the eDRAM ROV. Need to drop draw commands +// doing memexport in both VS and PS on FL 11_0 resource binding tier 1. + void DxbcShaderTranslator::ExportToMemory_PackFixed32( const uint32_t* eM_temps, uint32_t eM_count, const uint32_t bits[4], const dxbc::Src& is_integer, const dxbc::Src& is_signed) { diff --git a/src/xenia/gpu/registers.h b/src/xenia/gpu/registers.h index 69d922d8b..029a2d6d8 100644 --- a/src/xenia/gpu/registers.h +++ b/src/xenia/gpu/registers.h @@ -712,14 +712,14 @@ static_assert_size(RB_COPY_CONTROL, sizeof(uint32_t)); union alignas(uint32_t) RB_COPY_DEST_INFO { struct { - xenos::Endian128 copy_dest_endian : 3; // +0 - uint32_t copy_dest_array : 1; // +3 - uint32_t copy_dest_slice : 3; // +4 - xenos::ColorFormat copy_dest_format : 6; // +7 - uint32_t copy_dest_number : 3; // +13 - int32_t copy_dest_exp_bias : 6; // +16 - uint32_t : 2; // +22 - uint32_t copy_dest_swap : 1; // +24 + xenos::Endian128 copy_dest_endian : 3; // +0 + uint32_t copy_dest_array : 1; // +3 + uint32_t copy_dest_slice : 3; // +4 + xenos::ColorFormat copy_dest_format : 6; // +7 + xenos::SurfaceNumberFormat copy_dest_number : 3; // +13 + int32_t copy_dest_exp_bias : 6; // +16 + uint32_t : 2; // +22 + uint32_t copy_dest_swap : 1; // +24 }; uint32_t value; static constexpr Register register_index = XE_GPU_REG_RB_COPY_DEST_INFO; diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 2a0b6c938..2f0ee64e2 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -185,7 +185,7 @@ enum class IndexFormat : uint32_t { }; // SurfaceNumberX from yamato_enum.h. -enum class SurfaceNumFormat : uint32_t { +enum class SurfaceNumberFormat : uint32_t { kUnsignedRepeatingFraction = 0, // Microsoft-style, scale factor (2^(n-1))-1. kSignedRepeatingFraction = 1, @@ -1176,14 +1176,120 @@ union alignas(uint32_t) xe_gpu_fetch_group_t { }; static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6); -// GPU_MEMEXPORT_STREAM_CONSTANT from a game .pdb - float constant for memexport -// stream configuration. -// This is used with the floating-point ALU in shaders (written to eA using -// mad), so the dwords have a normalized exponent when reinterpreted as floats -// (otherwise they would be flushed to zero), but actually these are packed -// integers. dword_1 specifically is 2^23 because -// powf(2.0f, 23.0f) + float(i) == 0x4B000000 | i -// so mad can pack indices as integers in the lower bits. +// Shader memory export (memexport) allows for writing of arbitrary formatted +// data with random access / scatter capabilities. It provides functionality +// largely similar to resolving - format packing, supporting arbitrary color +// formats, from sub-dword ones such as k_8 in 58410B86, to 128-bit ones, with +// endian swap similar to how it's performed in resolves (up to 128-bit); +// specifying the number format, swapping red and blue channels - though with no +// exponent biasing. Unlike resolving, however, instead of writing to tiled +// textures, it exports the data to up to 5 elements (the eM# shader registers, +// each corresponding to `base address + element size * (offset + 0...4)`) in a +// stream defined by a stream constant and an offset in elements written to eA - +// a shader, however, can write to multiple streams with different or the same +// stream constants, by performing `alloc export` multiple times. It's used +// mostly in vertex shaders (most commonly in improvised "compute shaders" done +// by executing a vertex shader for a number of point-type primitives covering +// nothing), though usage in pixel shaders is also possible - an example is +// provided in the "Advanced Screenspace Antialiasing" presentation by Arne +// Schober. +// https://ubm-twvideo01.s3.amazonaws.com/o1/vault/gdceurope2010/slides/A_Schober_Advanced_Screenspace_Antialiasing.pdf +// +// Unlike fetch constants, which are passed via special registers, a memory +// export stream is configured by writing the stream constant and the offset to +// a shader export register (eA) allocated by the shader - similar to more +// conventional exports like oPos, o#, oC#. Therefore, in general, it's not +// possible to know what its value will be without running the shader. For +// emulation, this means that the memory range referenced by an export - that +// needs to be validated - requires running the shader on the CPU in general. +// Thankfully, however, the usual way of setting up eA is by executing: +// `mad eA, r#, const0100, c#` +// where c# is the stream float4 constant from the float constant registers, and +// const0100 is a literal (0.0f, 1.0f, 0.0f, 0.0f) constant, also from the float +// constant registers, used for placing the element index (r#) in the correct +// component of eA. This allows for easy gathering of memexport stream +// constants, which contain both the base address and the size of the +// destination buffer for bounds checking, from the shader code and the float +// constant registers, as long as the guest uses this instruction pattern to +// write to eA. +// +// The Xenos doesn't have an integer ALU, and denormals are treated as zero and +// are flushed. However, eA contains integers and bit fields. A stream constant +// is thus structured in a way that allows for packing integers in normalized +// floating-point numbers. +// +// X contains the base address of the stream in dwords as integer bits in the +// lower 30 bits, and bits 0b01 in the top. The 0b01 bits make the exponent +// nonzero, so the number is considered normalized, and therefore isn't flushed +// to zero. With only 512 MB of the physical memory on the Xbox 360, the +// exponent can't become 0b11111111, so X also won't be NaN for any valid Xbox +// 360 physical address (though in general the GPU supports 32-bit addresses, +// but this is originally an Xbox 360-specific feature, that was later, however, +// likely reused for GL_QCOM_writeonly_rendering). +// +// TODO(Triang3l): Verify whether GL_QCOM_writeonly_rendering is actually +// memexport on the Adreno 2xx using GL_OES_get_program_binary - it's also +// interesting to see how alphatest interacts with it, whether it's still true +// fixed-function alphatest, as it's claimed to be supported as usual by the +// extension specification - it's likely, however, that memory exports are +// discarded alongside other exports such as oC# and oDepth this way. +// +// Y of eA contains the offset in elements - this is what shaders are supposed +// to calculate from something like the vertex index. Again, it's specified as +// an integer in the low bits, not as a truly floating-point number. For this +// purpose, stream constants contain the value 2^23 - when a whole +// floating-point number smaller than 2^23 is added as floating-point to 2^23, +// its integer representation becomes the mantissa bits of a number with an +// exponent of 23. Via multiply-add, `offset * 1.0f + exp2f(23)` is written here +// by the shader, allowing for element offsets of up to 2^23 - 1. +// +// Z is a bit field with the information about the formatting of the data. It's +// also packed as a normalized floating-point number, but in a cleaner way than +// X because not as many bits are required - just like Y, it has an exponent of +// 23 (possibly to let shaders build these values manually using floating-point +// multiply-add like integer shift-or, and finally to add 2^23, though that's +// not a case easy to handle in emulation, unlike prebuilt stream constants). +// +// W contains the number of elements in the stream. It's also packed with the +// full 23 exponent just like Y and Z, there's no way to index more than 2^23 +// elements using packing via addition to 2^23, so this field also doesn't need +// more bits than that. +// +// Examples of setup in titles (Z from MSB to LSB): +// +// 4D5307E6 particles (different VS invocation counts, like 1, 2, 4): +// There is a passthrough shader - useful for verification as it simply writes +// directly what it reads via vfetch of various formats. Another shader (with +// different c# numbers, but same formats) does complicated math to process the +// particles. +// c152: Z = 010010110000|0|111|00|100110|00000|010, count = 35840 +// 8in32, 32_32_32_32_FLOAT, float, RGBA - from 32_32_32_32_FLOAT vfetch +// c154, 162: Z = 010010110000|0|111|00|100000|00000|001, count = 71680 +// 8in16, 16_16_16_16_FLOAT, float, RGBA - from 16_16_16_16_FLOAT vfetch +// c156, 158, 160: Z = 010010110000|0|000|00|011010|00000|001, count = 71680 +// 8in16, 16_16_16_16, unorm, RGBA - from 16_16_16_16 unorm vfetch +// c164: Z = 010010110000|0|111|00|011111|00000|001, count = 143360 +// 8in16, 16_16_FLOAT, float, RGBA - from 16_16_FLOAT vfetch +// c166: Z = 010010110000|0|000|00|011001|00000|001, count = 143360 +// 8in16, 16_16, unorm, RGBA - from 16_16 unorm vfetch +// c168: Z = 010010110000|0|001|00|000111|00000|010, count = 143360 +// 8in32, 2_10_10_10, snorm, RGBA - from 2_10_10_10 snorm vfetch +// c170, c172: Z = 010010110000|1|000|00|000110|00000|010, count = 143360 +// 8in32, 8_8_8_8, unorm, BGRA - from 8_8_8_8 unorm vfetch with .zyxw swizzle +// +// 4D5307E6 water simulation (2048 VS invocations): +// c130: Z = 010010110000|0|111|00|100110|00000|010, count = 16384 +// 8in32, 32_32_32_32_FLOAT, float, RGBA +// The shader has 5 memexports of this kind and 6 32_32_32_32_FLOAT vfetches. +// +// 4D5307E6 water tessellation factors (1 VS invocation per triangle patch): +// c130: Z = 010010110000|0|111|11|100100|11111|010, count = patch count * 3 +// 8in32, 32_FLOAT, float, RGBA +// +// 41560817 texture memory copying (64 bytes per invocation, two eA, eight eM#): +// c0: Z = 010010110000|0|010|11|011010|00011|001 +// 8in16, 16_16_16_16, uint, RGBA - from 16_16_16_16 uint vfetch +// (16_16_16_16 is the largest color format without special values) union alignas(uint32_t) xe_gpu_memexport_stream_t { struct { uint32_t base_address : 30; // +0 dword_0 physical address >> 2 @@ -1191,13 +1297,13 @@ union alignas(uint32_t) xe_gpu_memexport_stream_t { uint32_t const_0x4b000000; // +0 dword_1 - Endian128 endianness : 3; // +0 dword_2 - uint32_t unused_0 : 5; // +3 - ColorFormat format : 6; // +8 - uint32_t unused_1 : 2; // +14 - SurfaceNumFormat num_format : 3; // +16 - uint32_t red_blue_swap : 1; // +19 - uint32_t const_0x4b0 : 12; // +20 + Endian128 endianness : 3; // +0 dword_2 + uint32_t unused_0 : 5; // +3 + ColorFormat format : 6; // +8 + uint32_t unused_1 : 2; // +14 + SurfaceNumberFormat num_format : 3; // +16 + uint32_t red_blue_swap : 1; // +19 + uint32_t const_0x4b0 : 12; // +20 uint32_t index_count : 23; // +0 dword_3 uint32_t const_0x96 : 9; // +23