[GPU] Document memexport/resolve formats with more details

This commit is contained in:
Triang3l 2021-10-22 20:00:41 +03:00
parent d6660ac391
commit 28fec845d5
4 changed files with 141 additions and 26 deletions

View File

@ -876,8 +876,9 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
}
info_out.address.copy_sample_select = sample_select;
// Get the format to pass to the shader in a unified way - for depth (for
// which Direct3D 9 specifies the k_8_8_8_8 destination format), make sure the
// shader won't try to do conversion - pass proper k_24_8 or k_24_8_FLOAT.
// which Direct3D 9 specifies the k_8_8_8_8 uint destination format), make
// sure the shader won't try to do conversion - pass proper k_24_8 or
// k_24_8_FLOAT.
auto rb_copy_dest_info = regs.Get<reg::RB_COPY_DEST_INFO>();
xenos::TextureFormat dest_format;
auto rb_depth_info = regs.Get<reg::RB_DEPTH_INFO>();

View File

@ -15,6 +15,14 @@ namespace xe {
namespace gpu {
using namespace ucode;
// TODO(Triang3l): Support sub-dword memexports (like k_8 in 58410B86). This
// would require four 128 MB R8_UINT UAVs due to the Nvidia addressing limit.
// Need to be careful with resource binding tiers, however. Resource binding
// tier 1 on feature level 11_0 allows only 8 UAVs _across all stages_.
// RWByteAddressBuffer + 4 typed buffers is 5 per stage already, would need 10
// for both VS and PS, or even 11 with the eDRAM ROV. Need to drop draw commands
// doing memexport in both VS and PS on FL 11_0 resource binding tier 1.
void DxbcShaderTranslator::ExportToMemory_PackFixed32(
const uint32_t* eM_temps, uint32_t eM_count, const uint32_t bits[4],
const dxbc::Src& is_integer, const dxbc::Src& is_signed) {

View File

@ -712,14 +712,14 @@ static_assert_size(RB_COPY_CONTROL, sizeof(uint32_t));
union alignas(uint32_t) RB_COPY_DEST_INFO {
struct {
xenos::Endian128 copy_dest_endian : 3; // +0
uint32_t copy_dest_array : 1; // +3
uint32_t copy_dest_slice : 3; // +4
xenos::ColorFormat copy_dest_format : 6; // +7
uint32_t copy_dest_number : 3; // +13
int32_t copy_dest_exp_bias : 6; // +16
uint32_t : 2; // +22
uint32_t copy_dest_swap : 1; // +24
xenos::Endian128 copy_dest_endian : 3; // +0
uint32_t copy_dest_array : 1; // +3
uint32_t copy_dest_slice : 3; // +4
xenos::ColorFormat copy_dest_format : 6; // +7
xenos::SurfaceNumberFormat copy_dest_number : 3; // +13
int32_t copy_dest_exp_bias : 6; // +16
uint32_t : 2; // +22
uint32_t copy_dest_swap : 1; // +24
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_RB_COPY_DEST_INFO;

View File

@ -185,7 +185,7 @@ enum class IndexFormat : uint32_t {
};
// SurfaceNumberX from yamato_enum.h.
enum class SurfaceNumFormat : uint32_t {
enum class SurfaceNumberFormat : uint32_t {
kUnsignedRepeatingFraction = 0,
// Microsoft-style, scale factor (2^(n-1))-1.
kSignedRepeatingFraction = 1,
@ -1176,14 +1176,120 @@ union alignas(uint32_t) xe_gpu_fetch_group_t {
};
static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
// GPU_MEMEXPORT_STREAM_CONSTANT from a game .pdb - float constant for memexport
// stream configuration.
// This is used with the floating-point ALU in shaders (written to eA using
// mad), so the dwords have a normalized exponent when reinterpreted as floats
// (otherwise they would be flushed to zero), but actually these are packed
// integers. dword_1 specifically is 2^23 because
// powf(2.0f, 23.0f) + float(i) == 0x4B000000 | i
// so mad can pack indices as integers in the lower bits.
// Shader memory export (memexport) allows for writing of arbitrary formatted
// data with random access / scatter capabilities. It provides functionality
// largely similar to resolving - format packing, supporting arbitrary color
// formats, from sub-dword ones such as k_8 in 58410B86, to 128-bit ones, with
// endian swap similar to how it's performed in resolves (up to 128-bit);
// specifying the number format, swapping red and blue channels - though with no
// exponent biasing. Unlike resolving, however, instead of writing to tiled
// textures, it exports the data to up to 5 elements (the eM# shader registers,
// each corresponding to `base address + element size * (offset + 0...4)`) in a
// stream defined by a stream constant and an offset in elements written to eA -
// a shader, however, can write to multiple streams with different or the same
// stream constants, by performing `alloc export` multiple times. It's used
// mostly in vertex shaders (most commonly in improvised "compute shaders" done
// by executing a vertex shader for a number of point-type primitives covering
// nothing), though usage in pixel shaders is also possible - an example is
// provided in the "Advanced Screenspace Antialiasing" presentation by Arne
// Schober.
// https://ubm-twvideo01.s3.amazonaws.com/o1/vault/gdceurope2010/slides/A_Schober_Advanced_Screenspace_Antialiasing.pdf
//
// Unlike fetch constants, which are passed via special registers, a memory
// export stream is configured by writing the stream constant and the offset to
// a shader export register (eA) allocated by the shader - similar to more
// conventional exports like oPos, o#, oC#. Therefore, in general, it's not
// possible to know what its value will be without running the shader. For
// emulation, this means that the memory range referenced by an export - that
// needs to be validated - requires running the shader on the CPU in general.
// Thankfully, however, the usual way of setting up eA is by executing:
// `mad eA, r#, const0100, c#`
// where c# is the stream float4 constant from the float constant registers, and
// const0100 is a literal (0.0f, 1.0f, 0.0f, 0.0f) constant, also from the float
// constant registers, used for placing the element index (r#) in the correct
// component of eA. This allows for easy gathering of memexport stream
// constants, which contain both the base address and the size of the
// destination buffer for bounds checking, from the shader code and the float
// constant registers, as long as the guest uses this instruction pattern to
// write to eA.
//
// The Xenos doesn't have an integer ALU, and denormals are treated as zero and
// are flushed. However, eA contains integers and bit fields. A stream constant
// is thus structured in a way that allows for packing integers in normalized
// floating-point numbers.
//
// X contains the base address of the stream in dwords as integer bits in the
// lower 30 bits, and bits 0b01 in the top. The 0b01 bits make the exponent
// nonzero, so the number is considered normalized, and therefore isn't flushed
// to zero. With only 512 MB of the physical memory on the Xbox 360, the
// exponent can't become 0b11111111, so X also won't be NaN for any valid Xbox
// 360 physical address (though in general the GPU supports 32-bit addresses,
// but this is originally an Xbox 360-specific feature, that was later, however,
// likely reused for GL_QCOM_writeonly_rendering).
//
// TODO(Triang3l): Verify whether GL_QCOM_writeonly_rendering is actually
// memexport on the Adreno 2xx using GL_OES_get_program_binary - it's also
// interesting to see how alphatest interacts with it, whether it's still true
// fixed-function alphatest, as it's claimed to be supported as usual by the
// extension specification - it's likely, however, that memory exports are
// discarded alongside other exports such as oC# and oDepth this way.
//
// Y of eA contains the offset in elements - this is what shaders are supposed
// to calculate from something like the vertex index. Again, it's specified as
// an integer in the low bits, not as a truly floating-point number. For this
// purpose, stream constants contain the value 2^23 - when a whole
// floating-point number smaller than 2^23 is added as floating-point to 2^23,
// its integer representation becomes the mantissa bits of a number with an
// exponent of 23. Via multiply-add, `offset * 1.0f + exp2f(23)` is written here
// by the shader, allowing for element offsets of up to 2^23 - 1.
//
// Z is a bit field with the information about the formatting of the data. It's
// also packed as a normalized floating-point number, but in a cleaner way than
// X because not as many bits are required - just like Y, it has an exponent of
// 23 (possibly to let shaders build these values manually using floating-point
// multiply-add like integer shift-or, and finally to add 2^23, though that's
// not a case easy to handle in emulation, unlike prebuilt stream constants).
//
// W contains the number of elements in the stream. It's also packed with the
// full 23 exponent just like Y and Z, there's no way to index more than 2^23
// elements using packing via addition to 2^23, so this field also doesn't need
// more bits than that.
//
// Examples of setup in titles (Z from MSB to LSB):
//
// 4D5307E6 particles (different VS invocation counts, like 1, 2, 4):
// There is a passthrough shader - useful for verification as it simply writes
// directly what it reads via vfetch of various formats. Another shader (with
// different c# numbers, but same formats) does complicated math to process the
// particles.
// c152: Z = 010010110000|0|111|00|100110|00000|010, count = 35840
// 8in32, 32_32_32_32_FLOAT, float, RGBA - from 32_32_32_32_FLOAT vfetch
// c154, 162: Z = 010010110000|0|111|00|100000|00000|001, count = 71680
// 8in16, 16_16_16_16_FLOAT, float, RGBA - from 16_16_16_16_FLOAT vfetch
// c156, 158, 160: Z = 010010110000|0|000|00|011010|00000|001, count = 71680
// 8in16, 16_16_16_16, unorm, RGBA - from 16_16_16_16 unorm vfetch
// c164: Z = 010010110000|0|111|00|011111|00000|001, count = 143360
// 8in16, 16_16_FLOAT, float, RGBA - from 16_16_FLOAT vfetch
// c166: Z = 010010110000|0|000|00|011001|00000|001, count = 143360
// 8in16, 16_16, unorm, RGBA - from 16_16 unorm vfetch
// c168: Z = 010010110000|0|001|00|000111|00000|010, count = 143360
// 8in32, 2_10_10_10, snorm, RGBA - from 2_10_10_10 snorm vfetch
// c170, c172: Z = 010010110000|1|000|00|000110|00000|010, count = 143360
// 8in32, 8_8_8_8, unorm, BGRA - from 8_8_8_8 unorm vfetch with .zyxw swizzle
//
// 4D5307E6 water simulation (2048 VS invocations):
// c130: Z = 010010110000|0|111|00|100110|00000|010, count = 16384
// 8in32, 32_32_32_32_FLOAT, float, RGBA
// The shader has 5 memexports of this kind and 6 32_32_32_32_FLOAT vfetches.
//
// 4D5307E6 water tessellation factors (1 VS invocation per triangle patch):
// c130: Z = 010010110000|0|111|11|100100|11111|010, count = patch count * 3
// 8in32, 32_FLOAT, float, RGBA
//
// 41560817 texture memory copying (64 bytes per invocation, two eA, eight eM#):
// c0: Z = 010010110000|0|010|11|011010|00011|001
// 8in16, 16_16_16_16, uint, RGBA - from 16_16_16_16 uint vfetch
// (16_16_16_16 is the largest color format without special values)
union alignas(uint32_t) xe_gpu_memexport_stream_t {
struct {
uint32_t base_address : 30; // +0 dword_0 physical address >> 2
@ -1191,13 +1297,13 @@ union alignas(uint32_t) xe_gpu_memexport_stream_t {
uint32_t const_0x4b000000; // +0 dword_1
Endian128 endianness : 3; // +0 dword_2
uint32_t unused_0 : 5; // +3
ColorFormat format : 6; // +8
uint32_t unused_1 : 2; // +14
SurfaceNumFormat num_format : 3; // +16
uint32_t red_blue_swap : 1; // +19
uint32_t const_0x4b0 : 12; // +20
Endian128 endianness : 3; // +0 dword_2
uint32_t unused_0 : 5; // +3
ColorFormat format : 6; // +8
uint32_t unused_1 : 2; // +14
SurfaceNumberFormat num_format : 3; // +16
uint32_t red_blue_swap : 1; // +19
uint32_t const_0x4b0 : 12; // +20
uint32_t index_count : 23; // +0 dword_3
uint32_t const_0x96 : 9; // +23