From 6c48f209a1f0be867f969dc1864a10f8c76b373b Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 6 Jan 2019 15:10:55 +0300 Subject: [PATCH] [D3D12] Fix 7e3 and 20e4 denormal conversion --- .../dxbc/edram_load_depth_float_cs.cso | Bin 3296 -> 3336 bytes .../shaders/dxbc/edram_load_depth_float_cs.h | 149 ++++---- .../dxbc/edram_load_depth_float_cs.txt | 3 +- .../shaders/dxbc/edram_store_color_7e3_cs.cso | Bin 3396 -> 3476 bytes .../shaders/dxbc/edram_store_color_7e3_cs.h | 339 +++++++++--------- .../shaders/dxbc/edram_store_color_7e3_cs.txt | 4 +- .../dxbc/edram_store_depth_float_cs.cso | Bin 2456 -> 2496 bytes .../shaders/dxbc/edram_store_depth_float_cs.h | 193 +++++----- .../dxbc/edram_store_depth_float_cs.txt | 3 +- .../gpu/d3d12/shaders/pixel_formats.hlsli | 6 +- src/xenia/gpu/dxbc_shader_translator_om.cc | 44 ++- 11 files changed, 399 insertions(+), 342 deletions(-) diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.cso index b0e0e19faf14e4e75ac009365868bfb9749f546b..b5e6668556689006a06e992df7e2c61140fbbd83 100644 GIT binary patch delta 122 zcmaDL*&$`<65-@5q`_s|>Cs=d;;Ei&%It(cj0_A69J~w+tU%fXh%=ZN7%G6c1Bi1r z8b&j5-r;0m2n5QPZ=TAum`yN*fr0B2g8%~?P_Y{yke(dKCBE5$Q-qn(d9p8$IJ+QF P0%+=FMJDOVZ9Gu`dr25C delta 108 zcmeB>dLU`&65-@LvGvKKnX?~=1*bnuX;GiHAULcCBn)1fbx;g`qi$EtYRA^mdrj;!N|bCFoluJ+{doctQXOKOW=& delta 108 zcmbOteMHL8CBn&BCMRx+rK|7f_h}X5 delta 101 zcmX>gJVV&XCBn&B^55ery2WqaeamfXkvn%%jFEwXVFo7y11pd=0pbiM28IeC?f~LH y92> - ((125u).xxx - (rgba_f32u32.rgb >> 23u)); + min((125u).xxx - (rgba_f32u32.rgb >> 23u), 24u); uint3 rgb_f10u32 = (rgba_f32u32.rgb < 0x3E800000u) ? denormalized : (rgba_f32u32.rgb + 0xC2000000u); @@ -78,8 +78,8 @@ uint4 XeFloat32To20e4(uint4 f32u32) { // Keep only positive (high bit set means negative for both float and int) and // saturate to the maximum representable value near 2 (also dropping NaNs). f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : (0u).xxxx, 0x3FFFFFF8u); - uint4 denormalized = - ((f32u32 & 0x7FFFFFu) | 0x800000u) >> ((113u).xxxx - (f32u32 >> 23u)); + uint4 denormalized = ((f32u32 & 0x7FFFFFu) | 0x800000u) >> + min((113u).xxxx - (f32u32 >> 23u), 24u); uint4 f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u); return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu; } diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index c30092f5a..13217f86c 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -522,7 +522,27 @@ void DxbcShaderTranslator::CompletePixelShader_DepthTo24Bit( ++stat_.instruction_count; ++stat_.int_instruction_count; - // t1 = ((f32 & 0x7FFFFF) | 0x800000) >> (113 - (f32 >> 23)) + // Don't allow the shift to overflow, since in DXBC the lower 5 bits of the + // shift amount are used (otherwise 0 becomes 8). + // t2 = min(113 - (f32 >> 23), 24) + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_UMIN) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); + shader_code_.push_back(temp2); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(temp2); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(24); + shader_code_.push_back(24); + shader_code_.push_back(24); + shader_code_.push_back(24); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // t1 = ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24) shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_USHR) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); shader_code_.push_back( @@ -3789,7 +3809,27 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV_PackColor( ++stat_.instruction_count; ++stat_.int_instruction_count; - // t1 = ((f32 & 0x7FFFFF) | 0x800000) >> (125 - (f32 >> 23)) + // Don't allow the shift to overflow, since in DXBC the lower 5 bits of the + // shift amount are used. + // t2 = min(125 - (f32 >> 23), 24) + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_UMIN) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0111, 1)); + shader_code_.push_back(f10_temp2); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(f10_temp2); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); + shader_code_.push_back(24); + shader_code_.push_back(24); + shader_code_.push_back(24); + shader_code_.push_back(24); + ++stat_.instruction_count; + ++stat_.uint_instruction_count; + + // t1 = ((f32 & 0x7FFFFF) | 0x800000) >> min(125 - (f32 >> 23), 24) shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_USHR) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); shader_code_.push_back(