From 05f6b444a01eb701b6c0c8ff40b836e20e06fee5 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 29 Mar 2020 19:09:58 +0300 Subject: [PATCH] [DXBC] Move memexport to the new DXBC code and make it more straightforward --- src/xenia/gpu/dxbc_shader_translator.h | 22 + .../gpu/dxbc_shader_translator_memexport.cc | 1651 ++++------------- 2 files changed, 377 insertions(+), 1296 deletions(-) diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index f0191a401..e6c1a3e9d 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -950,6 +950,7 @@ class DxbcShaderTranslator : public ShaderTranslator { kBFI = 140, kLdUAVTyped = 163, kStoreUAVTyped = 164, + kStoreRaw = 166, kEvalSampleIndex = 204, }; @@ -1386,6 +1387,23 @@ class DxbcShaderTranslator : public ShaderTranslator { ++stat_.instruction_count; ++stat_.c_texture_store_instructions; } + void DxbcOpStoreRaw(const DxbcDest& dest, const DxbcSrc& byte_offset, + const DxbcSrc& value) { + uint32_t dest_write_mask = dest.GetMask(); + assert_true(dest_write_mask == 0b0001 || dest_write_mask == 0b0011 || + dest_write_mask == 0b0111 || dest_write_mask == 0b1111); + uint32_t operands_length = dest.GetLength() + + byte_offset.GetLength(0b0000) + + value.GetLength(dest_write_mask); + shader_code_.reserve(shader_code_.size() + 1 + operands_length); + shader_code_.push_back( + DxbcOpcodeToken(DxbcOpcode::kStoreRaw, operands_length)); + dest.Write(shader_code_); + byte_offset.Write(shader_code_, true, 0b0000); + value.Write(shader_code_, true, dest_write_mask); + ++stat_.instruction_count; + ++stat_.c_texture_store_instructions; + } void DxbcOpEvalSampleIndex(const DxbcDest& dest, const DxbcSrc& value, const DxbcSrc& sample_index) { uint32_t dest_write_mask = dest.GetMask(); @@ -1716,6 +1734,10 @@ class DxbcShaderTranslator : public ShaderTranslator { // Writing the epilogue. // ExportToMemory modifies the values of eA/eM# for simplicity, don't call // multiple times. + void ExportToMemory_PackFixed32(const uint32_t* eM_temps, uint32_t eM_count, + const uint32_t bits[4], + const DxbcSrc& is_integer, + const DxbcSrc& is_signed); void ExportToMemory(); void CompleteVertexOrDomainShader(); // Discards the SSAA sample if it fails alpha to coverage. diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc index bd902ed5b..c1f7ff6c6 100644 --- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc +++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc @@ -9,40 +9,85 @@ #include "xenia/gpu/dxbc_shader_translator.h" -#include - -#include "third_party/dxbc/d3d12TokenizedProgramFormat.hpp" - namespace xe { namespace gpu { using namespace ucode; -void DxbcShaderTranslator::ExportToMemory() { - static const float k32bppMaxValuesSigned[4][4] = { - {127.0f, 127.0f, 127.0f, 127.0f}, - {511.0f, 511.0f, 511.0f, 1.0f}, - {1023.0f, 1023.0f, 511.0f, 0.0f}, - {511.0f, 1023.0f, 1023.0f, 0.0f}, - }; - static const float k32bppMaxValuesUnsigned[4][4] = { - {255.0f, 255.0f, 255.0f, 255.0f}, - {1023.0f, 1023.0f, 1023.0f, 3.0f}, - {2047.0f, 2047.0f, 1023.0f, 0.0f}, - {1023.0f, 2047.0f, 2047.0f, 0.0f}, - }; - static const uint32_t k32bppMasks[4][4] = { - {255, 255, 255, 255}, - {1023, 1023, 1023, 3}, - {2047, 2047, 1023, 0}, - {1023, 2047, 2047, 0}, - }; - static const uint32_t k32bppShifts[4][3] = { - {8, 16, 24}, - {10, 20, 30}, - {11, 22, 0}, - {10, 21, 0}, - }; +void DxbcShaderTranslator::ExportToMemory_PackFixed32( + const uint32_t* eM_temps, uint32_t eM_count, const uint32_t bits[4], + const DxbcSrc& is_integer, const DxbcSrc& is_signed) { + // Will insert with BFI - sign extension of red will be overwritten, not + // truncated. + assert_not_zero(bits[0]); + assert_true(bits[0] + bits[1] + bits[2] + bits[3] == 32); + uint32_t mask = 0; + for (uint32_t i = 0; i < 4; ++i) { + if (bits[i]) { + mask |= 1 << i; + } + } + DxbcOpIf(true, is_signed); + { + float range[4]; + for (uint32_t i = 0; i < 4; ++i) { + range[i] = bits[i] ? float((uint32_t(1) << (bits[i] - 1)) - 1) : 0.0f; + } + DxbcSrc range_src(DxbcSrc::LP(range)); + DxbcOpIf(false, is_integer); + for (uint32_t i = 0; i < eM_count; ++i) { + uint32_t eM_temp = eM_temps[i]; + DxbcOpMul(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp), range_src); + } + DxbcOpEndIf(); + for (uint32_t i = 0; i < eM_count; ++i) { + DxbcDest eM_dest(DxbcDest::R(eM_temps[i], mask)); + DxbcSrc eM_src(DxbcSrc::R(eM_temps[i])); + DxbcOpMax(eM_dest, eM_src, -range_src); + DxbcOpMin(eM_dest, eM_src, range_src); + } + } + DxbcOpElse(); + { + float range[4]; + for (uint32_t i = 0; i < 4; ++i) { + range[i] = float((uint32_t(1) << bits[i]) - 1); + } + DxbcSrc range_src(DxbcSrc::LP(range)); + DxbcOpIf(false, is_integer); + for (uint32_t i = 0; i < eM_count; ++i) { + uint32_t eM_temp = eM_temps[i]; + DxbcOpMul(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp), range_src); + } + DxbcOpEndIf(); + for (uint32_t i = 0; i < eM_count; ++i) { + DxbcDest eM_dest(DxbcDest::R(eM_temps[i], mask)); + DxbcSrc eM_src(DxbcSrc::R(eM_temps[i])); + DxbcOpMax(eM_dest, eM_src, DxbcSrc::LF(0.0f)); + DxbcOpMin(eM_dest, eM_src, range_src); + } + } + DxbcOpEndIf(); + for (uint32_t i = 0; i < eM_count; ++i) { + uint32_t eM_temp = eM_temps[i]; + // Round to the nearest integer, according to the rules of handling integer + // formats in Direct3D. + DxbcOpRoundNE(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp)); + DxbcOpFToI(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp)); + DxbcDest eM_packed_dest(DxbcDest::R(eM_temp, 0b0001)); + DxbcSrc eM_packed_src(DxbcSrc::R(eM_temp, DxbcSrc::kXXXX)); + uint32_t offset = bits[0]; + for (uint32_t j = 1; j < 4; ++j) { + if (!bits[j]) { + continue; + } + DxbcOpBFI(eM_packed_dest, DxbcSrc::LU(bits[j]), DxbcSrc::LU(offset), + DxbcSrc::R(eM_temp).Select(j), eM_packed_src); + offset += bits[j]; + } + } +} +void DxbcShaderTranslator::ExportToMemory() { if (system_temp_memexport_written_ == UINT32_MAX) { // No exports in the shader. return; @@ -59,22 +104,12 @@ void DxbcShaderTranslator::ExportToMemory() { // Safety check if the shared memory is bound as UAV. system_constants_used_ |= 1ull << kSysConst_Flags_Index; - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSelectOperand( - D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSysConst_Flags_Comp, 3)); - shader_code_.push_back(cbuffer_index_system_constants_); - shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); - shader_code_.push_back(kSysConst_Flags_Vec); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(kSysFlag_SharedMemoryIsUAV); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - + DxbcOpAnd(DxbcDest::R(control_temp, 0b0001), + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_Flags_Vec) + .Select(kSysConst_Flags_Comp), + DxbcSrc::LU(kSysFlag_SharedMemoryIsUAV)); if (IsDxbcPixelShader()) { // Disable memexport in pixel shaders with supersampling since VPOS is // ambiguous. @@ -87,55 +122,23 @@ void DxbcShaderTranslator::ExportToMemory() { kSysConst_EDRAMResolutionSquareScale_Vec) .Select(kSysConst_EDRAMResolutionSquareScale_Comp), DxbcSrc::LU(2)); - - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(control_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; + DxbcOpAnd(DxbcDest::R(control_temp, 0b0001), + DxbcSrc::R(control_temp, DxbcSrc::kXXXX), + DxbcSrc::R(control_temp, DxbcSrc::kYYYY)); } else { - system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index; - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(11)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(control_temp); // Enough to check just Y because it's scaled for both 2x and 4x. - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, - kSysConst_SampleCountLog2_Comp + 1, 3)); - shader_code_.push_back(cbuffer_index_system_constants_); - shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); - shader_code_.push_back(kSysConst_SampleCountLog2_Vec); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(control_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; + system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index; + DxbcOpMovC(DxbcDest::R(control_temp, 0b0001), + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_SampleCountLog2_Vec) + .Select(kSysConst_SampleCountLog2_Comp + 1), + DxbcSrc::LU(0), DxbcSrc::R(control_temp, DxbcSrc::kXXXX)); } } - // Check if memexport can be done. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | - ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( - D3D10_SB_INSTRUCTION_TEST_NONZERO) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(control_temp); - ++stat_.instruction_count; - ++stat_.dynamic_flow_control_count; + DxbcOpIf(true, DxbcSrc::R(control_temp, DxbcSrc::kXXXX)); + // control_temp.x is now free. for (uint32_t i = 0; i < kMaxMemExports; ++i) { uint32_t eA_temp = system_temps_memexport_address_[i]; @@ -160,1241 +163,297 @@ void DxbcShaderTranslator::ExportToMemory() { continue; } - // Extract format info to control_temp. - // X - color format, Y - is signed, Z - fractional/integer, - // W - red/blue swap. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_UBFE) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(15)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(6); - shader_code_.push_back(1); - shader_code_.push_back(1); - shader_code_.push_back(1); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(8); - shader_code_.push_back(16); - shader_code_.push_back(17); - shader_code_.push_back(19); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); - shader_code_.push_back(eA_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - // Swap red and blue if needed. + DxbcOpAnd(DxbcDest::R(control_temp, 0b0001), + DxbcSrc::R(eA_temp, DxbcSrc::kZZZZ), + DxbcSrc::LU(uint32_t(1) << 19)); for (uint32_t j = 0; j < eM_count; ++j) { uint32_t eM_temp = eM_temps[j]; - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0101, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 0b11000110, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; + DxbcOpMovC(DxbcDest::R(eM_temp, 0b0101), + DxbcSrc::R(control_temp, DxbcSrc::kXXXX), + DxbcSrc::R(eM_temp, 0b000010), DxbcSrc::R(eM_temp)); } - // Initialize element size to 4 since there are many 32-bit formats. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(4); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; + // Initialize element size in control_temp.x to 4 bytes as this is the most + // common size. + DxbcDest element_size_dest(DxbcDest::R(control_temp, 0b0001)); + DxbcSrc element_size_src(DxbcSrc::R(control_temp, DxbcSrc::kXXXX)); + DxbcOpMov(element_size_dest, DxbcSrc::LU(4)); - // Allocate a register for checking if the format is equal to different - // values. - uint32_t format_check_temp = PushSystemTemp(); + // Each eM should get a packed value in the destination format now. - // Check if the format is float32 - in this case, it doesn't need any - // conversion. Compare the format to each float32 format. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0111, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(uint32_t(ColorFormat::k_32_FLOAT)); - shader_code_.push_back(uint32_t(ColorFormat::k_32_32_FLOAT)); - shader_code_.push_back(uint32_t(ColorFormat::k_32_32_32_32_FLOAT)); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.int_instruction_count; - // Merge to check whether the format is any-dimensional float32 into X. - for (uint32_t j = 0; j < 2; ++j) { - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1 + j, 1)); - shader_code_.push_back(format_check_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - } - - // If the format is float32, it doesn't need any conversion. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | - ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( - D3D10_SB_INSTRUCTION_TEST_NONZERO) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(format_check_temp); - ++stat_.instruction_count; - ++stat_.dynamic_flow_control_count; - - // Set element size to 8 for k_32_32_FLOAT or 16 for k_32_32_32_32_FLOAT. - for (uint32_t j = 0; j < 2; ++j) { - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1 + j, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(8 << j); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(control_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - } - - // If the format is not float32, do conversion and packing. Can reuse - // format_check_temp in the `else` case. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ELSE) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; - - // Check if the format is float16 to convert and pack. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(uint32_t(ColorFormat::k_16_16_FLOAT)); - shader_code_.push_back(uint32_t(ColorFormat::k_16_16_16_16_FLOAT)); - shader_code_.push_back(0); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.int_instruction_count; - // Merge to check whether the format is any-dimensional float16 into X. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(format_check_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // If the format is float16, convert and pack. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | - ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( - D3D10_SB_INSTRUCTION_TEST_NONZERO) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(format_check_temp); - ++stat_.instruction_count; - ++stat_.dynamic_flow_control_count; - - // Set element size to 8 for k_16_16_16_16_FLOAT. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(8); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(control_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - - for (uint32_t j = 0; j < eM_count; ++j) { - uint32_t eM_temp = eM_temps[j]; - // Convert to float16. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_F32TOF16) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - ++stat_.instruction_count; - ++stat_.conversion_instruction_count; - // Pack a float16 vector (in the little-endian way). - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_BFI) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(17)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 0b00001101, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 0b00001000, 1)); - shader_code_.push_back(eM_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - } - - // If the format is not float16, do float->integer and packing. Can reuse - // format_check_temp in the `else` case. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ELSE) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; - - // Check if the format is each of packed 32-bit formats, but not 16_16 (it - // will be handled separately for simplicity). - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(uint32_t(ColorFormat::k_8_8_8_8)); - shader_code_.push_back(uint32_t(ColorFormat::k_2_10_10_10)); - shader_code_.push_back(uint32_t(ColorFormat::k_10_11_11)); - shader_code_.push_back(uint32_t(ColorFormat::k_11_11_10)); - ++stat_.instruction_count; - ++stat_.int_instruction_count; - - // Check if the format is each of packed 32-bit formats blended as fixed16. - uint32_t format_as16_check_temp = PushSystemTemp(); - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(format_as16_check_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(uint32_t(ColorFormat::k_8_8_8_8_AS_16_16_16_16)); - shader_code_.push_back(uint32_t(ColorFormat::k_2_10_10_10_AS_16_16_16_16)); - shader_code_.push_back(uint32_t(ColorFormat::k_10_11_11_AS_16_16_16_16)); - shader_code_.push_back(uint32_t(ColorFormat::k_11_11_10_AS_16_16_16_16)); - ++stat_.instruction_count; - ++stat_.int_instruction_count; - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(format_as16_check_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - // Release format_as16_check_temp. - PopSystemTemp(); - - // Allocate a register for format bit representation parameters. - uint32_t format_param_temp = PushSystemTemp(); - - // Denormalize, clamp and convert to integer. - // A lot of the code is similar for both signed and unsigned. Start by - // checking the signedness. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | - ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( - D3D10_SB_INSTRUCTION_TEST_NONZERO) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(control_temp); - ++stat_.instruction_count; - ++stat_.dynamic_flow_control_count; - for (uint32_t j = 0; j < 2; ++j) { - if (j != 0) { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ELSE) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; + // Extract format properties to control_temp. + // Y - signedness if fixed-point. + // Z - fractional/integer if fixed-point. + // W - color format. + DxbcOpUBFE(DxbcDest::R(control_temp, 0b1110), DxbcSrc::LU(0, 1, 1, 6), + DxbcSrc::LU(0, 16, 17, 8), DxbcSrc::R(eA_temp, DxbcSrc::kZZZZ)); + DxbcSrc is_signed(DxbcSrc::R(control_temp, DxbcSrc::kYYYY)); + DxbcSrc is_integer(DxbcSrc::R(control_temp, DxbcSrc::kZZZZ)); + // Convert and pack the format. + DxbcOpSwitch(DxbcSrc::R(control_temp, DxbcSrc::kWWWW)); + // control_temp.w is now free. + { + // k_8_8_8_8 + // k_8_8_8_8_AS_16_16_16_16 + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_8_8_8_8))); + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_8_8_8_8_AS_16_16_16_16))); + { + uint32_t bits[4] = {8, 8, 8, 8}; + ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer, + is_signed); } - // Write the maximum integer value to format_param_temp. Default to - // 16_16_16_16, but override if it's a different packed 32bpp format. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(format_param_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - float norm16_max = j ? 65535.0f : 32767.0f; - shader_code_.push_back(*reinterpret_cast(&norm16_max)); - shader_code_.push_back(*reinterpret_cast(&norm16_max)); - shader_code_.push_back(*reinterpret_cast(&norm16_max)); - shader_code_.push_back(*reinterpret_cast(&norm16_max)); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - for (uint32_t k = 0; k < 4; ++k) { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(format_param_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, k, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - const uint32_t* norm_max = reinterpret_cast( - j ? k32bppMaxValuesUnsigned[k] : k32bppMaxValuesSigned[k]); - shader_code_.push_back(norm_max[0]); - shader_code_.push_back(norm_max[1]); - shader_code_.push_back(norm_max[2]); - shader_code_.push_back(norm_max[3]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(format_param_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; + DxbcOpBreak(); + + // k_2_10_10_10 + // k_2_10_10_10_AS_16_16_16_16 + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_2_10_10_10))); + DxbcOpCase( + DxbcSrc::LU(uint32_t(ColorFormat::k_2_10_10_10_AS_16_16_16_16))); + { + uint32_t bits[4] = {10, 10, 10, 2}; + ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer, + is_signed); } - // If fractional, denormalize. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | - ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( - D3D10_SB_INSTRUCTION_TEST_ZERO) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); - shader_code_.push_back(control_temp); - ++stat_.instruction_count; - ++stat_.dynamic_flow_control_count; - for (uint32_t k = 0; k < eM_count; ++k) { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MUL) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(eM_temps[k]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temps[k]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(format_param_temp); - ++stat_.instruction_count; - ++stat_.float_instruction_count; + DxbcOpBreak(); + + // k_10_11_11 + // k_10_11_11_AS_16_16_16_16 + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_10_11_11))); + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_10_11_11_AS_16_16_16_16))); + { + uint32_t bits[4] = {11, 11, 10}; + ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer, + is_signed); } - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; - for (uint32_t k = 0; k < eM_count; ++k) { - uint32_t eM_temp = eM_temps[k]; - // Clamp to the minimum (-max for signed, 0 for unsigned). - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MAX) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(j ? 10 : 8)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - if (j != 0) { - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - } else { - shader_code_.push_back( - EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, - kSwizzleXYZW, 1) | - ENCODE_D3D10_SB_OPERAND_EXTENDED(1)); - shader_code_.push_back(ENCODE_D3D10_SB_EXTENDED_OPERAND_MODIFIER( - D3D10_SB_OPERAND_MODIFIER_NEG)); - shader_code_.push_back(format_param_temp); + DxbcOpBreak(); + + // k_11_11_10 + // k_11_11_10_AS_16_16_16_16 + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_11_11_10))); + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_11_11_10_AS_16_16_16_16))); + { + uint32_t bits[4] = {10, 11, 11}; + ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer, + is_signed); + } + DxbcOpBreak(); + + // k_16_16 + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_16_16))); + { + uint32_t bits[4] = {16, 16}; + ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer, + is_signed); + } + DxbcOpBreak(); + + // k_16_16_16_16 + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_16_16_16_16))); + DxbcOpMov(element_size_dest, DxbcSrc::LU(8)); + DxbcOpIf(true, is_signed); + { + DxbcOpIf(false, is_integer); + for (uint32_t j = 0; j < eM_count; ++j) { + uint32_t eM_temp = eM_temps[j]; + DxbcOpMul(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp), + DxbcSrc::LF(32767.0f)); } - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // Clamp to the maximum. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MIN) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(format_param_temp); - ++stat_.instruction_count; - ++stat_.float_instruction_count; + DxbcOpEndIf(); + for (uint32_t j = 0; j < eM_count; ++j) { + DxbcDest eM_dest(DxbcDest::R(eM_temps[j])); + DxbcSrc eM_src(DxbcSrc::R(eM_temps[j])); + DxbcOpMax(eM_dest, eM_src, DxbcSrc::LF(-32767.0f)); + DxbcOpMin(eM_dest, eM_src, DxbcSrc::LF(32767.0f)); + } + } + DxbcOpElse(); + { + DxbcOpIf(false, is_integer); + for (uint32_t j = 0; j < eM_count; ++j) { + uint32_t eM_temp = eM_temps[j]; + DxbcOpMul(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp), + DxbcSrc::LF(65535.0f)); + } + DxbcOpEndIf(); + for (uint32_t j = 0; j < eM_count; ++j) { + DxbcDest eM_dest(DxbcDest::R(eM_temps[j])); + DxbcSrc eM_src(DxbcSrc::R(eM_temps[j])); + DxbcOpMax(eM_dest, eM_src, DxbcSrc::LF(0.0f)); + DxbcOpMin(eM_dest, eM_src, DxbcSrc::LF(65535.0f)); + } + } + DxbcOpEndIf(); + for (uint32_t j = 0; j < eM_count; ++j) { + uint32_t eM_temp = eM_temps[j]; // Round to the nearest integer, according to the rules of handling // integer formats in Direct3D. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ROUND_NE) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // Convert to integer. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(j ? D3D10_SB_OPCODE_FTOU - : D3D10_SB_OPCODE_FTOI) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - ++stat_.instruction_count; - ++stat_.conversion_instruction_count; + DxbcOpRoundNE(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp)); + DxbcOpFToI(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp)); + DxbcOpBFI(DxbcDest::R(eM_temp, 0b0011), DxbcSrc::LU(16), + DxbcSrc::LU(16), DxbcSrc::R(eM_temp, 0b1101), + DxbcSrc::R(eM_temp, 0b1000)); } - if (j == 0) { - // Drop sign extension bits. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(format_param_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(65535); - shader_code_.push_back(65535); - shader_code_.push_back(65535); - shader_code_.push_back(65535); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - for (uint32_t k = 0; k < 4; ++k) { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(format_param_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, k, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - const uint32_t* mask = k32bppMasks[k]; - shader_code_.push_back(mask[0]); - shader_code_.push_back(mask[1]); - shader_code_.push_back(mask[2]); - shader_code_.push_back(mask[3]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(format_param_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - } + DxbcOpBreak(); + + // k_16_16_FLOAT + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_16_16_FLOAT))); + for (uint32_t j = 0; j < eM_count; ++j) { + uint32_t eM_temp = eM_temps[j]; + DxbcOpF32ToF16(DxbcDest::R(eM_temp, 0b0011), DxbcSrc::R(eM_temp)); + DxbcOpBFI(DxbcDest::R(eM_temp, 0b0001), DxbcSrc::LU(16), + DxbcSrc::LU(16), DxbcSrc::R(eM_temp, DxbcSrc::kYYYY), + DxbcSrc::R(eM_temp, DxbcSrc::kXXXX)); + } + DxbcOpBreak(); + + // k_16_16_16_16_FLOAT + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_16_16_16_16_FLOAT))); + DxbcOpMov(element_size_dest, DxbcSrc::LU(8)); + for (uint32_t j = 0; j < eM_count; ++j) { + uint32_t eM_temp = eM_temps[j]; + DxbcOpF32ToF16(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp)); + DxbcOpBFI(DxbcDest::R(eM_temp, 0b0011), DxbcSrc::LU(16), + DxbcSrc::LU(16), DxbcSrc::R(eM_temp, 0b1101), + DxbcSrc::R(eM_temp, 0b1000)); + } + DxbcOpBreak(); + + // k_32_FLOAT + // Already in the destination format, 4 bytes per element already + // selected. + + // k_32_32_FLOAT + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_32_32_FLOAT))); + DxbcOpMov(element_size_dest, DxbcSrc::LU(8)); + // Already in the destination format. + DxbcOpBreak(); + + // k_32_32_32_32_FLOAT + DxbcOpCase(DxbcSrc::LU(uint32_t(ColorFormat::k_32_32_32_32_FLOAT))); + DxbcOpMov(element_size_dest, DxbcSrc::LU(16)); + // Already in the destination format. + DxbcOpBreak(); + } + DxbcOpEndSwitch(); + // control_temp.yz are now free. + + // Do endian swap. + { + DxbcDest endian_dest(DxbcDest::R(control_temp, 0b0010)); + DxbcSrc endian_src(DxbcSrc::R(control_temp, DxbcSrc::kYYYY)); + // Extract endianness into control_temp.y. + DxbcOpAnd(endian_dest, DxbcSrc::R(eA_temp, DxbcSrc::kZZZZ), + DxbcSrc::LU(0b111)); + + // Change 8-in-64 and 8-in-128 to 8-in-32. + for (uint32_t j = 0; j < 2; ++j) { + DxbcOpIEq( + DxbcDest::R(control_temp, 0b0100), endian_src, + DxbcSrc::LU(uint32_t(j ? Endian128::k8in128 : Endian128::k8in64))); for (uint32_t k = 0; k < eM_count; ++k) { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(eM_temps[k]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temps[k]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(format_param_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; + uint32_t eM_temp = eM_temps[k]; + DxbcOpMovC(DxbcDest::R(eM_temp), + DxbcSrc::R(control_temp, DxbcSrc::kZZZZ), + DxbcSrc::R(eM_temp, j ? 0b00011011 : 0b10110001), + DxbcSrc::R(eM_temp)); } - } - } - // Close the signedness check. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; - - // Shift each component into its location before packing. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1110, 1)); - shader_code_.push_back(format_param_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(16); - shader_code_.push_back(0); - shader_code_.push_back(16); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - for (uint32_t j = 0; j < 4; ++j) { - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1110, 1)); - shader_code_.push_back(format_param_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, j, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - const uint32_t* shift = k32bppShifts[j]; - shader_code_.push_back(0); - shader_code_.push_back(shift[0]); - shader_code_.push_back(shift[1]); - shader_code_.push_back(shift[2]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(format_param_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - } - for (uint32_t j = 0; j < eM_count; ++j) { - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ISHL) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1110, 1)); - shader_code_.push_back(eM_temps[j]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temps[j]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(format_param_temp); - ++stat_.instruction_count; - ++stat_.int_instruction_count; - } - - // Release format_param_temp. - PopSystemTemp(); - - // Merge XZ and YW into XY - this is common for both 16_16/16_16_16_16 and - // other formats. - for (uint32_t j = 0; j < eM_count; ++j) { - uint32_t eM_temp = eM_temps[j]; - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 0b00001000, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 0b00001101, 1)); - shader_code_.push_back(eM_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - } - - // Check if the format is norm16 since it needs its own packing. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(uint32_t(ColorFormat::k_16_16)); - shader_code_.push_back(uint32_t(ColorFormat::k_16_16_16_16)); - shader_code_.push_back(0); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.int_instruction_count; - - // Set element size to 8 for k_16_16_16_16. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(8); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(control_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - - // Merge to check whether the format is any-dimensional norm16 into X. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(format_check_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(format_check_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // If the format is not norm16, merge all components into X. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | - ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( - D3D10_SB_INSTRUCTION_TEST_ZERO) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(format_check_temp); - ++stat_.instruction_count; - ++stat_.dynamic_flow_control_count; - for (uint32_t j = 0; j < eM_count; ++j) { - uint32_t eM_temp = eM_temps[j]; - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(eM_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - } - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; - - // Close the float16 check. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; - - // Close the float32 check. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; - - // Release format_check_temp. - PopSystemTemp(); - - // Extract endianness into control_temp.xyz. - // X set for 8-in-16, 16-in-32 and 8-in-128. - // Y set for 8-in-32 and 16-in-32. - // Z set for 8-in-64 and 8-in-128. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_UBFE) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(15)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0111, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(1); - shader_code_.push_back(1); - shader_code_.push_back(1); - shader_code_.push_back(0); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(1); - shader_code_.push_back(2); - shader_code_.push_back(0); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); - shader_code_.push_back(eA_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // Check if need to do 8-in-64 or 8-in-128 swap. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | - ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( - D3D10_SB_INSTRUCTION_TEST_NONZERO) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); - shader_code_.push_back(control_temp); - ++stat_.instruction_count; - ++stat_.dynamic_flow_control_count; - - // Do 32-in-64 or 32-in-128 swap. - for (uint32_t j = 0; j < eM_count; ++j) { - uint32_t eM_temp = eM_temps[j]; - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 0b00011011, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 0b10110001, 1)); - shader_code_.push_back(eM_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - } - - // Change 8-in-64 or 8-in-128 to 8-in-32. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(1); - shader_code_.push_back(0); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - - // Close the 8-in-64 or 8-in-128 check. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; - - // 16-in-32 is used as intermediate swapping step here rather than 8-in-32. - // Thus 8-in-16 needs to be done for 8-in-16 (01) and 8-in-32 (10). - // And 16-in-32 needs to be done for 8-in-32 (10) and 16-in-32 (11). - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_XOR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(control_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // Allocate temporary registers for swapping. - uint32_t swap_temp1 = PushSystemTemp(); - uint32_t swap_temp2 = PushSystemTemp(); - - for (uint32_t j = 0; j < eM_count; ++j) { - uint32_t eM_temp = eM_temps[j]; - - // 8-in-16: Create the value being built in temp1. - // ushr temp1, eM, l(8, 8, 8, 8) - // eM: ABCD, temp1: BCD0 - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_USHR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(swap_temp1); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(8); - shader_code_.push_back(8); - shader_code_.push_back(8); - shader_code_.push_back(8); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // 8-in-16: Insert A in Y of temp1. - // bfi temp1, l(8, 8, 8, 8), l(8, 8, 8, 8), eM, temp1 - // eM: ABCD, temp1: BAD0 - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_BFI) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(17)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(swap_temp1); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(8); - shader_code_.push_back(8); - shader_code_.push_back(8); - shader_code_.push_back(8); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(8); - shader_code_.push_back(8); - shader_code_.push_back(8); - shader_code_.push_back(8); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(swap_temp1); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // 8-in-16: Create the source for C insertion in temp2. - // ushr temp2, eM, l(16, 16, 16, 16) - // eM: ABCD, temp1: BAD0, temp2: CD00 - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_USHR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(swap_temp2); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(16); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // 8-in-16: Insert C in W of temp1. - // bfi temp1, l(8, 8, 8, 8), l(24, 24, 24, 24), temp2, temp1 - // eM: ABCD, temp1: BADC - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_BFI) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(17)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(swap_temp1); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(8); - shader_code_.push_back(8); - shader_code_.push_back(8); - shader_code_.push_back(8); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(24); - shader_code_.push_back(24); - shader_code_.push_back(24); - shader_code_.push_back(24); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(swap_temp2); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(swap_temp1); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // Write the 8-in-16 value to eM if needed. - // movc eM, control.xxxx, temp1, eM - // eM: ABCD/BADC - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(swap_temp1); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - - // 16-in-32: Write the low 16 bits to temp1. - // ushr temp1, eM, l(16, 16, 16, 16) - // eM: ABCD/BADC, temp1: CD00/DC00 - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_USHR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(swap_temp1); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(16); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // 16-in-32: Write the high 16 bits to temp1. - // bfi temp1, l(16, 16, 16, 16), l(16, 16, 16, 16), eM, temp1 - // eM: ABCD/BADC, temp1: CDAB/DCBA - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_BFI) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(17)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(swap_temp1); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(16); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(swap_temp1); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // Write the swapped value to eM. - // movc eM, control.yyyy, temp1, eM - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(eM_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(swap_temp1); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(eM_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - } - - // Release swap_temp1 and swap_temp2. - PopSystemTemp(2); - - // Multiply the base address by dword size, also dropping the 0x40000000 - // bit. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ISHL) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(eA_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(eA_temp); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(2); - ++stat_.instruction_count; - ++stat_.int_instruction_count; - - // Drop the exponent in the element index. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1)); - shader_code_.push_back(eA_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(eA_temp); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back((1 << 23) - 1); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // Add the offset of the first written element to the base address. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_UMAD) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(eA_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(eA_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(eA_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // If the first written element is not eM0, add the offset to it. - if (eM_offsets[0] != 0) { - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE( - eM_offsets[0] == 1 ? D3D10_SB_OPCODE_IADD - : D3D10_SB_OPCODE_UMAD) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - eM_offsets[0] == 1 ? 7 : 9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(eA_temp); - if (eM_offsets[0] != 1) { - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(eM_offsets[0]); - } - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(eA_temp); - ++stat_.instruction_count; - if (eM_offsets[0] == 1) { - ++stat_.int_instruction_count; - } else { - ++stat_.uint_instruction_count; - } - } - - // If there are multiple eM# written, calculate offset of each. - uint32_t other_addresses_temp; - if (eM_count > 1) { - other_addresses_temp = PushSystemTemp(); - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_UMAD) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); - shader_code_.push_back(EncodeVectorMaskedOperand( - D3D10_SB_OPERAND_TYPE_TEMP, (1 << (eM_count - 1)) - 1, 1)); - shader_code_.push_back(other_addresses_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - // eM_offsets[0] already added to eA.x. - shader_code_.push_back(eM_offsets[1] - eM_offsets[0]); - shader_code_.push_back(eM_offsets[2] - eM_offsets[0]); - shader_code_.push_back(eM_offsets[3] - eM_offsets[0]); - shader_code_.push_back(eM_offsets[4] - eM_offsets[0]); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(eA_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - } else { - other_addresses_temp = UINT32_MAX; - } - - // Extract the mask of eM register actually written to on the execution - // path. - uint32_t eM_written_temps = PushSystemTemp(0, eM_count > 4 ? 2 : 1); - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back(EncodeVectorMaskedOperand( - D3D10_SB_OPERAND_TYPE_TEMP, (1 << std::min(eM_count, 4u)) - 1, 1)); - shader_code_.push_back(eM_written_temps); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, i >> 2, 1)); - shader_code_.push_back(system_temp_memexport_written_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - uint32_t eM_written_base = 1u << ((i & 3) << 3); - shader_code_.push_back(eM_written_base << eM_offsets[0]); - shader_code_.push_back(eM_written_base << eM_offsets[1]); - shader_code_.push_back(eM_written_base << eM_offsets[2]); - shader_code_.push_back(eM_written_base << eM_offsets[3]); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - if (eM_count > 4) { - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(eM_written_temps + 1); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, i >> 2, 1)); - shader_code_.push_back(system_temp_memexport_written_); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(eM_written_base << eM_offsets[4]); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - } - - // Check which Store (store_raw write mask) should be used according to the - // element size. Compare the element size to 16 and 8 into - // control_temp.xy. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IEQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(control_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(16); - shader_code_.push_back(8); - shader_code_.push_back(0); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.int_instruction_count; - - // Actually store the data. - // if (element_size == 16) { - // Store4 (j = 0) - // } else { - // if (element_size == 8) { - // Store2 (j = 1) - // } else { - // Store (j = 2) - // } - // } - for (uint32_t j = 0; j < 3; ++j) { - if (j < 2) { - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | - ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( - D3D10_SB_INSTRUCTION_TEST_NONZERO) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, j, 1)); - shader_code_.push_back(control_temp); - ++stat_.instruction_count; - ++stat_.dynamic_flow_control_count; + DxbcOpMovC(endian_dest, DxbcSrc::R(control_temp, DxbcSrc::kZZZZ), + DxbcSrc::LU(uint32_t(Endian128::k8in32)), endian_src); } - // 0b1111 for j = 0, 0b0011 for j = 1, 0b0001 for j = 2. - uint32_t store_mask = (1 << (1 << (2 - j))) - 1; - uint32_t store_swizzle = kSwizzleXYZW & ((1 << ((1 << (2 - j)) * 2)) - 1); + uint32_t swap_temp = PushSystemTemp(); + DxbcDest swap_temp_dest(DxbcDest::R(swap_temp)); + DxbcSrc swap_temp_src(DxbcSrc::R(swap_temp)); - for (uint32_t k = 0; k < eM_count; ++k) { - // Check if the eM was actually written to on the execution path. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_IF) | - ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( - D3D10_SB_INSTRUCTION_TEST_NONZERO) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, k & 3, 1)); - shader_code_.push_back(eM_written_temps + (k >> 2)); - ++stat_.instruction_count; - ++stat_.dynamic_flow_control_count; - - // Store. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_STORE_RAW) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); - shader_code_.push_back(EncodeVectorMaskedOperand( - D3D11_SB_OPERAND_TYPE_UNORDERED_ACCESS_VIEW, store_mask, 2)); - shader_code_.push_back(0); - shader_code_.push_back(uint32_t(UAVRegister::kSharedMemory)); - shader_code_.push_back(EncodeVectorSelectOperand( - D3D10_SB_OPERAND_TYPE_TEMP, k ? (k - 1) : 0, 1)); - shader_code_.push_back(k ? other_addresses_temp : eA_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, store_swizzle, 1)); - shader_code_.push_back(eM_temps[k]); - ++stat_.instruction_count; - ++stat_.c_texture_store_instructions; - - // Close the eM write check. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; + // 8-in-16 or one half of 8-in-32. + DxbcOpSwitch(endian_src); + DxbcOpCase(DxbcSrc::LU(uint32_t(Endian128::k8in16))); + DxbcOpCase(DxbcSrc::LU(uint32_t(Endian128::k8in32))); + for (uint32_t j = 0; j < eM_count; ++j) { + DxbcDest eM_dest(DxbcDest::R(eM_temps[j])); + DxbcSrc eM_src(DxbcSrc::R(eM_temps[j])); + // Temp = X0Z0. + DxbcOpAnd(swap_temp_dest, eM_src, DxbcSrc::LU(0x00FF00FF)); + // eM = YZW0. + DxbcOpUShR(eM_dest, eM_src, DxbcSrc::LU(8)); + // eM = Y0W0. + DxbcOpAnd(eM_dest, eM_src, DxbcSrc::LU(0x00FF00FF)); + // eM = YXWZ. + DxbcOpUMAd(eM_dest, swap_temp_src, DxbcSrc::LU(256), eM_src); } + DxbcOpBreak(); + DxbcOpEndSwitch(); - if (j < 2) { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ELSE) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; + // 16-in-32 or another half of 8-in-32. + DxbcOpSwitch(endian_src); + DxbcOpCase(DxbcSrc::LU(uint32_t(Endian128::k8in32))); + DxbcOpCase(DxbcSrc::LU(uint32_t(Endian128::k16in32))); + for (uint32_t j = 0; j < eM_count; ++j) { + DxbcDest eM_dest(DxbcDest::R(eM_temps[j])); + DxbcSrc eM_src(DxbcSrc::R(eM_temps[j])); + // Temp = ZW00. + DxbcOpUShR(swap_temp_dest, eM_src, DxbcSrc::LU(16)); + // eM = ZWXY. + DxbcOpBFI(eM_dest, DxbcSrc::LU(16), DxbcSrc::LU(16), eM_src, + swap_temp_src); } - } - for (uint32_t j = 0; j < 2; ++j) { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; - } + DxbcOpBreak(); + DxbcOpEndSwitch(); - // Release eM_written_temps. - PopSystemTemp(eM_count > 4 ? 2 : 1); - - if (other_addresses_temp != UINT32_MAX) { + // Release swap_temp. PopSystemTemp(); } + // control_temp.yz are now free. + + DxbcDest address_dest(DxbcDest::R(eA_temp, 0b0001)); + DxbcSrc address_src(DxbcSrc::R(eA_temp, DxbcSrc::kXXXX)); + // Multiply the base address by dword size, also dropping the 0x40000000 + // bit. + DxbcOpIShL(address_dest, address_src, DxbcSrc::LU(2)); + // Drop the exponent in the element index. + DxbcOpAnd(DxbcDest::R(eA_temp, 0b0010), DxbcSrc::R(eA_temp, DxbcSrc::kYYYY), + DxbcSrc::LU((1 << 23) - 1)); + // Add the offset of the first written element to the base address. + DxbcOpUMAd(address_dest, DxbcSrc::R(eA_temp, DxbcSrc::kYYYY), + element_size_src, address_src); + // Do the writes. + DxbcSrc eM_written_src( + DxbcSrc::R(system_temp_memexport_written_).Select(i >> 2)); + uint32_t eM_written_base = 1u << ((i & 3) << 3); + for (uint32_t j = 0; j < eM_count; ++j) { + // Go to the next eM#. + uint32_t eM_relative_offset = eM_offsets[j] - (j ? eM_offsets[j - 1] : 0); + if (eM_relative_offset) { + if (eM_relative_offset == 1) { + DxbcOpIAdd(address_dest, element_size_src, address_src); + } else { + DxbcOpUMAd(address_dest, DxbcSrc::LU(eM_relative_offset), + element_size_src, address_src); + } + } + // Check if the eM# was actually written to on the execution path. + DxbcOpAnd(DxbcDest::R(control_temp, 0b0010), eM_written_src, + DxbcSrc::LU(eM_written_base << eM_offsets[j])); + DxbcOpIf(true, DxbcSrc::R(control_temp, DxbcSrc::kYYYY)); + // Write the element of the needed size. + DxbcSrc eM_src(DxbcSrc::R(eM_temps[j])); + DxbcOpSwitch(element_size_src); + for (uint32_t k = 1; k <= 4; k <<= 1) { + DxbcOpCase(DxbcSrc::LU(k * 4)); + DxbcOpStoreRaw( + DxbcDest::U(0, uint32_t(UAVRegister::kSharedMemory), (1 << k) - 1), + address_src, eM_src); + DxbcOpBreak(); + } + DxbcOpEndSwitch(); + DxbcOpEndIf(); + } + // control_temp.y is now free. } // Close the memexport possibility check. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ENDIF) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); - ++stat_.instruction_count; + DxbcOpEndIf(); // Release control_temp. PopSystemTemp();