diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index 3c11f4c48..915fa19c7 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -959,6 +959,8 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, // bilinear filtering), applying exponent bias and swapping red and blue in // a format-agnostic way, then the resulting color is written to a temporary // RTV of the destination format. + auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); + auto device = provider->GetDevice(); if (sample_select <= xenos::CopySampleSelect::k3 && src_texture_format == dest_format && dest_exp_bias == 0) { XELOGGPU("Resolving a single sample without conversion"); @@ -1385,6 +1387,14 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() { const RenderTarget* render_target = binding.render_target; bool is_64bpp = false; + // Get the number of X thread groups. + uint32_t rt_pitch_tiles = surface_pitch_tiles; + if (!render_target->key.is_depth && + IsColorFormat64bpp( + ColorRenderTargetFormat(render_target->key.format))) { + rt_pitch_tiles *= 2; + } + // Copy from the render target planes and set up the layout. D3D12_TEXTURE_COPY_LOCATION location_source, location_dest; location_source.pResource = render_target->resource; @@ -1397,13 +1407,10 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() { command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source, nullptr); EDRAMLoadStoreRootConstants root_constants; - root_constants.base_tiles = binding.edram_base; - root_constants.pitch_tiles = surface_pitch_tiles; - if (!render_target->key.is_depth && - IsColorFormat64bpp( - ColorRenderTargetFormat(render_target->key.format))) { - root_constants.pitch_tiles *= 2; - } + root_constants.base_pitch_tiles = + binding.edram_base | (rt_pitch_tiles << 11); + root_constants.rt_color_depth_offset = + uint32_t(location_dest.PlacedFootprint.Offset); root_constants.rt_color_depth_pitch = location_dest.PlacedFootprint.Footprint.RowPitch; if (render_target->key.is_depth) { @@ -1411,12 +1418,10 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() { location_dest.PlacedFootprint = render_target->footprints[1]; command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source, nullptr); - root_constants.rt_stencil_offset_or_swap_red_blue = + root_constants.rt_stencil_offset = uint32_t(location_dest.PlacedFootprint.Offset); root_constants.rt_stencil_pitch = location_dest.PlacedFootprint.Footprint.RowPitch; - } else { - root_constants.rt_stencil_offset_or_swap_red_blue = 0; } // Transition the copy buffer to SRV. @@ -1437,8 +1442,7 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() { EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth, render_target->key.format); command_processor_->SetPipeline(edram_store_pipelines_[size_t(mode)]); - command_list->Dispatch(root_constants.pitch_tiles, binding.edram_dirty_rows, - 1); + command_list->Dispatch(rt_pitch_tiles, binding.edram_dirty_rows, 1); // Commit the UAV write and prepare for copying again. barrier_count = 1; @@ -1569,31 +1573,18 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM( } const RenderTarget* render_target = render_targets[i]; - // Set up the layout. - EDRAMLoadStoreRootConstants root_constants; - root_constants.base_tiles = edram_bases[i]; - root_constants.pitch_tiles = render_target->key.width_ss_div_80; + // Get the number of X thread groups. + uint32_t edram_pitch_tiles = render_target->key.width_ss_div_80; if (!render_target->key.is_depth && IsColorFormat64bpp( ColorRenderTargetFormat(render_target->key.format))) { - root_constants.pitch_tiles *= 2; + edram_pitch_tiles *= 2; } - root_constants.rt_color_depth_pitch = - render_target->footprints[0].Footprint.RowPitch; - if (render_target->key.is_depth) { - root_constants.rt_stencil_offset_or_swap_red_blue = - uint32_t(render_target->footprints[1].Offset); - root_constants.rt_stencil_pitch = - render_target->footprints[1].Footprint.RowPitch; - } else { - root_constants.rt_stencil_offset_or_swap_red_blue = 0; - } - // Validate the height in case the resolve is somehow too large (shouldn't // happen though, but who knows what games do). uint32_t edram_rows = std::min(render_target->key.height_ss_div_16, - (2048u - edram_bases[i]) / root_constants.pitch_tiles); + (2048u - edram_bases[i]) / edram_pitch_tiles); if (edram_rows == 0) { continue; } @@ -1612,12 +1603,25 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM( } // Load the data. + EDRAMLoadStoreRootConstants root_constants; + root_constants.base_pitch_tiles = + edram_bases[i] | (edram_pitch_tiles << 11); + root_constants.rt_color_depth_offset = + uint32_t(render_target->footprints[0].Offset); + root_constants.rt_color_depth_pitch = + render_target->footprints[0].Footprint.RowPitch; + if (render_target->key.is_depth) { + root_constants.rt_stencil_offset = + uint32_t(render_target->footprints[1].Offset); + root_constants.rt_stencil_pitch = + render_target->footprints[1].Footprint.RowPitch; + } command_list->SetComputeRoot32BitConstants( 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth, render_target->key.format); command_processor_->SetPipeline(edram_load_pipelines_[size_t(mode)]); - command_list->Dispatch(root_constants.pitch_tiles, edram_rows, 1); + command_list->Dispatch(edram_pitch_tiles, edram_rows, 1); // Commit the UAV write and transition the copy buffer to copy source. barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_UAV; diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index 16911bf2e..bdc6cf8b6 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -369,11 +369,33 @@ class RenderTargetCache { // EDRAM buffer load/store root signature. ID3D12RootSignature* edram_load_store_root_signature_ = nullptr; struct EDRAMLoadStoreRootConstants { - uint32_t base_tiles; - uint32_t pitch_tiles; - uint32_t rt_color_depth_pitch; - uint32_t rt_stencil_offset_or_swap_red_blue; - uint32_t rt_stencil_pitch; + union { + struct { + uint32_t rt_color_depth_offset; + uint32_t rt_color_depth_pitch; + uint32_t rt_stencil_offset; + uint32_t rt_stencil_pitch; + }; + struct { + // 16 bits for X, 16 bits for Y. + uint32_t tile_sample_rect_tl; + uint32_t tile_sample_rect_br; + uint32_t tile_sample_dest_base; + // 0:13 - destination pitch. + // 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA. + // 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA. + // 16:17 - sample to load (16 - vertical index, 17 - horizontal index). + // 18:19 - destination endianness. + // 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping. + // For 32 bits per pixel: + // 20:24 - red/blue bit depth. + // 25:29 - blue offset. + // For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47. + uint32_t tile_sample_dest_info; + }; + }; + // Base in the lower 11 bits, pitch above. + uint32_t base_pitch_tiles; }; // EDRAM buffer load/store pipelines. static const EDRAMLoadStoreModeInfo diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl index f0ca434a0..0bcdfb8e8 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl @@ -8,16 +8,7 @@ void main(uint3 xe_group_id : SV_GroupID, tile_dword_index.x *= 4u; uint4 pixels = xe_edram_load_store_source.Load4( XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); - if (xe_edram_swap_red_blue != 0u) { - // Not a very long shift, just 16 or 20. - uint blue_shift = xe_edram_swap_red_blue >> 16u; - uint red_mask = xe_edram_swap_red_blue & 0xFFFFu; - uint blue_mask = red_mask << blue_shift; - pixels = (pixels & ~(red_mask | blue_mask)) | - ((pixels & red_mask) << blue_shift) | - ((pixels >> blue_shift) & red_mask); - } uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + - xe_thread_id.x * 16u; + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; xe_edram_load_store_dest.Store4(rt_offset, pixels); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl index 7ee08448a..a65aa4bf2 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl @@ -13,12 +13,7 @@ void main(uint3 xe_group_id : SV_GroupID, } uint4 pixels = xe_edram_load_store_source.Load4( XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); - if (xe_edram_swap_red_blue != 0u) { - // The only 64-bit formats with a blue component are 16_16_16_16 and - // 16_16_16_16_FLOAT. - pixels = (pixels.yxwz & 0xFFFFu) | (pixels & 0xFFFF0000u); - } uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + - xe_thread_id.x * 16u; + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; xe_edram_load_store_dest.Store4(rt_offset, pixels); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl index 844b78d22..53c18687e 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl @@ -14,11 +14,7 @@ void main(uint3 xe_group_id : SV_GroupID, uint4 pixels_f16u32_packed = uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) | (uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u); - if (xe_edram_swap_red_blue != 0u) { - pixels_f16u32_packed = (pixels_f16u32_packed.yxwz & 0xFFFFu) | - (pixels_f16u32_packed & 0xFFFF0000u); - } uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + - xe_thread_id.x * 16u; + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl index 1d3d51faa..06eeb0080 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl @@ -19,13 +19,13 @@ void main(uint3 xe_group_id : SV_GroupID, uint4 depth = depth24to32 + (depth32 - depth24to32) * uint4(XeFloat32To20e4(depth32) == depth24); uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + - xe_thread_id.x * 16u; + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; xe_edram_load_store_dest.Store4(rt_offset, depth); // Stencil. uint4 stencil = (depth24_stencil & 0xFFu) << uint4(0u, 8u, 16u, 24u); stencil.xy |= stencil.zw; stencil.x |= stencil.y; - rt_offset = xe_edram_rt_stencil_offset + - xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u; + rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + + xe_edram_rt_stencil_offset; xe_edram_load_store_dest.Store(rt_offset, stencil.x); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl index 660ef6541..0fdbadd2b 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl @@ -10,13 +10,13 @@ void main(uint3 xe_group_id : SV_GroupID, XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); // Depth. uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + - xe_thread_id.x * 16u; + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; xe_edram_load_store_dest.Store4(rt_offset, pixels >> 8u); // Stencil. uint4 stencil = (pixels & 0xFFu) << uint4(0u, 8u, 16u, 24u); stencil.xy |= stencil.zw; stencil.x |= stencil.y; - rt_offset = xe_edram_rt_stencil_offset + - xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u; + rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + + xe_edram_rt_stencil_offset; xe_edram_load_store_dest.Store(rt_offset, stencil.x); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli index 22b4a9107..0314abd6d 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli +++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli @@ -1,25 +1,43 @@ #ifndef XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_ #define XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_ +// Root constants. cbuffer XeEDRAMLoadStoreConstants : register(b0) { - uint xe_edram_base_tiles; - uint xe_edram_pitch_tiles; - uint xe_edram_rt_color_depth_pitch; - uint xe_edram_rt_stencil_offset_or_swap_red_blue; - uint xe_edram_rt_stencil_pitch; + uint4 xe_edram_load_store_constants; + // Base in the lower 11 bits, pitch in the upper part, in tiles. + uint xe_edram_base_pitch_tiles; }; -#define xe_edram_rt_stencil_offset xe_edram_rt_stencil_offset_or_swap_red_blue -// For loads only. How exactly it's handled depends on the specific load shader, -// but 0 always means red and blue shouldn't be swapped. -#define xe_edram_swap_red_blue xe_edram_rt_stencil_offset_or_swap_red_blue + +// For loading and storing render targets. +#define xe_edram_rt_color_depth_offset (xe_edram_load_store_constants.x) +#define xe_edram_rt_color_depth_pitch (xe_edram_load_store_constants.y) +#define xe_edram_rt_stencil_offset (xe_edram_load_store_constants.z) +#define xe_edram_rt_stencil_pitch (xe_edram_load_store_constants.w) + +// For single sample resolving. +// Left/top of the copied region (relative to EDRAM base) in the lower 16 bits, +// right/bottom in the upper. +#define xe_edram_tile_sample_rect (xe_edram_load_store_constants.xy) +#define xe_edram_tile_sample_dest_base (xe_edram_load_store_constants.z) +// 0:13 - destination pitch. +// 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA. +// 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA. +// 16:17 - sample to load (16 - vertical index, 17 - horizontal index). +// 18:19 - destination endianness. +// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping. +// For 32 bits per pixel: +// 20:24 - red/blue bit depth. +// 25:29 - blue offset. +// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47. +#define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w) ByteAddressBuffer xe_edram_load_store_source : register(t0); RWByteAddressBuffer xe_edram_load_store_dest : register(u0); uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) { - return (xe_edram_base_tiles + (tile_index.y * xe_edram_pitch_tiles) + - tile_index.x) * 5120u + tile_dword_index.y * 320u + - tile_dword_index.x * 4u; + return ((xe_edram_base_pitch_tiles & 2047u) + + tile_index.y * (xe_edram_base_pitch_tiles >> 11u) + tile_index.x) * + 5120u + tile_dword_index.y * 320u + tile_dword_index.x * 4u; } #endif // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_ diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl index 584416fdb..db8038ae6 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl @@ -5,7 +5,7 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + - xe_thread_id.x * 16u; + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; uint4 pixels = xe_edram_load_store_source.Load4(rt_offset); uint2 tile_dword_index = xe_group_thread_id.xy; tile_dword_index.x *= 4u; diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl index ec3cab476..7a91fe1b3 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl @@ -5,7 +5,7 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + - xe_thread_id.x * 16u; + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; uint4 pixels = xe_edram_load_store_source.Load4(rt_offset); // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data // from 1 render target row rather than 1. Threads with X 0-19 are for the diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl index 7b9c5cc03..2beef5b4a 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl @@ -6,7 +6,7 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + - xe_thread_id.x * 16u; + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset); uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u); uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u); diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl index e5376c508..4134240a4 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl @@ -7,12 +7,12 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_thread_id : SV_DispatchThreadID) { // Depth. uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + - xe_thread_id.x * 16u; + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset); uint4 depth24_stencil = XeFloat32To20e4(depth32) << 8u; // Stencil. - rt_offset = xe_edram_rt_stencil_offset + - xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u; + rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + + xe_edram_rt_stencil_offset; depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >> uint4(0u, 8u, 16u, 24u)) & 0xFFu; uint2 tile_dword_index = xe_group_thread_id.xy; diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl index ba4b6db00..010cef44b 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl @@ -6,12 +6,12 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_thread_id : SV_DispatchThreadID) { // Depth. uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + - xe_thread_id.x * 16u; + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; uint4 pixels = (xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu) << 8u; // Stencil. - rt_offset = xe_edram_rt_stencil_offset + - xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u; + rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + + xe_edram_rt_stencil_offset; pixels |= (xe_edram_load_store_source.Load(rt_offset).xxxx >> uint4(0u, 8u, 16u, 24u)) & 0xFFu; uint2 tile_dword_index = xe_group_thread_id.xy;