diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 176728099..1bdc8257e 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -56,11 +56,9 @@ DEFINE_bool(d3d12_submit_on_primary_buffer_end, true, "possible to submit immediately to try to reduce frame latency.", "D3D12"); DEFINE_bool( - d3d12_tessellation_adaptive, false, + d3d12_tessellation_adaptive, true, "Allow games to use adaptive tessellation - may be disabled if the game " - "has issues with memexport, the maximum factor will be used in this case. " - "Temporarily disabled by default since there are visible cracks currently " - "in Halo 3.", + "has issues with memexport, the maximum factor will be used in this case.", "D3D12"); namespace xe { diff --git a/src/xenia/gpu/d3d12/shaders/adaptive_triangle.hs.hlsl b/src/xenia/gpu/d3d12/shaders/adaptive_triangle.hs.hlsl index 2409ed5ec..6b5368acc 100644 --- a/src/xenia/gpu/d3d12/shaders/adaptive_triangle.hs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/adaptive_triangle.hs.hlsl @@ -20,13 +20,29 @@ XeHSConstantDataOutput XePatchConstant( // has already been added on the CPU. // Fork phase. - // UVW are taken with ZYX swizzle (when r1.y is 0) in the vertex (domain) - // shader. Edge 0 is with U = 0, edge 1 is with V = 0, edge 2 is with W = 0. - // TODO(Triang3l): Verify this order. There are still cracks. + // It appears that on the Xbox 360: + // - [0] is the factor for the v0->v1 edge. + // - [1] is the factor for the v1->v2 edge. + // - [2] is the factor for the v2->v0 edge. + // Where v0 is the U1V0W0 vertex, v1 is the U0V1W0 vertex, and v2 is the + // U0V0W1 vertex. + // The hint at the order was provided in the Code Listing 15 of: + // http://www.uraldev.ru/files/download/21/Real-Time_Tessellation_on_GPU.pdf + // In Direct3D 12: + // - [0] is the factor for the U0 edge (v1->v2). + // - [1] is the factor for the V0 edge (v2->v0), + // - [2] is the factor for the W0 edge (v0->v1). + // Direct3D 12 provides barycentrics as X for v0, Y for v1, Z for v2. + // In Xenia's domain shaders, the barycentric coordinates are handled as: + // 1) vDomain.xyz -> r0.zyx by Xenia. + // 2) r0.zyx -> r0.zyx by the guest (because r1.y is set to 0 by Xenia, which + // apparently means identity swizzle to games). + // 3) r0.z * v0 + r0.y * v1 + r0.x * v2 by the guest. + // With this order, there are no cracks in Halo 3 water. [unroll] for (i = 0u; i < 3u; ++i) { - output.edges[i] = - clamp(asfloat(xe_input_patch[2u - i].index_or_edge_factor) + 1.0f, - xe_tessellation_factor_range.x, xe_tessellation_factor_range.y); + output.edges[i] = clamp( + asfloat(xe_input_patch[(i + 1u) % 3u].index_or_edge_factor) + 1.0f, + xe_tessellation_factor_range.x, xe_tessellation_factor_range.y); } // Join phase. vpc0, vpc1, vpc2 taken as inputs. diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.cso index b42fac2c4..7066de1d3 100644 Binary files a/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.cso and b/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.cso differ diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.h b/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.h index 25eea0214..6e970d083 100644 --- a/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.h +++ b/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.h @@ -1,11 +1,11 @@ // generated from `xb buildhlsl` // source: adaptive_triangle.hs.hlsl const uint8_t adaptive_triangle_hs[] = { - 0x44, 0x58, 0x42, 0x43, 0x72, 0x34, 0xB9, 0xC2, 0xEC, 0x61, 0xB3, 0x84, - 0x40, 0x92, 0xAA, 0x70, 0x58, 0xC9, 0x88, 0x7D, 0x01, 0x00, 0x00, 0x00, - 0x88, 0x0D, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, + 0x44, 0x58, 0x42, 0x43, 0x11, 0x2F, 0xB7, 0xDC, 0x3A, 0xC8, 0x6E, 0xE8, + 0x46, 0xFA, 0x34, 0x10, 0x8D, 0x43, 0xC9, 0x2E, 0x01, 0x00, 0x00, 0x00, + 0xAC, 0x0D, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x30, 0x0A, 0x00, 0x00, 0x64, 0x0A, 0x00, 0x00, 0x74, 0x0A, 0x00, 0x00, - 0x08, 0x0B, 0x00, 0x00, 0xEC, 0x0C, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, + 0x08, 0x0B, 0x00, 0x00, 0x10, 0x0D, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xF0, 0x09, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0x53, 0x48, 0x00, 0x05, 0x00, 0x00, 0xC6, 0x09, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, @@ -236,8 +236,8 @@ const uint8_t adaptive_triangle_hs[] = { 0x01, 0x0E, 0x00, 0x00, 0x53, 0x56, 0x5F, 0x54, 0x65, 0x73, 0x73, 0x46, 0x61, 0x63, 0x74, 0x6F, 0x72, 0x00, 0x53, 0x56, 0x5F, 0x49, 0x6E, 0x73, 0x69, 0x64, 0x65, 0x54, 0x65, 0x73, 0x73, 0x46, 0x61, 0x63, 0x74, 0x6F, - 0x72, 0x00, 0xAB, 0xAB, 0x53, 0x48, 0x45, 0x58, 0xDC, 0x01, 0x00, 0x00, - 0x51, 0x00, 0x03, 0x00, 0x77, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x01, + 0x72, 0x00, 0xAB, 0xAB, 0x53, 0x48, 0x45, 0x58, 0x00, 0x02, 0x00, 0x00, + 0x51, 0x00, 0x03, 0x00, 0x80, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x01, 0x93, 0x18, 0x00, 0x01, 0x94, 0x18, 0x00, 0x01, 0x95, 0x10, 0x00, 0x01, 0x96, 0x20, 0x00, 0x01, 0x97, 0x18, 0x00, 0x01, 0x6A, 0x08, 0x00, 0x01, 0x59, 0x00, 0x00, 0x07, 0x46, 0x8E, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -252,10 +252,13 @@ const uint8_t adaptive_triangle_hs[] = { 0x12, 0x20, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x02, 0x01, 0x00, 0x00, 0x00, 0x5B, 0x00, 0x00, 0x04, 0x12, 0x20, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x04, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x0A, 0x70, 0x01, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x12, 0x00, 0x10, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3F, - 0x0A, 0x90, 0xE1, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x1E, 0x00, 0x00, 0x06, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x70, 0x01, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x4E, 0x00, 0x00, 0x08, 0x00, 0xD0, 0x00, 0x00, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, + 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x80, 0x3F, 0x0A, 0x90, 0xA1, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x09, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -277,9 +280,9 @@ const uint8_t adaptive_triangle_hs[] = { 0x33, 0x00, 0x00, 0x07, 0x12, 0x20, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0xB0, 0x11, 0x00, 0x02, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, - 0x94, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.txt b/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.txt index 6bfd212ec..5b5446a59 100644 --- a/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.txt +++ b/src/xenia/gpu/d3d12/shaders/dxbc/adaptive_triangle_hs.txt @@ -99,8 +99,9 @@ dcl_output_siv o1.x, finalTriVeq0EdgeTessFactor dcl_output_siv o2.x, finalTriWeq0EdgeTessFactor dcl_temps 1 dcl_indexrange o0.x 3 -ineg r0.x, vForkInstanceID.x -add r0.x, l(1.000000), vicp[r0.x + 2][0].x +iadd r0.x, vForkInstanceID.x, l(1) +udiv null, r0.x, r0.x, l(3) +add r0.x, l(1.000000), vicp[r0.x + 0][0].x max r0.x, r0.x, CB0[0][14].x min r0.x, r0.x, CB0[0][14].y mov r0.y, vForkInstanceID.x @@ -115,4 +116,4 @@ dcl_temps 1 min r0.x, vpc0.x, vpc1.x min o3.x, r0.x, vpc2.x ret -// Approximately 11 instruction slots used +// Approximately 12 instruction slots used