[D3D12] DXT1 decompression shader

This commit is contained in:
Triang3l 2018-09-21 08:38:22 +03:00
parent 1f248572ac
commit 17e3f09c1e
11 changed files with 1933 additions and 72 deletions

View File

@ -1,8 +1,8 @@
// generated from `xb buildhlsl` // generated from `xb buildhlsl`
// source: texture_load_ctx1.cs.hlsl // source: texture_load_ctx1.cs.hlsl
const uint8_t texture_load_ctx1_cs[] = { const uint8_t texture_load_ctx1_cs[] = {
0x44, 0x58, 0x42, 0x43, 0x36, 0x1B, 0x8D, 0x80, 0x5E, 0x82, 0x06, 0x8F, 0x44, 0x58, 0x42, 0x43, 0x7E, 0xC2, 0x29, 0xDA, 0xF8, 0x25, 0x11, 0x52,
0xC2, 0xE4, 0xED, 0xF5, 0xC4, 0x87, 0x3F, 0xF9, 0x01, 0x00, 0x00, 0x00, 0xE1, 0xBC, 0xD4, 0xC7, 0xF9, 0x11, 0xB6, 0x3E, 0x01, 0x00, 0x00, 0x00,
0x20, 0x26, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x20, 0x26, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
0x54, 0x04, 0x00, 0x00, 0x64, 0x04, 0x00, 0x00, 0x74, 0x04, 0x00, 0x00, 0x54, 0x04, 0x00, 0x00, 0x64, 0x04, 0x00, 0x00, 0x74, 0x04, 0x00, 0x00,
0x84, 0x25, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x18, 0x04, 0x00, 0x00, 0x84, 0x25, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x18, 0x04, 0x00, 0x00,
@ -528,57 +528,57 @@ const uint8_t texture_load_ctx1_cs[] = {
0x03, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0xC2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0xC2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x15, 0x00, 0x00, 0x01, 0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x15, 0x00, 0x00, 0x01, 0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00,
0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00,
0x8C, 0x00, 0x00, 0x11, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0F,
0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x8C, 0x00, 0x00, 0x11, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0xAA, 0xAA, 0xAA, 0xAA,
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x1E, 0x00, 0x00, 0x07,
0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55,
0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0x00, 0x00, 0x07, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55,
0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x57, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x0A, 0x32, 0x00, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00,
0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x8C, 0x00, 0x00, 0x11,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x11,
0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x0A, 0x32, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07,

View File

@ -178,20 +178,20 @@ else
mov r1.zw, r3.xxxz mov r1.zw, r3.xxxz
mov r2.zw, r3.yyyw mov r2.zw, r3.yyyw
endif endif
ishl r3.xyzw, r2.xyzw, l(1, 1, 1, 1)
and r3.xyzw, r3.xyzw, l(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa)
ushr r2.xyzw, r2.xyzw, l(1, 1, 1, 1)
and r2.xyzw, r2.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555)
iadd r2.xyzw, r2.xyzw, r3.xyzw
ushr r3.xyzw, r2.xyzw, l(1, 1, 1, 1)
and r3.xyzw, r3.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555)
xor r2.xyzw, r2.xyzw, r3.xyzw
ishl r3.xyzw, r1.xyzw, l(8, 8, 8, 8) ishl r3.xyzw, r1.xyzw, l(8, 8, 8, 8)
and r3.xyzw, r3.xyzw, l(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000) and r3.xyzw, r3.xyzw, l(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000)
bfi r3.xyzw, l(8, 8, 8, 8), l(0, 0, 0, 0), r1.xyzw, r3.xyzw bfi r3.xyzw, l(8, 8, 8, 8), l(0, 0, 0, 0), r1.xyzw, r3.xyzw
ubfe r4.xyzw, l(8, 8, 8, 8), l(16, 16, 16, 16), r1.xyzw ubfe r4.xyzw, l(8, 8, 8, 8), l(16, 16, 16, 16), r1.xyzw
ushr r1.xyzw, r1.xyzw, l(8, 8, 8, 8) ushr r1.xyzw, r1.xyzw, l(8, 8, 8, 8)
bfi r1.xyzw, l(16, 16, 16, 16), l(0, 0, 0, 0), r4.xyzw, r1.xyzw bfi r1.xyzw, l(16, 16, 16, 16), l(0, 0, 0, 0), r4.xyzw, r1.xyzw
ishl r4.xyzw, r2.xyzw, l(1, 1, 1, 1)
and r4.xyzw, r4.xyzw, l(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa)
ushr r2.xyzw, r2.xyzw, l(1, 1, 1, 1)
and r2.xyzw, r2.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555)
iadd r2.xyzw, r2.xyzw, r4.xyzw
ushr r4.xyzw, r2.xyzw, l(1, 1, 1, 1)
and r4.xyzw, r4.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555)
xor r2.xyzw, r2.xyzw, r4.xyzw
ishl r0.xy, r0.xyxx, l(2, 2, 0, 0) ishl r0.xy, r0.xyxx, l(2, 2, 0, 0)
ishl r0.x, r0.x, l(1) ishl r0.x, r0.x, l(1)
imad r0.z, vThreadID.z, CB0[0][1].y, r0.y imad r0.z, vThreadID.z, CB0[0][1].y, r0.y

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,412 @@
//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
// Buffer Definitions:
//
// cbuffer XeTextureCopyConstants
// {
//
// uint xe_texture_copy_guest_base; // Offset: 0 Size: 4
// uint xe_texture_copy_guest_pitch; // Offset: 4 Size: 4
// uint xe_texture_copy_host_base; // Offset: 8 Size: 4
// uint xe_texture_copy_host_pitch; // Offset: 12 Size: 4
// uint3 xe_texture_copy_size_texels; // Offset: 16 Size: 12
// bool xe_texture_copy_is_3d; // Offset: 28 Size: 4
// uint3 xe_texture_copy_size_blocks; // Offset: 32 Size: 12
// uint xe_texture_copy_endianness; // Offset: 44 Size: 4
// uint3 xe_texture_copy_guest_mip_offset;// Offset: 48 Size: 12
//
// }
//
//
// Resource Bindings:
//
// Name Type Format Dim ID HLSL Bind Count
// ------------------------------ ---------- ------- ----------- ------- -------------- ------
// xe_texture_copy_source texture byte r/o T0 t0 1
// xe_texture_copy_dest UAV byte r/w U0 u0 1
// XeTextureCopyConstants cbuffer NA NA CB0 cb0 1
//
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Input
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Output
cs_5_1
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[0:0][4], immediateIndexed, space=0
dcl_resource_raw T0[0:0], space=0
dcl_uav_raw U0[0:0], space=0
dcl_input vThreadID.xyz
dcl_temps 22
dcl_thread_group 8, 32, 1
ishl r0.x, vThreadID.x, l(2)
mov r0.yz, vThreadID.yyzy
uge r1.xyz, r0.xyzx, CB0[0][2].xyzx
or r0.w, r1.y, r1.x
or r0.w, r1.z, r0.w
if_nz r0.w
ret
endif
iadd r1.xyz, r0.xyzx, CB0[0][3].xyzx
ieq r0.z, CB0[0][0].y, l(-1)
if_nz r0.z
if_nz CB0[0][1].w
iadd r2.xyzw, r1.xxxx, l(0, 1, 2, 3)
iadd r0.zw, CB0[0][2].yyyx, l(0, 0, 31, 31)
ushr r3.xyz, r1.zyyz, l(2, 4, 3, 0)
ushr r0.zw, r0.zzzw, l(0, 0, 4, 5)
and r0.z, r0.z, l(0x0ffffffe)
imad r0.z, r3.x, r0.z, r3.y
iadd r1.w, r3.z, r3.x
bfi r3.x, l(1), l(1), r1.w, l(0)
ushr r4.xyzw, r2.xyzw, l(3, 3, 3, 3)
iadd r3.xyzw, r3.xxxx, r4.xyzw
bfi r3.xyzw, l(2, 2, 2, 2), l(1, 1, 1, 1), r3.xyzw, l(0, 0, 0, 0)
bfi r3.xyzw, l(1, 1, 1, 1), l(0, 0, 0, 0), r1.wwww, r3.xyzw
ishl r1.w, r1.y, l(11)
and r1.w, r1.w, l(0x00003000)
bfi r4.xyzw, l(3, 3, 3, 3), l(9, 9, 9, 9), r2.xyzw, r1.wwww
ushr r4.xyzw, r4.xyzw, l(6, 6, 6, 6)
ushr r2.xyzw, r2.xyzw, l(5, 5, 5, 5)
imad r2.xyzw, r0.zzzz, r0.wwww, r2.xyzw
and r5.xyzw, r4.xyzw, l(240, 240, 240, 240)
bfi r6.xyzw, l(19, 19, 19, 19), l(11, 11, 11, 11), r2.xyzw, l(0, 0, 0, 0)
imad r6.xyzw, r5.xyzw, l(2, 2, 2, 2), r6.xyzw
bfi r6.xyzw, l(4, 4, 4, 4), l(0, 0, 0, 0), r4.xyzw, r6.xyzw
bfi r6.xyzw, l(2, 2, 2, 2), l(9, 9, 9, 9), r1.zzzz, r6.xyzw
bfi r7.xyzw, l(1, 1, 1, 1), l(4, 4, 4, 4), r1.yyyy, r6.xyzw
ubfe r6.xyzw, l(3, 3, 3, 3), l(6, 6, 6, 6), r6.xyzw
and r8.xyzw, r3.xyzw, l(6, 6, 6, 6)
bfi r3.xyzw, l(1, 1, 1, 1), l(8, 8, 8, 8), r3.xyzw, l(0, 0, 0, 0)
imad r3.xyzw, r6.xyzw, l(32, 32, 32, 32), r3.xyzw
imad r3.xyzw, r8.xyzw, l(4, 4, 4, 4), r3.xyzw
bfi r2.xyzw, l(19, 19, 19, 19), l(14, 14, 14, 14), r2.xyzw, l(0, 0, 0, 0)
imad r2.xyzw, r5.xyzw, l(16, 16, 16, 16), r2.xyzw
bfi r2.xyzw, l(4, 4, 4, 4), l(3, 3, 3, 3), r4.xyzw, r2.xyzw
bfi r2.xyzw, l(2, 2, 2, 2), l(12, 12, 12, 12), r1.zzzz, r2.xyzw
bfi r2.xyzw, l(1, 1, 1, 1), l(7, 7, 7, 7), r1.yyyy, r2.xyzw
bfi r2.xyzw, l(9, 9, 9, 9), l(3, 3, 3, 3), r3.xyzw, r2.xyzw
bfi r2.xyzw, l(6, 6, 6, 6), l(0, 0, 0, 0), r7.xyzw, r2.xyzw
else
iadd r3.xyzw, r1.xxxx, l(0, 1, 2, 3)
ushr r4.xyzw, r3.xyzw, l(5, 5, 5, 5)
ushr r0.zw, r1.yyyy, l(0, 0, 5, 2)
iadd r1.w, CB0[0][2].x, l(31)
ushr r1.w, r1.w, l(5)
imad r4.xyzw, r0.zzzz, r1.wwww, r4.xyzw
ishl r5.xy, r1.yyyy, l(5, 7, 0, 0)
and r5.xy, r5.xyxx, l(448, 2048, 0, 0)
bfi r6.xyzw, l(3, 3, 3, 3), l(3, 3, 3, 3), r3.xyzw, r5.xxxx
ishl r0.z, r5.x, l(1)
bfi r7.xyzw, l(3, 3, 3, 3), l(4, 4, 4, 4), r3.xyzw, r0.zzzz
and r7.xyzw, r7.xyzw, l(992, 992, 992, 992)
bfi r8.xyzw, l(22, 22, 22, 22), l(10, 10, 10, 10), r4.xyzw, r7.xyzw
bfi r8.xyzw, l(4, 4, 4, 4), l(0, 0, 0, 0), r6.xyzw, r8.xyzw
bfi r8.xyzw, l(1, 1, 1, 1), l(4, 4, 4, 4), r1.yyyy, r8.xyzw
ishl r9.xyzw, r7.xyzw, l(3, 3, 3, 3)
bfi r9.xyzw, l(22, 22, 22, 22), l(13, 13, 13, 13), r4.xyzw, r9.xyzw
bfi r9.xyzw, l(4, 4, 4, 4), l(3, 3, 3, 3), r6.xyzw, r9.xyzw
bfi r9.xyzw, l(1, 1, 1, 1), l(7, 7, 7, 7), r1.yyyy, r9.xyzw
bfi r5.xyzw, l(12, 12, 12, 12), l(0, 0, 0, 0), r5.yyyy, r9.xyzw
ishl r7.xyzw, r7.xyzw, l(2, 2, 2, 2)
bfi r4.xyzw, l(22, 22, 22, 22), l(12, 12, 12, 12), r4.xyzw, r7.xyzw
bfi r4.xyzw, l(4, 4, 4, 4), l(2, 2, 2, 2), r6.xyzw, r4.xyzw
bfi r4.xyzw, l(1, 1, 1, 1), l(6, 6, 6, 6), r1.yyyy, r4.xyzw
and r4.xyzw, r4.xyzw, l(1792, 1792, 1792, 1792)
iadd r4.xyzw, r5.xyzw, r4.xyzw
ushr r3.xyzw, r3.xyzw, l(3, 3, 3, 3)
and r0.z, r0.w, l(2)
iadd r3.xyzw, r0.zzzz, r3.xyzw
bfi r3.xyzw, l(2, 2, 2, 2), l(6, 6, 6, 6), r3.xyzw, l(0, 0, 0, 0)
iadd r3.xyzw, r4.xyzw, r3.xyzw
bfi r2.xyzw, l(6, 6, 6, 6), l(0, 0, 0, 0), r8.xyzw, r3.xyzw
endif
else
ishl r0.z, r1.x, l(3)
iadd r0.w, CB0[0][2].y, l(31)
and r0.w, r0.w, l(-32)
imad r0.w, r1.z, r0.w, r1.y
imad r0.z, r0.w, CB0[0][0].y, r0.z
iadd r2.xyzw, r0.zzzz, l(0, 8, 16, 24)
endif
iadd r1.xyzw, r2.xyzw, CB0[0][0].xxxx
ld_raw r2.xz, r1.x, T0[0].yxxx
ld_raw r2.yw, r1.y, T0[0].xyxx
ld_raw r3.xy, r1.z, T0[0].xyxx
ld_raw r3.zw, r1.w, T0[0].xxxy
ushr r0.z, CB0[0][2].w, l(1)
xor r0.z, r0.z, CB0[0][2].w
and r0.z, r0.z, l(1)
if_nz r0.z
ishl r1.xyzw, r2.zxwy, l(8, 8, 8, 8)
and r1.xyzw, r1.xyzw, l(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00)
ushr r4.xyzw, r2.zxwy, l(8, 8, 8, 8)
and r4.xyzw, r4.xyzw, l(0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff)
iadd r2.xyzw, r1.ywxz, r4.ywxz
endif
and r0.w, CB0[0][2].w, l(2)
if_nz r0.w
ushr r1.xyzw, r2.zxwy, l(16, 16, 16, 16)
bfi r2.xyzw, l(16, 16, 16, 16), l(16, 16, 16, 16), r2.xyzw, r1.ywxz
mov r1.xy, r2.zwzz
else
mov r1.xy, r2.zwzz
endif
if_nz r0.z
ishl r4.xyzw, r3.xyzw, l(8, 8, 8, 8)
and r4.xyzw, r4.xyzw, l(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00)
ushr r5.xyzw, r3.xyzw, l(8, 8, 8, 8)
and r5.xyzw, r5.xyzw, l(0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff)
iadd r3.xyzw, r4.xyzw, r5.xyzw
endif
if_nz r0.w
ushr r4.xyzw, r3.xyzw, l(16, 16, 16, 16)
bfi r4.xyzw, l(16, 16, 16, 16), l(16, 16, 16, 16), r3.xyzw, r4.xyzw
mov r1.zw, r4.xxxz
mov r2.zw, r4.yyyw
else
mov r1.zw, r3.xxxz
mov r2.zw, r3.yyyw
endif
ishl r3.xyzw, r2.xyzw, l(1, 1, 1, 1)
and r3.xyzw, r3.xyzw, l(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa)
ushr r4.xyzw, r2.xyzw, l(1, 1, 1, 1)
and r4.xyzw, r4.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555)
iadd r3.xyzw, r3.xyzw, r4.xyzw
ushr r4.xyzw, r3.xyzw, l(1, 1, 1, 1)
and r4.xyzw, r4.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555)
xor r3.xyzw, r3.xyzw, r4.xyzw
not r2.xyzw, r2.xyzw
ishl r4.xyzw, r2.xyzw, l(1, 1, 1, 1)
and r4.xyzw, r4.xyzw, l(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa)
xor r2.xyzw, r2.xyzw, r4.xyzw
bfi r4.xyzw, l(5, 5, 5, 5), l(23, 23, 23, 23), r1.xyzw, l(0, 0, 0, 0)
ishl r5.xyzw, r1.xyzw, l(18, 18, 18, 18)
and r5.xyzw, r5.xyzw, l(0x00700000, 0x00700000, 0x00700000, 0x00700000)
iadd r4.xyzw, r4.xyzw, r5.xyzw
ishl r5.xyzw, r1.xyzw, l(7, 7, 7, 7)
and r6.xyzw, r5.xyzw, l(0x0003f000, 0x0003f000, 0x0003f000, 0x0003f000)
iadd r4.xyzw, r4.xyzw, r6.xyzw
ishl r6.xyzw, r1.xyzw, l(1, 1, 1, 1)
and r6.xyzw, r6.xyzw, l(3072, 3072, 3072, 3072)
iadd r4.xyzw, r4.xyzw, r6.xyzw
ushr r6.xyzw, r1.xyzw, l(8, 8, 8, 8)
and r6.xyzw, r6.xyzw, l(248, 248, 248, 248)
iadd r4.xyzw, r4.xyzw, r6.xyzw
ubfe r6.xyzw, l(3, 3, 3, 3), l(13, 13, 13, 13), r1.xyzw
iadd r4.xyzw, r4.xyzw, r6.xyzw
and r5.xyzw, r5.xyzw, l(0x0f800000, 0x0f800000, 0x0f800000, 0x0f800000)
ishl r6.xyzw, r1.xyzw, l(2, 2, 2, 2)
and r6.xyzw, r6.xyzw, l(0x00700000, 0x00700000, 0x00700000, 0x00700000)
iadd r5.xyzw, r5.xyzw, r6.xyzw
ushr r6.xyzw, r1.xyzw, l(9, 9, 9, 9)
and r6.xyzw, r6.xyzw, l(0x0003f000, 0x0003f000, 0x0003f000, 0x0003f000)
iadd r5.xyzw, r5.xyzw, r6.xyzw
ushr r6.xyzw, r1.xyzw, l(15, 15, 15, 15)
and r6.xyzw, r6.xyzw, l(3072, 3072, 3072, 3072)
iadd r5.xyzw, r5.xyzw, r6.xyzw
ushr r6.xyzw, r1.xyzw, l(24, 24, 24, 24)
and r6.xyzw, r6.xyzw, l(248, 248, 248, 248)
iadd r5.xyzw, r5.xyzw, r6.xyzw
ushr r6.xyzw, r1.xyzw, l(29, 29, 29, 29)
iadd r5.xyzw, r5.xyzw, r6.xyzw
and r6.xyzw, r1.xyzw, l(0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff)
ushr r1.xyzw, r1.xyzw, l(16, 16, 16, 16)
uge r1.xyzw, r1.xyzw, r6.xyzw
ishl r0.xy, r0.xyxx, l(2, 2, 0, 0)
ishl r0.x, r0.x, l(2)
imad r0.z, vThreadID.z, CB0[0][1].y, r0.y
imad r0.x, r0.z, CB0[0][0].w, r0.x
iadd r0.x, r0.x, CB0[0][0].z
not r6.xyzw, r3.xyzw
ushr r7.xyzw, r2.xyzw, l(1, 1, 1, 1)
and r8.xyzw, r2.xyzw, r7.xyzw
and r8.xyzw, r8.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555)
or r7.xyzw, r2.xyzw, r7.xyzw
and r7.xyzw, r7.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555)
mov r0.z, CB0[0][1].y
mov r0.w, r0.y
mov r9.x, r0.x
mov r9.y, l(0)
loop
uge r9.z, r9.y, l(4)
breakc_nz r9.z
bfi r10.xyzw, l(29, 29, 29, 29), l(3, 3, 3, 3), r9.yyyy, l(0, 2, 4, 6)
ushr r11.xyzw, r6.xxxx, r10.xyzw
and r11.xyzw, r11.xyzw, l(3, 3, 3, 3)
ushr r12.xyzw, r3.xxxx, r10.xyzw
and r12.xyzw, r12.xyzw, l(3, 3, 3, 3)
imul null, r12.xyzw, r5.xxxx, r12.xyzw
imad r11.xyzw, r11.xyzw, r4.xxxx, r12.xyzw
and r12.xyzw, r11.xyzw, l(1023, 1023, 1023, 1023)
udiv r12.xyzw, null, r12.xyzw, l(3, 3, 3, 3)
ubfe r13.xyzw, l(10, 10, 10, 10), l(10, 10, 10, 10), r11.xyzw
udiv r13.xyzw, null, r13.xyzw, l(3, 3, 3, 3)
ishl r13.xyzw, r13.xyzw, l(8, 8, 8, 8)
or r12.xyzw, r12.xyzw, r13.xyzw
ushr r11.xyzw, r11.xyzw, l(20, 20, 20, 20)
udiv r11.xyzw, null, r11.xyzw, l(3, 3, 3, 3)
ishl r11.xyzw, r11.xyzw, l(16, 16, 16, 16)
or r11.xyzw, r11.xyzw, r12.xyzw
ushr r12.xyzw, r6.yyyy, r10.xyzw
and r12.xyzw, r12.xyzw, l(3, 3, 3, 3)
ushr r13.xyzw, r3.yyyy, r10.xyzw
and r13.xyzw, r13.xyzw, l(3, 3, 3, 3)
imul null, r13.xyzw, r5.yyyy, r13.xyzw
imad r12.xyzw, r12.xyzw, r4.yyyy, r13.xyzw
and r13.xyzw, r12.xyzw, l(1023, 1023, 1023, 1023)
udiv r13.xyzw, null, r13.xyzw, l(3, 3, 3, 3)
ubfe r14.xyzw, l(10, 10, 10, 10), l(10, 10, 10, 10), r12.xyzw
udiv r14.xyzw, null, r14.xyzw, l(3, 3, 3, 3)
ishl r14.xyzw, r14.xyzw, l(8, 8, 8, 8)
or r13.xyzw, r13.xyzw, r14.xyzw
ushr r12.xyzw, r12.xyzw, l(20, 20, 20, 20)
udiv r12.xyzw, null, r12.xyzw, l(3, 3, 3, 3)
ishl r12.xyzw, r12.xyzw, l(16, 16, 16, 16)
or r12.xyzw, r12.xyzw, r13.xyzw
ushr r13.xyzw, r6.zzzz, r10.xyzw
and r13.xyzw, r13.xyzw, l(3, 3, 3, 3)
ushr r14.xyzw, r3.zzzz, r10.xyzw
and r14.xyzw, r14.xyzw, l(3, 3, 3, 3)
imul null, r14.xyzw, r5.zzzz, r14.xyzw
imad r13.xyzw, r13.xyzw, r4.zzzz, r14.xyzw
and r14.xyzw, r13.xyzw, l(1023, 1023, 1023, 1023)
udiv r14.xyzw, null, r14.xyzw, l(3, 3, 3, 3)
ubfe r15.xyzw, l(10, 10, 10, 10), l(10, 10, 10, 10), r13.xyzw
udiv r15.xyzw, null, r15.xyzw, l(3, 3, 3, 3)
ishl r15.xyzw, r15.xyzw, l(8, 8, 8, 8)
or r14.xyzw, r14.xyzw, r15.xyzw
ushr r13.xyzw, r13.xyzw, l(20, 20, 20, 20)
udiv r13.xyzw, null, r13.xyzw, l(3, 3, 3, 3)
ishl r13.xyzw, r13.xyzw, l(16, 16, 16, 16)
or r13.xyzw, r13.xyzw, r14.xyzw
ushr r14.xyzw, r6.wwww, r10.xyzw
and r14.xyzw, r14.xyzw, l(3, 3, 3, 3)
ushr r15.xyzw, r3.wwww, r10.xyzw
and r15.xyzw, r15.xyzw, l(3, 3, 3, 3)
imul null, r15.xyzw, r5.wwww, r15.xyzw
imad r14.xyzw, r14.xyzw, r4.wwww, r15.xyzw
and r15.xyzw, r14.xyzw, l(1023, 1023, 1023, 1023)
udiv r15.xyzw, null, r15.xyzw, l(3, 3, 3, 3)
ubfe r16.xyzw, l(10, 10, 10, 10), l(10, 10, 10, 10), r14.xyzw
udiv r16.xyzw, null, r16.xyzw, l(3, 3, 3, 3)
ishl r16.xyzw, r16.xyzw, l(8, 8, 8, 8)
or r15.xyzw, r15.xyzw, r16.xyzw
ushr r14.xyzw, r14.xyzw, l(20, 20, 20, 20)
udiv r14.xyzw, null, r14.xyzw, l(3, 3, 3, 3)
ishl r14.xyzw, r14.xyzw, l(16, 16, 16, 16)
or r14.xyzw, r14.xyzw, r15.xyzw
or r11.xyzw, r11.xyzw, l(0xff000000, 0xff000000, 0xff000000, 0xff000000)
or r12.xyzw, r12.xyzw, l(0xff000000, 0xff000000, 0xff000000, 0xff000000)
or r13.xyzw, r13.xyzw, l(0xff000000, 0xff000000, 0xff000000, 0xff000000)
or r14.xyzw, r14.xyzw, l(0xff000000, 0xff000000, 0xff000000, 0xff000000)
iadd r15.xyzw, r10.xyzw, l(1, 1, 1, 1)
ushr r16.xyzw, r2.xxxx, r10.xyzw
and r16.xyzw, r16.xyzw, l(1, 1, 1, 1)
ushr r17.xyzw, r2.xxxx, r15.xyzw
and r17.xyzw, r17.xyzw, l(1, 1, 1, 1)
imul null, r17.xyzw, r5.xxxx, r17.xyzw
imad r16.xyzw, r16.xyzw, r4.xxxx, r17.xyzw
ushr r17.xyzw, r8.xxxx, r10.xyzw
and r18.xyzw, r17.xyzw, l(1, 1, 1, 1)
ubfe r18.xyzw, l(9, 9, 9, 9), r18.xyzw, r16.xyzw
bfi r19.xyzw, l(1, 1, 1, 1), l(0, 0, 0, 0), r17.xyzw, l(10, 10, 10, 10)
ubfe r19.xyzw, l(9, 9, 9, 9), r19.xyzw, r16.xyzw
ishl r19.xyzw, r19.xyzw, l(8, 8, 8, 8)
iadd r18.xyzw, r18.xyzw, r19.xyzw
bfi r17.xyzw, l(2, 2, 2, 2), l(0, 0, 0, 0), r17.xyzw, l(20, 20, 20, 20)
ushr r16.xyzw, r16.xyzw, r17.xyzw
ishl r16.xyzw, r16.xyzw, l(16, 16, 16, 16)
iadd r16.xyzw, r16.xyzw, r18.xyzw
ushr r17.xyzw, r7.xxxx, r10.xyzw
and r17.xyzw, r17.xyzw, l(1, 1, 1, 1)
imad r16.xyzw, r17.xyzw, l(0xff000000, 0xff000000, 0xff000000, 0xff000000), r16.xyzw
ushr r17.xyzw, r2.yyyy, r10.xyzw
and r17.xyzw, r17.xyzw, l(1, 1, 1, 1)
ushr r18.xyzw, r2.yyyy, r15.xyzw
and r18.xyzw, r18.xyzw, l(1, 1, 1, 1)
imul null, r18.xyzw, r5.yyyy, r18.xyzw
imad r17.xyzw, r17.xyzw, r4.yyyy, r18.xyzw
ushr r18.xyzw, r8.yyyy, r10.xyzw
and r19.xyzw, r18.xyzw, l(1, 1, 1, 1)
ubfe r19.xyzw, l(9, 9, 9, 9), r19.xyzw, r17.xyzw
bfi r20.xyzw, l(1, 1, 1, 1), l(0, 0, 0, 0), r18.xyzw, l(10, 10, 10, 10)
ubfe r20.xyzw, l(9, 9, 9, 9), r20.xyzw, r17.xyzw
ishl r20.xyzw, r20.xyzw, l(8, 8, 8, 8)
iadd r19.xyzw, r19.xyzw, r20.xyzw
bfi r18.xyzw, l(2, 2, 2, 2), l(0, 0, 0, 0), r18.xyzw, l(20, 20, 20, 20)
ushr r17.xyzw, r17.xyzw, r18.xyzw
ishl r17.xyzw, r17.xyzw, l(16, 16, 16, 16)
iadd r17.xyzw, r17.xyzw, r19.xyzw
ushr r18.xyzw, r7.yyyy, r10.xyzw
and r18.xyzw, r18.xyzw, l(1, 1, 1, 1)
imad r17.xyzw, r18.xyzw, l(0xff000000, 0xff000000, 0xff000000, 0xff000000), r17.xyzw
ushr r18.xyzw, r2.zzzz, r10.xyzw
and r18.xyzw, r18.xyzw, l(1, 1, 1, 1)
ushr r19.xyzw, r2.zzzz, r15.xyzw
and r19.xyzw, r19.xyzw, l(1, 1, 1, 1)
imul null, r19.xyzw, r5.zzzz, r19.xyzw
imad r18.xyzw, r18.xyzw, r4.zzzz, r19.xyzw
ushr r19.xyzw, r8.zzzz, r10.xyzw
and r20.xyzw, r19.xyzw, l(1, 1, 1, 1)
ubfe r20.xyzw, l(9, 9, 9, 9), r20.xyzw, r18.xyzw
bfi r21.xyzw, l(1, 1, 1, 1), l(0, 0, 0, 0), r19.xyzw, l(10, 10, 10, 10)
ubfe r21.xyzw, l(9, 9, 9, 9), r21.xyzw, r18.xyzw
ishl r21.xyzw, r21.xyzw, l(8, 8, 8, 8)
iadd r20.xyzw, r20.xyzw, r21.xyzw
bfi r19.xyzw, l(2, 2, 2, 2), l(0, 0, 0, 0), r19.xyzw, l(20, 20, 20, 20)
ushr r18.xyzw, r18.xyzw, r19.xyzw
ishl r18.xyzw, r18.xyzw, l(16, 16, 16, 16)
iadd r18.xyzw, r18.xyzw, r20.xyzw
ushr r19.xyzw, r7.zzzz, r10.xyzw
and r19.xyzw, r19.xyzw, l(1, 1, 1, 1)
imad r18.xyzw, r19.xyzw, l(0xff000000, 0xff000000, 0xff000000, 0xff000000), r18.xyzw
ushr r19.xyzw, r2.wwww, r10.xyzw
and r19.xyzw, r19.xyzw, l(1, 1, 1, 1)
ushr r15.xyzw, r2.wwww, r15.xyzw
and r15.xyzw, r15.xyzw, l(1, 1, 1, 1)
imul null, r15.xyzw, r5.wwww, r15.xyzw
imad r15.xyzw, r19.xyzw, r4.wwww, r15.xyzw
ushr r19.xyzw, r8.wwww, r10.xyzw
and r20.xyzw, r19.xyzw, l(1, 1, 1, 1)
ubfe r20.xyzw, l(9, 9, 9, 9), r20.xyzw, r15.xyzw
bfi r21.xyzw, l(1, 1, 1, 1), l(0, 0, 0, 0), r19.xyzw, l(10, 10, 10, 10)
ubfe r21.xyzw, l(9, 9, 9, 9), r21.xyzw, r15.xyzw
ishl r21.xyzw, r21.xyzw, l(8, 8, 8, 8)
iadd r20.xyzw, r20.xyzw, r21.xyzw
bfi r19.xyzw, l(2, 2, 2, 2), l(0, 0, 0, 0), r19.xyzw, l(20, 20, 20, 20)
ushr r15.xyzw, r15.xyzw, r19.xyzw
ishl r15.xyzw, r15.xyzw, l(16, 16, 16, 16)
iadd r15.xyzw, r15.xyzw, r20.xyzw
ushr r10.xyzw, r7.wwww, r10.xyzw
and r10.xyzw, r10.xyzw, l(1, 1, 1, 1)
imad r10.xyzw, r10.xyzw, l(0xff000000, 0xff000000, 0xff000000, 0xff000000), r15.xyzw
movc r11.xyzw, r1.xxxx, r16.xyzw, r11.xyzw
store_raw U0[0].xyzw, r9.x, r11.xyzw
iadd r11.xyz, r9.xxxx, l(16, 32, 48, 0)
movc r12.xyzw, r1.yyyy, r17.xyzw, r12.xyzw
store_raw U0[0].xyzw, r11.x, r12.xyzw
movc r12.xyzw, r1.zzzz, r18.xyzw, r13.xyzw
store_raw U0[0].xyzw, r11.y, r12.xyzw
movc r10.xyzw, r1.wwww, r10.xyzw, r14.xyzw
store_raw U0[0].xyzw, r11.z, r10.xyzw
iadd r0.w, r0.w, l(1)
uge r9.z, r0.w, r0.z
if_nz r9.z
ret
endif
iadd r9.x, r9.x, CB0[0][0].w
iadd r9.y, r9.y, l(1)
endloop
ret
// Approximately 360 instruction slots used

View File

@ -73,4 +73,74 @@ uint4 XeFloat20e4To32(uint4 f24u32) {
return (((exponent + 112u) << 23u) | (mantissa << 3u)) * uint4(f24u32 != 0u); return (((exponent + 112u) << 23u) | (mantissa << 3u)) * uint4(f24u32 != 0u);
} }
// Sorts the color indices of four DXT3/DXT5 or DXT1 opaque blocks so they can
// be used as the weights for the second endpoint, from 0 to 3. To get the
// weights for the first endpoint, apply bitwise NOT to the result.
uint4 XeDXTHighColorWeights(uint4 codes) {
// Initially 00 = 3:0, 01 = 0:3, 10 = 2:1, 11 = 1:2.
// Swap bits. 00 = 3:0, 01 = 2:1, 10 = 0:3, 11 = 1:2.
codes = ((codes & 0x55555555u) << 1u) | ((codes & 0xAAAAAAAAu) >> 1u);
// Swap 10 and 11. 00 = 3:0, 01 = 2:1, 10 = 1:2, 11 = 0:3.
return codes ^ ((codes & 0xAAAAAAAAu) >> 1u);
}
// Converts endpoint RGB (first in the low 16 bits, second in the high) of four
// DXT blocks to 8-bit, with 2 unused bits between each component to allow for
// overflow when multiplying by values up to 3 (so multiplication can be done
// for all components at once).
void XeDXTColorEndpointsTo8In10(uint4 rgb_565, out uint4 rgb_10b_low,
out uint4 rgb_10b_high) {
// Converting 5:6:5 to 8:8:8 similar to how Compressonator does that.
// https://github.com/GPUOpen-Tools/Compressonator/blob/master/Compressonator/Source/Codec/DXTC/Codec_DXTC_RGBA.cpp#L429
rgb_10b_low = ((rgb_565 & 31u) << 23u) |
((rgb_565 & (7u << 2u)) << (20u - 2u)) |
((rgb_565 & (63u << 5u)) << (12u - 5u)) |
((rgb_565 & (3u << 9u)) << (10u - 9u)) |
((rgb_565 & (31u << 11u)) >> (11u - 3u)) |
((rgb_565 & (7u << 13u)) >> 13u);
rgb_10b_high = ((rgb_565 & (31u << 16u)) << (23u - 16u)) |
((rgb_565 & (7u << 18u)) << (20u - 18u)) |
((rgb_565 & (63u << 21u)) >> (21u - 12u)) |
((rgb_565 & (3u << 25u)) >> (25u - 10u)) |
((rgb_565 & (31u << 27u)) >> (27u - 3u)) |
((rgb_565 & (7u << 29u)) >> 29u);
}
// Gets the colors of one row of four DXT opaque blocks. Endpoint colors can be
// obtained using XeDXTColorEndpointsTo8In10 (8 bits with 2 bits of free space
// between each), weights can be obtained using XeDXTHighColorWeights. Alpha is
// set to 0 in the result. weights_shift is 0 for the first row, 8 for the
// second, 16 for the third, and 24 for the fourth.
void XeDXTFourBlocksRowToRGB8(uint4 rgb_10b_low, uint4 rgb_10b_high,
uint4 weights_high, uint weights_shift,
out uint4 row_0, out uint4 row_1,
out uint4 row_2, out uint4 row_3) {
uint4 weights_low = ~weights_high;
uint4 weights_shifts = weights_shift + uint4(0u, 2u, 4u, 6u);
uint4 block_row_10b_3x =
((weights_low.xxxx >> weights_shifts) & 3u) * rgb_10b_low.x +
((weights_high.xxxx >> weights_shifts) & 3u) * rgb_10b_high.x;
row_0 = ((block_row_10b_3x & 1023u) / 3u) |
((((block_row_10b_3x >> 10u) & 1023u) / 3u) << 8u) |
(((block_row_10b_3x >> 20u) / 3u) << 16u);
block_row_10b_3x =
((weights_low.yyyy >> weights_shifts) & 3u) * rgb_10b_low.y +
((weights_high.yyyy >> weights_shifts) & 3u) * rgb_10b_high.y;
row_1 = ((block_row_10b_3x & 1023u) / 3u) |
((((block_row_10b_3x >> 10u) & 1023u) / 3u) << 8u) |
(((block_row_10b_3x >> 20u) / 3u) << 16u);
block_row_10b_3x =
((weights_low.zzzz >> weights_shifts) & 3u) * rgb_10b_low.z +
((weights_high.zzzz >> weights_shifts) & 3u) * rgb_10b_high.z;
row_2 = ((block_row_10b_3x & 1023u) / 3u) |
((((block_row_10b_3x >> 10u) & 1023u) / 3u) << 8u) |
(((block_row_10b_3x >> 20u) / 3u) << 16u);
block_row_10b_3x =
((weights_low.wwww >> weights_shifts) & 3u) * rgb_10b_low.w +
((weights_high.wwww >> weights_shifts) & 3u) * rgb_10b_high.w;
row_3 = ((block_row_10b_3x & 1023u) / 3u) |
((((block_row_10b_3x >> 10u) & 1023u) / 3u) << 8u) |
(((block_row_10b_3x >> 20u) / 3u) << 16u);
}
#endif // XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_ #endif // XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_

View File

@ -1,3 +1,4 @@
#include "pixel_formats.hlsli"
#include "texture_copy.hlsli" #include "texture_copy.hlsli"
// http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
@ -13,11 +14,11 @@
// II JJ KK LL // II JJ KK LL
// MM NN OO PP // MM NN OO PP
void XeCTX1FourBlocksRowToR8G8(uint4 weights_high, uint weights_shift, void XeCTX1FourBlocksRowToR8G8(uint4 end_low_rr00gg00, uint4 end_high_rr00gg00,
uint4 end_low_rr00gg00, uint4 end_high_rr00gg00, uint4 weights_high, uint weights_shift,
out uint4 row_01, out uint4 row_23) { out uint4 row_01, out uint4 row_23) {
uint4 weights_low = ~weights_high; uint4 weights_low = ~weights_high;
uint4 weights_shifts = uint4(0u, 2u, 4u, 6u) + weights_shift; uint4 weights_shifts = weights_shift + uint4(0u, 2u, 4u, 6u);
uint4 row_3aaaa = uint4 row_3aaaa =
((weights_low >> weights_shifts.x) & 3u) * end_low_rr00gg00 + ((weights_low >> weights_shifts.x) & 3u) * end_low_rr00gg00 +
((weights_high >> weights_shifts.x) & 3u) * end_high_rr00gg00; ((weights_high >> weights_shifts.x) & 3u) * end_high_rr00gg00;
@ -63,15 +64,6 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness); blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness);
blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness); blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness);
// Sort the color indices so they can be used as weights for the second
// endpoint. Initially 00 = 3:0, 01 = 0:3, 10 = 2:1, 11 = 1:2.
uint4 weights_high = uint4(blocks_01.yw, blocks_23.yw);
// Swap bits. 00 = 3:0, 01 = 2:1, 10 = 0:3, 11 = 1:2.
weights_high = ((weights_high & 0x55555555u) << 1u) |
((weights_high & 0xAAAAAAAAu) >> 1u);
// Swap 10 and 11. 00 = 3:0, 01 = 2:1, 10 = 1:2, 11 = 0:3.
weights_high ^= ((weights_high & 0xAAAAAAAAu) >> 1u);
// Unpack the endpoints as: // Unpack the endpoints as:
// 0x00g000r0 0x00g100r1 0x00g200r2 0x00g300r3 // 0x00g000r0 0x00g100r1 0x00g200r2 0x00g300r3
// 0x00G000R0 0x00G100R1 0x00G200R2 0x00G300R3 // 0x00G000R0 0x00G100R1 0x00G200R2 0x00G300R3
@ -82,6 +74,10 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
uint4 end_high_rr00gg00 = uint4 end_high_rr00gg00 =
((end_packed & 0xFF0000u) >> 16u) | ((end_packed & 0xFF000000u) >> 8u); ((end_packed & 0xFF0000u) >> 16u) | ((end_packed & 0xFF000000u) >> 8u);
// Sort the color indices so they can be used as weights for the second
// endpoint.
uint4 weights_high = XeDXTHighColorWeights(uint4(blocks_01.yw, blocks_23.yw));
// Uncompress and write the rows. // Uncompress and write the rows.
uint3 texel_index_host = block_index << uint3(2u, 2u, 0u); uint3 texel_index_host = block_index << uint3(2u, 2u, 0u);
uint texel_offset_host = XeTextureHostLinearOffset( uint texel_offset_host = XeTextureHostLinearOffset(
@ -89,8 +85,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) {
xe_texture_copy_host_pitch, 2u) + xe_texture_copy_host_base; xe_texture_copy_host_pitch, 2u) + xe_texture_copy_host_base;
for (uint i = 0u; i < 4u; ++i) { for (uint i = 0u; i < 4u; ++i) {
uint4 row_01, row_23; uint4 row_01, row_23;
XeCTX1FourBlocksRowToR8G8(weights_high, i * 8u, end_low_rr00gg00, XeCTX1FourBlocksRowToR8G8(end_low_rr00gg00, end_high_rr00gg00, weights_high,
end_high_rr00gg00, row_01, row_23); i * 8u, row_01, row_23);
xe_texture_copy_dest.Store4(texel_offset_host, row_01); xe_texture_copy_dest.Store4(texel_offset_host, row_01);
xe_texture_copy_dest.Store4(texel_offset_host + 16u, row_23); xe_texture_copy_dest.Store4(texel_offset_host + 16u, row_23);
if (++texel_index_host.y >= xe_texture_copy_size_texels.y) { if (++texel_index_host.y >= xe_texture_copy_size_texels.y) {

View File

@ -0,0 +1,119 @@
#include "pixel_formats.hlsli"
#include "texture_copy.hlsli"
void XeDXT1FourTransBlocksRowToRGBA8(uint4 rgb_10b_low, uint4 rgb_10b_high,
uint4 weights, uint4 weights_shift,
out uint4 row_0, out uint4 row_1,
out uint4 row_2, out uint4 row_3) {
uint4 weights_shifts_low = weights_shift + uint4(0u, 2u, 4u, 6u);
uint4 weights_shifts_high = weights_shifts_low + 1u;
// Whether the texel is (RGB0+RGB1)/2 - divide the weighted sum by 2 (shift
// right by 1) if it is.
uint4 weights_sums_log2 = weights & ((weights & 0xAAAAAAAAu) >> 1u);
// Whether the texel is opaque.
uint4 weights_alpha =
(weights & 0x55555555u) | ((weights & 0xAAAAAAAAu) >> 1u);
uint4 block_rgb_10b =
((weights.xxxx >> weights_shifts_low) & 1u) * rgb_10b_low.x +
((weights.xxxx >> weights_shifts_high) & 1u) * rgb_10b_high.x;
uint4 block_rgb_shift = (weights_sums_log2.xxxx >> weights_shifts_low) & 1u;
row_0 = ((block_rgb_10b & 1023u) >> block_rgb_shift) +
((((block_rgb_10b >> 10u) & 1023u) >> block_rgb_shift) << 8u) +
(((block_rgb_10b >> 20u) >> block_rgb_shift) << 16u) +
(((weights_alpha.xxxx >> weights_shifts_low) & 1u) * 0xFF000000u);
block_rgb_10b =
((weights.yyyy >> weights_shifts_low) & 1u) * rgb_10b_low.y +
((weights.yyyy >> weights_shifts_high) & 1u) * rgb_10b_high.y;
block_rgb_shift = (weights_sums_log2.yyyy >> weights_shifts_low) & 1u;
row_1 = ((block_rgb_10b & 1023u) >> block_rgb_shift) +
((((block_rgb_10b >> 10u) & 1023u) >> block_rgb_shift) << 8u) +
(((block_rgb_10b >> 20u) >> block_rgb_shift) << 16u) +
(((weights_alpha.yyyy >> weights_shifts_low) & 1u) * 0xFF000000u);
block_rgb_10b =
((weights.zzzz >> weights_shifts_low) & 1u) * rgb_10b_low.z +
((weights.zzzz >> weights_shifts_high) & 1u) * rgb_10b_high.z;
block_rgb_shift = (weights_sums_log2.zzzz >> weights_shifts_low) & 1u;
row_2 = ((block_rgb_10b & 1023u) >> block_rgb_shift) +
((((block_rgb_10b >> 10u) & 1023u) >> block_rgb_shift) << 8u) +
(((block_rgb_10b >> 20u) >> block_rgb_shift) << 16u) +
(((weights_alpha.zzzz >> weights_shifts_low) & 1u) * 0xFF000000u);
block_rgb_10b =
((weights.wwww >> weights_shifts_low) & 1u) * rgb_10b_low.w +
((weights.wwww >> weights_shifts_high) & 1u) * rgb_10b_high.w;
block_rgb_shift = (weights_sums_log2.wwww >> weights_shifts_low) & 1u;
row_3 = ((block_rgb_10b & 1023u) >> block_rgb_shift) +
((((block_rgb_10b >> 10u) & 1023u) >> block_rgb_shift) << 8u) +
(((block_rgb_10b >> 20u) >> block_rgb_shift) << 16u) +
(((weights_alpha.wwww >> weights_shifts_low) & 1u) * 0xFF000000u);
}
[numthreads(8, 32, 1)]
void main(uint3 xe_thread_id : SV_DispatchThreadID) {
// 1 thread = 4 DXT1 (8bpb) blocks to 16x4 R8G8B8A8 texels.
uint3 block_index = xe_thread_id;
block_index.x <<= 2u;
[branch] if (any(block_index >= xe_texture_copy_size_blocks)) {
return;
}
uint4 block_offsets_guest =
XeTextureCopyGuestBlockOffsets(block_index, 8u, 3u);
uint4 blocks_01 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.x),
xe_texture_copy_source.Load2(block_offsets_guest.y));
uint4 blocks_23 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.z),
xe_texture_copy_source.Load2(block_offsets_guest.w));
blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness);
blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness);
uint4 codes = uint4(blocks_01.yw, blocks_23.yw);
// Sort the color indices so they can be used as weights for the second
// endpoint in the opaque mode.
uint4 weights_opaque_high = XeDXTHighColorWeights(codes);
// Sort the color indices so bits of them can be used as endpoint weights, and
// AND of those bits can be used as the right shift amount for mixing the two
// colors in the punchthrough mode.
// Initially 00 = 1:0, 01 = 0:1, 10 = 1:1, 11 = 0:0.
// 00 = 0:0, 01 = 1:1, 10 = 0:1, 11 = 1:0.
uint4 weights_trans = ~codes;
// 00 = 0:0, 01 = 1:0, 10 = 0:1, 11 = 1:1.
weights_trans ^= (weights_trans & 0x55555555u) << 1u;
// Get endpoint RGB for mixing, as 8-bit components in 10-bit sequences.
uint4 rgb_565 = uint4(blocks_01.xz, blocks_23.xz);
uint4 rgb_10b_low, rgb_10b_high;
XeDXTColorEndpointsTo8In10(rgb_565, rgb_10b_low, rgb_10b_high);
// Get modes for each block.
bool4 is_trans = (rgb_565 & 0xFFFFu) <= (rgb_565 >> 16u);
// Uncompress and write the rows.
uint3 texel_index_host = block_index << uint3(2u, 2u, 0u);
uint texel_offset_host = XeTextureHostLinearOffset(
texel_index_host, xe_texture_copy_size_texels.y,
xe_texture_copy_host_pitch, 4u) + xe_texture_copy_host_base;
for (uint i = 0u; i < 4u; ++i) {
uint4 row_opaque_0, row_opaque_1, row_opaque_2, row_opaque_3;
XeDXTFourBlocksRowToRGB8(rgb_10b_low, rgb_10b_high, weights_opaque_high,
i * 8u, row_opaque_0, row_opaque_1, row_opaque_2,
row_opaque_3);
row_opaque_0 |= 0xFF000000u;
row_opaque_1 |= 0xFF000000u;
row_opaque_2 |= 0xFF000000u;
row_opaque_3 |= 0xFF000000u;
uint4 row_trans_0, row_trans_1, row_trans_2, row_trans_3;
XeDXT1FourTransBlocksRowToRGBA8(rgb_10b_low, rgb_10b_high, weights_trans,
i * 8u, row_trans_0, row_trans_1,
row_trans_2, row_trans_3);
xe_texture_copy_dest.Store4(texel_offset_host,
is_trans.x ? row_trans_0 : row_opaque_0);
xe_texture_copy_dest.Store4(texel_offset_host + 16u,
is_trans.y ? row_trans_1 : row_opaque_1);
xe_texture_copy_dest.Store4(texel_offset_host + 32u,
is_trans.z ? row_trans_2 : row_opaque_2);
xe_texture_copy_dest.Store4(texel_offset_host + 48u,
is_trans.w ? row_trans_3 : row_opaque_3);
if (++texel_index_host.y >= xe_texture_copy_size_texels.y) {
return;
}
texel_offset_host += xe_texture_copy_host_pitch;
}
}

View File

@ -34,6 +34,7 @@ namespace d3d12 {
#include "xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/texture_load_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_load_depth_float_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/texture_load_depth_unorm_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_load_depth_unorm_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt1_rgba8_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt3a_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt3a_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/texture_tile_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_tile_32bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/texture_tile_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_tile_64bpp_cs.h"
@ -180,6 +181,7 @@ const TextureCache::LoadModeInfo TextureCache::load_mode_info_[] = {
{texture_load_32bpb_cs, sizeof(texture_load_32bpb_cs)}, {texture_load_32bpb_cs, sizeof(texture_load_32bpb_cs)},
{texture_load_64bpb_cs, sizeof(texture_load_64bpb_cs)}, {texture_load_64bpb_cs, sizeof(texture_load_64bpb_cs)},
{texture_load_128bpb_cs, sizeof(texture_load_128bpb_cs)}, {texture_load_128bpb_cs, sizeof(texture_load_128bpb_cs)},
{texture_load_dxt1_rgba8_cs, sizeof(texture_load_dxt1_rgba8_cs)},
{texture_load_dxt3a_cs, sizeof(texture_load_dxt3a_cs)}, {texture_load_dxt3a_cs, sizeof(texture_load_dxt3a_cs)},
{texture_load_ctx1_cs, sizeof(texture_load_ctx1_cs)}, {texture_load_ctx1_cs, sizeof(texture_load_ctx1_cs)},
{texture_load_depth_unorm_cs, sizeof(texture_load_depth_unorm_cs)}, {texture_load_depth_unorm_cs, sizeof(texture_load_depth_unorm_cs)},

View File

@ -99,6 +99,7 @@ class TextureCache {
k32bpb, k32bpb,
k64bpb, k64bpb,
k128bpb, k128bpb,
kDXT1AsRGBA8,
kDXT3A, kDXT3A,
kCTX1, kCTX1,
kDepthUnorm, kDepthUnorm,