From bb24521c2b44a93e89126f33e93e9764555511da Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 22 Sep 2018 15:52:07 +0300 Subject: [PATCH] [D3D12] DXT5 decompression shader (not used yet) --- .../shaders/dxbc/texture_load_ctx1_cs.cso | Bin 9760 -> 9632 bytes .../d3d12/shaders/dxbc/texture_load_ctx1_cs.h | 385 +++-- .../shaders/dxbc/texture_load_ctx1_cs.txt | 99 +- .../dxbc/texture_load_dxt1_rgba8_cs.cso | Bin 15080 -> 15632 bytes .../shaders/dxbc/texture_load_dxt1_rgba8_cs.h | 970 +++++------ .../dxbc/texture_load_dxt1_rgba8_cs.txt | 254 +-- .../dxbc/texture_load_dxt5_rgba8_cs.cso | Bin 0 -> 17068 bytes .../shaders/dxbc/texture_load_dxt5_rgba8_cs.h | 1427 +++++++++++++++++ .../dxbc/texture_load_dxt5_rgba8_cs.txt | 505 ++++++ .../gpu/d3d12/shaders/pixel_formats.hlsli | 74 +- .../d3d12/shaders/texture_load_ctx1.cs.hlsl | 29 +- .../shaders/texture_load_dxt1_rgba8.cs.hlsl | 23 +- .../shaders/texture_load_dxt5_rgba8.cs.hlsl | 79 + src/xenia/gpu/d3d12/texture_cache.cc | 2 + src/xenia/gpu/d3d12/texture_cache.h | 1 + 15 files changed, 2967 insertions(+), 881 deletions(-) create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt5_rgba8_cs.cso create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt5_rgba8_cs.h create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt5_rgba8_cs.txt create mode 100644 src/xenia/gpu/d3d12/shaders/texture_load_dxt5_rgba8.cs.hlsl diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.cso index fde1d8b74e4279585b8b4ca1fd4feacaa67fe558..1ec8254dba51faf082a44c8a0b70681d8426f559 100644 GIT binary patch delta 848 zcmZWnO>5Lp6uohhnU~DeX2@uVrfI(YLKg!fMJq`CsG!9vN=ressGut?bm6v8q`I;k z7cN|kU34J`O8+Uf3i#fT_m4h}N zVBEHbUg!7};i@@jU^#NI;kVK9Y|X$=FUD1G%6vsMDmsOgnv3tv_MeQIXIiW_CTIW2 zza6!ka;Eqf9#&Stj3JDHZ>6eg#;sd_{f#`+dL;KhLZqliRDO>%Jab}eLEqGJVG z82AoU*f1^jt15Y!87yEvQD{XrdZCSe{C`~?cY~*RoE+1vow=y4nR3PLy0p6KIo#Av z=;KZNj8|XlKKh{=5rs5i*uM9%pVbU^%Sn0nGVxOx72)Kflch0{K`X||#%AK6pXg?x zHn5)@^<;OJb@Ium$>$eihfeBb`pgM)shjPx@xw~$hP$k8WHmgrIi6>{%%LPONL{?s I6?*CXZ}G%~nE(I) delta 1030 zcmY*YJ!lj`6n=ZR+05SVnq5|N=Uq1Ydq%~=fQpzP;S7qH;yiLdXustGHX|xfGKng1>51RvRA}HGV-t5k~4m{q>_rCYdo0&H|w{|v`*Y9?&J$ZlT z^Y6CT8}EPm@W*?5RRe(N6u=Dm3uK!*z!upfvZtfv8@efCfZG)B;84G>IeTOM+q}0z z89%@UV!`!eE>!SiYvm@3)ClEqi5LmZ8PnDn0XR?7F2HQ3z?-EwD^)?So@ER@({}2gK0H0O$P+yw*2v5)8G1fEX(c}Wokh0_g6G4S1j zuL<4)zb~wxR8oq^QoOa|#YwX%h0KzWB+y#J=SdwOCoW0Z_$IggQf6YxbbM{n|capF4A8Q=ev0+QizG84YFq79QfXO$?^Jk7r zFPr;n;Th*w?h%J$hL+zxW3L#)Vd&%QXam250$)(|-FOoxH5cziB2bkpZS;==}>OgXmgWuS)9IDYHI1mwWj+ov z-BfG?A7)DdudWb}2RA>DaQYEOnJ@WL2@kTCT*;4vM_DWWN^D2wQ0oug**ajETfoH%h3Z`>p}p+(R@iVz1P6-7s}O1PcIM5@oB7^1 zv;O0~FK_jpyMB4I_0#UohqK>bQ}hcS)%;ysq$bHKWv<{M^KI@vyg* zvHy8esckm+ces~%r%FH6`=u;fI?Ch!O0BVT!0%Zy8f6-UMs|2L zo>A&FudndlU%Y4X`qSXXxPU(A}BlBBU%^04lBJwW*YyIwrQs(6iG% zg?-DgX)srF;j4a;XoH|;jL~7x<=qdOSE${rw34&f5YfkYMY*ZyG?=fsC`n>*^k&T- z6}i{0wGt_ggvbIM-T`nLIB4e|11`yD1RPR=vj(mxs6z{9lMWgk@`BtdG`JlB3bFu| zgpF;OW>LB9?HdljqK})?bYs>savCCs?zc2Vbgx;?Ydn%58V^W<2M1t0M*F}ck>n^| z1rMb5cOk|jRni9V;G~u!(&S>pbnVc>(4_6ar635p|&-`81#doK8MZ~Ds(h+ZV_6fDTqQAMDY&7AQ)^iv{&a`>Xu<@>On8?SOn@t zxSE1%F-aqilp}*9EEFvIdcNW!zW8&*W{su`9(~fTwB%%25$9q|S&3=Wc4tIT2NK_M zqcf|cT<>^`d=+{I;3_Z!PVp);KpFrze|4Ihg6^7STButsmo?HR@PdWFnvbnrI+!fe z4YtPa|5+De>m00$LJZ6{aCi(uA|VE}7!qUD==z37MmR;k&AD`BPVDgMdVb-KfCz38 z_A`pFhFc$v4Egv7fh_lkcjQ=%ZKN}xm#kzjpUMhaWaZM23$G=)LW>!lr{xuY7%sjQ zQ&D0RPK=S`$~yZ!i+*3~MAU^n?jwx^wCH*u(Xl^Os(|{&`F^UU$B~G7c!y+ zv8PMdX}FxmW&<`-E^Gh+h&KpiKG(ZNOm9rveTf_)xg7erlX~bEw$yS*sHc6 z@edsR701v5upltg4-;%vsjp4=3X9ZGHx{vvI*VPX`-U{sYGcUo!vz delta 2210 zcmZuzO=ufO6duVdX;-$SRh6)^6=hdagT0>}>v;kveS{EOBs33 zd-G<;w?04k{=`fmQZ+Ko|wL1THXf5#8fXraVW8#uh93<;o6EDZMZ(STEGTF_DE;P!w=RA`qSa-6LC$jjZQ8Ew~0@4XOm?1D0t2M3(CTU_WX^OD8 zmNdbUG{svEY348+C9@g&iGofT9ya2H3u7Ec>%IdEA}Y`yp-1aPDX;w`U>fSDAb zKq!K%ivSNBaS`+pig%q+-u3N8mkz>mQ+&w?rX1qKSZJqtwAiL|T=WD=n8W%g?!`&k zX@#`ib?Fwr{gzGWj~lJ>SfK_juXsA&NN^mJ`rIMjM;UioHlQddk7hFm zORegxQZ{OVP8Wt$xk#JyrKWh5x`K7or>zx#o$n8Wg-_#iE#5L3dw(gQpkv=&YMI>z zij!$#W%Mx4mjoHbxzeIXORt-S0vpNxa)0>qvPcRRORl^*T7u&hYpn!TiNC}e8WAlX z6ZFG^M{dWZcUEIfp&8A;R{O&@+M+3#(Y(9h^3uyj7Z5ty=o-kK7j5jJM#r^}>~8`4 zyEVQ?vP$*{b2$mp6{0IU2D_*VCaJ9R2jEKkEB^bi{r$1t2K(%>mzjvw1`jSLBOUNKX7kFTNM^&q2FO{rf6+j;})-> z{YCv|^=Y_k!c1z&i!xtYTv%ItY3=gL;=<|WOD`|1E?-_by_jn9nZ?z$bYk(?5PDF^ zgUc6J*G^tsTTNH5N#>0DE(tCg-CoMWD}^R;`HS{`T@8-MKZ*`!8txy$;rle)2f`88 zX}H^zmtDev#k5CBoSNx}rM1)N#(3wJt)iaQa3)V$UHX1g zZj0wv)8(a$YmcO>gT}5lpurwreqO9v!_w-)BWKd7<>m89e&JQWUl%9LZ_GG#e);rw zSI1;8UOcmO^5VtC)p6D^tQVFpEi9isySliRzP5CFWqEb^?AqM@&+nak`q`t;&K*5> z^33AO+`;7wmzK`!;-P(Ccqlz{^s9%dTN>kY)&0?@pE~j8S5o@AHjBlbDShYWH29{r zgF`;EVOvf|w9Hp@&zwVfmp(OK*|38(q&;r?m3GeEEq1mYBkzHh_E?Mk zOuA`{uOF9Ld(=NYMITi&C(MCX#@(-O*dkWr2WuGF=%eX+>tg|jFVZvnzWCU`O(V2i z^r6;VYebiDJKkyKyzrw9UChfxA8q@LM*4BdC0@)uJF_7dJh_BU182E!-8!^07dXQP zmS>M_*=}F$&!D9A-}UvK^C1qz8`;K?2kvEmlN)@32hG^mQ@Uk$OBebGWBr zFY&^zL$cTYBQJLJfrCa|8NoA8`tC3TQ|MyKvQG8Y-S=%Z?qcUzR_IUaMV zCg}Eqooy|*aL#5#(1|>7_Mwy4 zmIp@qo48qjjmxDQGvbFYv3V_}owhxf$vKR9 zh%%(JSr44wkn5b+c*ooP##YVoh+g*P*rPnLI?eG$p8d%sr8(WlZW%Qmdvh7?Ma0Y9 zc-!0LMYvcSTxZerxaxG~8ZTC-SwDPmt`Dj*cFS;mSY9&oz8XE zyclDy`*d=yjh(r~J9g%B-zTH417y^VxvnYfR(yH>HFbsT>phpeClF8DlFP=jS2@Op?_$*9URSrJECC2F<>EbbN>>v|7d<(PBr&EuzJ49~jXs88^HK zG2cEaCbQ4CcG2X(3Efc!v|PG=;~z9|u`?GQHpZNL|C!`_+*7!Yz9YlF#~RON zV-JN+_dIHv=S!r4?-(kheP%KaKP=;TD+6EbQ!e8%kgn0%;;cd&2*>)he8&TxWhp;X z&t%kV?4Fy87*&h9?1S;H1#HTtD--$9>c(7nmZ7{2*eBy&!g-c`3xA2B_cC3k`28U6 zWyFj9juzu}P2U~n+r1DTXJ%v@<9$n*(fS|p!oKObjIyVx1N6r=0}l)4)c2|58@5w3 z=)`B{Y^&t{TFVU#Jo~=?6VSNM*Te3yb+GrfzL%B>)n4|Q`FUF|I^NUd;iuz= zyuMr`r;fLMsdH@`t-H>fb8R1LK6cbJ$Hq301J2OSHL^@|t|P6^3v**$oa-nP`)iqz zW|@&@na~`czFf1v=kn2WjV|IcO`dU#x%B56IgYdAZeRLxZQXU=ZZy}?26Av^Law3J zxrXMvIRDU`7ii9Pl!-p;vCK%b%t*6LXvD{BnS0^}KNm4HR=s&X8^@SSO24Vv@pf-O z&SvD*xZ_pxnKbIP+*+4yVyyb;4ClGnUa2~+bI*5`udAZ0g-N$a*`92R0d@kd**Sf3|yk(fin9H~f<-dv3G~Ac8w(-mU zw=r&fHO;Z+-gx_pFS4nF_dGDQFU~!7`TIz1Qj!}6K8xAoKZz{_Q9&5Z0Y z&N{ib49m5Qm^-d*hh-Sgz4PMSQpdTB=f$`ngs{U3ZT z5xjNrp47ZtBuDqL+jgEE;K1iHU6*x&w+z!5a~YT6x}1h{o!Ngz>1gWAG{>5IM) zzuCDC@&9^qZQFQ$U)46d(fi`c9ud3cIVQHZuU?{F>J@xnTxZ0Ky;oPL73#+Re;8dn zV`A^s6>5dLxe;CbelhkwU9ncU-i$8tANPSz=|1*88O*aCmf`PH>*B@k+BBaW#-2;} z9v(J-8>Ih}T>9dp^JDd;rew(oBFyn~|+$`Cg%83$V01FFXOHwpRByq^OvT=RPz zuKn&LzB7kz*^0|dt?z;A?*vDx?+5tfSbwoyuhgsSI+yOA?RXL2*geRm1jlaYhEb|%Rv)6BFe-8$Y zTus7l5l?G-bk^k^Ir-slHq7($(q~2XF!q9fN*C7hcT0G==tGTt2i>e|`vDHyz`1^* zkpsqDy8C8pQ={WH@zH_aT=bzfNe4J|fU^#0=wQrcT*rMZfxoBpzUG19Z^qt_xSvq;4|O2 zxKD8H^A!HtZsYx2h%w4V2P0u)w_V_Dmtz%e;@WoAGP#f4c14`+bB$s%wM^~hq7Svc z`eFY8SLYNNxpZTPoz_9z&@rt>!J#A8XdU}H7O_UbvKCyU+|Rc)8gm22H44W5SeN~Y z^~tq!1kO3)I+tl2c%c~6ARCed8y}HHo0P+vCy3p;%{CqL)+ySzPUGcMjUwV@t^aZdDheN znzNsIjJfE2tQJP>uCG1fTQ0buzqz)(AD@ErciX5JxybM1A=BSOq8;|narD^AYJM+f zd)n`9uRbAJ;5^szJc1XmiH+yN%SFM|7`t(IijEJ+&83S0?^+`l#`=3H_w#KnTOV_f z=X)@R^<9Pe(7g7{pApIX4cJ`9pM{fXj=SgbzGD(@63zZb-~Yp7;_naf8n#1 z>s(Z%8e^~H)?DOzkKVGk34!3KAYu69g;A8!m#+Xa@{T_Ab z`iVGX5GU^!-lILTkGVd*r@_l*{JoO*3*#8UH5;gy9$#<&zmWNX)j*m#g2#rPlWcOF zY!lDN*lm+>&i}}m)~LR{8NB_se(Q?wYTM^=v#-`TiRid!T@iJD_xug!12zFxsFY=BNs7rG>qMKgFN(lWZqlU=C#l2 zYS@s?8NxXE+Stb&?1Puf(6+fW{Ef7Rjvw)z+x7h5%yT1t)ZbP6{IH+k9cQjN6QMVk z;Ti*1#~IwT_bq}> weights_shifts) & 3u) * rgb_10b_low.x + ((weights_high.xxxx >> weights_shifts) & 3u) * rgb_10b_high.x; @@ -145,7 +144,7 @@ void XeDXTFourBlocksRowToRGB8(uint4 rgb_10b_low, uint4 rgb_10b_high, // & 0x249249 = bits 0 of 24 bits of DXT5 alpha codes. // & 0x492492 = bits 1 of 24 bits of DXT5 alpha codes. -// & 0x6DB6DB = bits 2 of 24 bits of DXT5 alpha codes. +// & 0x924924 = bits 2 of 24 bits of DXT5 alpha codes. // Sorts half (24 bits) of the codes of four DXT5 alpha blocks so they can be // used as weights for the second endpoint, from 0 to 7, in alpha0 > alpha1 @@ -155,12 +154,11 @@ uint4 XeDXT5High8StepAlphaWeights(uint4 codes_24b) { // weights from 6:1 to 1:6. Need to make 001 111, and subtract 1 from 010 and // above. // Whether the bits are 000 (the first endpoint only). - uint4 is_first = ((codes_24b & 0x249249u) & ((codes_24b & 0x492492u) >> 1u) & - ((codes_24b & 0x6DB6DBu) >> 2u)) ^ 0x249249u; + uint4 is_first = ((codes_24b & 0x249249u) | ((codes_24b & 0x492492u) >> 1u) | + ((codes_24b & 0x924924u) >> 2u)) ^ 0x249249u; // Whether the bits are 001 (the second endpoint only). - uint4 is_second = - (codes_24b & 0x249249u) & (0x249249u ^ - (((codes_24b & 0x492492u) >> 1u) & ((codes_24b & 0x6DB6DBu) >> 2u))); + uint4 is_second = (codes_24b & 0x249249u) & ~((codes_24b & 0x492492u) >> 1u) & + ~((codes_24b & 0x924924u) >> 2u); // Change 000 to 001 so subtracting 1 will result in 0 (and there will never // be overflow), subtract 1, and if the code was originally 001 (the second // endpoint only), make it 111. @@ -183,20 +181,19 @@ uint4 XeDXT5High6StepAlphaWeights(uint4 codes_24b) { // 111 - constant 1. // Create 3-bit masks (111 or 000) of whether the codes represent 0 or 1 // constants to keep them 110 and 111 later. - uint4 is_constant = (codes_24b & 0x492492u) & ((codes_24b & 0x6DB6DBu) >> 1u); + uint4 is_constant = codes_24b & 0x492492u & ((codes_24b & 0x924924u) >> 1u); is_constant |= (is_constant << 1u) | (is_constant >> 1u); // Store the codes for the constants (110 or 111), or 0 if not a constant. uint4 constant_values = - ((codes_24b & 0x249249u) | (0x492492u | 0x6DB6DBu)) & is_constant; + ((codes_24b & 0x249249u) | (0x492492u | 0x924924u)) & is_constant; // Need to make 001 101, and subtract 1 from 010 and above (constants will be // handled separately later). // Whether the bits are 000 (the first endpoint only). - uint4 is_first = ((codes_24b & 0x249249u) & ((codes_24b & 0x492492u) >> 1u) & - ((codes_24b & 0x6DB6DBu) >> 2u)) ^ 0x249249u; + uint4 is_first = ((codes_24b & 0x249249u) | ((codes_24b & 0x492492u) >> 1u) | + ((codes_24b & 0x924924u) >> 2u)) ^ 0x249249u; // Whether the bits are 001 (the second endpoint only). - uint4 is_second = - (codes_24b & 0x249249u) & (0x249249u ^ - (((codes_24b & 0x492492u) >> 1u) & ((codes_24b & 0x6DB6DBu) >> 2u))); + uint4 is_second = (codes_24b & 0x249249u) & ~((codes_24b & 0x492492u) >> 1u) & + ~((codes_24b & 0x924924u) >> 2u); // Change 000 to 001 so subtracting 1 will result in 0 (and there will never // be overflow), subtract 1, and if the code was originally 001 (the second // endpoint only), make it 101. @@ -206,4 +203,41 @@ uint4 XeDXT5High6StepAlphaWeights(uint4 codes_24b) { return (codes_24b & ~is_constant) | constant_values; } +uint4 XeDXT5Four8StepBlocksRowToA8(uint4 end_low, uint4 end_high, + uint4 weights_8step, uint4 weights_6step) { + // Choose the mode. + bool4 is_6step = end_low <= end_high; + uint4 weights_high = is_6step ? weights_6step : weights_8step; + uint4 weight_max = is_6step ? (5u.xxxx) : (7u.xxxx); + // In the 6-step mode, make a mask for whether the weights are constants. + uint4 is_constant = is_6step ? (weights_6step & 0x492u & + ((weights_6step & 0x924u) >> 1u)) + : (0u).xxxx; + is_constant |= (is_constant << 1u) | (is_constant >> 1u); + // Get the weights for the first endpoint and remove constant from the + // interpolation (set weights of the endpoints to 0 for them). First need to + // zero the weights of the second endpoint so 6 or 7 won't be subtracted from + // 5 while getting the weights of the first endpoint. + weights_high &= ~is_constant; + uint4 weights_low = ((weight_max * 0x249u) - weights_high) & ~is_constant; + // Interpolate. + uint4 row = + ((end_low * (weights_low & 7u) + + end_high * (weights_high & 7u)) / weight_max) | + (((end_low * ((weights_low >> 3u) & 7u) + + end_high * ((weights_high >> 3u) & 7u)) / weight_max) << 8u) | + (((end_low * ((weights_low >> 6u) & 7u) + + end_high * ((weights_high >> 6u) & 7u)) / weight_max) << 16u) | + (((end_low * ((weights_low >> 9u) & 7u) + + end_high * ((weights_high >> 9u) & 7u)) / weight_max) << 24u); + // Get the constant values as 1 bit per pixel separated by 7 bits. + uint4 constant_values = weights_6step & is_constant; + constant_values = (constant_values & 1u) | + ((constant_values & (1u << 3u)) << (8u - 3u)) | + ((constant_values & (1u << 6u)) << (16u - 6u)) | + ((constant_values & (1u << 9u)) << (24u - 9u)); + // Add constant 1 where needed. + return row + constant_values * 0xFFu; +} + #endif // XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_ diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl index 5273d8d75..e796464b2 100644 --- a/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl @@ -15,22 +15,17 @@ // MM NN OO PP void XeCTX1FourBlocksRowToR8G8(uint4 end_low_rr00gg00, uint4 end_high_rr00gg00, - uint4 weights_high, uint weights_shift, - out uint4 row_01, out uint4 row_23) { + uint4 weights_high, out uint4 row_01, + out uint4 row_23) { uint4 weights_low = ~weights_high; - uint4 weights_shifts = weights_shift + uint4(0u, 2u, 4u, 6u); - uint4 row_3aaaa = - ((weights_low >> weights_shifts.x) & 3u) * end_low_rr00gg00 + - ((weights_high >> weights_shifts.x) & 3u) * end_high_rr00gg00; - uint4 row_3bbbb = - ((weights_low >> weights_shifts.y) & 3u) * end_low_rr00gg00 + - ((weights_high >> weights_shifts.y) & 3u) * end_high_rr00gg00; - uint4 row_3cccc = - ((weights_low >> weights_shifts.z) & 3u) * end_low_rr00gg00 + - ((weights_high >> weights_shifts.z) & 3u) * end_high_rr00gg00; - uint4 row_3dddd = - ((weights_low >> weights_shifts.w) & 3u) * end_low_rr00gg00 + - ((weights_high >> weights_shifts.w) & 3u) * end_high_rr00gg00; + uint4 row_3aaaa = (weights_low & 3u) * end_low_rr00gg00 + + (weights_high & 3u) * end_high_rr00gg00; + uint4 row_3bbbb = ((weights_low >> 2u) & 3u) * end_low_rr00gg00 + + ((weights_high >> 2u) & 3u) * end_high_rr00gg00; + uint4 row_3cccc = ((weights_low >> 4u) & 3u) * end_low_rr00gg00 + + ((weights_high >> 4u) & 3u) * end_high_rr00gg00; + uint4 row_3dddd = ((weights_low >> 6u) & 3u) * end_low_rr00gg00 + + ((weights_high >> 6u) & 3u) * end_high_rr00gg00; uint4 row_half_3acac = uint4(row_3aaaa.xy, row_3cccc.xy).xzyw; uint4 row_half_3bdbd = uint4(row_3bbbb.xy, row_3dddd.xy).xzyw; // R0A G0A R0B G0B | R0C G0C R0D G0D | R1A G1A R1B G1B | R1C G1C R1D G1D @@ -85,8 +80,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { xe_texture_copy_host_pitch, 2u) + xe_texture_copy_host_base; for (uint i = 0u; i < 4u; ++i) { uint4 row_01, row_23; - XeCTX1FourBlocksRowToR8G8(end_low_rr00gg00, end_high_rr00gg00, weights_high, - i * 8u, row_01, row_23); + XeCTX1FourBlocksRowToR8G8(end_low_rr00gg00, end_high_rr00gg00, + weights_high >> (i * 8u), row_01, row_23); xe_texture_copy_dest.Store4(texel_offset_host, row_01); xe_texture_copy_dest.Store4(texel_offset_host + 16u, row_23); if (++texel_index_host.y >= xe_texture_copy_size_texels.y) { diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_dxt1_rgba8.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_dxt1_rgba8.cs.hlsl index cdc044493..e32079313 100644 --- a/src/xenia/gpu/d3d12/shaders/texture_load_dxt1_rgba8.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/texture_load_dxt1_rgba8.cs.hlsl @@ -1,12 +1,11 @@ #include "pixel_formats.hlsli" #include "texture_copy.hlsli" -void XeDXT1FourTransBlocksRowToRGBA8(uint4 rgb_10b_low, uint4 rgb_10b_high, - uint4 weights, uint4 weights_shift, - out uint4 row_0, out uint4 row_1, - out uint4 row_2, out uint4 row_3) { - uint4 weights_shifts_low = weights_shift + uint4(0u, 2u, 4u, 6u); - uint4 weights_shifts_high = weights_shifts_low + 1u; +void XeDXT1FourTransBlocksRowToRGBA8( + uint4 rgb_10b_low, uint4 rgb_10b_high, uint4 weights, out uint4 row_0, + out uint4 row_1, out uint4 row_2, out uint4 row_3) { + const uint4 weights_shifts_low = uint4(0u, 2u, 4u, 6u); + const uint4 weights_shifts_high = uint4(1u, 3u, 5u, 7u); // Whether the texel is (RGB0+RGB1)/2 - divide the weighted sum by 2 (shift // right by 1) if it is. uint4 weights_sums_log2 = weights & ((weights & 0xAAAAAAAAu) >> 1u); @@ -92,17 +91,17 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { xe_texture_copy_host_pitch, 4u) + xe_texture_copy_host_base; for (uint i = 0u; i < 4u; ++i) { uint4 row_opaque_0, row_opaque_1, row_opaque_2, row_opaque_3; - XeDXTFourBlocksRowToRGB8(rgb_10b_low, rgb_10b_high, weights_opaque_high, - i * 8u, row_opaque_0, row_opaque_1, row_opaque_2, - row_opaque_3); + XeDXTFourBlocksRowToRGB8(rgb_10b_low, rgb_10b_high, + weights_opaque_high >> (i * 8u), row_opaque_0, + row_opaque_1, row_opaque_2, row_opaque_3); row_opaque_0 |= 0xFF000000u; row_opaque_1 |= 0xFF000000u; row_opaque_2 |= 0xFF000000u; row_opaque_3 |= 0xFF000000u; uint4 row_trans_0, row_trans_1, row_trans_2, row_trans_3; - XeDXT1FourTransBlocksRowToRGBA8(rgb_10b_low, rgb_10b_high, weights_trans, - i * 8u, row_trans_0, row_trans_1, - row_trans_2, row_trans_3); + XeDXT1FourTransBlocksRowToRGBA8(rgb_10b_low, rgb_10b_high, + weights_trans >> (i * 8u), row_trans_0, + row_trans_1, row_trans_2, row_trans_3); xe_texture_copy_dest.Store4(texel_offset_host, is_trans.x ? row_trans_0 : row_opaque_0); xe_texture_copy_dest.Store4(texel_offset_host + 16u, diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_dxt5_rgba8.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_dxt5_rgba8.cs.hlsl new file mode 100644 index 000000000..d9dc74106 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/texture_load_dxt5_rgba8.cs.hlsl @@ -0,0 +1,79 @@ +#include "pixel_formats.hlsli" +#include "texture_copy.hlsli" + +[numthreads(8, 32, 1)] +void main(uint3 xe_thread_id : SV_DispatchThreadID) { + // 1 thread = 4 DXT5 (16bpb) blocks to 16x4 R8G8B8A8 texels. + uint3 block_index = xe_thread_id; + block_index.x <<= 2u; + [branch] if (any(block_index >= xe_texture_copy_size_blocks)) { + return; + } + uint4 block_offsets_guest = + XeTextureCopyGuestBlockOffsets(block_index, 16u, 4u); + uint4 block_0 = xe_texture_copy_source.Load4(block_offsets_guest.x); + uint4 block_1 = xe_texture_copy_source.Load4(block_offsets_guest.y); + uint4 block_2 = xe_texture_copy_source.Load4(block_offsets_guest.z); + uint4 block_3 = xe_texture_copy_source.Load4(block_offsets_guest.w); + block_0 = XeByteSwap(block_0, xe_texture_copy_endianness); + block_1 = XeByteSwap(block_1, xe_texture_copy_endianness); + block_2 = XeByteSwap(block_2, xe_texture_copy_endianness); + block_3 = XeByteSwap(block_3, xe_texture_copy_endianness); + uint4 alpha_blocks_0 = uint4(block_0.x, block_1.x, block_2.x, block_3.x); + uint4 alpha_blocks_1 = uint4(block_0.y, block_1.y, block_2.y, block_3.y); + + uint4 rgb_codes = uint4(block_0.w, block_1.w, block_2.w, block_3.w); + // Sort the color indices so they can be used as weights for the second + // endpoint. + uint4 rgb_weights_high = XeDXTHighColorWeights(rgb_codes); + // Sort the alpha indices. + uint4 alpha_codes_r01 = + (alpha_blocks_0 >> 16u) | ((alpha_blocks_1 & 0xFFu) << 16u); + uint4 alpha_codes_r23 = alpha_blocks_1 >> 8u; + uint4 alpha_weights_8step_r01 = XeDXT5High8StepAlphaWeights(alpha_codes_r01); + uint4 alpha_weights_8step_r23 = XeDXT5High8StepAlphaWeights(alpha_codes_r23); + uint4 alpha_weights_6step_r01 = XeDXT5High6StepAlphaWeights(alpha_codes_r01); + uint4 alpha_weights_6step_r23 = XeDXT5High6StepAlphaWeights(alpha_codes_r23); + + // Get the endpoints for mixing, as 8-bit components in 10-bit sequences. + uint4 rgb_565 = uint4(block_0.z, block_1.z, block_2.z, block_3.z); + uint4 rgb_10b_low, rgb_10b_high; + XeDXTColorEndpointsTo8In10(rgb_565, rgb_10b_low, rgb_10b_high); + // Get the alpha endpoints. + uint4 alpha_end_low = alpha_blocks_0 & 0xFFu; + uint4 alpha_end_high = (alpha_blocks_0 >> 8u) & 0xFFu; + + // Uncompress and write the rows. + uint3 texel_index_host = block_index << uint3(2u, 2u, 0u); + uint texel_offset_host = XeTextureHostLinearOffset( + texel_index_host, xe_texture_copy_size_texels.y, + xe_texture_copy_host_pitch, 4u) + xe_texture_copy_host_base; + for (uint i = 0u; i < 4u; ++i) { + uint4 row_0, row_1, row_2, row_3; + XeDXTFourBlocksRowToRGB8(rgb_10b_low, rgb_10b_high, + rgb_weights_high >> (i * 8u), + row_0, row_1, row_2, row_3); + uint4 alpha_row = XeDXT5Four8StepBlocksRowToA8( + alpha_end_low, alpha_end_high, + (i < 2u ? alpha_weights_8step_r01 : alpha_weights_8step_r23) >> + ((i & 1u) * 12u), + (i < 2u ? alpha_weights_6step_r01 : alpha_weights_6step_r23) >> + ((i & 1u) * 12u)); + xe_texture_copy_dest.Store4( + texel_offset_host, + row_0 | ((alpha_row.xxxx << uint4(24u, 16u, 8u, 0u)) & 0xFF000000u)); + xe_texture_copy_dest.Store4( + texel_offset_host + 16u, + row_1 | ((alpha_row.yyyy << uint4(24u, 16u, 8u, 0u)) & 0xFF000000u)); + xe_texture_copy_dest.Store4( + texel_offset_host + 32u, + row_2 | ((alpha_row.zzzz << uint4(24u, 16u, 8u, 0u)) & 0xFF000000u)); + xe_texture_copy_dest.Store4( + texel_offset_host + 48u, + row_3 | ((alpha_row.wwww << uint4(24u, 16u, 8u, 0u)) & 0xFF000000u)); + if (++texel_index_host.y >= xe_texture_copy_size_texels.y) { + return; + } + texel_offset_host += xe_texture_copy_host_pitch; + } +} diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index 246a3dab3..a8b9af66b 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -36,6 +36,7 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/dxbc/texture_load_depth_unorm_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt1_rgba8_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt3a_cs.h" +#include "xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt5_rgba8_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_tile_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_tile_64bpp_cs.h" @@ -182,6 +183,7 @@ const TextureCache::LoadModeInfo TextureCache::load_mode_info_[] = { {texture_load_64bpb_cs, sizeof(texture_load_64bpb_cs)}, {texture_load_128bpb_cs, sizeof(texture_load_128bpb_cs)}, {texture_load_dxt1_rgba8_cs, sizeof(texture_load_dxt1_rgba8_cs)}, + {texture_load_dxt5_rgba8_cs, sizeof(texture_load_dxt5_rgba8_cs)}, {texture_load_dxt3a_cs, sizeof(texture_load_dxt3a_cs)}, {texture_load_ctx1_cs, sizeof(texture_load_ctx1_cs)}, {texture_load_depth_unorm_cs, sizeof(texture_load_depth_unorm_cs)}, diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h index 8a1babd64..1bc260fce 100644 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ b/src/xenia/gpu/d3d12/texture_cache.h @@ -100,6 +100,7 @@ class TextureCache { k64bpb, k128bpb, kDXT1AsRGBA8, + kDXT5AsRGBA8, kDXT3A, kCTX1, kDepthUnorm,