From 17e3f09c1eceebc4bd4c34d07720bba89de0aee5 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Fri, 21 Sep 2018 08:38:22 +0300 Subject: [PATCH] [D3D12] DXT1 decompression shader --- .../shaders/dxbc/texture_load_ctx1_cs.cso | Bin 9760 -> 9760 bytes .../d3d12/shaders/dxbc/texture_load_ctx1_cs.h | 100 +- .../shaders/dxbc/texture_load_ctx1_cs.txt | 16 +- .../dxbc/texture_load_dxt1_rgba8_cs.cso | Bin 0 -> 15080 bytes .../shaders/dxbc/texture_load_dxt1_rgba8_cs.h | 1261 +++++++++++++++++ .../dxbc/texture_load_dxt1_rgba8_cs.txt | 412 ++++++ .../gpu/d3d12/shaders/pixel_formats.hlsli | 70 + .../d3d12/shaders/texture_load_ctx1.cs.hlsl | 24 +- .../shaders/texture_load_dxt1_rgba8.cs.hlsl | 119 ++ src/xenia/gpu/d3d12/texture_cache.cc | 2 + src/xenia/gpu/d3d12/texture_cache.h | 1 + 11 files changed, 1933 insertions(+), 72 deletions(-) create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt1_rgba8_cs.cso create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt1_rgba8_cs.h create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt1_rgba8_cs.txt create mode 100644 src/xenia/gpu/d3d12/shaders/texture_load_dxt1_rgba8.cs.hlsl diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.cso index be56628bc92f3383678cfbcbcb0318fd925fd2fb..fde1d8b74e4279585b8b4ca1fd4feacaa67fe558 100644 GIT binary patch delta 77 zcmZ4Bv%p8hCBn(M?vUoKAF6^u5BFR-{!?(9-A0iW5}Th$8ZfecVh~_pnOrETKG{cH gVDcM@3PzU6itN&p8zd#bA}b^{7+E&+N(*xX0Hx>}Z~y=R delta 86 zcmZ4Bv%p8hCBn(sOuDxru8FPx(37`ckF?wW+$gd_V)743g~@j$1SV@pRq!%1FffEN mFmQch5MW@Q{83UJEK?yRF!_s=1Pe%Z^BM^U#?2zqJlp__bR3QV diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.h b/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.h index 3a79d1abe..a9fda2d3b 100644 --- a/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.h +++ b/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.h @@ -1,8 +1,8 @@ // generated from `xb buildhlsl` // source: texture_load_ctx1.cs.hlsl const uint8_t texture_load_ctx1_cs[] = { - 0x44, 0x58, 0x42, 0x43, 0x36, 0x1B, 0x8D, 0x80, 0x5E, 0x82, 0x06, 0x8F, - 0xC2, 0xE4, 0xED, 0xF5, 0xC4, 0x87, 0x3F, 0xF9, 0x01, 0x00, 0x00, 0x00, + 0x44, 0x58, 0x42, 0x43, 0x7E, 0xC2, 0x29, 0xDA, 0xF8, 0x25, 0x11, 0x52, + 0xE1, 0xBC, 0xD4, 0xC7, 0xF9, 0x11, 0xB6, 0x3E, 0x01, 0x00, 0x00, 0x00, 0x20, 0x26, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x54, 0x04, 0x00, 0x00, 0x64, 0x04, 0x00, 0x00, 0x74, 0x04, 0x00, 0x00, 0x84, 0x25, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x18, 0x04, 0x00, 0x00, @@ -528,57 +528,57 @@ const uint8_t texture_load_ctx1_cs[] = { 0x03, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0xC2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x01, 0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, + 0x8C, 0x00, 0x00, 0x11, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0F, + 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x8C, 0x00, 0x00, 0x11, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, + 0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, - 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0xAA, 0xAA, 0xAA, 0xAA, - 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, - 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, - 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, - 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0x00, 0x00, 0x07, - 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, - 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x8C, 0x00, 0x00, 0x11, - 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, - 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x11, - 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, - 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, - 0x10, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x0A, 0x32, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x57, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x0A, 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.txt b/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.txt index 73a966a96..207e16d6e 100644 --- a/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.txt +++ b/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.txt @@ -178,20 +178,20 @@ else mov r1.zw, r3.xxxz mov r2.zw, r3.yyyw endif -ishl r3.xyzw, r2.xyzw, l(1, 1, 1, 1) -and r3.xyzw, r3.xyzw, l(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa) -ushr r2.xyzw, r2.xyzw, l(1, 1, 1, 1) -and r2.xyzw, r2.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555) -iadd r2.xyzw, r2.xyzw, r3.xyzw -ushr r3.xyzw, r2.xyzw, l(1, 1, 1, 1) -and r3.xyzw, r3.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555) -xor r2.xyzw, r2.xyzw, r3.xyzw ishl r3.xyzw, r1.xyzw, l(8, 8, 8, 8) and r3.xyzw, r3.xyzw, l(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000) bfi r3.xyzw, l(8, 8, 8, 8), l(0, 0, 0, 0), r1.xyzw, r3.xyzw ubfe r4.xyzw, l(8, 8, 8, 8), l(16, 16, 16, 16), r1.xyzw ushr r1.xyzw, r1.xyzw, l(8, 8, 8, 8) bfi r1.xyzw, l(16, 16, 16, 16), l(0, 0, 0, 0), r4.xyzw, r1.xyzw +ishl r4.xyzw, r2.xyzw, l(1, 1, 1, 1) +and r4.xyzw, r4.xyzw, l(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa) +ushr r2.xyzw, r2.xyzw, l(1, 1, 1, 1) +and r2.xyzw, r2.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555) +iadd r2.xyzw, r2.xyzw, r4.xyzw +ushr r4.xyzw, r2.xyzw, l(1, 1, 1, 1) +and r4.xyzw, r4.xyzw, l(0x55555555, 0x55555555, 0x55555555, 0x55555555) +xor r2.xyzw, r2.xyzw, r4.xyzw ishl r0.xy, r0.xyxx, l(2, 2, 0, 0) ishl r0.x, r0.x, l(1) imad r0.z, vThreadID.z, CB0[0][1].y, r0.y diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt1_rgba8_cs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt1_rgba8_cs.cso new file mode 100644 index 0000000000000000000000000000000000000000..afa163a0f568efe52473cf1bbf002a858564aba5 GIT binary patch literal 15080 zcmd6u&u<<_8OPVt{Bq(pX-YyB0W}z^CIXbSrIkXdhNd+|Dj_s!2q!Cv9b!oB$bKUN zLa5|eRa`&<2{?d656!iIKY?I9_t(5cAjUx z&+pyYebeVJJa=lxk-t7R{m;|)Pyg}huRQbP<#(p4s`|&ts@kdXN%eC(s%lYvr2dlD zUw;0D`G+-!@89b8sXwDWwe##LHodQfhaP(VbF4kA{#esLt8qr1_O|+N^`}uBn**rn$bW2F2q$T8C#VzQ2hN+qdHTP<;4x zE4~L6mjmL17uB95aB9}e;{~mMLhHsG4e$Bq93h!<4eLmm~`Q^{zJH8FPGFace zp=d2!y12enE&gD2ZLzw2`?fMragUJ~aA-f51Nw2UFOM!??a;2ST12_4ru1)GU;a^@ zZcEqJs~gKJqbI7{Q?*`=LxVn^enF&C!t(mU6N}ZQ)zxd2?82&HyHmBT+mBriX)H(d=U{ADw;irL!;1p1pc; zacOP#)at9(m#=AZ;`kR&R9`>)wbSG+jq#hs_1PC+xbWVWs_I+1S&Z(ls_*ToroN-w z!TisN{zfDJXunhW&;O)NWBs?R|LX+PR$EYduPMW?YCJ5;ua(N^F{_T}j%we*x)Q2t zulT0&xE`Jo-=${#{H|1;QJJslnmzOBfc|Q{wV?+!q(`$fIe#^JL(|7RlNBf#~SJOCxJ3 z>yKJJ*N80f&Ai*-od4;DEcT_WKTUf?BmEWO;xG1Incl#KpIEX^gU`5cJ}}lZWhmnY zFV7z7vfH-Wo~c|_|G9g2|6K3`{s!AT#DQz+U&jWUps{A&H>zsi!DekLkL~tIT-d`k z4Sn$!dd*8;+Yess$b*kH{K^PF`y}sUrlma2w;D$@Vj` zU>n$1V$@h#pNzto(RJ)Gzxps7FONwWrb8DYxwf*ReDPamHq9$GWkv^q*zH2R+)$oHZTo zTOB-PGP1A4=(D+$%U;{uhY{^%o^t#Cfv=;@5rh0mxqZ)?Y1TBf8W`}~mZJl?LJh`l zn;p|WzacaB=(m_F#_01~l*v5$4gOM(KEL7T*rVUT(4N-0*!t87jiOEWxfnWn*l%3> zUPzpwF{a$jH_4jRI_FW^l=!gC`0RD<8cTMz&IYAlk}#r;&MoU9$X1K8kYStPw@+&s zxr>{$RyHuGUF&;DX7MahK27Z9-d4vakNx8NJy`J_5V`n-cQO3MGl3TUbw}@qbItPt z8t+A5o99S#EuMXT_nOecv&DNErB9s)$hZGf)`c3jp#|Rou?^k1zaSHv*|V#_{e!l> zEFNgK{qXf!U5y*`-#2vYJ73Tr7oC{czLNfKL&r9B*)^ISgX^zya`KKH-<|DW_`e-^ zkU{J4h8UWM_^>~|qx;^$`lGuy&UyGQcF}7;!-ww}Q`Vom?f<0zq%OCE=Nx8CxdS(; zFYN!zp0Qj;_=&0XsLvm|?D`;&YcziEuzu(t^}~y?m-Q5B%u{ajv*w(#o=@pmMJ^!g z?|Q$%2I{MC9b5Y1V*ij&DYs)ne1tvmH}dVrc2ugnw%hl=t?N4bW~r~|&GzVP|3CQrPE|Ign)_L1WddBG`pS;HRJiDiM6a_#?G z7V?^2$HRP#DOc57MJs)>6`p-(-@H<^^4Vbd@RzczQ_#7k`&P1a|CxMiD*gXu`Ja0% z_CRB_?~Hq|wid2?%krZPWP$HzF7uST?=`G_ylQv$;)|5^r)bvQ^^Lp)Aakloe!8xzH)g*_x$6@0p&xZP$ z!+pYhjO>%nb#0o)n6j{o#&2RD$hIh}#4wGq4@2jJ$-7bMH_i^X8sFK6tl<~x(SBj= zGtC+?p&geQXBupH+0O`_XT*KY?`o|18OJxBl-qeq&a#GYW9R*uy(#OD8h!@1xsTL- z;Wq`oZ#h2T+qcm06=TY6Y~(1nYYfU94A$U}7|>W_Ou38kD979Ps)`sH$M*-W=agg6 zqAtktGZY@{%RK6PR|>^`TDfgG$lOdXKT~*aq)Z&!y>HF3Er-39V;p=G`_U$kp|jt7 zjDekC;J-4*phe%6K8`tN9OoF=%bW} za|}M~WnT^HRbnD9cHhS$pY;>1KF`MIXvzZMGhFEAi}%8m#SwUzn=j^b%AySLMV(TH ztu0q){aQc!lK7@v=MyCSWP=A6cv6C2@o?m>(3i~E@y<3Y_M2C|I-4GhMVH{=^D z){1dlD_px)qAu}V2~Vj9c~;uO_n^)6cwJvjICH|6vI~2ZnL7?K zR_3?9k(P41?x0!U$Oi^`8{c{2_d)!(-F#C%)p0hyZ5KqVe(mQdle1=Doq%CGQ|{)1 zF|Z+G#5!?Y7*px669Nz^5tjo~Tv=&KXw(~utblBek693wwc?%Ks#*&Oeb z+s~(1C&ufm6KIZy^UiDDH&iFK-y{6C!9JPPhiLU&dX_EP07mSNLyVRAt#72IJX|M{ z4-E7+KKhaaeRTrOGE?rx#dTtQ+pcY$I)Nw3DSSOAPW6E8ft(p5Bg?#iZ0HvLz<`D#bd2(Q9iX1yX70NPd+r~ zfb-64-ZxYyw%;TCw!yL4Zk?DPtk_>^zFWmu*=Fk-X(GeRZ;xOk+?N5hK?QM_zlA}_S5|DTkz4dVpm?*|0e68=&aed z> 1u); + // Swap 10 and 11. 00 = 3:0, 01 = 2:1, 10 = 1:2, 11 = 0:3. + return codes ^ ((codes & 0xAAAAAAAAu) >> 1u); +} + +// Converts endpoint RGB (first in the low 16 bits, second in the high) of four +// DXT blocks to 8-bit, with 2 unused bits between each component to allow for +// overflow when multiplying by values up to 3 (so multiplication can be done +// for all components at once). +void XeDXTColorEndpointsTo8In10(uint4 rgb_565, out uint4 rgb_10b_low, + out uint4 rgb_10b_high) { + // Converting 5:6:5 to 8:8:8 similar to how Compressonator does that. + // https://github.com/GPUOpen-Tools/Compressonator/blob/master/Compressonator/Source/Codec/DXTC/Codec_DXTC_RGBA.cpp#L429 + rgb_10b_low = ((rgb_565 & 31u) << 23u) | + ((rgb_565 & (7u << 2u)) << (20u - 2u)) | + ((rgb_565 & (63u << 5u)) << (12u - 5u)) | + ((rgb_565 & (3u << 9u)) << (10u - 9u)) | + ((rgb_565 & (31u << 11u)) >> (11u - 3u)) | + ((rgb_565 & (7u << 13u)) >> 13u); + rgb_10b_high = ((rgb_565 & (31u << 16u)) << (23u - 16u)) | + ((rgb_565 & (7u << 18u)) << (20u - 18u)) | + ((rgb_565 & (63u << 21u)) >> (21u - 12u)) | + ((rgb_565 & (3u << 25u)) >> (25u - 10u)) | + ((rgb_565 & (31u << 27u)) >> (27u - 3u)) | + ((rgb_565 & (7u << 29u)) >> 29u); +} + +// Gets the colors of one row of four DXT opaque blocks. Endpoint colors can be +// obtained using XeDXTColorEndpointsTo8In10 (8 bits with 2 bits of free space +// between each), weights can be obtained using XeDXTHighColorWeights. Alpha is +// set to 0 in the result. weights_shift is 0 for the first row, 8 for the +// second, 16 for the third, and 24 for the fourth. +void XeDXTFourBlocksRowToRGB8(uint4 rgb_10b_low, uint4 rgb_10b_high, + uint4 weights_high, uint weights_shift, + out uint4 row_0, out uint4 row_1, + out uint4 row_2, out uint4 row_3) { + uint4 weights_low = ~weights_high; + uint4 weights_shifts = weights_shift + uint4(0u, 2u, 4u, 6u); + uint4 block_row_10b_3x = + ((weights_low.xxxx >> weights_shifts) & 3u) * rgb_10b_low.x + + ((weights_high.xxxx >> weights_shifts) & 3u) * rgb_10b_high.x; + row_0 = ((block_row_10b_3x & 1023u) / 3u) | + ((((block_row_10b_3x >> 10u) & 1023u) / 3u) << 8u) | + (((block_row_10b_3x >> 20u) / 3u) << 16u); + block_row_10b_3x = + ((weights_low.yyyy >> weights_shifts) & 3u) * rgb_10b_low.y + + ((weights_high.yyyy >> weights_shifts) & 3u) * rgb_10b_high.y; + row_1 = ((block_row_10b_3x & 1023u) / 3u) | + ((((block_row_10b_3x >> 10u) & 1023u) / 3u) << 8u) | + (((block_row_10b_3x >> 20u) / 3u) << 16u); + block_row_10b_3x = + ((weights_low.zzzz >> weights_shifts) & 3u) * rgb_10b_low.z + + ((weights_high.zzzz >> weights_shifts) & 3u) * rgb_10b_high.z; + row_2 = ((block_row_10b_3x & 1023u) / 3u) | + ((((block_row_10b_3x >> 10u) & 1023u) / 3u) << 8u) | + (((block_row_10b_3x >> 20u) / 3u) << 16u); + block_row_10b_3x = + ((weights_low.wwww >> weights_shifts) & 3u) * rgb_10b_low.w + + ((weights_high.wwww >> weights_shifts) & 3u) * rgb_10b_high.w; + row_3 = ((block_row_10b_3x & 1023u) / 3u) | + ((((block_row_10b_3x >> 10u) & 1023u) / 3u) << 8u) | + (((block_row_10b_3x >> 20u) / 3u) << 16u); +} + #endif // XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_ diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl index 12101ad83..5273d8d75 100644 --- a/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/texture_load_ctx1.cs.hlsl @@ -1,3 +1,4 @@ +#include "pixel_formats.hlsli" #include "texture_copy.hlsli" // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf @@ -13,11 +14,11 @@ // II JJ KK LL // MM NN OO PP -void XeCTX1FourBlocksRowToR8G8(uint4 weights_high, uint weights_shift, - uint4 end_low_rr00gg00, uint4 end_high_rr00gg00, +void XeCTX1FourBlocksRowToR8G8(uint4 end_low_rr00gg00, uint4 end_high_rr00gg00, + uint4 weights_high, uint weights_shift, out uint4 row_01, out uint4 row_23) { uint4 weights_low = ~weights_high; - uint4 weights_shifts = uint4(0u, 2u, 4u, 6u) + weights_shift; + uint4 weights_shifts = weights_shift + uint4(0u, 2u, 4u, 6u); uint4 row_3aaaa = ((weights_low >> weights_shifts.x) & 3u) * end_low_rr00gg00 + ((weights_high >> weights_shifts.x) & 3u) * end_high_rr00gg00; @@ -63,15 +64,6 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness); blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness); - // Sort the color indices so they can be used as weights for the second - // endpoint. Initially 00 = 3:0, 01 = 0:3, 10 = 2:1, 11 = 1:2. - uint4 weights_high = uint4(blocks_01.yw, blocks_23.yw); - // Swap bits. 00 = 3:0, 01 = 2:1, 10 = 0:3, 11 = 1:2. - weights_high = ((weights_high & 0x55555555u) << 1u) | - ((weights_high & 0xAAAAAAAAu) >> 1u); - // Swap 10 and 11. 00 = 3:0, 01 = 2:1, 10 = 1:2, 11 = 0:3. - weights_high ^= ((weights_high & 0xAAAAAAAAu) >> 1u); - // Unpack the endpoints as: // 0x00g000r0 0x00g100r1 0x00g200r2 0x00g300r3 // 0x00G000R0 0x00G100R1 0x00G200R2 0x00G300R3 @@ -82,6 +74,10 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { uint4 end_high_rr00gg00 = ((end_packed & 0xFF0000u) >> 16u) | ((end_packed & 0xFF000000u) >> 8u); + // Sort the color indices so they can be used as weights for the second + // endpoint. + uint4 weights_high = XeDXTHighColorWeights(uint4(blocks_01.yw, blocks_23.yw)); + // Uncompress and write the rows. uint3 texel_index_host = block_index << uint3(2u, 2u, 0u); uint texel_offset_host = XeTextureHostLinearOffset( @@ -89,8 +85,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { xe_texture_copy_host_pitch, 2u) + xe_texture_copy_host_base; for (uint i = 0u; i < 4u; ++i) { uint4 row_01, row_23; - XeCTX1FourBlocksRowToR8G8(weights_high, i * 8u, end_low_rr00gg00, - end_high_rr00gg00, row_01, row_23); + XeCTX1FourBlocksRowToR8G8(end_low_rr00gg00, end_high_rr00gg00, weights_high, + i * 8u, row_01, row_23); xe_texture_copy_dest.Store4(texel_offset_host, row_01); xe_texture_copy_dest.Store4(texel_offset_host + 16u, row_23); if (++texel_index_host.y >= xe_texture_copy_size_texels.y) { diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_dxt1_rgba8.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_dxt1_rgba8.cs.hlsl new file mode 100644 index 000000000..cdc044493 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/texture_load_dxt1_rgba8.cs.hlsl @@ -0,0 +1,119 @@ +#include "pixel_formats.hlsli" +#include "texture_copy.hlsli" + +void XeDXT1FourTransBlocksRowToRGBA8(uint4 rgb_10b_low, uint4 rgb_10b_high, + uint4 weights, uint4 weights_shift, + out uint4 row_0, out uint4 row_1, + out uint4 row_2, out uint4 row_3) { + uint4 weights_shifts_low = weights_shift + uint4(0u, 2u, 4u, 6u); + uint4 weights_shifts_high = weights_shifts_low + 1u; + // Whether the texel is (RGB0+RGB1)/2 - divide the weighted sum by 2 (shift + // right by 1) if it is. + uint4 weights_sums_log2 = weights & ((weights & 0xAAAAAAAAu) >> 1u); + // Whether the texel is opaque. + uint4 weights_alpha = + (weights & 0x55555555u) | ((weights & 0xAAAAAAAAu) >> 1u); + uint4 block_rgb_10b = + ((weights.xxxx >> weights_shifts_low) & 1u) * rgb_10b_low.x + + ((weights.xxxx >> weights_shifts_high) & 1u) * rgb_10b_high.x; + uint4 block_rgb_shift = (weights_sums_log2.xxxx >> weights_shifts_low) & 1u; + row_0 = ((block_rgb_10b & 1023u) >> block_rgb_shift) + + ((((block_rgb_10b >> 10u) & 1023u) >> block_rgb_shift) << 8u) + + (((block_rgb_10b >> 20u) >> block_rgb_shift) << 16u) + + (((weights_alpha.xxxx >> weights_shifts_low) & 1u) * 0xFF000000u); + block_rgb_10b = + ((weights.yyyy >> weights_shifts_low) & 1u) * rgb_10b_low.y + + ((weights.yyyy >> weights_shifts_high) & 1u) * rgb_10b_high.y; + block_rgb_shift = (weights_sums_log2.yyyy >> weights_shifts_low) & 1u; + row_1 = ((block_rgb_10b & 1023u) >> block_rgb_shift) + + ((((block_rgb_10b >> 10u) & 1023u) >> block_rgb_shift) << 8u) + + (((block_rgb_10b >> 20u) >> block_rgb_shift) << 16u) + + (((weights_alpha.yyyy >> weights_shifts_low) & 1u) * 0xFF000000u); + block_rgb_10b = + ((weights.zzzz >> weights_shifts_low) & 1u) * rgb_10b_low.z + + ((weights.zzzz >> weights_shifts_high) & 1u) * rgb_10b_high.z; + block_rgb_shift = (weights_sums_log2.zzzz >> weights_shifts_low) & 1u; + row_2 = ((block_rgb_10b & 1023u) >> block_rgb_shift) + + ((((block_rgb_10b >> 10u) & 1023u) >> block_rgb_shift) << 8u) + + (((block_rgb_10b >> 20u) >> block_rgb_shift) << 16u) + + (((weights_alpha.zzzz >> weights_shifts_low) & 1u) * 0xFF000000u); + block_rgb_10b = + ((weights.wwww >> weights_shifts_low) & 1u) * rgb_10b_low.w + + ((weights.wwww >> weights_shifts_high) & 1u) * rgb_10b_high.w; + block_rgb_shift = (weights_sums_log2.wwww >> weights_shifts_low) & 1u; + row_3 = ((block_rgb_10b & 1023u) >> block_rgb_shift) + + ((((block_rgb_10b >> 10u) & 1023u) >> block_rgb_shift) << 8u) + + (((block_rgb_10b >> 20u) >> block_rgb_shift) << 16u) + + (((weights_alpha.wwww >> weights_shifts_low) & 1u) * 0xFF000000u); +} + +[numthreads(8, 32, 1)] +void main(uint3 xe_thread_id : SV_DispatchThreadID) { + // 1 thread = 4 DXT1 (8bpb) blocks to 16x4 R8G8B8A8 texels. + uint3 block_index = xe_thread_id; + block_index.x <<= 2u; + [branch] if (any(block_index >= xe_texture_copy_size_blocks)) { + return; + } + uint4 block_offsets_guest = + XeTextureCopyGuestBlockOffsets(block_index, 8u, 3u); + uint4 blocks_01 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.x), + xe_texture_copy_source.Load2(block_offsets_guest.y)); + uint4 blocks_23 = uint4(xe_texture_copy_source.Load2(block_offsets_guest.z), + xe_texture_copy_source.Load2(block_offsets_guest.w)); + blocks_01 = XeByteSwap(blocks_01, xe_texture_copy_endianness); + blocks_23 = XeByteSwap(blocks_23, xe_texture_copy_endianness); + + uint4 codes = uint4(blocks_01.yw, blocks_23.yw); + // Sort the color indices so they can be used as weights for the second + // endpoint in the opaque mode. + uint4 weights_opaque_high = XeDXTHighColorWeights(codes); + // Sort the color indices so bits of them can be used as endpoint weights, and + // AND of those bits can be used as the right shift amount for mixing the two + // colors in the punchthrough mode. + // Initially 00 = 1:0, 01 = 0:1, 10 = 1:1, 11 = 0:0. + // 00 = 0:0, 01 = 1:1, 10 = 0:1, 11 = 1:0. + uint4 weights_trans = ~codes; + // 00 = 0:0, 01 = 1:0, 10 = 0:1, 11 = 1:1. + weights_trans ^= (weights_trans & 0x55555555u) << 1u; + + // Get endpoint RGB for mixing, as 8-bit components in 10-bit sequences. + uint4 rgb_565 = uint4(blocks_01.xz, blocks_23.xz); + uint4 rgb_10b_low, rgb_10b_high; + XeDXTColorEndpointsTo8In10(rgb_565, rgb_10b_low, rgb_10b_high); + + // Get modes for each block. + bool4 is_trans = (rgb_565 & 0xFFFFu) <= (rgb_565 >> 16u); + + // Uncompress and write the rows. + uint3 texel_index_host = block_index << uint3(2u, 2u, 0u); + uint texel_offset_host = XeTextureHostLinearOffset( + texel_index_host, xe_texture_copy_size_texels.y, + xe_texture_copy_host_pitch, 4u) + xe_texture_copy_host_base; + for (uint i = 0u; i < 4u; ++i) { + uint4 row_opaque_0, row_opaque_1, row_opaque_2, row_opaque_3; + XeDXTFourBlocksRowToRGB8(rgb_10b_low, rgb_10b_high, weights_opaque_high, + i * 8u, row_opaque_0, row_opaque_1, row_opaque_2, + row_opaque_3); + row_opaque_0 |= 0xFF000000u; + row_opaque_1 |= 0xFF000000u; + row_opaque_2 |= 0xFF000000u; + row_opaque_3 |= 0xFF000000u; + uint4 row_trans_0, row_trans_1, row_trans_2, row_trans_3; + XeDXT1FourTransBlocksRowToRGBA8(rgb_10b_low, rgb_10b_high, weights_trans, + i * 8u, row_trans_0, row_trans_1, + row_trans_2, row_trans_3); + xe_texture_copy_dest.Store4(texel_offset_host, + is_trans.x ? row_trans_0 : row_opaque_0); + xe_texture_copy_dest.Store4(texel_offset_host + 16u, + is_trans.y ? row_trans_1 : row_opaque_1); + xe_texture_copy_dest.Store4(texel_offset_host + 32u, + is_trans.z ? row_trans_2 : row_opaque_2); + xe_texture_copy_dest.Store4(texel_offset_host + 48u, + is_trans.w ? row_trans_3 : row_opaque_3); + if (++texel_index_host.y >= xe_texture_copy_size_texels.y) { + return; + } + texel_offset_host += xe_texture_copy_host_pitch; + } +} diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index 1edf02b10..246a3dab3 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -34,6 +34,7 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/dxbc/texture_load_ctx1_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_load_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_load_depth_unorm_cs.h" +#include "xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt1_rgba8_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_load_dxt3a_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_tile_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/texture_tile_64bpp_cs.h" @@ -180,6 +181,7 @@ const TextureCache::LoadModeInfo TextureCache::load_mode_info_[] = { {texture_load_32bpb_cs, sizeof(texture_load_32bpb_cs)}, {texture_load_64bpb_cs, sizeof(texture_load_64bpb_cs)}, {texture_load_128bpb_cs, sizeof(texture_load_128bpb_cs)}, + {texture_load_dxt1_rgba8_cs, sizeof(texture_load_dxt1_rgba8_cs)}, {texture_load_dxt3a_cs, sizeof(texture_load_dxt3a_cs)}, {texture_load_ctx1_cs, sizeof(texture_load_ctx1_cs)}, {texture_load_depth_unorm_cs, sizeof(texture_load_depth_unorm_cs)}, diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h index 9f7eaa16d..8a1babd64 100644 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ b/src/xenia/gpu/d3d12/texture_cache.h @@ -99,6 +99,7 @@ class TextureCache { k32bpb, k64bpb, k128bpb, + kDXT1AsRGBA8, kDXT3A, kCTX1, kDepthUnorm,