mirror of https://github.com/PCSX2/pcsx2.git
TextureDecompress: Re-add non-SSE code paths
This commit is contained in:
parent
04d7d1a1db
commit
122f1ec767
|
@ -469,12 +469,21 @@ void DecompressBlockBC5 (uint32_t x, uint32_t y, uint32_t stride, enum BC5Mode m
|
|||
|
||||
// File: bc7decomp.c - Richard Geldreich, Jr. 3/31/2020 - MIT license or public domain (see end of file)
|
||||
#include <string.h>
|
||||
|
||||
#if (defined(_M_AMD64) || defined(__x86_64__) || defined(__SSE2__))
|
||||
# define BC7DECOMP_USE_SSE2
|
||||
#endif
|
||||
|
||||
#ifdef BC7DECOMP_USE_SSE2
|
||||
#include <immintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace bc7decomp
|
||||
{
|
||||
static const __m128i g_bc7_weights4_sse2[8] =
|
||||
|
||||
#ifdef BC7DECOMP_USE_SSE2
|
||||
const __m128i g_bc7_weights4_sse2[8] =
|
||||
{
|
||||
_mm_set_epi16(4, 4, 4, 4, 0, 0, 0, 0),
|
||||
_mm_set_epi16(13, 13, 13, 13, 9, 9, 9, 9),
|
||||
|
@ -485,12 +494,13 @@ namespace bc7decomp
|
|||
_mm_set_epi16(55, 55, 55, 55, 51, 51, 51, 51),
|
||||
_mm_set_epi16(64, 64, 64, 64, 60, 60, 60, 60),
|
||||
};
|
||||
#endif
|
||||
|
||||
static const uint32_t g_bc7_weights2[4] = { 0, 21, 43, 64 };
|
||||
static const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 };
|
||||
static const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
|
||||
const uint32_t g_bc7_weights2[4] = { 0, 21, 43, 64 };
|
||||
const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 };
|
||||
const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
|
||||
|
||||
static const uint8_t g_bc7_partition2[64 * 16] =
|
||||
const uint8_t g_bc7_partition2[64 * 16] =
|
||||
{
|
||||
0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1, 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1, 0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1, 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1, 0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1, 0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
|
||||
|
@ -502,7 +512,7 @@ static const uint8_t g_bc7_partition2[64 * 16] =
|
|||
0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1, 0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1, 0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1, 0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1, 0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0, 0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0, 0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1
|
||||
};
|
||||
|
||||
static const uint8_t g_bc7_partition3[64 * 16] =
|
||||
const uint8_t g_bc7_partition3[64 * 16] =
|
||||
{
|
||||
0,0,1,1,0,0,1,1,0,2,2,1,2,2,2,2, 0,0,0,1,0,0,1,1,2,2,1,1,2,2,2,1, 0,0,0,0,2,0,0,1,2,2,1,1,2,2,1,1, 0,2,2,2,0,0,2,2,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2, 0,0,1,1,0,0,1,1,0,0,2,2,0,0,2,2, 0,0,2,2,0,0,2,2,1,1,1,1,1,1,1,1, 0,0,1,1,0,0,1,1,2,2,1,1,2,2,1,1,
|
||||
0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2, 0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2, 0,0,0,0,1,1,1,1,2,2,2,2,2,2,2,2, 0,0,1,2,0,0,1,2,0,0,1,2,0,0,1,2, 0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2, 0,1,2,2,0,1,2,2,0,1,2,2,0,1,2,2, 0,0,1,1,0,1,1,2,1,1,2,2,1,2,2,2, 0,0,1,1,2,0,0,1,2,2,0,0,2,2,2,0,
|
||||
|
@ -514,19 +524,19 @@ static const uint8_t g_bc7_partition3[64 * 16] =
|
|||
0,1,1,0,0,1,1,0,2,2,2,2,2,2,2,2, 0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2, 0,0,2,2,1,1,2,2,1,1,2,2,0,0,2,2, 0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2, 0,0,0,2,0,0,0,1,0,0,0,2,0,0,0,1, 0,2,2,2,1,2,2,2,0,2,2,2,1,2,2,2, 0,1,0,1,2,2,2,2,2,2,2,2,2,2,2,2, 0,1,1,1,2,0,1,1,2,2,0,1,2,2,2,0,
|
||||
};
|
||||
|
||||
static const uint8_t g_bc7_table_anchor_index_second_subset[64] = { 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, 15, 2, 8, 2, 2, 8, 8,15, 2, 8, 2, 2, 8, 8, 2, 2, 15,15, 6, 8, 2, 8,15,15, 2, 8, 2, 2, 2,15,15, 6, 6, 2, 6, 8,15,15, 2, 2, 15,15,15,15,15, 2, 2,15 };
|
||||
const uint8_t g_bc7_table_anchor_index_second_subset[64] = { 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, 15, 2, 8, 2, 2, 8, 8,15, 2, 8, 2, 2, 8, 8, 2, 2, 15,15, 6, 8, 2, 8,15,15, 2, 8, 2, 2, 2,15,15, 6, 6, 2, 6, 8,15,15, 2, 2, 15,15,15,15,15, 2, 2,15 };
|
||||
|
||||
static const uint8_t g_bc7_table_anchor_index_third_subset_1[64] =
|
||||
const uint8_t g_bc7_table_anchor_index_third_subset_1[64] =
|
||||
{
|
||||
3, 3,15,15, 8, 3,15,15, 8, 8, 6, 6, 6, 5, 3, 3, 3, 3, 8,15, 3, 3, 6,10, 5, 8, 8, 6, 8, 5,15,15, 8,15, 3, 5, 6,10, 8,15, 15, 3,15, 5,15,15,15,15, 3,15, 5, 5, 5, 8, 5,10, 5,10, 8,13,15,12, 3, 3
|
||||
};
|
||||
|
||||
static const uint8_t g_bc7_table_anchor_index_third_subset_2[64] =
|
||||
const uint8_t g_bc7_table_anchor_index_third_subset_2[64] =
|
||||
{
|
||||
15, 8, 8, 3,15,15, 3, 8, 15,15,15,15,15,15,15, 8, 15, 8,15, 3,15, 8,15, 8, 3,15, 6,10,15,15,10, 8, 15, 3,15,10,10, 8, 9,10, 6,15, 8,15, 3, 6, 6, 8, 15, 3,15,15,15,15,15,15, 15,15,15,15, 3,15,15, 8
|
||||
};
|
||||
|
||||
static const uint8_t g_bc7_first_byte_to_mode[256] =
|
||||
const uint8_t g_bc7_first_byte_to_mode[256] =
|
||||
{
|
||||
8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
|
@ -546,7 +556,7 @@ static const uint8_t g_bc7_first_byte_to_mode[256] =
|
|||
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
};
|
||||
|
||||
static inline void insert_weight_zero(uint64_t& index_bits, uint32_t bits_per_index, uint32_t offset)
|
||||
inline void insert_weight_zero(uint64_t& index_bits, uint32_t bits_per_index, uint32_t offset)
|
||||
{
|
||||
uint64_t LOW_BIT_MASK = (static_cast<uint64_t>(1) << ((bits_per_index * (offset + 1)) - 1)) - 1;
|
||||
uint64_t HIGH_BIT_MASK = ~LOW_BIT_MASK;
|
||||
|
@ -577,6 +587,8 @@ static inline uint32_t bc7_interp(uint32_t l, uint32_t h, uint32_t w, uint32_t b
|
|||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#ifdef BC7DECOMP_USE_SSE2
|
||||
static inline __m128i bc7_interp_sse2(__m128i l, __m128i h, __m128i w, __m128i iw)
|
||||
{
|
||||
return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(l, iw), _mm_mullo_epi16(h, w)), _mm_set1_epi16(32)), 6);
|
||||
|
@ -619,8 +631,9 @@ static inline void bc7_interp3_sse2(const color_rgba* endpoint_pair, color_rgba*
|
|||
_mm_storeu_si128(reinterpret_cast<__m128i*>(out_colors), all_colors_0);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(out_colors + 4), all_colors_1);
|
||||
}
|
||||
#endif
|
||||
|
||||
static bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
|
||||
bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
|
||||
{
|
||||
//const uint32_t SUBSETS = 3;
|
||||
const uint32_t ENDPOINTS = 6;
|
||||
|
@ -630,6 +643,9 @@ static bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color
|
|||
const uint32_t ENDPOINT_BITS = (mode == 0) ? 4 : 5;
|
||||
const uint32_t ENDPOINT_MASK = (1 << ENDPOINT_BITS) - 1;
|
||||
const uint32_t PBITS = (mode == 0) ? 6 : 0;
|
||||
#ifndef BC7DECOMP_USE_SSE2
|
||||
const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
|
||||
#endif
|
||||
const uint32_t PART_BITS = (mode == 0) ? 4 : 6;
|
||||
const uint32_t PART_MASK = (1 << PART_BITS) - 1;
|
||||
|
||||
|
@ -691,6 +707,7 @@ static bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color
|
|||
|
||||
color_rgba block_colors[3][8];
|
||||
|
||||
#ifdef BC7DECOMP_USE_SSE2
|
||||
for (uint32_t s = 0; s < 3; s++)
|
||||
{
|
||||
if (WEIGHT_BITS == 2)
|
||||
|
@ -698,6 +715,15 @@ static bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color
|
|||
else
|
||||
bc7_interp3_sse2(endpoints + s * 2, block_colors[s]);
|
||||
}
|
||||
#else
|
||||
for (uint32_t s = 0; s < 3; s++)
|
||||
for (uint32_t i = 0; i < WEIGHT_VALS; i++)
|
||||
{
|
||||
for (uint32_t c = 0; c < 3; c++)
|
||||
block_colors[s][i][c] = static_cast<uint8_t>(bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS));
|
||||
block_colors[s][i][3] = 255;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (uint32_t i = 0; i < 16; i++)
|
||||
pPixels[i] = block_colors[g_bc7_partition3[part * 16 + i]][weights[i]];
|
||||
|
@ -705,7 +731,7 @@ static bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
|
||||
bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
|
||||
{
|
||||
//const uint32_t SUBSETS = 2;
|
||||
const uint32_t ENDPOINTS = 4;
|
||||
|
@ -716,6 +742,9 @@ static bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, col
|
|||
const uint32_t ENDPOINT_MASK = (1 << ENDPOINT_BITS) - 1;
|
||||
const uint32_t PBITS = (mode == 1) ? 2 : 4;
|
||||
const uint32_t SHARED_PBITS = (mode == 1) ? true : false;
|
||||
#ifndef BC7DECOMP_USE_SSE2
|
||||
const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
|
||||
#endif
|
||||
|
||||
const uint64_t low_chunk = data_chunks[0];
|
||||
const uint64_t high_chunk = data_chunks[1];
|
||||
|
@ -783,6 +812,7 @@ static bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, col
|
|||
endpoints[e][c] = static_cast<uint8_t>((mode != 7U && c == 3U) ? 255 : bc7_dequant(endpoints[e][c], pbits[SHARED_PBITS ? (e >> 1) : e], ENDPOINT_BITS));
|
||||
|
||||
color_rgba block_colors[2][8];
|
||||
#ifdef BC7DECOMP_USE_SSE2
|
||||
for (uint32_t s = 0; s < 2; s++)
|
||||
{
|
||||
if (WEIGHT_BITS == 2)
|
||||
|
@ -790,6 +820,15 @@ static bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, col
|
|||
else
|
||||
bc7_interp3_sse2(endpoints + s * 2, block_colors[s]);
|
||||
}
|
||||
#else
|
||||
for (uint32_t s = 0; s < 2; s++)
|
||||
for (uint32_t i = 0; i < WEIGHT_VALS; i++)
|
||||
{
|
||||
for (uint32_t c = 0; c < COMPS; c++)
|
||||
block_colors[s][i][c] = static_cast<uint8_t>(bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS));
|
||||
block_colors[s][i][3] = (COMPS == 3) ? 255 : block_colors[s][i][3];
|
||||
}
|
||||
#endif
|
||||
|
||||
for (uint32_t i = 0; i < 16; i++)
|
||||
pPixels[i] = block_colors[g_bc7_partition2[part * 16 + i]][weights[i]];
|
||||
|
@ -797,7 +836,7 @@ static bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, col
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool unpack_bc7_mode4_5(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
|
||||
bool unpack_bc7_mode4_5(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels)
|
||||
{
|
||||
const uint32_t ENDPOINTS = 2;
|
||||
//const uint32_t COMPS = 4;
|
||||
|
@ -879,10 +918,16 @@ static bool unpack_bc7_mode4_5(uint32_t mode, const uint64_t* data_chunks, color
|
|||
endpoints[e][c] = static_cast<uint8_t>(bc7_dequant(endpoints[e][c], (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS));
|
||||
|
||||
color_rgba block_colors[8];
|
||||
#ifdef BC7DECOMP_USE_SSE2
|
||||
if (weight_bits[0] == 3)
|
||||
bc7_interp3_sse2(endpoints, block_colors);
|
||||
else
|
||||
bc7_interp2_sse2(endpoints, block_colors);
|
||||
#else
|
||||
for (uint32_t i = 0; i < (1U << weight_bits[0]); i++)
|
||||
for (uint32_t c = 0; c < 3; c++)
|
||||
block_colors[i][c] = static_cast<uint8_t>(bc7_interp(endpoints[0][c], endpoints[1][c], i, weight_bits[0]));
|
||||
#endif
|
||||
|
||||
for (uint32_t i = 0; i < (1U << weight_bits[1]); i++)
|
||||
block_colors[i][3] = static_cast<uint8_t>(bc7_interp(endpoints[0][3], endpoints[1][3], i, weight_bits[1]));
|
||||
|
@ -898,7 +943,6 @@ static bool unpack_bc7_mode4_5(uint32_t mode, const uint64_t* data_chunks, color
|
|||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct bc7_mode_6
|
||||
{
|
||||
struct
|
||||
|
@ -945,9 +989,8 @@ struct bc7_mode_6
|
|||
uint64_t m_hi_bits;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
static bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels)
|
||||
bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels)
|
||||
{
|
||||
static_assert(sizeof(bc7_mode_6) == 16, "sizeof(bc7_mode_6) == 16");
|
||||
|
||||
|
@ -966,6 +1009,7 @@ static bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels)
|
|||
const uint32_t a1 = static_cast<uint32_t>((block.m_lo.m_a1 << 1) | block.m_hi.m_p1);
|
||||
|
||||
color_rgba vals[16];
|
||||
#ifdef BC7DECOMP_USE_SSE2
|
||||
__m128i vep0 = _mm_set_epi16((short)a0, (short)b0, (short)g0, (short)r0, (short)a0, (short)b0, (short)g0, (short)r0);
|
||||
__m128i vep1 = _mm_set_epi16((short)a1, (short)b1, (short)g1, (short)r1, (short)a1, (short)b1, (short)g1, (short)r1);
|
||||
|
||||
|
@ -983,6 +1027,18 @@ static bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels)
|
|||
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(vals + i), combined);
|
||||
}
|
||||
#else
|
||||
for (uint32_t i = 0; i < 16; i++)
|
||||
{
|
||||
const uint32_t w = g_bc7_weights4[i];
|
||||
const uint32_t iw = 64 - w;
|
||||
vals[i].set_noclamp_rgba(
|
||||
(r0 * iw + r1 * w + 32) >> 6,
|
||||
(g0 * iw + g1 * w + 32) >> 6,
|
||||
(b0 * iw + b1 * w + 32) >> 6,
|
||||
(a0 * iw + a1 * w + 32) >> 6);
|
||||
}
|
||||
#endif
|
||||
|
||||
pPixels[0] = vals[block.m_hi.m_s00];
|
||||
pPixels[1] = vals[block.m_hi.m_s10];
|
||||
|
|
Loading…
Reference in New Issue