Image: Vectorize RGBA8 conversion and add A1BGR5
This commit is contained in:
parent
9b2cf0abbd
commit
b68d7c8adb
|
@ -44,6 +44,17 @@
|
|||
#include <malloc.h> // alloca
|
||||
#endif
|
||||
|
||||
/// Helper to disable loop vectorization.
|
||||
#if defined(__clang__)
|
||||
#define DONT_VECTORIZE_THIS_LOOP _Pragma("clang loop vectorize(disable)")
|
||||
#elif defined(_MSC_VER)
|
||||
#define DONT_VECTORIZE_THIS_LOOP __pragma(loop(no_vector))
|
||||
#elif defined(__GNUC__)
|
||||
#define DONT_VECTORIZE_THIS_LOOP _Pragma("GCC novector")
|
||||
#else
|
||||
#define DONT_VECTORIZE_THIS_LOOP
|
||||
#endif
|
||||
|
||||
/// Only currently using 128-bit vectors at max.
|
||||
static constexpr u32 VECTOR_ALIGNMENT = 16;
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
|
||||
// SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin <stenzek@gmail.com>
|
||||
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
|
||||
|
||||
#include "image.h"
|
||||
|
@ -583,12 +583,46 @@ std::optional<Image> Image::ConvertToRGBA8(Error* error) const
|
|||
case ImageFormat::RGB565:
|
||||
{
|
||||
ret = Image(m_width, m_height, ImageFormat::RGBA8);
|
||||
|
||||
constexpr u32 pixels_per_vec = 8;
|
||||
[[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec);
|
||||
|
||||
for (u32 y = 0; y < m_height; y++)
|
||||
{
|
||||
const u8* pixels_in = GetRowPixels(y);
|
||||
u8* pixels_out = ret->GetRowPixels(y);
|
||||
u32 x = 0;
|
||||
|
||||
for (u32 x = 0; x < m_width; x++)
|
||||
#ifdef CPU_ARCH_SIMD
|
||||
for (; x < aligned_width; x += pixels_per_vec)
|
||||
{
|
||||
GSVector4i rgb565 = GSVector4i::load<false>(pixels_in);
|
||||
pixels_in += sizeof(u16) * pixels_per_vec;
|
||||
|
||||
GSVector4i r = rgb565.srl16<11>();
|
||||
r = r.sll16<3>() | r.sll16<13>().srl16<13>();
|
||||
|
||||
GSVector4i g = rgb565.sll16<5>().srl16<10>();
|
||||
g = g.sll16<2>() | g.sll16<14>().srl16<14>();
|
||||
|
||||
GSVector4i b = rgb565.sll16<11>().srl16<11>();
|
||||
b = b.sll16<3>() | b.sll16<13>().srl16<13>();
|
||||
|
||||
const GSVector4i low =
|
||||
r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | GSVector4i::cxpr(0xFF000000);
|
||||
const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() |
|
||||
b.uph64().u16to32().sll32<16>() | GSVector4i::cxpr(0xFF000000);
|
||||
|
||||
GSVector4i::store<false>(pixels_out, low);
|
||||
pixels_out += sizeof(GSVector4i);
|
||||
|
||||
GSVector4i::store<false>(pixels_out, high);
|
||||
pixels_out += sizeof(GSVector4i);
|
||||
}
|
||||
#endif
|
||||
|
||||
DONT_VECTORIZE_THIS_LOOP
|
||||
for (; x < m_width; x++)
|
||||
{
|
||||
// RGB565 -> RGBA8
|
||||
u16 pixel_in;
|
||||
|
@ -609,12 +643,48 @@ std::optional<Image> Image::ConvertToRGBA8(Error* error) const
|
|||
case ImageFormat::RGB5A1:
|
||||
{
|
||||
ret = Image(m_width, m_height, ImageFormat::RGBA8);
|
||||
|
||||
constexpr u32 pixels_per_vec = 8;
|
||||
[[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec);
|
||||
|
||||
for (u32 y = 0; y < m_height; y++)
|
||||
{
|
||||
const u8* pixels_in = GetRowPixels(y);
|
||||
u8* pixels_out = ret->GetRowPixels(y);
|
||||
u32 x = 0;
|
||||
|
||||
for (u32 x = 0; x < m_width; x++)
|
||||
#ifdef CPU_ARCH_SIMD
|
||||
for (; x < aligned_width; x += pixels_per_vec)
|
||||
{
|
||||
GSVector4i rgb5a1 = GSVector4i::load<false>(pixels_in);
|
||||
pixels_in += sizeof(u16) * pixels_per_vec;
|
||||
|
||||
GSVector4i r = rgb5a1.sll16<1>().srl16<11>();
|
||||
r = r.sll16<3>() | r.sll16<13>().srl16<13>();
|
||||
|
||||
GSVector4i g = rgb5a1.sll16<6>().srl16<11>();
|
||||
g = g.sll16<3>() | g.sll16<13>().srl16<13>();
|
||||
|
||||
GSVector4i b = rgb5a1.sll16<11>().srl16<11>();
|
||||
b = b.sll16<3>() | b.sll16<13>().srl16<13>();
|
||||
|
||||
GSVector4i a = rgb5a1.sra16<7>().srl16<8>();
|
||||
|
||||
const GSVector4i low =
|
||||
r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | a.u16to32().sll32<24>();
|
||||
const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() |
|
||||
b.uph64().u16to32().sll32<16>() | a.uph64().u16to32().sll32<24>();
|
||||
|
||||
GSVector4i::store<false>(pixels_out, low);
|
||||
pixels_out += sizeof(GSVector4i);
|
||||
|
||||
GSVector4i::store<false>(pixels_out, high);
|
||||
pixels_out += sizeof(GSVector4i);
|
||||
}
|
||||
#endif
|
||||
|
||||
DONT_VECTORIZE_THIS_LOOP
|
||||
for (; x < m_width; x++)
|
||||
{
|
||||
// RGB5A1 -> RGBA8
|
||||
u16 pixel_in;
|
||||
|
@ -633,6 +703,69 @@ std::optional<Image> Image::ConvertToRGBA8(Error* error) const
|
|||
}
|
||||
break;
|
||||
|
||||
case ImageFormat::A1BGR5:
|
||||
{
|
||||
ret = Image(m_width, m_height, ImageFormat::RGBA8);
|
||||
|
||||
constexpr u32 pixels_per_vec = 8;
|
||||
[[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec);
|
||||
|
||||
for (u32 y = 0; y < m_height; y++)
|
||||
{
|
||||
const u8* pixels_in = GetRowPixels(y);
|
||||
u8* pixels_out = ret->GetRowPixels(y);
|
||||
u32 x = 0;
|
||||
|
||||
#ifdef CPU_ARCH_SIMD
|
||||
for (; x < aligned_width; x += pixels_per_vec)
|
||||
{
|
||||
GSVector4i a1bgr5 = GSVector4i::load<false>(pixels_in);
|
||||
pixels_in += sizeof(u16) * pixels_per_vec;
|
||||
|
||||
GSVector4i r = a1bgr5.srl16<11>();
|
||||
r = r.sll16<3>() | r.sll16<13>().srl16<13>();
|
||||
|
||||
GSVector4i g = a1bgr5.sll16<5>().srl16<11>();
|
||||
g = g.sll16<3>() | g.sll16<13>().srl16<13>();
|
||||
|
||||
GSVector4i b = a1bgr5.sll16<10>().srl16<11>();
|
||||
b = b.sll16<3>() | b.sll16<13>().srl16<13>();
|
||||
|
||||
GSVector4i a = a1bgr5.sll16<15>().sra16<7>().srl16<8>();
|
||||
|
||||
const GSVector4i low =
|
||||
r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | a.u16to32().sll32<24>();
|
||||
const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() |
|
||||
b.uph64().u16to32().sll32<16>() | a.uph64().u16to32().sll32<24>();
|
||||
|
||||
GSVector4i::store<false>(pixels_out, low);
|
||||
pixels_out += sizeof(GSVector4i);
|
||||
|
||||
GSVector4i::store<false>(pixels_out, high);
|
||||
pixels_out += sizeof(GSVector4i);
|
||||
}
|
||||
#endif
|
||||
|
||||
DONT_VECTORIZE_THIS_LOOP
|
||||
for (; x < m_width; x++)
|
||||
{
|
||||
// RGB5A1 -> RGBA8
|
||||
u16 pixel_in;
|
||||
std::memcpy(&pixel_in, pixels_in, sizeof(u16));
|
||||
pixels_in += sizeof(u16);
|
||||
const u8 a1 = Truncate8(pixel_in & 0x01);
|
||||
const u8 r5 = Truncate8((pixel_in >> 11) & 0x1F);
|
||||
const u8 g6 = Truncate8((pixel_in >> 6) & 0x1F);
|
||||
const u8 b5 = Truncate8((pixel_in >> 1) & 0x1F);
|
||||
const u32 rgba8 = ZeroExtend32((r5 << 3) | (r5 & 7)) | (ZeroExtend32((g6 << 3) | (g6 & 7)) << 8) |
|
||||
(ZeroExtend32((b5 << 3) | (b5 & 7)) << 16) | (a1 ? 0xFF000000u : 0u);
|
||||
std::memcpy(pixels_out, &rgba8, sizeof(u32));
|
||||
pixels_out += sizeof(u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case ImageFormat::BGR8:
|
||||
{
|
||||
ret = Image(m_width, m_height, ImageFormat::RGBA8);
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
|
||||
// SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin <stenzek@gmail.com>
|
||||
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/align.h"
|
||||
#include "common/heap_array.h"
|
||||
#include "common/intrin.h"
|
||||
#include "common/types.h"
|
||||
|
||||
#include <cstdio>
|
||||
|
@ -60,12 +61,13 @@ public:
|
|||
ALWAYS_INLINE u32 GetHeight() const { return m_height; }
|
||||
ALWAYS_INLINE u32 GetPitch() const { return m_pitch; }
|
||||
ALWAYS_INLINE ImageFormat GetFormat() const { return m_format; }
|
||||
ALWAYS_INLINE const u8* GetPixels() const { return m_pixels.get(); }
|
||||
ALWAYS_INLINE u8* GetPixels() { return m_pixels.get(); }
|
||||
ALWAYS_INLINE const u8* GetRowPixels(u32 y) const { return &m_pixels[y * m_pitch]; }
|
||||
ALWAYS_INLINE u8* GetRowPixels(u32 y) { return &m_pixels[y * m_pitch]; }
|
||||
// ALWAYS_INLINE void SetPixel(u32 x, u32 y, PixelType pixel) { m_pixels[y * m_width + x] = pixel; }
|
||||
// ALWAYS_INLINE PixelType GetPixel(u32 x, u32 y) const { return m_pixels[y * m_width + x]; }
|
||||
ALWAYS_INLINE const u8* GetPixels() const { return std::assume_aligned<VECTOR_ALIGNMENT>(m_pixels.get()); }
|
||||
ALWAYS_INLINE u8* GetPixels() { return std::assume_aligned<VECTOR_ALIGNMENT>(m_pixels.get()); }
|
||||
ALWAYS_INLINE const u8* GetRowPixels(u32 y) const
|
||||
{
|
||||
return std::assume_aligned<VECTOR_ALIGNMENT>(&m_pixels[y * m_pitch]);
|
||||
}
|
||||
ALWAYS_INLINE u8* GetRowPixels(u32 y) { return std::assume_aligned<VECTOR_ALIGNMENT>(&m_pixels[y * m_pitch]); }
|
||||
|
||||
u32 GetBlocksWide() const;
|
||||
u32 GetBlocksHigh() const;
|
||||
|
|
Loading…
Reference in New Issue