GPU: Do a massive refactor of the manually vectorized code and add full support for AVX2.

- Most notably, each version of the manually vectorized code now resides in their own files.
- Depending on the rendering situation, the new AVX2 code may increase rendering performance by 5% to up to 50%.
- Certain functions automatically gain manual vectorization support since the new GPU code makes use of the new general-purpose copy functions that were added in commit e991b16. In other words, AVX-512 and AltiVec builds also benefit from this.
This commit is contained in:
rogerman 2021-09-06 18:00:38 -07:00
parent 4226fa7ab2
commit 0db98725dc
9 changed files with 8003 additions and 4096 deletions

4209
desmume/src/GPU.cpp Normal file → Executable file

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@
Copyright (C) 2006 yopyop Copyright (C) 2006 yopyop
Copyright (C) 2006-2007 Theo Berkau Copyright (C) 2006-2007 Theo Berkau
Copyright (C) 2007 shash Copyright (C) 2007 shash
Copyright (C) 2009-2019 DeSmuME team Copyright (C) 2009-2021 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -28,7 +28,6 @@
#include "./utils/colorspacehandler/colorspacehandler.h" #include "./utils/colorspacehandler/colorspacehandler.h"
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
#include <emmintrin.h>
#include "./utils/colorspacehandler/colorspacehandler_SSE2.h" #include "./utils/colorspacehandler/colorspacehandler_SSE2.h"
#endif #endif
@ -40,6 +39,14 @@
#include <smmintrin.h> #include <smmintrin.h>
#endif #endif
#ifdef ENABLE_AVX2
#include "./utils/colorspacehandler/colorspacehandler_AVX2.h"
#endif
#ifdef ENABLE_AVX512_1
#include "./utils/colorspacehandler/colorspacehandler_AVX512.h"
#endif
// Note: Technically, the shift count of palignr can be any value of [0-255]. But practically speaking, the // Note: Technically, the shift count of palignr can be any value of [0-255]. But practically speaking, the
// shift count should be a value of [0-15]. If we assume that the value range will always be [0-15], we can // shift count should be a value of [0-15]. If we assume that the value range will always be [0-15], we can
// then substitute the palignr instruction with an SSE2 equivalent. // then substitute the palignr instruction with an SSE2 equivalent.
@ -1201,8 +1208,9 @@ typedef struct
typedef struct typedef struct
{ {
u8 begin; u8 begin[GPU_FRAMEBUFFER_NATIVE_WIDTH];
u8 trunc; u8 trunc[GPU_FRAMEBUFFER_NATIVE_WIDTH];
u32 trunc32[GPU_FRAMEBUFFER_NATIVE_WIDTH];
} MosaicTableEntry; } MosaicTableEntry;
typedef struct typedef struct
@ -1272,34 +1280,21 @@ typedef struct
FragmentColor *brightnessDownTable666; FragmentColor *brightnessDownTable666;
FragmentColor *brightnessDownTable888; FragmentColor *brightnessDownTable888;
bool srcEffectEnable[6];
bool dstBlendEnable[6];
#ifdef ENABLE_SSE2
__m128i srcEffectEnable_SSE2[6];
#ifdef ENABLE_SSSE3
__m128i dstBlendEnable_SSSE3;
#else
__m128i dstBlendEnable_SSE2[6];
#endif
#endif // ENABLE_SSE2
bool dstAnyBlendEnable;
u8 WIN0_enable[6]; u8 WIN0_enable[6];
u8 WIN1_enable[6]; u8 WIN1_enable[6];
u8 WINOUT_enable[6]; u8 WINOUT_enable[6];
u8 WINOBJ_enable[6]; u8 WINOBJ_enable[6];
#if defined(ENABLE_SSE2)
__m128i WIN0_enable_SSE2[6];
__m128i WIN1_enable_SSE2[6];
__m128i WINOUT_enable_SSE2[6];
__m128i WINOBJ_enable_SSE2[6];
#endif
bool WIN0_ENABLED; bool WIN0_ENABLED;
bool WIN1_ENABLED; bool WIN1_ENABLED;
bool WINOBJ_ENABLED; bool WINOBJ_ENABLED;
bool isAnyWindowEnabled; bool isAnyWindowEnabled;
u8 srcEffectEnable[6];
u8 dstBlendEnable[6];
bool dstAnyBlendEnable;
CACHE_ALIGN u8 dstBlendEnableVecLookup[128]; // Supports up to 1024-bit vectors
MosaicTableEntry *mosaicWidthBG; MosaicTableEntry *mosaicWidthBG;
MosaicTableEntry *mosaicHeightBG; MosaicTableEntry *mosaicHeightBG;
MosaicTableEntry *mosaicWidthOBJ; MosaicTableEntry *mosaicWidthOBJ;
@ -1340,32 +1335,26 @@ typedef struct
class GPUEngineBase class GPUEngineBase
{ {
protected: protected:
static CACHE_ALIGN u16 _brightnessUpTable555[17][0x8000];
static CACHE_ALIGN FragmentColor _brightnessUpTable666[17][0x8000];
static CACHE_ALIGN FragmentColor _brightnessUpTable888[17][0x8000];
static CACHE_ALIGN u16 _brightnessDownTable555[17][0x8000];
static CACHE_ALIGN FragmentColor _brightnessDownTable666[17][0x8000];
static CACHE_ALIGN FragmentColor _brightnessDownTable888[17][0x8000];
static CACHE_ALIGN u8 _blendTable555[17][17][32][32];
static const CACHE_ALIGN SpriteSize _sprSizeTab[4][4]; static const CACHE_ALIGN SpriteSize _sprSizeTab[4][4];
static const CACHE_ALIGN BGLayerSize _BGLayerSizeLUT[8][4]; static const CACHE_ALIGN BGLayerSize _BGLayerSizeLUT[8][4];
static const CACHE_ALIGN BGType _mode2type[8][4]; static const CACHE_ALIGN BGType _mode2type[8][4];
static struct MosaicLookup static struct MosaicLookup
{ {
CACHE_ALIGN MosaicTableEntry table[16][256]; CACHE_ALIGN MosaicTableEntry table[16];
MosaicLookup() MosaicLookup()
{ {
for (size_t m = 0; m < 16; m++) for (size_t m = 0; m < 16; m++)
{ {
for (size_t i = 0; i < 256; i++) for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
{ {
size_t mosaic = m+1; size_t mosaic = m+1;
MosaicTableEntry &te = table[m][i]; MosaicTableEntry &te = table[m];
te.begin = ((i % mosaic) == 0);
te.trunc = (i / mosaic) * mosaic; te.begin[i] = ((i % mosaic) == 0);
te.trunc[i] = (i / mosaic) * mosaic;
te.trunc32[i] = te.trunc[i];
} }
} }
} }
@ -1413,12 +1402,12 @@ protected:
itemsForPriority_t _itemsForPriority[NB_PRIORITIES]; itemsForPriority_t _itemsForPriority[NB_PRIORITIES];
struct MosaicColor { struct MosaicColor {
u16 bg[4][256]; CACHE_ALIGN u16 bg[4][GPU_FRAMEBUFFER_NATIVE_WIDTH + sizeof(u32)]; // Pad this buffersa little bit to avoid buffer overruns with vectorized gather instructions.
struct Obj { struct Obj {
u16 color; u16 color;
u8 alpha; u8 alpha;
u8 opaque; u8 opaque;
} obj[256]; } obj[GPU_FRAMEBUFFER_NATIVE_WIDTH];
} _mosaicColors; } _mosaicColors;
GPUEngineID _engineID; GPUEngineID _engineID;
@ -1452,7 +1441,6 @@ protected:
FragmentColor _asyncClearBackdropColor32; // Do not modify this variable directly. FragmentColor _asyncClearBackdropColor32; // Do not modify this variable directly.
bool _asyncClearUseInternalCustomBuffer; // Do not modify this variable directly. bool _asyncClearUseInternalCustomBuffer; // Do not modify this variable directly.
void _InitLUTs();
void _Reset_Base(); void _Reset_Base();
void _ResortBGLayers(); void _ResortBGLayers();
@ -1466,12 +1454,19 @@ protected:
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING, PixelLookupFunc GetPixelFunc> void _RenderPixelIterate(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, const u32 map, const u32 tile, const u16 *__restrict pal); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING, PixelLookupFunc GetPixelFunc> void _RenderPixelIterate(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, const u32 map, const u32 tile, const u16 *__restrict pal);
TILEENTRY _GetTileEntry(const u32 tileMapAddress, const u16 xOffset, const u16 layerWidthMask); TILEENTRY _GetTileEntry(const u32 tileMapAddress, const u16 xOffset, const u16 layerWidthMask);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST> FORCEINLINE void _CompositePixelImmediate(GPUEngineCompositorInfo &compInfo, const size_t srcX, u16 srcColor16, bool opaque); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST> FORCEINLINE void _CompositePixelImmediate(GPUEngineCompositorInfo &compInfo, const size_t srcX, u16 srcColor16, bool isOpaque);
template<bool ISFIRSTLINE> void _MosaicLine(GPUEngineCompositorInfo &compInfo);
template<bool MOSAIC> void _PrecompositeNativeToCustomLineBG(GPUEngineCompositorInfo &compInfo); template<bool MOSAIC> void _PrecompositeNativeToCustomLineBG(GPUEngineCompositorInfo &compInfo);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST> void _CompositeNativeLineOBJ(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST> void _CompositeNativeLineOBJ(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST> void _CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST> void _CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST> void _CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST> void _CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST> void _CompositeNativeLineOBJ_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST> size_t _CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST> size_t _CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING> void _RenderLine_BGText(GPUEngineCompositorInfo &compInfo, const u16 XBG, const u16 YBG); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING> void _RenderLine_BGText(GPUEngineCompositorInfo &compInfo, const u16 XBG, const u16 YBG);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING> void _RenderLine_BGAffine(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING> void _RenderLine_BGAffine(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING> void _RenderLine_BGExtended(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, bool &outUseCustomVRAM); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING> void _RenderLine_BGExtended(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, bool &outUseCustomVRAM);
@ -1487,79 +1482,22 @@ protected:
template<NDSColorFormat OUTPUTFORMAT> void _HandleDisplayModeOff(const size_t l); template<NDSColorFormat OUTPUTFORMAT> void _HandleDisplayModeOff(const size_t l);
template<NDSColorFormat OUTPUTFORMAT> void _HandleDisplayModeNormal(const size_t l); template<NDSColorFormat OUTPUTFORMAT> void _HandleDisplayModeNormal(const size_t l);
template<NDSColorFormat OUTPUTFORMAT> size_t _ApplyMasterBrightnessUp_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped);
template<NDSColorFormat OUTPUTFORMAT> size_t _ApplyMasterBrightnessDown_LoopOp(void *__restrict dst, const size_t pixCount, const u8 intensityClamped);
template<size_t WIN_NUM> void _UpdateWINH(GPUEngineCompositorInfo &compInfo); template<size_t WIN_NUM> void _UpdateWINH(GPUEngineCompositorInfo &compInfo);
template<size_t WIN_NUM> bool _IsWindowInsideVerticalRange(GPUEngineCompositorInfo &compInfo); template<size_t WIN_NUM> bool _IsWindowInsideVerticalRange(GPUEngineCompositorInfo &compInfo);
void _PerformWindowTesting(GPUEngineCompositorInfo &compInfo); void _PerformWindowTesting(GPUEngineCompositorInfo &compInfo);
void _PerformWindowTestingNative(GPUEngineCompositorInfo &compInfo, const size_t layerID, const u8 *__restrict win0, const u8 *__restrict win1, const u8 *__restrict winObj, u8 *__restrict didPassWindowTestNative, u8 *__restrict enableColorEffectNative);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING> FORCEINLINE void _RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING> FORCEINLINE void _RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST> FORCEINLINE void _RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST> FORCEINLINE void _RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST> void _RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST> void _RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST> void _RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST> void _RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item);
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _PixelCopy(GPUEngineCompositorInfo &compInfo, const u16 srcColor16);
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _PixelCopy(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32);
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _PixelBrightnessUp(GPUEngineCompositorInfo &compInfo, const u16 srcColor16);
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _PixelBrightnessUp(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32);
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _PixelBrightnessDown(GPUEngineCompositorInfo &compInfo, const u16 srcColor16);
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _PixelBrightnessDown(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32);
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE> FORCEINLINE void _PixelUnknownEffect(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode);
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE> FORCEINLINE void _PixelUnknownEffect(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE> FORCEINLINE void _PixelComposite(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE> FORCEINLINE void _PixelComposite(GPUEngineCompositorInfo &compInfo, FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode);
FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB);
FORCEINLINE u16 _ColorEffectBlend(const u16 colA, const u16 colB, const TBlendTable *blendTable);
template<NDSColorFormat COLORFORMAT> FORCEINLINE FragmentColor _ColorEffectBlend(const FragmentColor colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB);
FORCEINLINE u16 _ColorEffectBlend3D(const FragmentColor colA, const u16 colB);
template<NDSColorFormat COLORFORMATB> FORCEINLINE FragmentColor _ColorEffectBlend3D(const FragmentColor colA, const FragmentColor colB);
FORCEINLINE u16 _ColorEffectIncreaseBrightness(const u16 col, const u16 blendEVY);
template<NDSColorFormat COLORFORMAT> FORCEINLINE FragmentColor _ColorEffectIncreaseBrightness(const FragmentColor col, const u16 blendEVY);
FORCEINLINE u16 _ColorEffectDecreaseBrightness(const u16 col, const u16 blendEVY);
FORCEINLINE FragmentColor _ColorEffectDecreaseBrightness(const FragmentColor col, const u16 blendEVY);
#ifdef ENABLE_SSE2
template<NDSColorFormat COLORFORMAT, bool USECONSTANTBLENDVALUESHINT> FORCEINLINE __m128i _ColorEffectBlend(const __m128i &colA, const __m128i &colB, const __m128i &blendEVA, const __m128i &blendEVB);
template<NDSColorFormat COLORFORMATB> FORCEINLINE __m128i _ColorEffectBlend3D(const __m128i &colA_Lo, const __m128i &colA_Hi, const __m128i &colB);
template<NDSColorFormat COLORFORMAT> FORCEINLINE __m128i _ColorEffectIncreaseBrightness(const __m128i &col, const __m128i &blendEVY);
template<NDSColorFormat COLORFORMAT> FORCEINLINE __m128i _ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY);
template<bool WILLDEFERCOMPOSITING> FORCEINLINE void _RenderPixel_CheckWindows16_SSE2(GPUEngineCompositorInfo &compInfo, const size_t dstX, __m128i &didPassWindowTest, __m128i &enableColorEffect) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _PixelCopy16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID);
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _PixelCopyWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID);
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _PixelBrightnessUp16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID);
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _PixelBrightnessUpWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID);
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _PixelBrightnessDown16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID);
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _PixelBrightnessDownWithMask16_SSE2(GPUEngineCompositorInfo &compInfo, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0, __m128i &dstLayerID);
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
FORCEINLINE void _PixelUnknownEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo,
const __m128i &passMask8,
const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
const __m128i &srcEffectEnableMask,
const __m128i &enableColorEffectMask,
const __m128i &spriteAlpha,
const __m128i &spriteMode,
__m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
__m128i &dstLayerID);
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
FORCEINLINE void _PixelComposite16_SSE2(GPUEngineCompositorInfo &compInfo,
const bool didAllPixelsPass,
const __m128i &passMask8,
const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
const __m128i &srcEffectEnableMask,
const u8 *__restrict enableColorEffectPtr,
const u8 *__restrict sprAlphaPtr,
const u8 *__restrict sprModePtr);
#endif
template<bool ISDEBUGRENDER, bool ISOBJMODEBITMAP> FORCEINLINE void _RenderSpriteUpdatePixel(GPUEngineCompositorInfo &compInfo, size_t frameX, const u16 *__restrict srcPalette, const u8 palIndex, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab); template<bool ISDEBUGRENDER, bool ISOBJMODEBITMAP> FORCEINLINE void _RenderSpriteUpdatePixel(GPUEngineCompositorInfo &compInfo, size_t frameX, const u16 *__restrict srcPalette, const u8 palIndex, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);
template<bool ISDEBUGRENDER> void _RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u8 spriteAlpha, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab); template<bool ISDEBUGRENDER> void _RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u8 spriteAlpha, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);
template<bool ISDEBUGRENDER> size_t _RenderSpriteBMP_LoopOp(const size_t length, const u8 spriteAlpha, const u8 prio, const u8 spriteNum, const u16 *__restrict vramBuffer, size_t &frameX, size_t &spriteX, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);
template<bool ISDEBUGRENDER> void _RenderSprite256(GPUEngineCompositorInfo &compInfo, const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u16 *__restrict palColorBuffer, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab); template<bool ISDEBUGRENDER> void _RenderSprite256(GPUEngineCompositorInfo &compInfo, const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u16 *__restrict palColorBuffer, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);
template<bool ISDEBUGRENDER> void _RenderSprite16(GPUEngineCompositorInfo &compInfo, const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u16 *__restrict palColorBuffer, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab); template<bool ISDEBUGRENDER> void _RenderSprite16(GPUEngineCompositorInfo &compInfo, const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u16 *__restrict palColorBuffer, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);
void _RenderSpriteWin(const u8 *src, const bool col256, const size_t lg, size_t sprX, size_t x, const s32 xdir); void _RenderSpriteWin(const u8 *src, const bool col256, const size_t lg, size_t sprX, size_t x, const s32 xdir);
@ -1703,14 +1641,16 @@ protected:
u16 _RenderLine_DispCapture_BlendFunc(const u16 srcA, const u16 srcB, const u8 blendEVA, const u8 blendEVB); u16 _RenderLine_DispCapture_BlendFunc(const u16 srcA, const u16 srcB, const u8 blendEVA, const u8 blendEVB);
template<NDSColorFormat COLORFORMAT> FragmentColor _RenderLine_DispCapture_BlendFunc(const FragmentColor srcA, const FragmentColor srcB, const u8 blendEVA, const u8 blendEVB); template<NDSColorFormat COLORFORMAT> FragmentColor _RenderLine_DispCapture_BlendFunc(const FragmentColor srcA, const FragmentColor srcB, const u8 blendEVA, const u8 blendEVB);
#ifdef ENABLE_SSE2 template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST>
template<NDSColorFormat COLORFORMAT> __m128i _RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA, const __m128i &srcB, const __m128i &blendEVA, const __m128i &blendEVB); size_t _RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const FragmentColor *__restrict srcLinePtr);
#endif
template<NDSColorFormat OUTPUTFORMAT> template<NDSColorFormat OUTPUTFORMAT>
void _RenderLine_DispCapture_BlendToCustomDstBuffer(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length); // Do not use restrict pointers, since srcB and dst can be the same void _RenderLine_DispCapture_Blend_Buffer(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t pixCount); // Do not use restrict pointers, since srcB and dst can be the same
template<NDSColorFormat OUTPUTFORMAT, size_t CAPTURELENGTH, bool CAPTUREFROMNATIVESRCA, bool CAPTUREFROMNATIVESRCB, bool CAPTURETONATIVEDST> template<NDSColorFormat OUTPUTFORMAT>
size_t _RenderLine_DispCapture_Blend_VecLoop(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length);
template<NDSColorFormat OUTPUTFORMAT, size_t CAPTURELENGTH, bool ISCAPTURENATIVE>
void _RenderLine_DispCapture_Blend(const GPUEngineLineInfo &lineInfo, const void *srcA, const void *srcB, void *dst, const size_t captureLengthExt); // Do not use restrict pointers, since srcB and dst can be the same void _RenderLine_DispCapture_Blend(const GPUEngineLineInfo &lineInfo, const void *srcA, const void *srcB, void *dst, const size_t captureLengthExt); // Do not use restrict pointers, since srcB and dst can be the same
template<NDSColorFormat OUTPUTFORMAT> void _HandleDisplayModeVRAM(const GPUEngineLineInfo &lineInfo); template<NDSColorFormat OUTPUTFORMAT> void _HandleDisplayModeVRAM(const GPUEngineLineInfo &lineInfo);
@ -1986,20 +1926,6 @@ public:
void SetClientData(void *clientData); void SetClientData(void *clientData);
}; };
template <s32 INTEGERSCALEHINT, bool SCALEVERTICAL, bool USELINEINDEX, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
void CopyLineExpandHinted(const void *__restrict srcBuffer, const size_t srcLineIndex,
void *__restrict dstBuffer, const size_t dstLineIndex, const size_t dstLineWidth, const size_t dstLineCount);
template <s32 INTEGERSCALEHINT, bool SCALEVERTICAL, bool USELINEINDEX, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
void CopyLineExpandHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer);
template <s32 INTEGERSCALEHINT, bool USELINEINDEX, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
void CopyLineReduceHinted(const void *__restrict srcBuffer, const size_t srcLineIndex, const size_t srcLineWidth,
void *__restrict dstBuffer, const size_t dstLineIndex);
template <s32 INTEGERSCALEHINT, bool USELINEINDEX, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
void CopyLineReduceHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer);
extern GPUSubsystem *GPU; extern GPUSubsystem *GPU;
extern MMU_struct MMU; extern MMU_struct MMU;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,96 @@
/*
Copyright (C) 2021 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with the this software. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef GPU_OPERATIONS_H
#define GPU_OPERATIONS_H
#include <stdio.h>
#include "types.h"
#include "./utils/colorspacehandler/colorspacehandler.h"
#include "GPU.h"
template <s32 INTEGERSCALEHINT, bool SCALEVERTICAL, bool USELINEINDEX, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
void CopyLineExpandHinted(const void *__restrict srcBuffer, const size_t srcLineIndex,
void *__restrict dstBuffer, const size_t dstLineIndex, const size_t dstLineWidth, const size_t dstLineCount);
template <s32 INTEGERSCALEHINT, bool SCALEVERTICAL, bool USELINEINDEX, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
void CopyLineExpandHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer);
template <s32 INTEGERSCALEHINT, bool USELINEINDEX, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
void CopyLineReduceHinted(const void *__restrict srcBuffer, const size_t srcLineIndex, const size_t srcLineWidth,
void *__restrict dstBuffer, const size_t dstLineIndex);
template <s32 INTEGERSCALEHINT, bool USELINEINDEX, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
void CopyLineReduceHinted(const GPUEngineLineInfo &lineInfo, const void *__restrict srcBuffer, void *__restrict dstBuffer);
class ColorOperation
{
public:
ColorOperation() {};
FORCEINLINE u16 blend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB) const;
FORCEINLINE u16 blend(const u16 colA, const u16 colB, const TBlendTable *blendTable) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE FragmentColor blend(const FragmentColor colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB) const;
FORCEINLINE u16 blend3D(const FragmentColor colA, const u16 colB) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE FragmentColor blend3D(const FragmentColor colA, const FragmentColor colB) const;
FORCEINLINE u16 increase(const u16 col, const u16 blendEVY) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE FragmentColor increase(const FragmentColor col, const u16 blendEVY) const;
FORCEINLINE u16 decrease(const u16 col, const u16 blendEVY) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE FragmentColor decrease(const FragmentColor col, const u16 blendEVY) const;
};
class PixelOperation
{
private:
template<GPULayerType LAYERTYPE> FORCEINLINE void __selectedEffect(const GPUEngineCompositorInfo &compInfo, const u8 &dstLayerID, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode, ColorEffect &selectedEffect, TBlendTable **selectedBlendTable, u8 &blendEVA, u8 &blendEVB) const;
protected:
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32) const;
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE> FORCEINLINE void _unknownEffect16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode) const;
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE> FORCEINLINE void _unknownEffect32(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const OBJMode spriteMode) const;
public:
static CACHE_ALIGN u8 BlendTable555[17][17][32][32];
static CACHE_ALIGN u16 BrightnessUpTable555[17][0x8000];
static CACHE_ALIGN FragmentColor BrightnessUpTable666[17][0x8000];
static CACHE_ALIGN FragmentColor BrightnessUpTable888[17][0x8000];
static CACHE_ALIGN u16 BrightnessDownTable555[17][0x8000];
static CACHE_ALIGN FragmentColor BrightnessDownTable666[17][0x8000];
static CACHE_ALIGN FragmentColor BrightnessDownTable888[17][0x8000];
static void InitLUTs();
PixelOperation() {};
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE> FORCEINLINE void Composite16(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode) const;
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE> FORCEINLINE void Composite32(GPUEngineCompositorInfo &compInfo, FragmentColor srcColor32, const bool enableColorEffect, const u8 spriteAlpha, const u8 spriteMode) const;
};
#endif // GPU_OPERATIONS_H

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,122 @@
/*
Copyright (C) 2021 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with the this software. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef GPU_OPERATIONS_AVX2_H
#define GPU_OPERATIONS_AVX2_H
#include "GPU_Operations.h"
#ifndef ENABLE_AVX2
#warning This header requires AVX2 support.
#else
class ColorOperation_AVX2
{
public:
ColorOperation_AVX2() {};
FORCEINLINE v256u16 blend(const v256u16 &colA, const v256u16 &colB, const v256u16 &blendEVA, const v256u16 &blendEVB) const;
template<NDSColorFormat COLORFORMAT, bool USECONSTANTBLENDVALUESHINT> FORCEINLINE v256u32 blend(const v256u32 &colA, const v256u32 &colB, const v256u16 &blendEVA, const v256u16 &blendEVB) const;
FORCEINLINE v256u16 blend3D(const v256u32 &colA_Lo, const v256u32 &colA_Hi, const v256u16 &colB) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v256u32 blend3D(const v256u32 &colA, const v256u32 &colB) const;
FORCEINLINE v256u16 increase(const v256u16 &col, const v256u16 &blendEVY) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v256u32 increase(const v256u32 &col, const v256u16 &blendEVY) const;
FORCEINLINE v256u16 decrease(const v256u16 &col, const v256u16 &blendEVY) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v256u32 decrease(const v256u32 &col, const v256u16 &blendEVY) const;
};
class PixelOperation_AVX2
{
protected:
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy16(GPUEngineCompositorInfo &compInfo, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy32(GPUEngineCompositorInfo &compInfo, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copyMask16(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copyMask32(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp16(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp32(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUpMask16(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUpMask32(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown16(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown32(GPUEngineCompositorInfo &compInfo, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDownMask16(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u16 &src1, const v256u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDownMask32(GPUEngineCompositorInfo &compInfo, const v256u8 &passMask8, const v256u16 &evy16, const v256u8 &srcLayerID, const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
FORCEINLINE void _unknownEffectMask16(GPUEngineCompositorInfo &compInfo,
const v256u8 &passMask8,
const v256u16 &evy16,
const v256u8 &srcLayerID,
const v256u16 &src1, const v256u16 &src0,
const v256u8 &srcEffectEnableMask,
const v256u8 &dstBlendEnableMaskLUT,
const v256u8 &enableColorEffectMask,
const v256u8 &spriteAlpha,
const v256u8 &spriteMode) const;
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
FORCEINLINE void _unknownEffectMask32(GPUEngineCompositorInfo &compInfo,
const v256u8 &passMask8,
const v256u16 &evy16,
const v256u8 &srcLayerID,
const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0,
const v256u8 &srcEffectEnableMask,
const v256u8 &dstBlendEnableMaskLUT,
const v256u8 &enableColorEffectMask,
const v256u8 &spriteAlpha,
const v256u8 &spriteMode) const;
public:
PixelOperation_AVX2() {};
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
FORCEINLINE void Composite16(GPUEngineCompositorInfo &compInfo,
const bool didAllPixelsPass,
const v256u8 &passMask8,
const v256u16 &evy16,
const v256u8 &srcLayerID,
const v256u16 &src1, const v256u16 &src0,
const v256u8 &srcEffectEnableMask,
const v256u8 &dstBlendEnableMaskLUT,
const u8 *__restrict enableColorEffectPtr,
const u8 *__restrict sprAlphaPtr,
const u8 *__restrict sprModePtr) const;
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
FORCEINLINE void Composite32(GPUEngineCompositorInfo &compInfo,
const bool didAllPixelsPass,
const v256u8 &passMask8,
const v256u16 &evy16,
const v256u8 &srcLayerID,
const v256u32 &src3, const v256u32 &src2, const v256u32 &src1, const v256u32 &src0,
const v256u8 &srcEffectEnableMask,
const v256u8 &dstBlendEnableMaskLUT,
const u8 *__restrict enableColorEffectPtr,
const u8 *__restrict sprAlphaPtr,
const u8 *__restrict sprModePtr) const;
};
#endif // ENABLE_AVX2
#endif // GPU_OPERATIONS_AVX2_H

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,122 @@
/*
Copyright (C) 2021 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with the this software. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef GPU_OPERATIONS_SSE2_H
#define GPU_OPERATIONS_SSE2_H
#include "GPU_Operations.h"
#ifndef ENABLE_SSE2
#warning This header requires SSE2 support.
#else
class ColorOperation_SSE2
{
public:
ColorOperation_SSE2() {};
FORCEINLINE v128u16 blend(const v128u16 &colA, const v128u16 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const;
template<NDSColorFormat COLORFORMAT, bool USECONSTANTBLENDVALUESHINT> FORCEINLINE v128u32 blend(const v128u32 &colA, const v128u32 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const;
FORCEINLINE v128u16 blend3D(const v128u32 &colA_Lo, const v128u32 &colA_Hi, const v128u16 &colB) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 blend3D(const v128u32 &colA, const v128u32 &colB) const;
FORCEINLINE v128u16 increase(const v128u16 &col, const v128u16 &blendEVY) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 increase(const v128u32 &col, const v128u16 &blendEVY) const;
FORCEINLINE v128u16 decrease(const v128u16 &col, const v128u16 &blendEVY) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 decrease(const v128u32 &col, const v128u16 &blendEVY) const;
};
class PixelOperation_SSE2
{
protected:
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy16(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy32(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copyMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copyMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUpMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUpMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDownMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDownMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
FORCEINLINE void _unknownEffectMask16(GPUEngineCompositorInfo &compInfo,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u16 &src1, const v128u16 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const v128u8 &enableColorEffectMask,
const v128u8 &spriteAlpha,
const v128u8 &spriteMode) const;
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
FORCEINLINE void _unknownEffectMask32(GPUEngineCompositorInfo &compInfo,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const v128u8 &enableColorEffectMask,
const v128u8 &spriteAlpha,
const v128u8 &spriteMode) const;
public:
PixelOperation_SSE2() {};
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
FORCEINLINE void Composite16(GPUEngineCompositorInfo &compInfo,
const bool didAllPixelsPass,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u16 &src1, const v128u16 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const u8 *__restrict enableColorEffectPtr,
const u8 *__restrict sprAlphaPtr,
const u8 *__restrict sprModePtr) const;
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
FORCEINLINE void Composite32(GPUEngineCompositorInfo &compInfo,
const bool didAllPixelsPass,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const u8 *__restrict enableColorEffectPtr,
const u8 *__restrict sprAlphaPtr,
const u8 *__restrict sprModePtr) const;
};
#endif // ENABLE_SSE2
#endif // GPU_OPERATIONS_SSE2_H

View File

@ -46,7 +46,7 @@
#include "driver.h" #include "driver.h"
#include "emufile.h" #include "emufile.h"
#include "matrix.h" #include "matrix.h"
#include "GPU.h" #include "GPU_Operations.h"
#include "MMU.h" #include "MMU.h"
#include "render3D.h" #include "render3D.h"
#include "mem.h" #include "mem.h"