- Unify all colorspace conversion code.
- Fix bug with VRAM-to-VRAM capture.

OpenGL Renderer:
- Try and fix a possible bug with applying fog to transparent fragments.
This commit is contained in:
rogerman 2016-06-17 04:22:51 +00:00
parent b543e309c5
commit f8e0585d26
13 changed files with 718 additions and 832 deletions

View File

@ -49,6 +49,64 @@
u32 Render3DFramesPerSecond; u32 Render3DFramesPerSecond;
CACHE_ALIGN u32 color_555_to_6665_opaque[32768];
CACHE_ALIGN u32 color_555_to_666[32768];
CACHE_ALIGN u32 color_555_to_8888_opaque[32768];
CACHE_ALIGN u32 color_555_to_888[32768];
//is this a crazy idea? this table spreads 5 bits evenly over 31 from exactly 0 to INT_MAX
CACHE_ALIGN const u32 material_5bit_to_31bit[] = {
0x00000000, 0x04210842, 0x08421084, 0x0C6318C6,
0x10842108, 0x14A5294A, 0x18C6318C, 0x1CE739CE,
0x21084210, 0x25294A52, 0x294A5294, 0x2D6B5AD6,
0x318C6318, 0x35AD6B5A, 0x39CE739C, 0x3DEF7BDE,
0x42108421, 0x46318C63, 0x4A5294A5, 0x4E739CE7,
0x5294A529, 0x56B5AD6B, 0x5AD6B5AD, 0x5EF7BDEF,
0x6318C631, 0x6739CE73, 0x6B5AD6B5, 0x6F7BDEF7,
0x739CE739, 0x77BDEF7B, 0x7BDEF7BD, 0x7FFFFFFF
};
// 5-bit to 6-bit conversions use this formula -- dst = (src == 0) ? 0 : (2*src) + 1
// Reference GBATEK: http://problemkaputt.de/gbatek.htm#ds3dtextureblending
CACHE_ALIGN const u8 material_5bit_to_6bit[] = {
0x00, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F,
0x21, 0x23, 0x25, 0x27, 0x29, 0x2B, 0x2D, 0x2F,
0x31, 0x33, 0x35, 0x37, 0x39, 0x3B, 0x3D, 0x3F
};
CACHE_ALIGN const u8 material_5bit_to_8bit[] = {
0x00, 0x08, 0x10, 0x18, 0x21, 0x29, 0x31, 0x39,
0x42, 0x4A, 0x52, 0x5A, 0x63, 0x6B, 0x73, 0x7B,
0x84, 0x8C, 0x94, 0x9C, 0xA5, 0xAD, 0xB5, 0xBD,
0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF
};
CACHE_ALIGN const u8 material_6bit_to_8bit[] = {
0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C,
0x20, 0x24, 0x28, 0x2C, 0x30, 0x34, 0x38, 0x3C,
0x41, 0x45, 0x49, 0x4D, 0x51, 0x55, 0x59, 0x5D,
0x61, 0x65, 0x69, 0x6D, 0x71, 0x75, 0x79, 0x7D,
0x82, 0x86, 0x8A, 0x8E, 0x92, 0x96, 0x9A, 0x9E,
0xA2, 0xA6, 0xAA, 0xAE, 0xB2, 0xB6, 0xBA, 0xBE,
0xC3, 0xC7, 0xCB, 0xCF, 0xD3, 0xD7, 0xDB, 0xDF,
0xE3, 0xE7, 0xEB, 0xEF, 0xF3, 0xF7, 0xFB, 0xFF
};
CACHE_ALIGN const u8 material_3bit_to_8bit[] = {
0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF
};
//maybe not very precise
CACHE_ALIGN const u8 material_3bit_to_5bit[] = {
0, 4, 8, 13, 17, 22, 26, 31
};
//TODO - generate this in the static init method more accurately
CACHE_ALIGN const u8 material_3bit_to_6bit[] = {
0, 8, 16, 26, 34, 44, 52, 63
};
//instantiate static instance //instantiate static instance
u16 GPUEngineBase::_fadeInColors[17][0x8000]; u16 GPUEngineBase::_fadeInColors[17][0x8000];
u16 GPUEngineBase::_fadeOutColors[17][0x8000]; u16 GPUEngineBase::_fadeOutColors[17][0x8000];
@ -869,9 +927,12 @@ FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectDecreaseBrightness(const Fr
b = col.b; b = col.b;
} }
if ( (INPUTFORMAT != NDSColorFormat_BGR555_Rev) && (OUTPUTFORMAT != NDSColorFormat_BGR555_Rev) )
{
newColor.r = (r - (r * blendEVY / 16)); newColor.r = (r - (r * blendEVY / 16));
newColor.g = (g - (g * blendEVY / 16)); newColor.g = (g - (g * blendEVY / 16));
newColor.b = (b - (b * blendEVY / 16)); newColor.b = (b - (b * blendEVY / 16));
}
return newColor; return newColor;
} }
@ -1166,29 +1227,13 @@ void GPUEngineBase::_RenderLine_Clear(const u16 clearColor, const u16 l, void *r
break; break;
case NDSColorFormat_BGR666_Rev: case NDSColorFormat_BGR666_Rev:
{ memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(renderLineTarget, COLOR555TO666(dstClearColor16));
FragmentColor dstClearColor32;
dstClearColor32.r = material_5bit_to_6bit[(dstClearColor16 >> 0) & 0x001F];
dstClearColor32.g = material_5bit_to_6bit[(dstClearColor16 >> 5) & 0x001F];
dstClearColor32.b = material_5bit_to_6bit[(dstClearColor16 >> 10) & 0x001F];
dstClearColor32.a = 0;
memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(renderLineTarget, dstClearColor32.color);
break; break;
}
case NDSColorFormat_BGR888_Rev: case NDSColorFormat_BGR888_Rev:
{ memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(renderLineTarget, COLOR555TO888(dstClearColor16));
FragmentColor dstClearColor32;
dstClearColor32.r = material_5bit_to_8bit[(dstClearColor16 >> 0) & 0x001F];
dstClearColor32.g = material_5bit_to_8bit[(dstClearColor16 >> 5) & 0x001F];
dstClearColor32.b = material_5bit_to_8bit[(dstClearColor16 >> 10) & 0x001F];
dstClearColor32.a = 0;
memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(renderLineTarget, dstClearColor32.color);
break; break;
} }
}
memset(this->_renderLineLayerIDNative, GPULayerID_Backdrop, GPU_FRAMEBUFFER_NATIVE_WIDTH); memset(this->_renderLineLayerIDNative, GPULayerID_Backdrop, GPU_FRAMEBUFFER_NATIVE_WIDTH);
@ -1915,6 +1960,8 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(const size_t srcX, const u16 src, c
ColorEffect selectedEffect = ColorEffect_Disable; ColorEffect selectedEffect = ColorEffect_Disable;
TBlendTable *selectedBlendTable = this->_blendTable; TBlendTable *selectedBlendTable = this->_blendTable;
u8 blendEVA = this->_BLDALPHA_EVA;
u8 blendEVB = this->_BLDALPHA_EVB;
if (enableColorEffect) if (enableColorEffect)
{ {
@ -1963,9 +2010,9 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(const size_t srcX, const u16 src, c
//it's tested by the spriteblend demo and the glory of heracles title screen //it's tested by the spriteblend demo and the glory of heracles title screen
if (srcAlpha != 0xFF) if (srcAlpha != 0xFF)
{ {
const u8 BLDALPHA_EVA = srcAlpha; blendEVA = srcAlpha;
const u8 BLDALPHA_EVB = 16 - srcAlpha; blendEVB = 16 - srcAlpha;
selectedBlendTable = &GPUEngineBase::_blendTable555[BLDALPHA_EVA][BLDALPHA_EVB]; selectedBlendTable = &GPUEngineBase::_blendTable555[blendEVA][blendEVB];
} }
forceBlendEffect = true; forceBlendEffect = true;
@ -2001,28 +2048,28 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(const size_t srcX, const u16 src, c
} }
// Render the pixel using the selected color effect. // Render the pixel using the selected color effect.
u16 finalDstColor; u16 finalDstColor16;
switch (selectedEffect) switch (selectedEffect)
{ {
case ColorEffect_Disable: case ColorEffect_Disable:
finalDstColor = src; finalDstColor16 = src;
break; break;
case ColorEffect_IncreaseBrightness: case ColorEffect_IncreaseBrightness:
finalDstColor = this->_ColorEffectIncreaseBrightness(src & 0x7FFF); finalDstColor16 = this->_ColorEffectIncreaseBrightness(src & 0x7FFF);
break; break;
case ColorEffect_DecreaseBrightness: case ColorEffect_DecreaseBrightness:
finalDstColor = this->_ColorEffectDecreaseBrightness(src & 0x7FFF); finalDstColor16 = this->_ColorEffectDecreaseBrightness(src & 0x7FFF);
break; break;
case ColorEffect_Blend: case ColorEffect_Blend:
finalDstColor = this->_ColorEffectBlend(src, *(u16 *)dstColorLine, selectedBlendTable); finalDstColor16 = this->_ColorEffectBlend(src, *(u16 *)dstColorLine, selectedBlendTable);
break; break;
} }
*(u16 *)dstColorLine = finalDstColor | 0x8000; *(u16 *)dstColorLine = finalDstColor16 | 0x8000;
*dstLayerIDLine = LAYERID; *dstLayerIDLine = LAYERID;
} }
@ -2428,28 +2475,28 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D(const size_t srcX, const Fragment
// Render the pixel using the selected color effect. // Render the pixel using the selected color effect.
const u16 srcRGB555 = R6G6B6TORGB15(src.r, src.g, src.b); const u16 srcRGB555 = R6G6B6TORGB15(src.r, src.g, src.b);
u16 finalDstColor; u16 finalDstColor16;
switch (selectedEffect) switch (selectedEffect)
{ {
case ColorEffect_Disable: case ColorEffect_Disable:
finalDstColor = srcRGB555; finalDstColor16 = srcRGB555;
break; break;
case ColorEffect_IncreaseBrightness: case ColorEffect_IncreaseBrightness:
finalDstColor = this->_ColorEffectIncreaseBrightness(srcRGB555); finalDstColor16 = this->_ColorEffectIncreaseBrightness(srcRGB555);
break; break;
case ColorEffect_DecreaseBrightness: case ColorEffect_DecreaseBrightness:
finalDstColor = this->_ColorEffectDecreaseBrightness(srcRGB555); finalDstColor16 = this->_ColorEffectDecreaseBrightness(srcRGB555);
break; break;
case ColorEffect_Blend: case ColorEffect_Blend:
finalDstColor = this->_ColorEffectBlend3D(src, *(u16 *)dstColorLine); finalDstColor16 = this->_ColorEffectBlend3D(src, *(u16 *)dstColorLine);
break; break;
} }
*(u16 *)dstColorLine = finalDstColor | 0x8000; *(u16 *)dstColorLine = finalDstColor16 | 0x8000;
*dstLayerIDLine = GPULayerID_BG0; *dstLayerIDLine = GPULayerID_BG0;
} }
@ -3923,8 +3970,6 @@ template <bool ISFULLINTENSITYHINT>
void GPUEngineBase::ApplyMasterBrightness() void GPUEngineBase::ApplyMasterBrightness()
{ {
const NDSColorFormat outputFormat = GPU->GetDisplayInfo().colorFormat; const NDSColorFormat outputFormat = GPU->GetDisplayInfo().colorFormat;
const size_t pixBytes = GPU->GetDisplayInfo().pixelBytes;
const IOREG_MASTER_BRIGHT &MASTER_BRIGHT = this->_IORegisterMap->MASTER_BRIGHT; const IOREG_MASTER_BRIGHT &MASTER_BRIGHT = this->_IORegisterMap->MASTER_BRIGHT;
const u32 intensity = MASTER_BRIGHT.Intensity; const u32 intensity = MASTER_BRIGHT.Intensity;
@ -3993,15 +4038,15 @@ void GPUEngineBase::ApplyMasterBrightness()
switch (outputFormat) switch (outputFormat)
{ {
case NDSColorFormat_BGR555_Rev: case NDSColorFormat_BGR555_Rev:
memset_u16(dst, 0x7FFF, pixCount); memset_u16(dst, 0xFFFF, pixCount);
break; break;
case NDSColorFormat_BGR666_Rev: case NDSColorFormat_BGR666_Rev:
memset_u32(dst, 0x003F3F3F, pixCount); memset_u32(dst, 0x1F3F3F3F, pixCount);
break; break;
case NDSColorFormat_BGR888_Rev: case NDSColorFormat_BGR888_Rev:
memset_u32(dst, 0x00FFFFFF, pixCount); memset_u32(dst, 0xFFFFFFFF, pixCount);
break; break;
default: default:
@ -4063,7 +4108,23 @@ void GPUEngineBase::ApplyMasterBrightness()
else else
{ {
// all black (optimization) // all black (optimization)
memset(dst, 0, pixCount * pixBytes); switch (outputFormat)
{
case NDSColorFormat_BGR555_Rev:
memset_u16(dst, 0x8000, pixCount);
break;
case NDSColorFormat_BGR666_Rev:
memset_u32(dst, 0x1F000000, pixCount);
break;
case NDSColorFormat_BGR888_Rev:
memset_u32(dst, 0xFF000000, pixCount);
break;
default:
break;
}
} }
break; break;
} }
@ -4500,33 +4561,7 @@ void GPUEngineBase::ResolveCustomRendering()
void GPUEngineBase::ResolveRGB666ToRGB888() void GPUEngineBase::ResolveRGB666ToRGB888()
{ {
size_t i = 0; ConvertColorBuffers6665To8888<false>((FragmentColor *)this->renderedBuffer, (FragmentColor *)this->renderedBuffer, this->renderedWidth * this->renderedHeight);
const size_t pixCount = this->renderedWidth * this->renderedHeight;
FragmentColor *buffer = (FragmentColor *)this->renderedBuffer;
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 4);
for (; i < ssePixCount; i += 4)
{
// Convert to RGBA8888:
// RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03)
// Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07)
__m128i color8888 = _mm_load_si128((__m128i *)(buffer + i));
__m128i a = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(color8888, 3), _mm_set1_epi8(0xF8)), _mm_and_si128(_mm_srli_epi32(color8888, 2), _mm_set1_epi8(0x07)) );
color8888 = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(color8888, 2), _mm_set1_epi8(0xFC)), _mm_and_si128(_mm_srli_epi32(color8888, 4), _mm_set1_epi8(0x03)) );
color8888 = _mm_or_si128(_mm_and_si128(color8888, _mm_set1_epi32(0x00FFFFFF)), _mm_and_si128(a, _mm_set1_epi32(0xFF000000)));
_mm_store_si128((__m128i *)(buffer + i), color8888);
}
#endif
for (; i < pixCount; i++)
{
buffer[i].r = material_6bit_to_8bit[buffer[i].r];
buffer[i].g = material_6bit_to_8bit[buffer[i].g];
buffer[i].b = material_6bit_to_8bit[buffer[i].b];
buffer[i].a = material_5bit_to_8bit[buffer[i].a];
}
} }
void GPUEngineBase::ResolveToCustomFramebuffer() void GPUEngineBase::ResolveToCustomFramebuffer()
@ -5263,7 +5298,22 @@ void GPUEngineA::_RenderLine_DisplayCapture(const void *renderedLineSrcA, const
{ {
case 0: // Capture VRAM case 0: // Capture VRAM
{ {
this->VerifyVRAMLineDidChange(vramReadBlock, readLineIndexWithOffset); const bool didVRAMLineChange = this->VerifyVRAMLineDidChange(vramReadBlock, readLineIndexWithOffset);
if (didVRAMLineChange)
{
if (vramConfiguration.banks[vramReadBlock].purpose == VramConfiguration::LCDC)
{
u32 cap_src_adr = readLineIndexWithOffset * GPU_FRAMEBUFFER_NATIVE_WIDTH;
cap_src_adr &= 0x0000FFFF;
cap_src = this->_VRAMNativeBlockPtr[vramReadBlock] + cap_src_adr;
}
else
{
cap_src = (u16 *)MMU.blank_memory;
}
srcB = cap_src;
}
if (this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset]) if (this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset])
{ {
@ -5292,7 +5342,25 @@ void GPUEngineA::_RenderLine_DisplayCapture(const void *renderedLineSrcA, const
default: // Capture source is SourceA+B blended default: // Capture source is SourceA+B blended
{ {
//INFO("Capture source is SourceA+B blended\n"); //INFO("Capture source is SourceA+B blended\n");
this->VerifyVRAMLineDidChange(vramReadBlock, readLineIndexWithOffset); if (DISPCAPCNT.SrcB == 0)
{
const bool didVRAMLineChange = this->VerifyVRAMLineDidChange(vramReadBlock, readLineIndexWithOffset);
if (didVRAMLineChange)
{
if (vramConfiguration.banks[vramReadBlock].purpose == VramConfiguration::LCDC)
{
u32 cap_src_adr = readLineIndexWithOffset * GPU_FRAMEBUFFER_NATIVE_WIDTH;
cap_src_adr &= 0x0000FFFF;
cap_src = this->_VRAMNativeBlockPtr[vramReadBlock] + cap_src_adr;
}
else
{
cap_src = (u16 *)MMU.blank_memory;
}
srcB = cap_src;
}
}
if (DISPCAPCNT.SrcA == 0) if (DISPCAPCNT.SrcA == 0)
{ {
@ -5642,17 +5710,17 @@ u16 GPUEngineA::_RenderLine_DispCapture_BlendFunc(const u16 srcA, const u16 srcB
if (a_alpha) if (a_alpha)
{ {
a = 0x8000; a = 0x8000;
r = ((srcA & 0x1F) * blendEVA); r = ((srcA & 0x001F) * blendEVA);
g = (((srcA >> 5) & 0x1F) * blendEVA); g = (((srcA >> 5) & 0x001F) * blendEVA);
b = (((srcA >> 10) & 0x1F) * blendEVA); b = (((srcA >> 10) & 0x001F) * blendEVA);
} }
if (b_alpha) if (b_alpha)
{ {
a = 0x8000; a = 0x8000;
r += ((srcB & 0x1F) * blendEVB); r += ((srcB & 0x001F) * blendEVB);
g += (((srcB >> 5) & 0x1F) * blendEVB); g += (((srcB >> 5) & 0x001F) * blendEVB);
b += (((srcB >> 10) & 0x1F) * blendEVB); b += (((srcB >> 10) & 0x001F) * blendEVB);
} }
r >>= 4; r >>= 4;
@ -5660,9 +5728,9 @@ u16 GPUEngineA::_RenderLine_DispCapture_BlendFunc(const u16 srcA, const u16 srcB
b >>= 4; b >>= 4;
//freedom wings sky will overflow while doing some fsaa/motionblur effect without this //freedom wings sky will overflow while doing some fsaa/motionblur effect without this
r = std::min((u16)31,r); r = std::min<u16>(0x001F, r);
g = std::min((u16)31,g); g = std::min<u16>(0x001F, g);
b = std::min((u16)31,b); b = std::min<u16>(0x001F, b);
return LOCAL_TO_LE_16(a | (b << 10) | (g << 5) | r); return LOCAL_TO_LE_16(a | (b << 10) | (g << 5) | r);
} }
@ -5729,7 +5797,6 @@ void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const u16 *srcA,
size_t i = 0; size_t i = 0;
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
const size_t ssePixCount = length - (length % 8); const size_t ssePixCount = length - (length % 8);
for (; i < ssePixCount; i += 8) for (; i < ssePixCount; i += 8)
{ {
@ -5754,6 +5821,7 @@ void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const u16 *srcA,
_mm_store_si128( (__m128i *)(dst + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); _mm_store_si128( (__m128i *)(dst + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) );
} }
#endif #endif
for (; i < length; i++) for (; i < length; i++)
{ {
const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[offset + i]; const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[offset + i];
@ -5849,10 +5917,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l)
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
{ {
dst[i].r = material_5bit_to_6bit[(src[i] >> 0) & 0x001F]; dst[i].color = COLOR555TO6665_OPAQUE(src[i] & 0x7FFF);
dst[i].g = material_5bit_to_6bit[(src[i] >> 5) & 0x001F];
dst[i].b = material_5bit_to_6bit[(src[i] >> 10) & 0x001F];
dst[i].a = 0;
} }
break; break;
} }
@ -5864,10 +5929,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l)
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
{ {
dst[i].r = material_5bit_to_8bit[(src[i] >> 0) & 0x001F]; dst[i].color = COLOR555TO8888_OPAQUE(src[i] & 0x7FFF);
dst[i].g = material_5bit_to_8bit[(src[i] >> 5) & 0x001F];
dst[i].b = material_5bit_to_8bit[(src[i] >> 10) & 0x001F];
dst[i].a = 0;
} }
break; break;
} }
@ -5891,10 +5953,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l)
for (size_t i = 0; i < customPixCount; i++) for (size_t i = 0; i < customPixCount; i++)
{ {
dst[i].r = material_5bit_to_6bit[(src[i] >> 0) & 0x001F]; dst[i].color = COLOR555TO6665_OPAQUE(src[i] & 0x7FFF);
dst[i].g = material_5bit_to_6bit[(src[i] >> 5) & 0x001F];
dst[i].b = material_5bit_to_6bit[(src[i] >> 10) & 0x001F];
dst[i].a = 0;
} }
break; break;
} }
@ -5906,10 +5965,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l)
for (size_t i = 0; i < customPixCount; i++) for (size_t i = 0; i < customPixCount; i++)
{ {
dst[i].r = material_5bit_to_8bit[(src[i] >> 0) & 0x001F]; dst[i].color = COLOR555TO8888_OPAQUE(src[i] & 0x7FFF);
dst[i].g = material_5bit_to_8bit[(src[i] >> 5) & 0x001F];
dst[i].b = material_5bit_to_8bit[(src[i] >> 10) & 0x001F];
dst[i].a = 0;
} }
break; break;
} }
@ -5959,16 +6015,8 @@ void GPUEngineA::_HandleDisplayModeMainMemory(const size_t l)
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2) for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2)
{ {
u32 src = DISP_FIFOrecv(); u32 src = DISP_FIFOrecv();
dst[i+0].color = COLOR555TO6665_OPAQUE((src >> 0) & 0x7FFF);
dst[i+0].r = material_5bit_to_6bit[(src >> 0) & 0x0000001F]; dst[i+1].color = COLOR555TO6665_OPAQUE((src >> 16) & 0x7FFF);
dst[i+0].g = material_5bit_to_6bit[(src >> 5) & 0x0000001F];
dst[i+0].b = material_5bit_to_6bit[(src >> 10) & 0x0000001F];
dst[i+0].a = 0;
dst[i+1].r = material_5bit_to_6bit[(src >> 16) & 0x0000001F];
dst[i+1].g = material_5bit_to_6bit[(src >> 21) & 0x0000001F];
dst[i+1].b = material_5bit_to_6bit[(src >> 26) & 0x0000001F];
dst[i+1].a = 0;
} }
break; break;
} }
@ -5980,16 +6028,8 @@ void GPUEngineA::_HandleDisplayModeMainMemory(const size_t l)
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2) for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2)
{ {
u32 src = DISP_FIFOrecv(); u32 src = DISP_FIFOrecv();
dst[i+0].color = COLOR555TO8888_OPAQUE((src >> 0) & 0x7FFF);
dst[i+0].r = material_5bit_to_8bit[(src >> 0) & 0x0000001F]; dst[i+1].color = COLOR555TO8888_OPAQUE((src >> 16) & 0x7FFF);
dst[i+0].g = material_5bit_to_8bit[(src >> 5) & 0x0000001F];
dst[i+0].b = material_5bit_to_8bit[(src >> 10) & 0x0000001F];
dst[i+0].a = 0;
dst[i+1].r = material_5bit_to_8bit[(src >> 16) & 0x0000001F];
dst[i+1].g = material_5bit_to_8bit[(src >> 21) & 0x0000001F];
dst[i+1].b = material_5bit_to_8bit[(src >> 26) & 0x0000001F];
dst[i+1].a = 0;
} }
break; break;
} }
@ -6278,6 +6318,24 @@ void* GPUEngineB::_RenderLine_Layers(const u16 l)
GPUSubsystem::GPUSubsystem() GPUSubsystem::GPUSubsystem()
{ {
static bool needInitTables = true;
if (needInitTables)
{
#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] )
#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
for (size_t i = 0; i < 32768; i++)
{
color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) );
color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 );
color_555_to_888[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) );
color_555_to_8888_opaque[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) | 0xFF000000 );
}
needInitTables = false;
}
_defaultEventHandler = new GPUEventHandlerDefault; _defaultEventHandler = new GPUEventHandlerDefault;
_event = _defaultEventHandler; _event = _defaultEventHandler;
@ -6953,17 +7011,11 @@ void GPUSubsystem::ClearWithColor(const u16 colorBGRA5551)
break; break;
case NDSColorFormat_BGR666_Rev: case NDSColorFormat_BGR666_Rev:
color32.r = material_5bit_to_6bit[(colorBGRA5551 & 0x001F)]; color32.color = COLOR555TO6665_OPAQUE(colorBGRA5551 & 0x7FFF);
color32.g = material_5bit_to_6bit[(colorBGRA5551 & 0x03E0) >> 5];
color32.b = material_5bit_to_6bit[(colorBGRA5551 & 0x7C00) >> 10];
color32.a = 0xFF;
break; break;
case NDSColorFormat_BGR888_Rev: case NDSColorFormat_BGR888_Rev:
color32.r = material_5bit_to_8bit[(colorBGRA5551 & 0x001F)]; color32.color = COLOR555TO8888_OPAQUE(colorBGRA5551 & 0x7FFF);
color32.g = material_5bit_to_8bit[(colorBGRA5551 & 0x03E0) >> 5];
color32.b = material_5bit_to_8bit[(colorBGRA5551 & 0x7C00) >> 10];
color32.a = 0xFF;
break; break;
default: default:
@ -7026,6 +7078,82 @@ void NDSDisplay::SetEngineByID(const GPUEngineID theID)
this->_gpu->SetDisplayByID(this->_ID); this->_gpu->SetDisplayByID(this->_ID);
} }
template <bool SWAP_RB>
void ConvertColorBuffers8888To6665(const FragmentColor *src, FragmentColor *dst, size_t pixCount)
{
size_t i = 0;
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 4);
for (; i < ssePixCount; i += 4)
{
_mm_store_si128( (__m128i *)(dst + i), ConvertColor8888To6665<SWAP_RB>(_mm_load_si128((__m128i *)(src + i))) );
}
#endif
for (; i < pixCount; i++)
{
dst[i] = ConvertColor8888To6665<SWAP_RB>(src[i]);
}
}
template <bool SWAP_RB>
void ConvertColorBuffers6665To8888(const FragmentColor *src, FragmentColor *dst, size_t pixCount)
{
size_t i = 0;
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 4);
for (; i < ssePixCount; i += 4)
{
_mm_store_si128( (__m128i *)(dst + i), ConvertColor6665To8888<SWAP_RB>(_mm_load_si128((__m128i *)(src + i))) );
}
#endif
for (; i < pixCount; i++)
{
dst[i] = ConvertColor6665To8888<SWAP_RB>(src[i]);
}
}
template <bool SWAP_RB>
void ConvertColorBuffers8888To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount)
{
size_t i = 0;
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8)
{
_mm_store_si128( (__m128i *)(dst + i), ConvertColor8888To5551<SWAP_RB>(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) );
}
#endif
for (; i < pixCount; i++)
{
dst[i] = ConvertColor8888To5551<SWAP_RB>(src[i]);
}
}
template <bool SWAP_RB>
void ConvertColorBuffers6665To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount)
{
size_t i = 0;
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8)
{
_mm_store_si128( (__m128i *)(dst + i), ConvertColor6665To5551<SWAP_RB>(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) );
}
#endif
for (; i < pixCount; i++)
{
dst[i] = ConvertColor6665To5551<SWAP_RB>(src[i]);
}
}
template void GPUEngineBase::ParseReg_DISPCNT<GPUEngineID_Main>(); template void GPUEngineBase::ParseReg_DISPCNT<GPUEngineID_Main>();
template void GPUEngineBase::ParseReg_DISPCNT<GPUEngineID_Sub>(); template void GPUEngineBase::ParseReg_DISPCNT<GPUEngineID_Sub>();
@ -7061,3 +7189,15 @@ template void GPUEngineBase::RenderLayerBG<GPULayerID_BG0>(u16 *dstColorBuffer);
template void GPUEngineBase::RenderLayerBG<GPULayerID_BG1>(u16 *dstColorBuffer); template void GPUEngineBase::RenderLayerBG<GPULayerID_BG1>(u16 *dstColorBuffer);
template void GPUEngineBase::RenderLayerBG<GPULayerID_BG2>(u16 *dstColorBuffer); template void GPUEngineBase::RenderLayerBG<GPULayerID_BG2>(u16 *dstColorBuffer);
template void GPUEngineBase::RenderLayerBG<GPULayerID_BG3>(u16 *dstColorBuffer); template void GPUEngineBase::RenderLayerBG<GPULayerID_BG3>(u16 *dstColorBuffer);
template void ConvertColorBuffers8888To6665<true>(const FragmentColor *src, FragmentColor *dst, size_t pixCount);
template void ConvertColorBuffers8888To6665<false>(const FragmentColor *src, FragmentColor *dst, size_t pixCount);
template void ConvertColorBuffers6665To8888<true>(const FragmentColor *src, FragmentColor *dst, size_t pixCount);
template void ConvertColorBuffers6665To8888<false>(const FragmentColor *src, FragmentColor *dst, size_t pixCount);
template void ConvertColorBuffers8888To5551<true>(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffers8888To5551<false>(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffers6665To5551<true>(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount);
template void ConvertColorBuffers6665To5551<false>(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount);

View File

@ -1629,6 +1629,43 @@ public:
extern GPUSubsystem *GPU; extern GPUSubsystem *GPU;
extern MMU_struct MMU; extern MMU_struct MMU;
extern CACHE_ALIGN const u32 material_5bit_to_31bit[32];
extern CACHE_ALIGN const u8 material_5bit_to_6bit[32];
extern CACHE_ALIGN const u8 material_5bit_to_8bit[32];
extern CACHE_ALIGN const u8 material_6bit_to_8bit[64];
extern CACHE_ALIGN const u8 material_3bit_to_5bit[8];
extern CACHE_ALIGN const u8 material_3bit_to_6bit[8];
extern CACHE_ALIGN const u8 material_3bit_to_8bit[8];
extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768];
extern CACHE_ALIGN u32 color_555_to_666[32768];
extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768];
extern CACHE_ALIGN u32 color_555_to_888[32768];
#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color
#define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color
#ifdef LOCAL_LE
#define COLOR555TO6665(col,alpha5) (((alpha5)<<24) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, little-endian
#else
#define COLOR555TO6665(col,alpha5) ((alpha5) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, big-endian
#endif
#define COLOR555TO8888_OPAQUE(col) (color_555_to_8888_opaque[(col)]) // Convert a 15-bit color to an opaque 32-bit color
#define COLOR555TO888(col) (color_555_to_888[(col)]) // Convert a 15-bit color to an opaque 24-bit color or a fully transparent 32-bit color
#ifdef LOCAL_LE
#define COLOR555TO8888(col,alpha8) (((alpha8)<<24) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, little-endian
#else
#define COLOR555TO8888(col,alpha8) ((alpha8) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, big-endian
#endif
//produce a 15bpp color from individual 5bit components
#define R5G5B5TORGB15(r,g,b) ( (r) | ((g)<<5) | ((b)<<10) )
//produce a 16bpp color from individual 5bit components
#define R6G6B6TORGB15(r,g,b) ( ((r)>>1) | (((g)&0x3E)<<4) | (((b)&0x3E)<<9) )
inline FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a) inline FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a)
{ {
FragmentColor ret; FragmentColor ret;
@ -1636,4 +1673,214 @@ inline FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const
return ret; return ret;
} }
template <bool SWAP_RB>
FORCEINLINE FragmentColor ConvertColor8888To6665(FragmentColor srcColor)
{
FragmentColor outColor;
outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r) >> 2;
outColor.g = srcColor.g >> 2;
outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b) >> 2;
outColor.a = srcColor.a >> 3;
return outColor;
}
template <bool SWAP_RB>
FORCEINLINE FragmentColor ConvertColor6665To8888(FragmentColor srcColor)
{
FragmentColor outColor;
outColor.r = material_6bit_to_8bit[((SWAP_RB) ? srcColor.b : srcColor.r)];
outColor.g = material_6bit_to_8bit[srcColor.g];
outColor.b = material_6bit_to_8bit[((SWAP_RB) ? srcColor.r : srcColor.b)];
outColor.a = material_5bit_to_8bit[srcColor.a];
return outColor;
}
template <bool SWAP_RB>
FORCEINLINE u16 ConvertColor8888To5551(FragmentColor srcColor)
{
return R5G5B5TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r) >> 3, srcColor.g >> 3, ((SWAP_RB) ? srcColor.r : srcColor.b) >> 3) | ((srcColor.a == 0) ? 0x0000 : 0x8000 );
}
template <bool SWAP_RB>
FORCEINLINE u16 ConvertColor6665To5551(FragmentColor srcColor)
{
return R6G6B6TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r), srcColor.g, ((SWAP_RB) ? srcColor.r : srcColor.b)) | ((srcColor.a == 0) ? 0x0000 : 0x8000);
}
#ifdef ENABLE_SSE2
template <bool SWAP_RB>
FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src)
{
__m128i rgb;
const __m128i a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) );
if (SWAP_RB)
{
#ifdef ENABLE_SSSE3
rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) );
rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2) );
#else
rgb = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x003F0000)), 18), _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00003F00)), 2), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x0000003F)), 14)) );
#endif
}
else
{
rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) );
}
return _mm_or_si128(rgb, a);
}
template <bool SWAP_RB>
FORCEINLINE __m128i ConvertColor6665To8888(const __m128i src)
{
// Conversion algorithm:
// RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03)
// Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07)
__m128i rgb = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 2), _mm_set1_epi32(0x00FCFCFC)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00030303)) );
const __m128i a = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0xF8000000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x07000000)) );
if (SWAP_RB)
{
#ifdef ENABLE_SSSE3
rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2) );
#else
rgb = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16)) );
#endif
}
return _mm_or_si128(rgb, a);
}
template <NDSColorFormat COLORFORMAT, bool SWAP_RB>
FORCEINLINE __m128i _ConvertColorBaseTo5551(const __m128i srcLo, const __m128i srcHi)
{
if (COLORFORMAT == NDSColorFormat_BGR555_Rev)
{
return srcLo;
}
__m128i rgbLo;
__m128i rgbHi;
__m128i aLo;
__m128i aHi;
if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
{
if (SWAP_RB)
{
// Convert color from low bits
rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 17), _mm_set1_epi32(0x0000001F));
rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) );
rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) );
// Convert color from high bits
rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 17), _mm_set1_epi32(0x0000001F));
rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) );
rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) );
}
else
{
// Convert color from low bits
rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 1), _mm_set1_epi32(0x0000001F));
rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) );
rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) );
// Convert color from high bits
rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 1), _mm_set1_epi32(0x0000001F));
rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) );
rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) );
}
}
else if (COLORFORMAT == NDSColorFormat_BGR888_Rev)
{
if (SWAP_RB)
{
// Convert color from low bits
rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 19), _mm_set1_epi32(0x0000001F));
rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) );
rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) );
// Convert color from high bits
rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 19), _mm_set1_epi32(0x0000001F));
rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) );
rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) );
}
else
{
// Convert color from low bits
rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 3), _mm_set1_epi32(0x0000001F));
rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) );
rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) );
// Convert color from high bits
rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 3), _mm_set1_epi32(0x0000001F));
rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) );
rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) );
}
}
// Convert alpha from low bits
aLo = _mm_and_si128(srcLo, _mm_set1_epi32(0xFF000000));
aLo = _mm_cmpeq_epi32(aLo, _mm_setzero_si128());
// Convert alpha from high bits
aHi = _mm_and_si128(srcHi, _mm_set1_epi32(0xFF000000));
aHi = _mm_cmpeq_epi32(aHi, _mm_setzero_si128());
#ifdef ENABLE_SSSE3
aLo = _mm_andnot_si128(aLo, _mm_set1_epi32(0x00008000));
aHi = _mm_andnot_si128(aHi, _mm_set1_epi32(0x00008000));
return _mm_shuffle_epi8( _mm_or_si128(_mm_or_si128(rgbLo, aLo), _mm_slli_epi32(_mm_or_si128(rgbHi, aHi), 16)), _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0) );
#else
rgbLo = _mm_packs_epi32(rgbLo, _mm_setzero_si128());
rgbHi = _mm_packs_epi32(rgbHi, _mm_setzero_si128());
// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
// 16-bit. Since SSE2 only has packssdw (signed saturated 16-bit pack), using
// packssdw on the alpha bit (0x8000) will result in a value of 0x7FFF, which is
// incorrect. Now if we were to use SSE4.1's packusdw (unsigned saturated 16-bit
// pack), we wouldn't have to go through this hassle. But not everyone has an
// SSE4.1-capable CPU, so doing this the SSE2 way is more guaranteed to work for
// everyone's CPU.
//
// To use packssdw, we take a bit one position lower for the alpha bit, run
// packssdw, then shift the bit back to its original position. Then we por the
// alpha vector with the post-packed color vector to get the final color.
aLo = _mm_andnot_si128(aLo, _mm_set1_epi32(0x00004000)); // Mask out the bit before A
aLo = _mm_packs_epi32(aLo, _mm_setzero_si128()); // Pack 32-bit down to 16-bit
aLo = _mm_slli_epi16(aLo, 1); // Shift the A bit back to where it needs to be
aHi = _mm_andnot_si128(aHi, _mm_set1_epi32(0x00004000));
aHi = _mm_packs_epi32(aHi, _mm_setzero_si128());
aHi = _mm_slli_epi16(aHi, 1);
return _mm_or_si128( _mm_or_si128(rgbLo, aLo), _mm_slli_epi32(_mm_or_si128(rgbHi, aHi), 16) );
#endif
}
template <bool SWAP_RB>
FORCEINLINE __m128i ConvertColor8888To5551(const __m128i srcLo, const __m128i srcHi)
{
return _ConvertColorBaseTo5551<NDSColorFormat_BGR888_Rev, SWAP_RB>(srcLo, srcHi);
}
template <bool SWAP_RB>
FORCEINLINE __m128i ConvertColor6665To5551(const __m128i srcLo, const __m128i srcHi)
{
return _ConvertColorBaseTo5551<NDSColorFormat_BGR666_Rev, SWAP_RB>(srcLo, srcHi);
}
#endif
template<bool SWAP_RB> void ConvertColorBuffers8888To6665(const FragmentColor *src, FragmentColor *dst, size_t pixCount);
template<bool SWAP_RB> void ConvertColorBuffers6665To8888(const FragmentColor *src, FragmentColor *dst, size_t pixCount);
template<bool SWAP_RB> void ConvertColorBuffers8888To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount);
template<bool SWAP_RB> void ConvertColorBuffers6665To5551(const FragmentColor *__restrict src, u16 *__restrict dst, size_t pixCount);
#endif #endif

View File

@ -34,10 +34,6 @@
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#ifdef ENABLE_SSSE3
#include <tmmintrin.h>
#endif
typedef struct typedef struct
{ {
unsigned int major; unsigned int major;
@ -49,7 +45,7 @@ static OGLVersion _OGLDriverVersion = {0, 0, 0};
// Lookup Tables // Lookup Tables
static CACHE_ALIGN GLfloat material_8bit_to_float[256] = {0}; static CACHE_ALIGN GLfloat material_8bit_to_float[256] = {0};
CACHE_ALIGN const GLfloat divide5bitBy31_LUT[32] = {0.0, 0.03225806451613, 0.06451612903226, 0.09677419354839, CACHE_ALIGN const GLfloat divide5bitBy31_LUT[32] = {0.0, 0.0322580645161, 0.0645161290323, 0.0967741935484,
0.1290322580645, 0.1612903225806, 0.1935483870968, 0.2258064516129, 0.1290322580645, 0.1612903225806, 0.1935483870968, 0.2258064516129,
0.2580645161290, 0.2903225806452, 0.3225806451613, 0.3548387096774, 0.2580645161290, 0.2903225806452, 0.3225806451613, 0.3548387096774,
0.3870967741935, 0.4193548387097, 0.4516129032258, 0.4838709677419, 0.3870967741935, 0.4193548387097, 0.4516129032258, 0.4838709677419,
@ -58,6 +54,24 @@ CACHE_ALIGN const GLfloat divide5bitBy31_LUT[32] = {0.0, 0.03225806451613, 0.064
0.7741935483871, 0.8064516129032, 0.8387096774194, 0.8709677419355, 0.7741935483871, 0.8064516129032, 0.8387096774194, 0.8709677419355,
0.9032258064516, 0.9354838709677, 0.9677419354839, 1.0}; 0.9032258064516, 0.9354838709677, 0.9677419354839, 1.0};
CACHE_ALIGN const GLfloat divide6bitBy63_LUT[64] = {0.0, 0.0158730158730, 0.0317460317460, 0.0476190476191,
0.0634920634921, 0.0793650793651, 0.0952380952381, 0.1111111111111,
0.1269841269841, 0.1428571428571, 0.1587301587302, 0.1746031746032,
0.1904761904762, 0.2063492063492, 0.2222222222222, 0.2380952380952,
0.2539682539683, 0.2698412698413, 0.2857142857143, 0.3015873015873,
0.3174603174603, 0.3333333333333, 0.3492063492064, 0.3650793650794,
0.3809523809524, 0.3968253968254, 0.4126984126984, 0.4285714285714,
0.4444444444444, 0.4603174603175, 0.4761904761905, 0.4920634920635,
0.5079365079365, 0.5238095238095, 0.5396825396825, 0.5555555555556,
0.5714285714286, 0.5873015873016, 0.6031746031746, 0.6190476190476,
0.6349206349206, 0.6507936507937, 0.6666666666667, 0.6825396825397,
0.6984126984127, 0.7142857142857, 0.7301587301587, 0.7460317460318,
0.7619047619048, 0.7777777777778, 0.7936507936508, 0.8095238095238,
0.8253968253968, 0.8412698412698, 0.8571428571429, 0.8730158730159,
0.8888888888889, 0.9047619047619, 0.9206349206349, 0.9365079365079,
0.9523809523810, 0.9682539682540, 0.9841269841270, 1.0};
const GLfloat PostprocessVtxBuffer[16] = {-1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, const GLfloat PostprocessVtxBuffer[16] = {-1.0f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f,
0.0f, 0.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f}; 0.0f, 0.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f};
const GLubyte PostprocessElementBuffer[6] = {0, 1, 2, 2, 3, 0}; const GLubyte PostprocessElementBuffer[6] = {0, 1, 2, 2, 3, 0};
@ -355,7 +369,7 @@ static const char *fragmentShader_100 = {"\
gl_FragData[0] = newFragColor;\n\ gl_FragData[0] = newFragColor;\n\
gl_FragData[1] = vec4( packVec3FromFloat(newFragDepth), float(polyEnableDepthWrite && (newFragColor.a > 0.999 || polySetNewDepthForTranslucent)));\n\ gl_FragData[1] = vec4( packVec3FromFloat(newFragDepth), float(polyEnableDepthWrite && (newFragColor.a > 0.999 || polySetNewDepthForTranslucent)));\n\
gl_FragData[2] = vec4(float(polyID)/63.0, 0.0, 0.0, float(newFragColor.a > 0.999));\n\ gl_FragData[2] = vec4(float(polyID)/63.0, 0.0, 0.0, float(newFragColor.a > 0.999));\n\
gl_FragData[3] = vec4( float(polyEnableFog), 0.0, 0.0, float(newFragColor.a > 0.999 || !polyEnableFog));\n\ gl_FragData[3] = vec4(float(polyEnableFog), 0.0, 0.0, float((newFragColor.a > 0.999) ? 1.0 : 0.5));\n\
gl_FragDepth = newFragDepth;\n\ gl_FragDepth = newFragDepth;\n\
} \n\ } \n\
"}; "};
@ -462,7 +476,7 @@ static const char *FogFragShader_100 = {"\
{\n\ {\n\
vec4 inFragColor = texture2D(texInFragColor, texCoord);\n\ vec4 inFragColor = texture2D(texInFragColor, texCoord);\n\
vec4 inFogAttributes = texture2D(texInFogAttributes, texCoord);\n\ vec4 inFogAttributes = texture2D(texInFogAttributes, texCoord);\n\
bool polyEnableFog = bool(inFogAttributes.r);\n\ bool polyEnableFog = (inFogAttributes.r > 0.999);\n\
vec4 newFoggedColor = inFragColor;\n\ vec4 newFoggedColor = inFragColor;\n\
\n\ \n\
if (polyEnableFog)\n\ if (polyEnableFog)\n\
@ -543,98 +557,6 @@ static const char *FramebufferOutputRGBA8888FragShader_100 = {"\
}\n\ }\n\
"}; "};
FORCEINLINE u32 BGRA8888_32_To_RGBA6665_32(const u32 srcPix)
{
const u32 dstPix = (srcPix >> 2);
return (dstPix & 0x00003F00) << 16 | // R
(dstPix & 0x003F0000) | // G
(dstPix & 0x3F000000) >> 16 | // B
((dstPix >> 1) & 0x0000001F); // A
}
FORCEINLINE u32 BGRA8888_32Rev_To_RGBA6665_32Rev(const u32 srcPix)
{
const u32 dstPix = (srcPix >> 2);
return (dstPix & 0x003F0000) >> 16 | // R
(dstPix & 0x00003F00) | // G
(dstPix & 0x0000003F) << 16 | // B
((dstPix >> 1) & 0x1F000000); // A
}
FORCEINLINE FragmentColor BGRA8888_32_To_RGBA6665_32(const FragmentColor src)
{
FragmentColor dst = src;
dst.r = src.b >> 2;
dst.g = src.g >> 2;
dst.b = src.r >> 2;
dst.a = src.a >> 3;
return dst;
}
FORCEINLINE FragmentColor BGRA8888_32Rev_To_RGBA6665_32Rev(const FragmentColor src)
{
FragmentColor dst = src;
dst.r = src.b >> 2;
dst.g = src.g >> 2;
dst.b = src.r >> 2;
dst.a = src.a >> 3;
return dst;
}
FORCEINLINE u16 BGRA8888_32_To_RGBA5551_16(const FragmentColor src)
{
return R5G5B5TORGB15( (src.b >> 3),
(src.g >> 3),
(src.r >> 3)) |
((src.a == 0) ? 0x0000 : 0x8000);
}
FORCEINLINE u16 BGRA8888_32Rev_To_RGBA5551_16Rev(const FragmentColor src)
{
return R5G5B5TORGB15( (src.b >> 3),
(src.g >> 3),
(src.r >> 3)) |
((src.a == 0) ? 0x0000 : 0x8000);
}
#ifdef ENABLE_SSSE3
FORCEINLINE __m128i BGRA8888_32Rev_To_RGBA6665_32Rev(const __m128i src)
{
const __m128i rgb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FCFCFC)), 2);
const __m128i a = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0xF8000000)), 3);
return _mm_shuffle_epi8(_mm_or_si128(rgb, a), _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA
}
FORCEINLINE __m128i BGRA8888_32Rev_To_RGBA5551_16Rev(const __m128i src)
{
__m128i b = _mm_and_si128(src, _mm_set1_epi32(0x000000F8)); // Read from R
b = _mm_slli_epi32(b, 7); // Shift to B
__m128i g = _mm_and_si128(src, _mm_set1_epi32(0x0000F800)); // Read from G
g = _mm_srli_epi32(g, 6); // Shift in G
__m128i r = _mm_and_si128(src, _mm_set1_epi32(0x00F80000)); // Read from B
r = _mm_srli_epi32(r, 19); // Shift to R
__m128i a = _mm_and_si128(src, _mm_set1_epi32(0xFF000000)); // Read from A
a = _mm_cmpeq_epi32(a, _mm_setzero_si128()); // Determine A
a = _mm_andnot_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A
// All the colors are currently placed on 32 bit boundaries, so we need to swizzle them
// to the lower 64 bits of our vector before we store them back to memory.
// Note: Do not attempt to use packssdw here since packing with the 0x8000 bit set will
// result in values of 0x7FFF, which are incorrect values in this case.
return _mm_shuffle_epi8(_mm_or_si128(_mm_or_si128(_mm_or_si128(b, g), r), a), _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
}
#endif
bool IsVersionSupported(unsigned int checkVersionMajor, unsigned int checkVersionMinor, unsigned int checkVersionRevision) bool IsVersionSupported(unsigned int checkVersionMajor, unsigned int checkVersionMinor, unsigned int checkVersionRevision)
{ {
bool result = false; bool result = false;
@ -1052,99 +974,62 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
// to the DS Y-coordinate. // to the DS Y-coordinate.
size_t i = 0; size_t i = 0;
const size_t pixCount = this->_framebufferWidth;
#ifdef ENABLE_SSSE3
const size_t ssePixCount = pixCount - (pixCount % 4);
#endif
if (this->willFlipFramebufferOnGPU) if (this->willFlipFramebufferOnGPU)
{ {
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
if (this->_outputFormat == NDSColorFormat_BGR666_Rev) if (this->_outputFormat == NDSColorFormat_BGR666_Rev)
{ {
if ( (dstFramebuffer != NULL) && (dstRGBA5551 != NULL) ) if ( (dstFramebuffer != NULL) && (dstRGBA5551 != NULL) )
{ {
#ifdef ENABLE_SSSE3 #ifdef ENABLE_SSE2
for (; i < ssePixCount; i += 4) const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8)
{ {
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + i)); const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + i + 0));
const __m128i color6665 = BGRA8888_32Rev_To_RGBA6665_32Rev(srcColor); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + i + 4));
const __m128i color5551 = BGRA8888_32Rev_To_RGBA5551_16Rev(srcColor);
_mm_store_si128((__m128i *)(dstFramebuffer + i), color6665); _mm_store_si128( (__m128i *)(dstFramebuffer + i + 0), ConvertColor8888To6665<true>(srcColorLo) );
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color5551); _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ConvertColor8888To6665<true>(srcColorHi) );
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
#ifdef LOCAL_BE dstFramebuffer[i] = ConvertColor8888To6665<true>(srcFramebuffer[i]);
dstFramebuffer[i] = BGRA8888_32_To_RGBA6665_32(srcFramebuffer[i]); dstRGBA5551[i] = ConvertColor8888To5551<true>(srcFramebuffer[i]);
dstRGBA5551[i] = BGRA8888_32_To_RGBA5551_16(srcFramebuffer[i]);
#else
dstFramebuffer[i] = BGRA8888_32Rev_To_RGBA6665_32Rev(srcFramebuffer[i]);
dstRGBA5551[i] = BGRA8888_32Rev_To_RGBA5551_16Rev(srcFramebuffer[i]);
#endif
} }
} }
else if (dstFramebuffer != NULL) else if (dstFramebuffer != NULL)
{ {
#ifdef ENABLE_SSSE3 ConvertColorBuffers8888To6665<true>(srcFramebuffer, dstFramebuffer, pixCount);
for (; i < ssePixCount; i += 4)
{
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + i));
const __m128i color6665 = BGRA8888_32Rev_To_RGBA6665_32Rev(srcColor);
_mm_store_si128((__m128i *)(dstFramebuffer + i), color6665);
}
#endif
for (; i < pixCount; i++)
{
#ifdef LOCAL_BE
dstFramebuffer[i] = BGRA8888_32_To_RGBA6665_32(srcFramebuffer[i]);
#else
dstFramebuffer[i] = BGRA8888_32Rev_To_RGBA6665_32Rev(srcFramebuffer[i]);
#endif
}
} }
else else
{ {
#ifdef ENABLE_SSSE3 ConvertColorBuffers8888To5551<true>(srcFramebuffer, dstRGBA5551, pixCount);
for (; i < ssePixCount; i += 4)
{
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + i));
const __m128i color5551 = BGRA8888_32Rev_To_RGBA5551_16Rev(srcColor);
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color5551);
}
#endif
for (; i < pixCount; i++)
{
#ifdef LOCAL_BE
dstRGBA5551[i] = BGRA8888_32_To_RGBA5551_16(srcFramebuffer[i]);
#else
dstRGBA5551[i] = BGRA8888_32Rev_To_RGBA5551_16Rev(srcFramebuffer[i]);
#endif
}
} }
} }
else if (this->_outputFormat == NDSColorFormat_BGR888_Rev) else if (this->_outputFormat == NDSColorFormat_BGR888_Rev)
{ {
if ( (dstFramebuffer != NULL) && (dstRGBA5551 != NULL) ) if ( (dstFramebuffer != NULL) && (dstRGBA5551 != NULL) )
{ {
#ifdef ENABLE_SSSE3 #ifdef ENABLE_SSE2
for (; i < ssePixCount; i += 4) const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8)
{ {
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + i)); const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + i + 0));
const __m128i color5551 = BGRA8888_32Rev_To_RGBA5551_16Rev(srcColor); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + i + 4));
_mm_store_si128((__m128i *)(dstFramebuffer + i), srcColor);
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color5551); _mm_store_si128( (__m128i *)(dstFramebuffer + i + 0), srcColorLo );
_mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), srcColorHi );
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dstFramebuffer[i] = srcFramebuffer[i]; dstFramebuffer[i] = ConvertColor8888To6665<true>(srcFramebuffer[i]);
#ifdef LOCAL_BE dstRGBA5551[i] = ConvertColor8888To5551<true>(srcFramebuffer[i]);
dstRGBA5551[i] = BGRA8888_32_To_RGBA5551_16(srcFramebuffer[i]);
#else
dstRGBA5551[i] = BGRA8888_32Rev_To_RGBA5551_16Rev(srcFramebuffer[i]);
#endif
} }
} }
else if (dstFramebuffer != NULL) else if (dstFramebuffer != NULL)
@ -1153,27 +1038,14 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
} }
else else
{ {
#ifdef ENABLE_SSSE3 ConvertColorBuffers8888To5551<true>(srcFramebuffer, dstRGBA5551, pixCount);
for (; i < ssePixCount; i += 4)
{
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + i));
const __m128i color5551 = BGRA8888_32Rev_To_RGBA5551_16Rev(srcColor);
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color5551);
}
#endif
for (; i < pixCount; i++)
{
#ifdef LOCAL_BE
dstRGBA5551[i] = BGRA8888_32_To_RGBA5551_16(srcFramebuffer[i]);
#else
dstRGBA5551[i] = BGRA8888_32Rev_To_RGBA5551_16Rev(srcFramebuffer[i]);
#endif
}
} }
} }
} }
else // In the case where OpenGL couldn't flip the framebuffer on the GPU, we'll instead need to flip the framebuffer during conversion. else // In the case where OpenGL couldn't flip the framebuffer on the GPU, we'll instead need to flip the framebuffer during conversion.
{ {
const size_t pixCount = this->_framebufferWidth;
if (this->_outputFormat == NDSColorFormat_BGR666_Rev) if (this->_outputFormat == NDSColorFormat_BGR666_Rev)
{ {
if ( (dstFramebuffer != NULL) && (dstRGBA5551 != NULL) ) if ( (dstFramebuffer != NULL) && (dstRGBA5551 != NULL) )
@ -1181,25 +1053,22 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{ {
size_t x = 0; size_t x = 0;
#ifdef ENABLE_SSSE3 #ifdef ENABLE_SSE2
for (; x < ssePixCount; x += 4, ir += 4, iw += 4) const size_t ssePixCount = pixCount - (pixCount % 8);
for (; x < ssePixCount; x += 8, ir += 8, iw += 8)
{ {
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + ir)); const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 0));
const __m128i color6665 = BGRA8888_32Rev_To_RGBA6665_32Rev(srcColor); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 4));
const __m128i color5551 = BGRA8888_32Rev_To_RGBA5551_16Rev(srcColor);
_mm_store_si128((__m128i *)(dstFramebuffer + iw), color6665); _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 0), ConvertColor8888To6665<true>(srcColorLo) );
_mm_storel_epi64((__m128i *)(dstFramebuffer + iw), color5551); _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ConvertColor8888To6665<true>(srcColorHi) );
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif #endif
for (; x < pixCount; x++, ir++, iw++) for (; x < pixCount; x++, ir++, iw++)
{ {
#ifdef LOCAL_BE dstFramebuffer[iw] = ConvertColor8888To6665<true>(srcFramebuffer[ir]);
dstFramebuffer[iw] = BGRA8888_32_To_RGBA6665_32(srcFramebuffer[ir]); dstRGBA5551[iw] = ConvertColor8888To5551<true>(srcFramebuffer[ir]);
dstRGBA5551[iw] = BGRA8888_32_To_RGBA5551_16(srcFramebuffer[ir]);
#else
dstFramebuffer[iw] = BGRA8888_32Rev_To_RGBA6665_32Rev(srcFramebuffer[ir]);
dstRGBA5551[iw] = BGRA8888_32Rev_To_RGBA5551_16Rev(srcFramebuffer[ir]);
#endif
} }
} }
} }
@ -1207,46 +1076,14 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
{ {
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{ {
size_t x = 0; ConvertColorBuffers8888To6665<true>(srcFramebuffer + ir, dstFramebuffer + iw, pixCount);
#ifdef ENABLE_SSSE3
for (; x < ssePixCount; x += 4, ir += 4, iw += 4)
{
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + ir));
const __m128i color6665 = BGRA8888_32Rev_To_RGBA6665_32Rev(srcColor);
_mm_store_si128((__m128i *)(dstFramebuffer + iw), color6665);
}
#endif
for (; x < pixCount; x++, ir++, iw++)
{
#ifdef LOCAL_BE
dstFramebuffer[iw] = BGRA8888_32_To_RGBA6665_32(srcFramebuffer[ir]);
#else
dstFramebuffer[iw] = BGRA8888_32Rev_To_RGBA6665_32Rev(srcFramebuffer[ir]);
#endif
}
} }
} }
else else
{ {
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{ {
size_t x = 0; ConvertColorBuffers8888To5551<true>(srcFramebuffer + ir, dstRGBA5551 + iw, pixCount);
#ifdef ENABLE_SSSE3
for (; x < ssePixCount; x += 4, ir += 4, iw += 4)
{
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + ir));
const __m128i color5551 = BGRA8888_32Rev_To_RGBA5551_16Rev(srcColor);
_mm_storel_epi64((__m128i *)(dstFramebuffer + iw), color5551);
}
#endif
for (; x < pixCount; x++, ir++, iw++)
{
#ifdef LOCAL_BE
dstRGBA5551[iw] = BGRA8888_32_To_RGBA5551_16(srcFramebuffer[ir]);
#else
dstRGBA5551[iw] = BGRA8888_32Rev_To_RGBA5551_16Rev(srcFramebuffer[ir]);
#endif
}
} }
} }
} }
@ -1257,23 +1094,22 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{ {
size_t x = 0; size_t x = 0;
#ifdef ENABLE_SSSE3 #ifdef ENABLE_SSE2
for (; x < ssePixCount; x += 4, ir += 4, iw += 4) const size_t ssePixCount = pixCount - (pixCount % 8);
for (; x < ssePixCount; x += 8, ir += 8, iw += 8)
{ {
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + ir)); const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 0));
const __m128i color5551 = BGRA8888_32Rev_To_RGBA5551_16Rev(srcColor); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 4));
_mm_store_si128((__m128i *)(dstFramebuffer + iw), srcColor);
_mm_storel_epi64((__m128i *)(dstFramebuffer + iw), color5551); _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 0), srcColorLo );
_mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), srcColorHi );
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif #endif
for (; x < pixCount; x++, ir++, iw++) for (; x < pixCount; x++, ir++, iw++)
{ {
dstFramebuffer[iw] = srcFramebuffer[ir]; dstFramebuffer[iw] = srcFramebuffer[ir];
#ifdef LOCAL_BE dstRGBA5551[iw] = ConvertColor8888To5551<true>(srcFramebuffer[ir]);
dstRGBA5551[iw] = BGRA8888_32_To_RGBA5551_16(srcFramebuffer[ir]);
#else
dstRGBA5551[iw] = BGRA8888_32Rev_To_RGBA5551_16Rev(srcFramebuffer[ir]);
#endif
} }
} }
} }
@ -1294,23 +1130,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
{ {
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{ {
size_t x = 0; ConvertColorBuffers8888To5551<true>(srcFramebuffer + ir, dstRGBA5551 + iw, pixCount);
#ifdef ENABLE_SSSE3
for (; x < ssePixCount; x += 4, ir += 4, iw += 4)
{
const __m128i srcColor = _mm_load_si128((__m128i *)(srcFramebuffer + ir));
const __m128i color5551 = BGRA8888_32Rev_To_RGBA5551_16Rev(srcColor);
_mm_storel_epi64((__m128i *)(dstFramebuffer + iw), color5551);
}
#endif
for (; x < pixCount; x++, ir++, iw++)
{
#ifdef LOCAL_BE
dstRGBA5551[iw] = BGRA8888_32_To_RGBA5551_16(srcFramebuffer[ir]);
#else
dstRGBA5551[iw] = BGRA8888_32Rev_To_RGBA5551_16Rev(srcFramebuffer[ir]);
#endif
}
} }
} }
} }
@ -1323,11 +1143,7 @@ Render3DError OpenGLRenderer::FlushFramebuffer(const FragmentColor *__restrict s
{ {
if (this->willConvertFramebufferOnGPU) if (this->willConvertFramebufferOnGPU)
{ {
#ifdef ENABLE_SSE2
return Render3D_SSE2::FlushFramebuffer(srcFramebuffer, NULL, dstRGBA5551);
#else
return Render3D::FlushFramebuffer(srcFramebuffer, NULL, dstRGBA5551); return Render3D::FlushFramebuffer(srcFramebuffer, NULL, dstRGBA5551);
#endif
} }
else else
{ {
@ -2963,7 +2779,7 @@ Render3DError OpenGLRenderer_1_2::ClearUsingImage(const u16 *__restrict colorBuf
return OGLERROR_NOERR; return OGLERROR_NOERR;
} }
Render3DError OpenGLRenderer_1_2::ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const Render3DError OpenGLRenderer_1_2::ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) const
{ {
OGLRenderRef &OGLRef = *this->ref; OGLRenderRef &OGLRef = *this->ref;
@ -2978,7 +2794,7 @@ Render3DError OpenGLRenderer_1_2::ClearUsingValues(const FragmentColor &clearCol
if (this->isShaderSupported && this->isFBOSupported) if (this->isShaderSupported && this->isFBOSupported)
{ {
glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT); // texGColorID glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT); // texGColorID
glClearColor(divide5bitBy31_LUT[clearColor.r], divide5bitBy31_LUT[clearColor.g], divide5bitBy31_LUT[clearColor.b], divide5bitBy31_LUT[clearColor.a]); glClearColor(divide6bitBy63_LUT[clearColor6665.r], divide6bitBy63_LUT[clearColor6665.g], divide6bitBy63_LUT[clearColor6665.b], divide5bitBy31_LUT[clearColor6665.a]);
glClearDepth((GLclampd)clearAttributes.depth / (GLclampd)0x00FFFFFF); glClearDepth((GLclampd)clearAttributes.depth / (GLclampd)0x00FFFFFF);
glClearStencil(0xFF); glClearStencil(0xFF);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);
@ -2999,7 +2815,7 @@ Render3DError OpenGLRenderer_1_2::ClearUsingValues(const FragmentColor &clearCol
} }
else else
{ {
glClearColor(divide5bitBy31_LUT[clearColor.r], divide5bitBy31_LUT[clearColor.g], divide5bitBy31_LUT[clearColor.b], divide5bitBy31_LUT[clearColor.a]); glClearColor(divide6bitBy63_LUT[clearColor6665.r], divide6bitBy63_LUT[clearColor6665.g], divide6bitBy63_LUT[clearColor6665.b], divide5bitBy31_LUT[clearColor6665.a]);
glClearDepth((GLclampd)clearAttributes.depth / (GLclampd)0x00FFFFFF); glClearDepth((GLclampd)clearAttributes.depth / (GLclampd)0x00FFFFFF);
glClearStencil(clearAttributes.opaquePolyID); glClearStencil(clearAttributes.opaquePolyID);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);

View File

@ -522,6 +522,7 @@ extern GPU3DInterface gpu3Dgl_3_2;
extern const GLenum RenderDrawList[4]; extern const GLenum RenderDrawList[4];
extern CACHE_ALIGN const GLfloat divide5bitBy31_LUT[32]; extern CACHE_ALIGN const GLfloat divide5bitBy31_LUT[32];
extern CACHE_ALIGN const GLfloat divide6bitBy63_LUT[64];
extern const GLfloat PostprocessVtxBuffer[16]; extern const GLfloat PostprocessVtxBuffer[16];
extern const GLubyte PostprocessElementBuffer[6]; extern const GLubyte PostprocessElementBuffer[6];
@ -560,9 +561,7 @@ FORCEINLINE u32 BGRA8888_32_To_RGBA6665_32(const u32 srcPix);
FORCEINLINE u32 BGRA8888_32Rev_To_RGBA6665_32Rev(const u32 srcPix); FORCEINLINE u32 BGRA8888_32Rev_To_RGBA6665_32Rev(const u32 srcPix);
bool IsVersionSupported(unsigned int checkVersionMajor, unsigned int checkVersionMinor, unsigned int checkVersionRevision); bool IsVersionSupported(unsigned int checkVersionMajor, unsigned int checkVersionMinor, unsigned int checkVersionRevision);
#if defined(ENABLE_SSSE3) #if defined(ENABLE_SSE2)
class OpenGLRenderer : public Render3D_SSSE3
#elif defined(ENABLE_SSE2)
class OpenGLRenderer : public Render3D_SSE2 class OpenGLRenderer : public Render3D_SSE2
#else #else
class OpenGLRenderer : public Render3D class OpenGLRenderer : public Render3D
@ -719,7 +718,7 @@ protected:
virtual Render3DError EndRender(const u64 frameCount); virtual Render3DError EndRender(const u64 frameCount);
virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer);
virtual Render3DError ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const; virtual Render3DError ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) const;
virtual void SetPolygonIndex(const size_t index); virtual void SetPolygonIndex(const size_t index);
virtual Render3DError SetupPolygon(const POLY &thePoly); virtual Render3DError SetupPolygon(const POLY &thePoly);

View File

@ -251,7 +251,7 @@ static const char *GeometryFragShader_150 = {"\
outFragColor = newFragColor;\n\ outFragColor = newFragColor;\n\
outFragDepth = vec4( packVec3FromFloat(newFragDepth), float(bool(polyEnableDepthWrite) && (newFragColor.a > 0.999 || bool(polySetNewDepthForTranslucent))));\n\ outFragDepth = vec4( packVec3FromFloat(newFragDepth), float(bool(polyEnableDepthWrite) && (newFragColor.a > 0.999 || bool(polySetNewDepthForTranslucent))));\n\
outPolyID = vec4(float(polyID)/63.0, 0.0, 0.0, float(newFragColor.a > 0.999));\n\ outPolyID = vec4(float(polyID)/63.0, 0.0, 0.0, float(newFragColor.a > 0.999));\n\
outFogAttributes = vec4( float(polyEnableFog), 0.0, 0.0, float(newFragColor.a > 0.999 || !bool(polyEnableFog)));\n\ outFogAttributes = vec4(float(polyEnableFog), 0.0, 0.0, float((newFragColor.a > 0.999) ? 1.0 : 0.5));\n\
gl_FragDepth = newFragDepth;\n\ gl_FragDepth = newFragDepth;\n\
} \n\ } \n\
"}; "};
@ -420,7 +420,7 @@ static const char *FogFragShader_150 = {"\
{\n\ {\n\
vec4 inFragColor = texture(texInFragColor, texCoord);\n\ vec4 inFragColor = texture(texInFragColor, texCoord);\n\
vec4 inFogAttributes = texture(texInFogAttributes, texCoord);\n\ vec4 inFogAttributes = texture(texInFogAttributes, texCoord);\n\
bool polyEnableFog = bool(inFogAttributes.r);\n\ bool polyEnableFog = (inFogAttributes.r > 0.999);\n\
vec4 newFoggedColor = inFragColor;\n\ vec4 newFoggedColor = inFragColor;\n\
\n\ \n\
if (polyEnableFog)\n\ if (polyEnableFog)\n\
@ -1585,7 +1585,7 @@ Render3DError OpenGLRenderer_3_2::ClearUsingImage(const u16 *__restrict colorBuf
return OGLERROR_NOERR; return OGLERROR_NOERR;
} }
Render3DError OpenGLRenderer_3_2::ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const Render3DError OpenGLRenderer_3_2::ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) const
{ {
OGLRenderRef &OGLRef = *this->ref; OGLRenderRef &OGLRef = *this->ref;
OGLRef.selectedRenderingFBO = (CommonSettings.GFX3D_Renderer_Multisample) ? OGLRef.fboMSIntermediateRenderID : OGLRef.fboRenderID; OGLRef.selectedRenderingFBO = (CommonSettings.GFX3D_Renderer_Multisample) ? OGLRef.fboMSIntermediateRenderID : OGLRef.fboRenderID;
@ -1593,10 +1593,10 @@ Render3DError OpenGLRenderer_3_2::ClearUsingValues(const FragmentColor &clearCol
glDrawBuffers(4, RenderDrawList); glDrawBuffers(4, RenderDrawList);
glDepthMask(GL_TRUE); glDepthMask(GL_TRUE);
const GLfloat oglColor[4] = {divide5bitBy31_LUT[clearColor.r], divide5bitBy31_LUT[clearColor.g], divide5bitBy31_LUT[clearColor.b], divide5bitBy31_LUT[clearColor.a]}; const GLfloat oglColor[4] = {divide6bitBy63_LUT[clearColor6665.r], divide6bitBy63_LUT[clearColor6665.g], divide6bitBy63_LUT[clearColor6665.b], divide5bitBy31_LUT[clearColor6665.a]};
const GLfloat oglDepth[4] = {(GLfloat)(clearAttributes.depth & 0x000000FF)/255.0f, (GLfloat)((clearAttributes.depth >> 8) & 0x000000FF)/255.0f, (GLfloat)((clearAttributes.depth >> 16) & 0x000000FF)/255.0f, 1.0}; const GLfloat oglDepth[4] = {(GLfloat)(clearAttributes.depth & 0x000000FF)/255.0f, (GLfloat)((clearAttributes.depth >> 8) & 0x000000FF)/255.0f, (GLfloat)((clearAttributes.depth >> 16) & 0x000000FF)/255.0f, 1.0};
const GLfloat oglPolyID[4] = {(GLfloat)clearAttributes.opaquePolyID/63.0f, 0.0, 0.0, 1.0}; const GLfloat oglPolyID[4] = {(GLfloat)clearAttributes.opaquePolyID/63.0f, 0.0f, 0.0f, 1.0f};
const GLfloat oglFogAttr[4] = {(GLfloat)clearAttributes.isFogged, 0.0, 0.0, 1.0}; const GLfloat oglFogAttr[4] = {(GLfloat)clearAttributes.isFogged, 0.0f, 0.0f, 1.0f};
glClearBufferfi(GL_DEPTH_STENCIL, 0, (GLfloat)clearAttributes.depth / (GLfloat)0x00FFFFFF, 0xFF); glClearBufferfi(GL_DEPTH_STENCIL, 0, (GLfloat)clearAttributes.depth / (GLfloat)0x00FFFFFF, 0xFF);
glClearBufferfv(GL_COLOR, 0, oglColor); // texGColorID glClearBufferfv(GL_COLOR, 0, oglColor); // texGColorID

View File

@ -94,7 +94,7 @@ protected:
virtual Render3DError DestroyToonTable(); virtual Render3DError DestroyToonTable();
virtual Render3DError UpdateToonTable(const u16 *toonTableBuffer); virtual Render3DError UpdateToonTable(const u16 *toonTableBuffer);
virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer);
virtual Render3DError ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const; virtual Render3DError ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) const;
virtual void SetPolygonIndex(const size_t index); virtual void SetPolygonIndex(const size_t index);
virtual Render3DError SetupPolygon(const POLY &thePoly); virtual Render3DError SetupPolygon(const POLY &thePoly);

View File

@ -266,63 +266,9 @@ Viewer3d_State* viewer3d_state = NULL;
static GFX3D_Clipper boxtestClipper; static GFX3D_Clipper boxtestClipper;
//tables that are provided to anyone //tables that are provided to anyone
CACHE_ALIGN u32 color_15bit_to_24bit_reverse[32768];
CACHE_ALIGN u32 color_15bit_to_24bit[32768];
CACHE_ALIGN u16 color_15bit_to_16bit_reverse[32768];
CACHE_ALIGN u8 mixTable555[32][32][32]; CACHE_ALIGN u8 mixTable555[32][32][32];
CACHE_ALIGN u32 dsDepthExtend_15bit_to_24bit[32768]; CACHE_ALIGN u32 dsDepthExtend_15bit_to_24bit[32768];
//is this a crazy idea? this table spreads 5 bits evenly over 31 from exactly 0 to INT_MAX
CACHE_ALIGN const u32 material_5bit_to_31bit[] = {
0x00000000, 0x04210842, 0x08421084, 0x0C6318C6,
0x10842108, 0x14A5294A, 0x18C6318C, 0x1CE739CE,
0x21084210, 0x25294A52, 0x294A5294, 0x2D6B5AD6,
0x318C6318, 0x35AD6B5A, 0x39CE739C, 0x3DEF7BDE,
0x42108421, 0x46318C63, 0x4A5294A5, 0x4E739CE7,
0x5294A529, 0x56B5AD6B, 0x5AD6B5AD, 0x5EF7BDEF,
0x6318C631, 0x6739CE73, 0x6B5AD6B5, 0x6F7BDEF7,
0x739CE739, 0x77BDEF7B, 0x7BDEF7BD, 0x7FFFFFFF
};
CACHE_ALIGN const u8 material_5bit_to_6bit[] = {
0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
0x10, 0x12, 0x14, 0x16, 0x19, 0x1A, 0x1C, 0x1E,
0x21, 0x23, 0x25, 0x27, 0x29, 0x2B, 0x2D, 0x2F,
0x31, 0x33, 0x35, 0x37, 0x39, 0x3B, 0x3D, 0x3F
};
CACHE_ALIGN const u8 material_5bit_to_8bit[] = {
0x00, 0x08, 0x10, 0x18, 0x21, 0x29, 0x31, 0x39,
0x42, 0x4A, 0x52, 0x5A, 0x63, 0x6B, 0x73, 0x7B,
0x84, 0x8C, 0x94, 0x9C, 0xA5, 0xAD, 0xB5, 0xBD,
0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF
};
CACHE_ALIGN const u8 material_6bit_to_8bit[] = {
0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C,
0x20, 0x24, 0x28, 0x2C, 0x30, 0x34, 0x38, 0x3C,
0x41, 0x45, 0x49, 0x4D, 0x51, 0x55, 0x59, 0x5D,
0x61, 0x65, 0x69, 0x6D, 0x71, 0x75, 0x79, 0x7D,
0x82, 0x86, 0x8A, 0x8E, 0x92, 0x96, 0x9A, 0x9E,
0xA2, 0xA6, 0xAA, 0xAE, 0xB2, 0xB6, 0xBA, 0xBE,
0xC3, 0xC7, 0xCB, 0xCF, 0xD3, 0xD7, 0xDB, 0xDF,
0xE3, 0xE7, 0xEB, 0xEF, 0xF3, 0xF7, 0xFB, 0xFF
};
CACHE_ALIGN const u8 material_3bit_to_8bit[] = {
0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF
};
//maybe not very precise
CACHE_ALIGN const u8 material_3bit_to_5bit[] = {
0, 4, 8, 13, 17, 22, 26, 31
};
//TODO - generate this in the static init method more accurately
CACHE_ALIGN const u8 material_3bit_to_6bit[] = {
0, 8, 16, 26, 34, 44, 52, 63
};
//private acceleration tables //private acceleration tables
static float float16table[65536]; static float float16table[65536];
static float float10Table[1024]; static float float10Table[1024];
@ -451,21 +397,11 @@ static BOOL flushPending = FALSE;
static BOOL drawPending = FALSE; static BOOL drawPending = FALSE;
//------------------------------------------------------------ //------------------------------------------------------------
static void makeTables() { static void makeTables()
{
//produce the color bits of a 24bpp color from a DS RGB15 using bit logic (internal use only)
#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
//produce the color bits of a 24bpp color from a DS RGB15 using bit logic (internal use only). RGB are reverse of usual
#define RGB15TO24_BITLOGIC_REVERSE(col) ( (material_5bit_to_8bit[(col)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[((col)>>10)&0x1F] )
for (size_t i = 0; i < 32768; i++) for (size_t i = 0; i < 32768; i++)
{ {
color_15bit_to_24bit[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) ); // 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
color_15bit_to_24bit_reverse[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC_REVERSE(i) );
color_15bit_to_16bit_reverse[i] = (((i & 0x001F) << 11) | (material_5bit_to_6bit[(i & 0x03E0) >> 5] << 5) | ((i & 0x7C00) >> 10));
// 15-bit to 24-bit depth formula from http://nocash.emubase.de/gbatek.htm#ds3drearplane
dsDepthExtend_15bit_to_24bit[i] = LE_TO_LOCAL_32( (i*0x200)+((i+1)>>15)*0x01FF ); dsDepthExtend_15bit_to_24bit[i] = LE_TO_LOCAL_32( (i*0x200)+((i+1)>>15)*0x01FF );
} }
@ -771,9 +707,9 @@ static void SetVertex()
vert.coord[1] = coordTransformed[1]/4096.0f; vert.coord[1] = coordTransformed[1]/4096.0f;
vert.coord[2] = coordTransformed[2]/4096.0f; vert.coord[2] = coordTransformed[2]/4096.0f;
vert.coord[3] = coordTransformed[3]/4096.0f; vert.coord[3] = coordTransformed[3]/4096.0f;
vert.color[0] = GFX3D_5TO6(colorRGB[0]); vert.color[0] = GFX3D_5TO6_LOOKUP(colorRGB[0]);
vert.color[1] = GFX3D_5TO6(colorRGB[1]); vert.color[1] = GFX3D_5TO6_LOOKUP(colorRGB[1]);
vert.color[2] = GFX3D_5TO6(colorRGB[2]); vert.color[2] = GFX3D_5TO6_LOOKUP(colorRGB[2]);
vert.color_to_float(); vert.color_to_float();
tempVertInfo.map[tempVertInfo.count] = vertlist->count + tempVertInfo.count - continuation; tempVertInfo.map[tempVertInfo.count] = vertlist->count + tempVertInfo.count - continuation;
tempVertInfo.count++; tempVertInfo.count++;

View File

@ -69,52 +69,8 @@ class EMUFILE;
#define GFX3D_VEC_TEST 0x72 #define GFX3D_VEC_TEST 0x72
#define GFX3D_NOP_NOARG_HACK 0xDD #define GFX3D_NOP_NOARG_HACK 0xDD
//produce a 32bpp color from a ds RGB15, using a table
#define RGB15TO32_NOALPHA(col) ( color_15bit_to_24bit[col&0x7FFF] )
//produce a 32bpp color from a ds RGB15 plus an 8bit alpha, using a table
#ifdef WORDS_BIGENDIAN
#define RGB15TO32(col,alpha8) ( (alpha8) | color_15bit_to_24bit[(col)&0x7FFF] )
#else
#define RGB15TO32(col,alpha8) ( ((alpha8)<<24) | color_15bit_to_24bit[(col)&0x7FFF] )
#endif
//produce a 5555 32bit color from a ds RGB15 plus an 5bit alpha
#ifdef WORDS_BIGENDIAN
#define RGB15TO5555(col,alpha5) ( (alpha5) | ((((col) & 0x7C00)>>10)<<8) | ((((col) & 0x03E0)>>5)<<16) | (((col) & 0x001F)<<24) )
#else
#define RGB15TO5555(col,alpha5) ( ((alpha5)<<24) | ((((col) & 0x7C00)>>10)<<16) | ((((col) & 0x03E0)>>5)<<8) | ((col) & 0x001F) )
#endif
//produce a 6665 32bit color from a ds RGB15 plus an 5bit alpha
inline u32 RGB15TO6665(u16 col, u8 alpha5)
{
const u16 r = (col&0x001F)>>0;
const u16 g = (col&0x03E0)>>5;
const u16 b = (col&0x7C00)>>10;
#ifdef WORDS_BIGENDIAN
const u32 ret = alpha5 | (((b<<1)+1)<<8) | (((g<<1)+1)<<16) | (((r<<1)+1)<<24);
#else
const u32 ret = (alpha5<<24) | (((b<<1)+1)<<16) | (((g<<1)+1)<<8) | ((r<<1)+1);
#endif
return ret;
}
//produce a 24bpp color from a ds RGB15, using a table
#define RGB15TO24_REVERSE(col) ( color_15bit_to_24bit_reverse[(col)&0x7FFF] )
//produce a 16bpp color from a ds RGB15, using a table
#define RGB15TO16_REVERSE(col) ( color_15bit_to_16bit_reverse[(col)&0x7FFF] )
//produce a 15bpp color from individual 5bit components
#define R5G5B5TORGB15(r,g,b) ( (r) | ((g)<<5) | ((b)<<10) )
//produce a 16bpp color from individual 5bit components
#define R6G6B6TORGB15(r,g,b) ( ((r)>>1) | (((g)&0x3E)<<4) | (((b)&0x3E)<<9) )
#define GFX3D_5TO6(x) ((x)?(((x)<<1)+1):0) #define GFX3D_5TO6(x) ((x)?(((x)<<1)+1):0)
#define GFX3D_5TO6_LOOKUP(x) (material_5bit_to_6bit[(x)])
// 15-bit to 24-bit depth formula from http://nocash.emubase.de/gbatek.htm#ds3drearplane // 15-bit to 24-bit depth formula from http://nocash.emubase.de/gbatek.htm#ds3drearplane
#define DS_DEPTH15TO24(depth) ( dsDepthExtend_15bit_to_24bit[(depth) & 0x7FFF] ) #define DS_DEPTH15TO24(depth) ( dsDepthExtend_15bit_to_24bit[(depth) & 0x7FFF] )
@ -733,18 +689,8 @@ extern u32 Render3DFramesPerSecond; // save the current 3D rendering frame count
//--------------------- //---------------------
extern CACHE_ALIGN u32 color_15bit_to_24bit[32768];
extern CACHE_ALIGN u32 color_15bit_to_24bit_reverse[32768];
extern CACHE_ALIGN u16 color_15bit_to_16bit_reverse[32768];
extern CACHE_ALIGN u32 dsDepthExtend_15bit_to_24bit[32768]; extern CACHE_ALIGN u32 dsDepthExtend_15bit_to_24bit[32768];
extern CACHE_ALIGN u8 mixTable555[32][32][32]; extern CACHE_ALIGN u8 mixTable555[32][32][32];
extern CACHE_ALIGN const u32 material_5bit_to_31bit[32];
extern CACHE_ALIGN const u8 material_5bit_to_6bit[32];
extern CACHE_ALIGN const u8 material_5bit_to_8bit[32];
extern CACHE_ALIGN const u8 material_6bit_to_8bit[64];
extern CACHE_ALIGN const u8 material_3bit_to_5bit[8];
extern CACHE_ALIGN const u8 material_3bit_to_6bit[8];
extern CACHE_ALIGN const u8 material_3bit_to_8bit[8];
extern BOOL isSwapBuffers; extern BOOL isSwapBuffers;

View File

@ -49,10 +49,6 @@
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#ifdef ENABLE_SSSE3
#include <tmmintrin.h>
#endif
#include "bits.h" #include "bits.h"
#include "common.h" #include "common.h"
#include "matrix.h" #include "matrix.h"
@ -495,7 +491,7 @@ public:
dst.r = modulate_table[mainTexColor.r][src.r]; dst.r = modulate_table[mainTexColor.r][src.r];
dst.g = modulate_table[mainTexColor.g][src.g]; dst.g = modulate_table[mainTexColor.g][src.g];
dst.b = modulate_table[mainTexColor.b][src.b]; dst.b = modulate_table[mainTexColor.b][src.b];
dst.a = modulate_table[GFX3D_5TO6(mainTexColor.a)][GFX3D_5TO6(src.a)]>>1; dst.a = modulate_table[GFX3D_5TO6_LOOKUP(mainTexColor.a)][GFX3D_5TO6_LOOKUP(src.a)]>>1;
//dst.a = 28; //dst.a = 28;
//#ifdef _MSC_VER //#ifdef _MSC_VER
//if(GetAsyncKeyState(VK_SHIFT)) { //if(GetAsyncKeyState(VK_SHIFT)) {
@ -538,7 +534,7 @@ public:
dst.r = modulate_table[mainTexColor.r][src.r]; dst.r = modulate_table[mainTexColor.r][src.r];
dst.g = modulate_table[mainTexColor.g][src.r]; dst.g = modulate_table[mainTexColor.g][src.r];
dst.b = modulate_table[mainTexColor.b][src.r]; dst.b = modulate_table[mainTexColor.b][src.r];
dst.a = modulate_table[GFX3D_5TO6(mainTexColor.a)][GFX3D_5TO6(src.a)] >> 1; dst.a = modulate_table[GFX3D_5TO6_LOOKUP(mainTexColor.a)][GFX3D_5TO6_LOOKUP(src.a)] >> 1;
dst.r = min<u8>(0x3F, (dst.r + toonColor.r)); dst.r = min<u8>(0x3F, (dst.r + toonColor.r));
dst.g = min<u8>(0x3F, (dst.g + toonColor.g)); dst.g = min<u8>(0x3F, (dst.g + toonColor.g));
@ -549,7 +545,7 @@ public:
dst.r = modulate_table[mainTexColor.r][toonColor.r]; dst.r = modulate_table[mainTexColor.r][toonColor.r];
dst.g = modulate_table[mainTexColor.g][toonColor.g]; dst.g = modulate_table[mainTexColor.g][toonColor.g];
dst.b = modulate_table[mainTexColor.b][toonColor.b]; dst.b = modulate_table[mainTexColor.b][toonColor.b];
dst.a = modulate_table[GFX3D_5TO6(mainTexColor.a)][GFX3D_5TO6(src.a)] >> 1; dst.a = modulate_table[GFX3D_5TO6_LOOKUP(mainTexColor.a)][GFX3D_5TO6_LOOKUP(src.a)] >> 1;
} }
} }
break; break;
@ -1132,9 +1128,7 @@ void _HACK_Viewer_ExecUnit()
static Render3D* SoftRasterizerRendererCreate() static Render3D* SoftRasterizerRendererCreate()
{ {
#if defined(ENABLE_SSSE3) #if defined(ENABLE_SSE2)
return new SoftRasterizerRenderer_SSSE3;
#elif defined(ENABLE_SSE2)
return new SoftRasterizerRenderer_SSE2; return new SoftRasterizerRenderer_SSE2;
#else #else
return new SoftRasterizerRenderer; return new SoftRasterizerRenderer;
@ -1145,9 +1139,7 @@ static void SoftRasterizerRendererDestroy()
{ {
if (CurrentRenderer != BaseRenderer) if (CurrentRenderer != BaseRenderer)
{ {
#if defined(ENABLE_SSSE3) #if defined(ENABLE_SSE2)
SoftRasterizerRenderer_SSSE3 *oldRenderer = (SoftRasterizerRenderer_SSSE3 *)CurrentRenderer;
#elif defined(ENABLE_SSE2)
SoftRasterizerRenderer_SSE2 *oldRenderer = (SoftRasterizerRenderer_SSE2 *)CurrentRenderer; SoftRasterizerRenderer_SSE2 *oldRenderer = (SoftRasterizerRenderer_SSE2 *)CurrentRenderer;
#else #else
SoftRasterizerRenderer *oldRenderer = (SoftRasterizerRenderer *)CurrentRenderer; SoftRasterizerRenderer *oldRenderer = (SoftRasterizerRenderer *)CurrentRenderer;
@ -1647,11 +1639,7 @@ Render3DError SoftRasterizerRenderer::UpdateEdgeMarkColorTable(const u16 *edgeMa
//we can do this by rendering a 3d frame and then freezing the system, but only changing the edge mark colors //we can do this by rendering a 3d frame and then freezing the system, but only changing the edge mark colors
for (size_t i = 0; i < 8; i++) for (size_t i = 0; i < 8; i++)
{ {
const u16 col = edgeMarkColorTable[i]; this->edgeMarkTable[i].color = COLOR555TO6665(edgeMarkColorTable[i] & 0x7FFF, (this->currentRenderState->enableAntialiasing) ? 0x10 : 0x1F);
this->edgeMarkTable[i].color = RGB15TO5555(col, (this->currentRenderState->enableAntialiasing) ? 0x10 : 0x1F);
this->edgeMarkTable[i].r = GFX3D_5TO6(this->edgeMarkTable[i].r);
this->edgeMarkTable[i].g = GFX3D_5TO6(this->edgeMarkTable[i].g);
this->edgeMarkTable[i].b = GFX3D_5TO6(this->edgeMarkTable[i].b);
//zero 20-jun-2013 - this doesnt make any sense. at least, it should be related to the 0x8000 bit. if this is undocumented behaviour, lets write about which scenario proves it here, or which scenario is requiring this code. //zero 20-jun-2013 - this doesnt make any sense. at least, it should be related to the 0x8000 bit. if this is undocumented behaviour, lets write about which scenario proves it here, or which scenario is requiring this code.
//// this seems to be the only thing that selectively disables edge marking //// this seems to be the only thing that selectively disables edge marking
@ -1735,10 +1723,9 @@ Render3DError SoftRasterizerRenderer::UpdateFogTable(const u8 *fogDensityTable)
// new multithreaded method. // new multithreaded method.
Render3DError SoftRasterizerRenderer::RenderFog(const u8 *densityTable, const u32 color, const u32 offset, const u8 shift, const bool alphaOnly) Render3DError SoftRasterizerRenderer::RenderFog(const u8 *densityTable, const u32 color, const u32 offset, const u8 shift, const bool alphaOnly)
{ {
u32 r = GFX3D_5TO6((color)&0x1F); FragmentColor fogColor;
u32 g = GFX3D_5TO6((color>>5)&0x1F); fogColor.color = COLOR555TO6665( color & 0x7FFF, (color>>16) & 0x1F );
u32 b = GFX3D_5TO6((color>>10)&0x1F);
u32 a = (color>>16)&0x1F;
const size_t framebufferFragmentCount = this->_framebufferWidth * this->_framebufferHeight; const size_t framebufferFragmentCount = this->_framebufferWidth * this->_framebufferHeight;
if (!alphaOnly) if (!alphaOnly)
@ -1750,10 +1737,10 @@ Render3DError SoftRasterizerRenderer::RenderFog(const u8 *densityTable, const u3
const u8 fog = (this->_framebufferAttributes->isFogged[i] != 0) ? this->fogTable[fogIndex] : 0; const u8 fog = (this->_framebufferAttributes->isFogged[i] != 0) ? this->fogTable[fogIndex] : 0;
FragmentColor &destFragmentColor = this->_framebufferColor[i]; FragmentColor &destFragmentColor = this->_framebufferColor[i];
destFragmentColor.r = ((128-fog)*destFragmentColor.r + r*fog)>>7; destFragmentColor.r = ((128-fog)*destFragmentColor.r + fogColor.r*fog)>>7;
destFragmentColor.g = ((128-fog)*destFragmentColor.g + g*fog)>>7; destFragmentColor.g = ((128-fog)*destFragmentColor.g + fogColor.g*fog)>>7;
destFragmentColor.b = ((128-fog)*destFragmentColor.b + b*fog)>>7; destFragmentColor.b = ((128-fog)*destFragmentColor.b + fogColor.b*fog)>>7;
destFragmentColor.a = ((128-fog)*destFragmentColor.a + a*fog)>>7; destFragmentColor.a = ((128-fog)*destFragmentColor.a + fogColor.a*fog)>>7;
} }
} }
else else
@ -1765,7 +1752,7 @@ Render3DError SoftRasterizerRenderer::RenderFog(const u8 *densityTable, const u3
const u8 fog = (this->_framebufferAttributes->isFogged[i] != 0) ? this->fogTable[fogIndex] : 0; const u8 fog = (this->_framebufferAttributes->isFogged[i] != 0) ? this->fogTable[fogIndex] : 0;
FragmentColor &destFragmentColor = this->_framebufferColor[i]; FragmentColor &destFragmentColor = this->_framebufferColor[i];
destFragmentColor.a = ((128-fog)*destFragmentColor.a + a*fog)>>7; destFragmentColor.a = ((128-fog)*destFragmentColor.a + fogColor.a*fog)>>7;
} }
} }
@ -1825,10 +1812,8 @@ END_EDGE_MARK: ;
if (param.enableFog) if (param.enableFog)
{ {
const u32 r = GFX3D_5TO6( (param.fogColor ) & 0x1F ); FragmentColor fogColor;
const u32 g = GFX3D_5TO6( (param.fogColor >> 5) & 0x1F ); fogColor.color = COLOR555TO6665( param.fogColor & 0x7FFF, (param.fogColor>>16) & 0x1F );
const u32 b = GFX3D_5TO6( (param.fogColor >> 10) & 0x1F );
const u32 a = (param.fogColor >> 16) & 0x1F;
const size_t fogIndex = depth >> 9; const size_t fogIndex = depth >> 9;
assert(fogIndex < 32768); assert(fogIndex < 32768);
@ -1836,12 +1821,12 @@ END_EDGE_MARK: ;
if (!param.fogAlphaOnly) if (!param.fogAlphaOnly)
{ {
dstColor.r = ( (128-fog)*dstColor.r + r*fog ) >> 7; dstColor.r = ( (128-fog)*dstColor.r + fogColor.r*fog ) >> 7;
dstColor.g = ( (128-fog)*dstColor.g + g*fog ) >> 7; dstColor.g = ( (128-fog)*dstColor.g + fogColor.g*fog ) >> 7;
dstColor.b = ( (128-fog)*dstColor.b + b*fog ) >> 7; dstColor.b = ( (128-fog)*dstColor.b + fogColor.b*fog ) >> 7;
} }
dstColor.a = ( (128-fog)*dstColor.a + a*fog ) >> 7; dstColor.a = ( (128-fog)*dstColor.a + fogColor.a*fog ) >> 7;
} }
} }
} }
@ -1854,7 +1839,7 @@ Render3DError SoftRasterizerRenderer::UpdateToonTable(const u16 *toonTableBuffer
//convert the toon colors //convert the toon colors
for (size_t i = 0; i < 32; i++) for (size_t i = 0; i < 32; i++)
{ {
this->toonColor32LUT[i].color = (RGB15TO32_NOALPHA(toonTableBuffer[i])>>2)&0x3F3F3F3F; this->toonColor32LUT[i].color = ( COLOR555TO888(toonTableBuffer[i] & 0x7FFF) >> 2 ) & 0x003F3F3F;
//printf("%d %d %d %d\n", this->toonColor32LUT[i].r, this->toonColor32LUT[i].g, this->toonColor32LUT[i].b, this->toonColor32LUT[i].a); //printf("%d %d %d %d\n", this->toonColor32LUT[i].r, this->toonColor32LUT[i].g, this->toonColor32LUT[i].b, this->toonColor32LUT[i].a);
} }
@ -1874,7 +1859,7 @@ Render3DError SoftRasterizerRenderer::ClearUsingImage(const u16 *__restrict colo
{ {
const size_t ir = readLine + ((x * xRatio) >> 16); const size_t ir = readLine + ((x * xRatio) >> 16);
this->_framebufferColor[iw].color = RGB15TO6665(colorBuffer[ir] & 0x7FFF, (colorBuffer[ir] >> 15) * 0x1F); this->_framebufferColor[iw].color = COLOR555TO6665(colorBuffer[ir] & 0x7FFF, (colorBuffer[ir] >> 15) * 0x1F);
this->_framebufferAttributes->depth[iw] = depthBuffer[ir]; this->_framebufferAttributes->depth[iw] = depthBuffer[ir];
this->_framebufferAttributes->isFogged[iw] = fogBuffer[ir]; this->_framebufferAttributes->isFogged[iw] = fogBuffer[ir];
this->_framebufferAttributes->opaquePolyID[iw] = polyIDBuffer[ir]; this->_framebufferAttributes->opaquePolyID[iw] = polyIDBuffer[ir];
@ -1887,17 +1872,12 @@ Render3DError SoftRasterizerRenderer::ClearUsingImage(const u16 *__restrict colo
return RENDER3DERROR_NOERR; return RENDER3DERROR_NOERR;
} }
Render3DError SoftRasterizerRenderer::ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const Render3DError SoftRasterizerRenderer::ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) const
{ {
FragmentColor convertedClearColor = clearColor;
convertedClearColor.r = GFX3D_5TO6(clearColor.r);
convertedClearColor.g = GFX3D_5TO6(clearColor.g);
convertedClearColor.b = GFX3D_5TO6(clearColor.b);
for (size_t i = 0; i < (this->_framebufferWidth * this->_framebufferHeight); i++) for (size_t i = 0; i < (this->_framebufferWidth * this->_framebufferHeight); i++)
{ {
this->_framebufferAttributes->SetAtIndex(i, clearAttributes); this->_framebufferAttributes->SetAtIndex(i, clearAttributes);
this->_framebufferColor[i] = convertedClearColor; this->_framebufferColor[i] = clearColor6665;
} }
return RENDER3DERROR_NOERR; return RENDER3DERROR_NOERR;
@ -2046,14 +2026,9 @@ Render3DError SoftRasterizerRenderer::SetFramebufferSize(size_t w, size_t h)
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) const
{ {
FragmentColor convertedClearColor = clearColor; const __m128i color_vec128 = _mm_set1_epi32(clearColor6665.color);
convertedClearColor.r = GFX3D_5TO6(clearColor.r);
convertedClearColor.g = GFX3D_5TO6(clearColor.g);
convertedClearColor.b = GFX3D_5TO6(clearColor.b);
const __m128i color_vec128 = _mm_set1_epi32(convertedClearColor.color);
const __m128i attrDepth_vec128 = _mm_set1_epi32(clearAttributes.depth); const __m128i attrDepth_vec128 = _mm_set1_epi32(clearAttributes.depth);
const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(clearAttributes.opaquePolyID); const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(clearAttributes.opaquePolyID);
const __m128i attrTranslucentPolyID_vec128 = _mm_set1_epi8(clearAttributes.translucentPolyID); const __m128i attrTranslucentPolyID_vec128 = _mm_set1_epi8(clearAttributes.translucentPolyID);
@ -2086,7 +2061,7 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
this->_framebufferColor[i] = convertedClearColor; this->_framebufferColor[i] = clearColor6665;
this->_framebufferAttributes->SetAtIndex(i, clearAttributes); this->_framebufferAttributes->SetAtIndex(i, clearAttributes);
} }

View File

@ -39,9 +39,7 @@ struct SoftRasterizerPostProcessParams
bool fogAlphaOnly; bool fogAlphaOnly;
}; };
#if defined(ENABLE_SSSE3) #if defined(ENABLE_SSE2)
class SoftRasterizerRenderer : public Render3D_SSSE3
#elif defined(ENABLE_SSE2)
class SoftRasterizerRenderer : public Render3D_SSE2 class SoftRasterizerRenderer : public Render3D_SSE2
#else #else
class SoftRasterizerRenderer : public Render3D class SoftRasterizerRenderer : public Render3D
@ -69,7 +67,7 @@ protected:
virtual Render3DError EndRender(const u64 frameCount); virtual Render3DError EndRender(const u64 frameCount);
virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer);
virtual Render3DError ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const; virtual Render3DError ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) const;
public: public:
int _debug_drawClippedUserPoly; int _debug_drawClippedUserPoly;
@ -106,16 +104,7 @@ public:
class SoftRasterizerRenderer_SSE2 : public SoftRasterizerRenderer class SoftRasterizerRenderer_SSE2 : public SoftRasterizerRenderer
{ {
virtual Render3DError ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const; virtual Render3DError ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) const;
};
#endif
#ifdef ENABLE_SSSE3
class SoftRasterizerRenderer_SSSE3 : public SoftRasterizerRenderer_SSE2
{
}; };
#endif #endif

View File

@ -24,10 +24,6 @@
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#ifdef ENABLE_SSSE3
#include <tmmintrin.h>
#endif
#include "bits.h" #include "bits.h"
#include "common.h" #include "common.h"
#include "gfx3d.h" #include "gfx3d.h"
@ -612,23 +608,11 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram
{ {
if ( (this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev) ) if ( (this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev) )
{ {
for (size_t i = 0; i < pixCount; i++) ConvertColorBuffers8888To6665<false>(srcFramebuffer, dstFramebuffer, pixCount);
{
dstFramebuffer[i].r = srcFramebuffer[i].r >> 2;
dstFramebuffer[i].g = srcFramebuffer[i].g >> 2;
dstFramebuffer[i].b = srcFramebuffer[i].b >> 2;
dstFramebuffer[i].a = srcFramebuffer[i].a >> 3;
}
} }
else if ( (this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev) ) else if ( (this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev) )
{ {
for (size_t i = 0; i < pixCount; i++) ConvertColorBuffers6665To8888<false>(srcFramebuffer, dstFramebuffer, pixCount);
{
dstFramebuffer[i].r = material_6bit_to_8bit[srcFramebuffer[i].r];
dstFramebuffer[i].g = material_6bit_to_8bit[srcFramebuffer[i].g];
dstFramebuffer[i].b = material_6bit_to_8bit[srcFramebuffer[i].b];
dstFramebuffer[i].a = material_5bit_to_8bit[srcFramebuffer[i].a];
}
} }
else if ( ((this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev)) || else if ( ((this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev)) ||
((this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev)) ) ((this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev)) )
@ -639,9 +623,13 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram
if (dstRGBA5551 != NULL) if (dstRGBA5551 != NULL)
{ {
for (size_t i = 0; i < pixCount; i++) if (this->_outputFormat == NDSColorFormat_BGR666_Rev)
{ {
dstRGBA5551[i] = R6G6B6TORGB15(srcFramebuffer[i].r, srcFramebuffer[i].g, srcFramebuffer[i].b) | ((srcFramebuffer[i].a == 0) ? 0x0000 : 0x8000); ConvertColorBuffers6665To5551<false>(srcFramebuffer, dstRGBA5551, pixCount);
}
else if (this ->_outputFormat == NDSColorFormat_BGR888_Rev)
{
ConvertColorBuffers8888To5551<false>(srcFramebuffer, dstRGBA5551, pixCount);
} }
} }
@ -657,20 +645,8 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
{ {
Render3DError error = RENDER3DERROR_NOERR; Render3DError error = RENDER3DERROR_NOERR;
FragmentColor clearColor; FragmentColor clearColor6665;
clearColor6665.color = COLOR555TO6665(renderState.clearColor & 0x7FFF, (renderState.clearColor >> 16) & 0x1F);
#ifdef LOCAL_LE
clearColor.r = renderState.clearColor & 0x1F;
clearColor.g = (renderState.clearColor >> 5) & 0x1F;
clearColor.b = (renderState.clearColor >> 10) & 0x1F;
clearColor.a = (renderState.clearColor >> 16) & 0x1F;
#else
const u32 clearColorSwapped = LE_TO_LOCAL_32(renderState.clearColor);
clearColor.r = clearColorSwapped & 0x1F;
clearColor.g = (clearColorSwapped >> 5) & 0x1F;
clearColor.b = (clearColorSwapped >> 10) & 0x1F;
clearColor.a = (clearColorSwapped >> 16) & 0x1F;
#endif
FragmentAttributes clearFragment; FragmentAttributes clearFragment;
clearFragment.opaquePolyID = (renderState.clearColor >> 24) & 0x3F; clearFragment.opaquePolyID = (renderState.clearColor >> 24) & 0x3F;
@ -732,12 +708,12 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer);
if (error != RENDER3DERROR_NOERR) if (error != RENDER3DERROR_NOERR)
{ {
error = this->ClearUsingValues(clearColor, clearFragment); error = this->ClearUsingValues(clearColor6665, clearFragment);
} }
} }
else else
{ {
error = this->ClearUsingValues(clearColor, clearFragment); error = this->ClearUsingValues(clearColor6665, clearFragment);
} }
return error; return error;
@ -748,7 +724,7 @@ Render3DError Render3D::ClearUsingImage(const u16 *__restrict colorBuffer, const
return RENDER3DERROR_NOERR; return RENDER3DERROR_NOERR;
} }
Render3DError Render3D::ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const Render3DError Render3D::ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) const
{ {
return RENDER3DERROR_NOERR; return RENDER3DERROR_NOERR;
} }
@ -831,130 +807,12 @@ Render3DError Render3D::VramReconfigureSignal()
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
Render3DError Render3D_SSE2::FlushFramebuffer(const FragmentColor *__restrict srcFramebuffer, FragmentColor *__restrict dstFramebuffer, u16 *__restrict dstRGBA5551)
{
if ( (dstFramebuffer == NULL) && (dstRGBA5551 == NULL) )
{
return RENDER3DERROR_NOERR;
}
size_t i = 0;
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
const size_t ssePixCount = pixCount - (pixCount % 4);
if (dstFramebuffer != NULL)
{
if ( (this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev) )
{
for (; i < ssePixCount; i += 4)
{
// Convert to RGBA6665
__m128i color6665 = _mm_load_si128((__m128i *)(srcFramebuffer + i));
__m128i a = _mm_srli_epi32(_mm_and_si128(color6665, _mm_set1_epi32(0xF8000000)), 3);
color6665 = _mm_srli_epi32(_mm_and_si128(color6665, _mm_set1_epi32(0x00FCFCFC)), 2);
color6665 = _mm_or_si128(color6665, a);
_mm_store_si128((__m128i *)(dstFramebuffer + i), color6665);
}
for (; i < pixCount; i++)
{
dstFramebuffer[i].r = srcFramebuffer[i].r >> 2;
dstFramebuffer[i].g = srcFramebuffer[i].g >> 2;
dstFramebuffer[i].b = srcFramebuffer[i].b >> 2;
dstFramebuffer[i].a = srcFramebuffer[i].a >> 3;
}
}
else if ( (this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev) )
{
for (; i < ssePixCount; i += 4)
{
// Convert to RGBA8888:
// RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03)
// Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07)
__m128i color8888 = _mm_load_si128((__m128i *)(srcFramebuffer + i));
__m128i a = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(color8888, 3), _mm_set1_epi8(0xF8)), _mm_and_si128(_mm_srli_epi32(color8888, 2), _mm_set1_epi8(0x07)) );
color8888 = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(color8888, 2), _mm_set1_epi8(0xFC)), _mm_and_si128(_mm_srli_epi32(color8888, 4), _mm_set1_epi8(0x03)) );
color8888 = _mm_or_si128(_mm_and_si128(color8888, _mm_set1_epi32(0x00FFFFFF)), _mm_and_si128(a, _mm_set1_epi32(0xFF000000)));
_mm_store_si128((__m128i *)(dstFramebuffer + i), color8888);
}
for (; i < pixCount; i++)
{
dstFramebuffer[i].r = material_6bit_to_8bit[srcFramebuffer[i].r];
dstFramebuffer[i].g = material_6bit_to_8bit[srcFramebuffer[i].g];
dstFramebuffer[i].b = material_6bit_to_8bit[srcFramebuffer[i].b];
dstFramebuffer[i].a = material_5bit_to_8bit[srcFramebuffer[i].a];
}
}
else if ( ((this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev)) ||
((this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev)) )
{
memcpy(dstFramebuffer, srcFramebuffer, pixCount * sizeof(FragmentColor));
}
}
if (dstRGBA5551 != NULL)
{
for (; i < ssePixCount; i += 4)
{
// Convert to RGBA5551
__m128i color5551 = _mm_load_si128((__m128i *)(srcFramebuffer + i));
__m128i r = _mm_and_si128(color5551, _mm_set1_epi32(0x0000003E)); // Read from R
r = _mm_srli_epi32(r, 1); // Shift to R
__m128i g = _mm_and_si128(color5551, _mm_set1_epi32(0x00003E00)); // Read from G
g = _mm_srli_epi32(g, 4); // Shift in G
__m128i b = _mm_and_si128(color5551, _mm_set1_epi32(0x003E0000)); // Read from B
b = _mm_srli_epi32(b, 7); // Shift to B
__m128i a = _mm_and_si128(color5551, _mm_set1_epi32(0xFF000000)); // Read from A
a = _mm_cmpeq_epi32(a, _mm_setzero_si128()); // Determine A
// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
// 16-bit. Since SSE2 only has packssdw (signed saturated 16-bit pack), using
// packssdw on the alpha bit (0x8000) will result in a value of 0x7FFF, which is
// incorrect. Now if we were to use SSE4.1's packusdw (unsigned saturated 16-bit
// pack), we wouldn't have to go through this hassle. But not everyone has an
// SSE4.1-capable CPU, so doing this the SSE2 way is more guaranteed to work for
// everyone's CPU.
//
// To use packssdw, we take a bit one position lower for the alpha bit, run
// packssdw, then shift the bit back to its original position. Then we por the
// alpha vector with the post-packed color vector to get the final color.
a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A
a = _mm_packs_epi32(a, _mm_setzero_si128()); // Pack 32-bit down to 16-bit
a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be
// Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
color5551 = _mm_or_si128(_mm_or_si128(r, g), b);
color5551 = _mm_packs_epi32(color5551, _mm_setzero_si128());
color5551 = _mm_or_si128(color5551, a);
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color5551);
}
for (; i < pixCount; i++)
{
dstRGBA5551[i] = R6G6B6TORGB15(srcFramebuffer[i].r, srcFramebuffer[i].g, srcFramebuffer[i].b) | ((srcFramebuffer[i].a == 0) ? 0x0000 : 0x8000);
}
}
return RENDER3DERROR_NOERR;
}
Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
{ {
Render3DError error = RENDER3DERROR_NOERR; Render3DError error = RENDER3DERROR_NOERR;
FragmentColor clearColor; FragmentColor clearColor6665;
clearColor.r = renderState.clearColor & 0x1F; clearColor6665.color = COLOR555TO6665(renderState.clearColor & 0x7FFF, (renderState.clearColor >> 16) & 0x1F);
clearColor.g = (renderState.clearColor >> 5) & 0x1F;
clearColor.b = (renderState.clearColor >> 10) & 0x1F;
clearColor.a = (renderState.clearColor >> 16) & 0x1F;
FragmentAttributes clearFragment; FragmentAttributes clearFragment;
clearFragment.opaquePolyID = (renderState.clearColor >> 24) & 0x3F; clearFragment.opaquePolyID = (renderState.clearColor >> 24) & 0x3F;
@ -1080,12 +938,12 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer);
if (error != RENDER3DERROR_NOERR) if (error != RENDER3DERROR_NOERR)
{ {
error = this->ClearUsingValues(clearColor, clearFragment); error = this->ClearUsingValues(clearColor6665, clearFragment);
} }
} }
else else
{ {
error = this->ClearUsingValues(clearColor, clearFragment); error = this->ClearUsingValues(clearColor6665, clearFragment);
} }
return error; return error;

View File

@ -149,7 +149,7 @@ protected:
virtual Render3DError FlushFramebuffer(const FragmentColor *__restrict srcFramebuffer, FragmentColor *__restrict dstFramebuffer, u16 *__restrict dstRGBA5551); virtual Render3DError FlushFramebuffer(const FragmentColor *__restrict srcFramebuffer, FragmentColor *__restrict dstFramebuffer, u16 *__restrict dstRGBA5551);
virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer);
virtual Render3DError ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const; virtual Render3DError ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) const;
virtual Render3DError SetupPolygon(const POLY &thePoly); virtual Render3DError SetupPolygon(const POLY &thePoly);
virtual Render3DError SetupTexture(const POLY &thePoly, bool enableTexturing); virtual Render3DError SetupTexture(const POLY &thePoly, bool enableTexturing);
@ -201,22 +201,10 @@ public:
class Render3D_SSE2 : public Render3D class Render3D_SSE2 : public Render3D
{ {
protected:
virtual Render3DError FlushFramebuffer(const FragmentColor *__restrict srcFramebuffer, FragmentColor *__restrict dstFramebuffer, u16 *__restrict dstRGBA5551);
public: public:
virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState); virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState);
}; };
#endif #endif
#ifdef ENABLE_SSSE3
class Render3D_SSSE3 : public Render3D_SSE2
{
};
#endif
#endif // RENDER3D_H #endif // RENDER3D_H

View File

@ -37,7 +37,7 @@ using std::max;
//only dump this from ogl renderer. for now, softrasterizer creates things in an incompatible pixel format //only dump this from ogl renderer. for now, softrasterizer creates things in an incompatible pixel format
//#define DEBUG_DUMP_TEXTURE //#define DEBUG_DUMP_TEXTURE
#define CONVERT(color,alpha) ((TEXFORMAT == TexFormat_32bpp)?(RGB15TO32(color,alpha)):RGB15TO6665(color,alpha)) #define CONVERT(color) ((TEXFORMAT == TexFormat_32bpp)?(COLOR555TO8888_OPAQUE(color)):COLOR555TO6665_OPAQUE(color))
//This class represents a number of regions of memory which should be viewed as contiguous //This class represents a number of regions of memory which should be viewed as contiguous
class MemSpan class MemSpan
@ -403,8 +403,7 @@ public:
// format that is not A3I5 or A5I3), set all transparent pixels to 0 so that 3D // format that is not A3I5 or A5I3), set all transparent pixels to 0 so that 3D
// renderers can assume that the transparent color is 0 during texture sampling. // renderers can assume that the transparent color is 0 during texture sampling.
const u8 opaqueColor = (TEXFORMAT == TexFormat_32bpp) ? 0xFF : 0x1F; const bool isPalZeroTransparent = ( ((format >> 29) & 1) != 0 );
const u8 palZeroTransparent = ( 1 - ((format>>29) & 1) ) * opaqueColor;
switch (newitem->mode) switch (newitem->mode)
{ {
@ -415,12 +414,12 @@ public:
adr = ms.items[j].ptr; adr = ms.items[j].ptr;
for(u32 x = 0; x < ms.items[j].len; x++) for(u32 x = 0; x < ms.items[j].len; x++)
{ {
u16 c = pal[*adr&31]; u16 c = pal[*adr&31] & 0x7FFF;
u8 alpha = *adr>>5; u8 alpha = *adr>>5;
if(TEXFORMAT == TexFormat_15bpp) if(TEXFORMAT == TexFormat_15bpp)
*dwdst++ = RGB15TO6665(c,material_3bit_to_5bit[alpha]); *dwdst++ = COLOR555TO6665(c,material_3bit_to_5bit[alpha]);
else else
*dwdst++ = RGB15TO32(c,material_3bit_to_8bit[alpha]); *dwdst++ = COLOR555TO8888(c,material_3bit_to_8bit[alpha]);
adr++; adr++;
} }
} }
@ -429,7 +428,7 @@ public:
case TEXMODE_I2: case TEXMODE_I2:
{ {
if (palZeroTransparent == 0) if (isPalZeroTransparent)
{ {
for(int j=0;j<ms.numItems;j++) for(int j=0;j<ms.numItems;j++)
{ {
@ -437,23 +436,18 @@ public:
for(u32 x = 0; x < ms.items[j].len; x++) for(u32 x = 0; x < ms.items[j].len; x++)
{ {
u8 bits; u8 bits;
u16 c;
bits = (*adr)&0x3; bits = (*adr)&0x3;
c = pal[bits]; *dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
*dwdst++ = (bits == 0) ? 0 : CONVERT(c,opaqueColor);
bits = ((*adr)>>2)&0x3; bits = ((*adr)>>2)&0x3;
c = pal[bits]; *dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
*dwdst++ = (bits == 0) ? 0 : CONVERT(c,opaqueColor);
bits = ((*adr)>>4)&0x3; bits = ((*adr)>>4)&0x3;
c = pal[bits]; *dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
*dwdst++ = (bits == 0) ? 0 : CONVERT(c,opaqueColor);
bits = ((*adr)>>6)&0x3; bits = ((*adr)>>6)&0x3;
c = pal[bits]; *dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
*dwdst++ = (bits == 0) ? 0 : CONVERT(c,opaqueColor);
adr++; adr++;
} }
@ -470,20 +464,20 @@ public:
u16 c; u16 c;
bits = (*adr)&0x3; bits = (*adr)&0x3;
c = pal[bits]; c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c,opaqueColor); *dwdst++ = CONVERT(c);
bits = ((*adr)>>2)&0x3; bits = ((*adr)>>2)&0x3;
c = pal[bits]; c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c,opaqueColor); *dwdst++ = CONVERT(c);
bits = ((*adr)>>4)&0x3; bits = ((*adr)>>4)&0x3;
c = pal[bits]; c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c,opaqueColor); *dwdst++ = CONVERT(c);
bits = ((*adr)>>6)&0x3; bits = ((*adr)>>6)&0x3;
c = pal[bits]; c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c,opaqueColor); *dwdst++ = CONVERT(c);
adr++; adr++;
} }
@ -494,7 +488,7 @@ public:
case TEXMODE_I4: case TEXMODE_I4:
{ {
if (palZeroTransparent == 0) if (isPalZeroTransparent)
{ {
for(int j=0;j<ms.numItems;j++) for(int j=0;j<ms.numItems;j++)
{ {
@ -502,15 +496,12 @@ public:
for(u32 x = 0; x < ms.items[j].len; x++) for(u32 x = 0; x < ms.items[j].len; x++)
{ {
u8 bits; u8 bits;
u16 c;
bits = (*adr)&0xF; bits = (*adr)&0xF;
c = pal[bits]; *dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
*dwdst++ = (bits == 0) ? 0 : CONVERT(c,opaqueColor);
bits = ((*adr)>>4); bits = ((*adr)>>4);
c = pal[bits]; *dwdst++ = (bits == 0) ? 0 : CONVERT(pal[bits] & 0x7FFF);
*dwdst++ = (bits == 0) ? 0 : CONVERT(c,opaqueColor);
adr++; adr++;
} }
} }
@ -526,12 +517,12 @@ public:
u16 c; u16 c;
bits = (*adr)&0xF; bits = (*adr)&0xF;
c = pal[bits]; c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c,opaqueColor); *dwdst++ = CONVERT(c);
bits = ((*adr)>>4); bits = ((*adr)>>4);
c = pal[bits]; c = pal[bits] & 0x7FFF;
*dwdst++ = CONVERT(c,opaqueColor); *dwdst++ = CONVERT(c);
adr++; adr++;
} }
} }
@ -541,15 +532,14 @@ public:
case TEXMODE_I8: case TEXMODE_I8:
{ {
if (palZeroTransparent == 0) if (isPalZeroTransparent)
{ {
for(int j=0;j<ms.numItems;j++) for(int j=0;j<ms.numItems;j++)
{ {
adr = ms.items[j].ptr; adr = ms.items[j].ptr;
for(u32 x = 0; x < ms.items[j].len; ++x) for(u32 x = 0; x < ms.items[j].len; ++x)
{ {
u16 c = pal[*adr]; *dwdst++ = (*adr == 0) ? 0 : CONVERT(pal[*adr] & 0x7FFF);
*dwdst++ = (*adr == 0) ? 0 : CONVERT(c,opaqueColor);
adr++; adr++;
} }
} }
@ -561,8 +551,8 @@ public:
adr = ms.items[j].ptr; adr = ms.items[j].ptr;
for(u32 x = 0; x < ms.items[j].len; ++x) for(u32 x = 0; x < ms.items[j].len; ++x)
{ {
u16 c = pal[*adr]; const u16 c = pal[*adr] & 0x7FFF;
*dwdst++ = CONVERT(c,opaqueColor); *dwdst++ = CONVERT(c);
adr++; adr++;
} }
} }
@ -572,13 +562,14 @@ public:
case TEXMODE_4X4: case TEXMODE_4X4:
{ {
if(ms.numItems != 1) { if (ms.numItems != 1)
{
PROGINFO("Your 4x4 texture has overrun its texture slot.\n"); PROGINFO("Your 4x4 texture has overrun its texture slot.\n");
} }
//this check isnt necessary since the addressing is tied to the texture data which will also run out: //this check isnt necessary since the addressing is tied to the texture data which will also run out:
//if(msIndex.numItems != 1) PROGINFO("Your 4x4 texture index has overrun its slot.\n"); //if(msIndex.numItems != 1) PROGINFO("Your 4x4 texture index has overrun its slot.\n");
#define PAL4X4(offset) LE_TO_LOCAL_16( *(u16*)( MMU.texInfo.texPalSlot[((paletteAddress + (offset)*2)>>14)&0x7] + ((paletteAddress + (offset)*2)&0x3FFF) ) ) #define PAL4X4(offset) ( LE_TO_LOCAL_16( *(u16*)( MMU.texInfo.texPalSlot[((paletteAddress + (offset)*2)>>14)&0x7] + ((paletteAddress + (offset)*2)&0x3FFF) ) ) & 0x7FFF )
u16* slot1; u16* slot1;
u32* map = (u32*)ms.items[0].ptr; u32* map = (u32*)ms.items[0].ptr;
@ -603,31 +594,32 @@ public:
((y<<2)+2)*sizeX,((y<<2)+3)*sizeX}; ((y<<2)+2)*sizeX,((y<<2)+3)*sizeX};
for (int x = 0; x < xTmpSize; x ++, d++) for (int x = 0; x < xTmpSize; x ++, d++)
{ {
if(d >= limit) if (d >= limit)
dead = true; dead = true;
if(dead) { if (dead)
{
for (int sy = 0; sy < 4; sy++) for (int sy = 0; sy < 4; sy++)
{ {
u32 currentPos = (x<<2) + tmpPos[sy]; const u32 currentPos = (x<<2) + tmpPos[sy];
dwdst[currentPos] = dwdst[currentPos+1] = dwdst[currentPos+2] = dwdst[currentPos+3] = 0; dwdst[currentPos] = dwdst[currentPos+1] = dwdst[currentPos+2] = dwdst[currentPos+3] = 0;
} }
continue; continue;
} }
u32 currBlock = LE_TO_LOCAL_32(map[d]); const u32 currBlock = LE_TO_LOCAL_32(map[d]);
u16 pal1 = LE_TO_LOCAL_16(slot1[d]); const u16 pal1 = LE_TO_LOCAL_16(slot1[d]);
u16 pal1offset = (pal1 & 0x3FFF)<<1; const u16 pal1offset = (pal1 & 0x3FFF)<<1;
u8 mode = pal1>>14; const u8 mode = pal1>>14;
u32 tmp_col[4]; u32 tmp_col[4];
tmp_col[0] = RGB15TO32( PAL4X4(pal1offset), 0xFF ); tmp_col[0] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset) );
tmp_col[1] = RGB15TO32( PAL4X4(pal1offset+1), 0xFF ); tmp_col[1] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+1) );
switch (mode) switch (mode)
{ {
case 0: case 0:
tmp_col[2] = RGB15TO32( PAL4X4(pal1offset+2), 0xFF ); tmp_col[2] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+2) );
tmp_col[3] = 0x00000000; tmp_col[3] = 0x00000000;
break; break;
@ -647,8 +639,8 @@ public:
break; break;
case 2: case 2:
tmp_col[2] = RGB15TO32( PAL4X4(pal1offset+2), 0xFF ); tmp_col[2] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+2) );
tmp_col[3] = RGB15TO32( PAL4X4(pal1offset+3), 0xFF ); tmp_col[3] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+3) );
break; break;
case 3: case 3:
@ -676,13 +668,13 @@ public:
( ((g0*3 + g1*5)>>6) << 5 ) | ( ((g0*3 + g1*5)>>6) << 5 ) |
( ((b0*3 + b1*5)>>6) << 10 ); ( ((b0*3 + b1*5)>>6) << 10 );
tmp_col[2] = RGB15TO32(tmp1, 0xFF); tmp_col[2] = COLOR555TO8888_OPAQUE(tmp1);
tmp_col[3] = RGB15TO32(tmp2, 0xFF); tmp_col[3] = COLOR555TO8888_OPAQUE(tmp2);
break; break;
} }
} }
if(TEXFORMAT==TexFormat_15bpp) if (TEXFORMAT==TexFormat_15bpp)
{ {
for (size_t i = 0; i < 4; i++) for (size_t i = 0; i < 4; i++)
{ {
@ -706,8 +698,8 @@ public:
for (size_t sy = 0; sy < 4; sy++) for (size_t sy = 0; sy < 4; sy++)
{ {
// Texture offset // Texture offset
u32 currentPos = (x<<2) + tmpPos[sy]; const u32 currentPos = (x<<2) + tmpPos[sy];
u8 currRow = (u8)((currBlock>>(sy<<3))&0xFF); const u8 currRow = (u8)((currBlock>>(sy<<3))&0xFF);
dwdst[currentPos ] = tmp_col[ currRow &3]; dwdst[currentPos ] = tmp_col[ currRow &3];
dwdst[currentPos+1] = tmp_col[(currRow>>2)&3]; dwdst[currentPos+1] = tmp_col[(currRow>>2)&3];
@ -721,17 +713,17 @@ public:
case TEXMODE_A5I3: case TEXMODE_A5I3:
{ {
for(int j=0;j<ms.numItems;j++) for (int j = 0; j < ms.numItems; j++)
{ {
adr = ms.items[j].ptr; adr = ms.items[j].ptr;
for(u32 x = 0; x < ms.items[j].len; ++x) for (u32 x = 0; x < ms.items[j].len; ++x)
{ {
u16 c = pal[*adr&0x07]; const u16 c = pal[*adr&0x07] & 0x7FFF;
u8 alpha = (*adr>>3); const u8 alpha = (*adr>>3);
if(TEXFORMAT == TexFormat_15bpp) if (TEXFORMAT == TexFormat_15bpp)
*dwdst++ = RGB15TO6665(c,alpha); *dwdst++ = COLOR555TO6665(c,alpha);
else else
*dwdst++ = RGB15TO32(c,material_5bit_to_8bit[alpha]); *dwdst++ = COLOR555TO8888(c,material_5bit_to_8bit[alpha]);
adr++; adr++;
} }
} }
@ -740,15 +732,15 @@ public:
case TEXMODE_16BPP: case TEXMODE_16BPP:
{ {
for(int j=0;j<ms.numItems;j++) for (int j = 0; j < ms.numItems; j++)
{ {
u16* map = (u16*)ms.items[j].ptr; const u16 *map = (u16*)ms.items[j].ptr;
int len = ms.items[j].len>>1; const int len = ms.items[j].len>>1;
for(int x = 0; x < len; ++x) for (int x = 0; x < len; ++x)
{ {
u16 c = map[x]; const u16 c = map[x];
*dwdst++ = (c & 0x8000) ? CONVERT(c&0x7FFF,opaqueColor) : 0; *dwdst++ = (c & 0x8000) ? CONVERT(c&0x7FFF) : 0;
} }
} }
break; break;