SoftRasterizer: Do some multithreading improvements, and also clean up and refactor RasterizerUnit.

- Completely encapsulate all stray global variables into the SoftRasterizer class where they belong.
- Framebuffer clears are now fully multithreaded, significantly improving clearing performance.
- Doing multithreaded texture loads and vertex calculations now requires a minimum of 2 threads, down from 4 threads.
- The maximum amount of SoftRasterizer threads has been increased from 16 to 32.
This commit is contained in:
rogerman 2018-02-12 11:35:21 -08:00
parent 9e3b694ace
commit 7509d469b9
6 changed files with 1204 additions and 968 deletions

View File

@ -1,7 +1,7 @@
/*
Copyright (C) 2006 yopyop
Copyright (C) 2006-2007 shash
Copyright (C) 2008-2017 DeSmuME team
Copyright (C) 2008-2018 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -1171,14 +1171,12 @@ Render3DError OpenGLRenderer::_FlushFramebufferFlipAndConvertOnCPU(const Fragmen
if (!doFramebufferFlip)
{
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
if (!doFramebufferConvert)
{
if ( (dstFramebufferMain != NULL) && (dstFramebuffer16 != NULL) )
{
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 8);
const size_t ssePixCount = this->_framebufferPixCount - (this->_framebufferPixCount % 8);
for (; i < ssePixCount; i += 8)
{
const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + i + 0));
@ -1191,7 +1189,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferFlipAndConvertOnCPU(const Fragmen
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++)
for (; i < this->_framebufferPixCount; i++)
{
dstFramebufferMain[i].color = ColorspaceCopy32<false>(srcFramebuffer[i]);
dstFramebuffer16[i] = ColorspaceConvert8888To5551<false>(srcFramebuffer[i]);
@ -1202,12 +1200,12 @@ Render3DError OpenGLRenderer::_FlushFramebufferFlipAndConvertOnCPU(const Fragmen
}
else if (dstFramebufferMain != NULL)
{
ColorspaceCopyBuffer32<false, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, pixCount);
ColorspaceCopyBuffer32<false, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, this->_framebufferPixCount);
this->_renderNeedsFlushMain = false;
}
else
{
ColorspaceConvertBuffer8888To5551<false, false>((u32 *)srcFramebuffer, dstFramebuffer16, pixCount);
ColorspaceConvertBuffer8888To5551<false, false>((u32 *)srcFramebuffer, dstFramebuffer16, this->_framebufferPixCount);
this->_renderNeedsFlush16 = false;
}
}
@ -1218,7 +1216,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferFlipAndConvertOnCPU(const Fragmen
if ( (dstFramebufferMain != NULL) && (dstFramebuffer16 != NULL) )
{
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 8);
const size_t ssePixCount = this->_framebufferPixCount - (this->_framebufferPixCount % 8);
for (; i < ssePixCount; i += 8)
{
const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + i + 0));
@ -1231,7 +1229,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferFlipAndConvertOnCPU(const Fragmen
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++)
for (; i < this->_framebufferPixCount; i++)
{
dstFramebufferMain[i].color = ColorspaceConvert8888To6665<true>(srcFramebuffer[i]);
dstFramebuffer16[i] = ColorspaceConvert8888To5551<true>(srcFramebuffer[i]);
@ -1242,12 +1240,12 @@ Render3DError OpenGLRenderer::_FlushFramebufferFlipAndConvertOnCPU(const Fragmen
}
else if (dstFramebufferMain != NULL)
{
ColorspaceConvertBuffer8888To6665<true, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, pixCount);
ColorspaceConvertBuffer8888To6665<true, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, this->_framebufferPixCount);
this->_renderNeedsFlushMain = false;
}
else
{
ColorspaceConvertBuffer8888To5551<true, false>((u32 *)srcFramebuffer, dstFramebuffer16, pixCount);
ColorspaceConvertBuffer8888To5551<true, false>((u32 *)srcFramebuffer, dstFramebuffer16, this->_framebufferPixCount);
this->_renderNeedsFlush16 = false;
}
}
@ -1256,7 +1254,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferFlipAndConvertOnCPU(const Fragmen
if ( (dstFramebufferMain != NULL) && (dstFramebuffer16 != NULL) )
{
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 8);
const size_t ssePixCount = this->_framebufferPixCount - (this->_framebufferPixCount % 8);
for (; i < ssePixCount; i += 8)
{
const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + i + 0));
@ -1269,7 +1267,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferFlipAndConvertOnCPU(const Fragmen
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++)
for (; i < this->_framebufferPixCount; i++)
{
dstFramebufferMain[i].color = ColorspaceCopy32<true>(srcFramebuffer[i]);
dstFramebuffer16[i] = ColorspaceConvert8888To5551<true>(srcFramebuffer[i]);
@ -1280,12 +1278,12 @@ Render3DError OpenGLRenderer::_FlushFramebufferFlipAndConvertOnCPU(const Fragmen
}
else if (dstFramebufferMain != NULL)
{
ColorspaceCopyBuffer32<true, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, pixCount);
ColorspaceCopyBuffer32<true, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, this->_framebufferPixCount);
this->_renderNeedsFlushMain = false;
}
else
{
ColorspaceConvertBuffer8888To5551<true, false>((u32 *)srcFramebuffer, dstFramebuffer16, pixCount);
ColorspaceConvertBuffer8888To5551<true, false>((u32 *)srcFramebuffer, dstFramebuffer16, this->_framebufferPixCount);
this->_renderNeedsFlush16 = false;
}
}
@ -4891,6 +4889,7 @@ Render3DError OpenGLRenderer_1_2::SetFramebufferSize(size_t w, size_t h)
this->_framebufferWidth = w;
this->_framebufferHeight = h;
this->_framebufferPixCount = w * h;
this->_framebufferColorSizeBytes = newFramebufferColorSizeBytes;
if (this->isPBOSupported)

View File

@ -1,7 +1,7 @@
/*
Copyright (C) 2006 yopyop
Copyright (C) 2006-2007 shash
Copyright (C) 2008-2017 DeSmuME team
Copyright (C) 2008-2018 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -2064,6 +2064,7 @@ Render3DError OpenGLRenderer_3_2::SetFramebufferSize(size_t w, size_t h)
this->_framebufferWidth = w;
this->_framebufferHeight = h;
this->_framebufferPixCount = w * h;
this->_framebufferColorSizeBytes = newFramebufferColorSizeBytes;
this->_framebufferColor = NULL; // Don't need to make a client-side buffer since we will be reading directly from the PBO.

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*
Copyright (C) 2009-2017 DeSmuME team
Copyright (C) 2009-2018 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -20,11 +20,21 @@
#include "render3D.h"
#include "gfx3d.h"
#define SOFTRASTERIZER_MAX_THREADS 32
extern GPU3DInterface gpu3DRasterize;
class SoftRasterizerRenderer;
class Task;
class SoftRasterizerRenderer;
struct edge_fx_fl;
struct SoftRasterizerClearParam
{
SoftRasterizerRenderer *renderer;
size_t startPixel;
size_t endPixel;
};
struct SoftRasterizerPostProcessParams
{
@ -81,6 +91,39 @@ public:
void SetUseDeposterize(bool willDeposterize);
void SetScalingFactor(size_t scalingFactor);
};
template <bool RENDERER>
class RasterizerUnit
{
protected:
bool _debug_thisPoly;
u32 _SLI_Mask;
u32 _SLI_Value;
SoftRasterizerRenderer *_softRender;
SoftRasterizerTexture *_currentTexture;
VERT *_verts[MAX_CLIPPED_VERTS];
size_t _polynum;
u8 _textureWrapMode;
Render3DError _SetupTexture(const POLY &thePoly, size_t polyRenderIndex);
FORCEINLINE FragmentColor _sample(const float u, const float v);
FORCEINLINE float _round_s(double val);
template<bool ISSHADOWPOLYGON> FORCEINLINE void _shade(const PolygonMode polygonMode, const FragmentColor src, FragmentColor &dst, const float texCoordU, const float texCoordV);
template<bool ISSHADOWPOLYGON> FORCEINLINE void _pixel(const POLYGON_ATTR polyAttr, const bool isTranslucent, const size_t fragmentIndex, FragmentColor &dstColor, float r, float g, float b, float invu, float invv, float w, float z);
template<bool ISSHADOWPOLYGON, bool USELINEHACK> FORCEINLINE void _drawscanline(const POLYGON_ATTR polyAttr, const bool isTranslucent, FragmentColor *dstColor, const size_t framebufferWidth, const size_t framebufferHeight, edge_fx_fl *pLeft, edge_fx_fl *pRight);
template<bool SLI, bool ISSHADOWPOLYGON, bool USELINEHACK, bool ISHORIZONTAL> void _runscanlines(const POLYGON_ATTR polyAttr, const bool isTranslucent, FragmentColor *dstColor, const size_t framebufferWidth, const size_t framebufferHeight, edge_fx_fl *left, edge_fx_fl *right);
template<int TYPE> FORCEINLINE void _rot_verts();
template<bool ISBACKWARDS, int TYPE> void _sort_verts();
template<bool SLI, bool ISBACKWARDS, bool ISSHADOWPOLYGON, bool USELINEHACK> void _shape_engine(const POLYGON_ATTR polyAttr, const bool isTranslucent, FragmentColor *dstColor, const size_t framebufferWidth, const size_t framebufferHeight, int type);
public:
void SetSLI(u32 value, u32 mask, bool debug);
void SetRenderer(SoftRasterizerRenderer *theRenderer);
template<bool SLI, bool USELINEHACK> FORCEINLINE void Render();
};
#if defined(ENABLE_SSE2)
class SoftRasterizerRenderer : public Render3D_SSE2
@ -88,13 +131,28 @@ class SoftRasterizerRenderer : public Render3D_SSE2
class SoftRasterizerRenderer : public Render3D
#endif
{
protected:
protected:
Task *_task;
SoftRasterizerClearParam _threadClearParam[SOFTRASTERIZER_MAX_THREADS];
SoftRasterizerPostProcessParams _threadPostprocessParam[SOFTRASTERIZER_MAX_THREADS];
RasterizerUnit<true> _rasterizerUnit[SOFTRASTERIZER_MAX_THREADS];
RasterizerUnit<false> _HACK_viewer_rasterizerUnit;
size_t _threadCount;
size_t _nativeLinesPerThread;
size_t _nativePixelsPerThread;
size_t _customLinesPerThread;
size_t _customPixelsPerThread;
FragmentColor _clearColor6665;
FragmentAttributes _clearAttributes;
GFX3D_Clipper clipper;
u8 fogTable[32768];
FragmentColor edgeMarkTable[8];
bool edgeMarkDisabled[8];
bool _stateSetupNeedsFinish;
bool _renderGeometryNeedsFinish;
bool _enableHighPrecisionColorInterpolation;
@ -124,7 +182,6 @@ public:
bool polyVisible[POLYLIST_SIZE];
bool polyBackfacing[POLYLIST_SIZE];
GFX3D_State *currentRenderState;
SoftRasterizerPostProcessParams *postprocessParam;
bool _enableFragmentSamplingHack;
@ -147,15 +204,31 @@ public:
virtual Render3DError ApplyRenderingSettings(const GFX3D_State &renderState);
virtual Render3DError Render(const GFX3D &engine);
virtual Render3DError RenderFinish();
virtual Render3DError RenderFlush(bool willFlushBuffer32, bool willFlushBuffer16);
virtual Render3DError RenderFlush(bool willFlushBuffer32, bool willFlushBuffer16);
virtual void ClearUsingValuesLoop(const size_t startPixel, const size_t endPixel);
virtual Render3DError SetFramebufferSize(size_t w, size_t h);
};
#ifdef ENABLE_SSE2
#ifdef ENABLE_SSE2
class SoftRasterizerRenderer_SSE2 : public SoftRasterizerRenderer
{
{
protected:
v128u32 _clearColor_v128u32;
v128u32 _clearDepth_v128u32;
v128u8 _clearAttrOpaquePolyID_v128u8;
v128u8 _clearAttrTranslucentPolyID_v128u8;
v128u8 _clearAttrStencil_v128u8;
v128u8 _clearAttrIsFogged_v128u8;
v128u8 _clearAttrIsTranslucentPoly_v128u8;
virtual Render3DError ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes);
public:
SoftRasterizerRenderer_SSE2();
virtual void ClearUsingValuesLoop(const size_t startPixel, const size_t endPixel);
virtual Render3DError SetFramebufferSize(size_t w, size_t h);
};
#endif

View File

@ -1,6 +1,6 @@
/*
Copyright (C) 2006-2007 shash
Copyright (C) 2008-2017 DeSmuME team
Copyright (C) 2008-2018 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -244,6 +244,8 @@ Render3D::Render3D()
_framebufferWidth = GPU_FRAMEBUFFER_NATIVE_WIDTH;
_framebufferHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
_framebufferPixCount = _framebufferWidth * _framebufferHeight;
_framebufferSIMDPixCount = 0;
_framebufferColorSizeBytes = 0;
_framebufferColor = NULL;
@ -332,6 +334,7 @@ Render3DError Render3D::SetFramebufferSize(size_t w, size_t h)
this->_framebufferWidth = w;
this->_framebufferHeight = h;
this->_framebufferPixCount = w * h;
this->_framebufferColorSizeBytes = w * h * sizeof(FragmentColor);
this->_framebufferColor = GPU->GetEngineMain()->Get3DFramebufferMain(); // Just use the buffer that is already present on the main GPU engine
@ -478,22 +481,20 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram
return RENDER3DERROR_NOERR;
}
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
if (dstFramebufferMain != NULL)
{
if ( (this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev) )
{
ColorspaceConvertBuffer8888To6665<false, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, pixCount);
ColorspaceConvertBuffer8888To6665<false, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, this->_framebufferPixCount);
}
else if ( (this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev) )
{
ColorspaceConvertBuffer6665To8888<false, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, pixCount);
ColorspaceConvertBuffer6665To8888<false, false>((u32 *)srcFramebuffer, (u32 *)dstFramebufferMain, this->_framebufferPixCount);
}
else if ( ((this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev)) ||
((this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev)) )
{
memcpy(dstFramebufferMain, srcFramebuffer, pixCount * sizeof(FragmentColor));
memcpy(dstFramebufferMain, srcFramebuffer, this->_framebufferPixCount * sizeof(FragmentColor));
}
this->_renderNeedsFlushMain = false;
@ -503,11 +504,11 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram
{
if (this->_outputFormat == NDSColorFormat_BGR666_Rev)
{
ColorspaceConvertBuffer6665To5551<false, false>((u32 *)srcFramebuffer, dstFramebuffer16, pixCount);
ColorspaceConvertBuffer6665To5551<false, false>((u32 *)srcFramebuffer, dstFramebuffer16, this->_framebufferPixCount);
}
else if (this ->_outputFormat == NDSColorFormat_BGR888_Rev)
{
ColorspaceConvertBuffer8888To5551<false, false>((u32 *)srcFramebuffer, dstFramebuffer16, pixCount);
ColorspaceConvertBuffer8888To5551<false, false>((u32 *)srcFramebuffer, dstFramebuffer16, this->_framebufferPixCount);
}
this->_renderNeedsFlush16 = false;
@ -687,6 +688,42 @@ Render3DError Render3D::VramReconfigureSignal()
return RENDER3DERROR_NOERR;
}
Render3D_SIMD128::Render3D_SIMD128()
{
_framebufferSIMDPixCount = _framebufferPixCount - (_framebufferPixCount % 16);
}
Render3DError Render3D_SIMD128::SetFramebufferSize(size_t w, size_t h)
{
Render3DError error = this->Render3D::SetFramebufferSize(w, h);
if (error != RENDER3DERROR_NOERR)
{
return RENDER3DERROR_NOERR;
}
this->_framebufferSIMDPixCount = this->_framebufferPixCount - (this->_framebufferPixCount % 16);
return error;
}
Render3D_SIMD256::Render3D_SIMD256()
{
_framebufferSIMDPixCount = _framebufferPixCount - (_framebufferPixCount % 32);
}
Render3DError Render3D_SIMD256::SetFramebufferSize(size_t w, size_t h)
{
Render3DError error = this->Render3D::SetFramebufferSize(w, h);
if (error != RENDER3DERROR_NOERR)
{
return RENDER3DERROR_NOERR;
}
this->_framebufferSIMDPixCount = this->_framebufferPixCount - (this->_framebufferPixCount % 32);
return error;
}
#ifdef ENABLE_SSE2
Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)

View File

@ -1,6 +1,6 @@
/*
Copyright (C) 2006-2007 shash
Copyright (C) 2007-2017 DeSmuME team
Copyright (C) 2007-2018 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -136,14 +136,16 @@ public:
size_t GetScalingFactor() const;
void SetScalingFactor(size_t scalingFactor);
};
class Render3D
{
protected:
Render3DDeviceInfo _deviceInfo;
size_t _framebufferWidth;
size_t _framebufferHeight;
size_t _framebufferHeight;
size_t _framebufferPixCount;
size_t _framebufferSIMDPixCount;
size_t _framebufferColorSizeBytes;
FragmentColor *_framebufferColor;
@ -244,11 +246,27 @@ public:
void SetTextureProcessingProperties();
Render3DTexture* GetTextureByPolygonRenderIndex(size_t polyRenderIndex) const;
};
class Render3D_SIMD128 : public Render3D
{
public:
Render3D_SIMD128();
virtual Render3DError SetFramebufferSize(size_t w, size_t h);
};
class Render3D_SIMD256 : public Render3D
{
public:
Render3D_SIMD256();
virtual Render3DError SetFramebufferSize(size_t w, size_t h);
};
#ifdef ENABLE_SSE2
#ifdef ENABLE_SSE2
class Render3D_SSE2 : public Render3D
class Render3D_SSE2 : public Render3D_SIMD128
{
public:
virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState);