Destroy OpenMP

This commit is contained in:
Jasper St. Pierre 2014-11-05 12:26:08 -08:00
parent a1b44a9027
commit 44b879dac2
5 changed files with 0 additions and 68 deletions

View File

@ -12,7 +12,6 @@ option(ENABLE_PCH "Use PCH to speed up compilation" ON)
option(ENABLE_LTO "Enables Link Time Optimization" OFF) option(ENABLE_LTO "Enables Link Time Optimization" OFF)
option(ENABLE_GENERIC "Enables generic build that should run on any little-endian host" OFF) option(ENABLE_GENERIC "Enables generic build that should run on any little-endian host" OFF)
option(OPENMP "Enable OpenMP parallelization" ON)
option(ENCODE_FRAMEDUMPS "Encode framedumps in AVI format" ON) option(ENCODE_FRAMEDUMPS "Encode framedumps in AVI format" ON)
option(FASTLOG "Enable all logs" OFF) option(FASTLOG "Enable all logs" OFF)
@ -345,19 +344,6 @@ add_definitions(-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE)
include(CheckLib) include(CheckLib)
include(CheckCXXSourceRuns) include(CheckCXXSourceRuns)
if(OPENMP)
include(FindOpenMP OPTIONAL)
if(OPENMP_FOUND)
message("OpenMP parallelization enabled")
add_definitions("${OpenMP_CXX_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()
endif()
if(NOT OPENMP_FOUND)
add_definitions(-Wno-unknown-pragmas)
message("OpenMP parallelization disabled")
endif()
if(NOT ANDROID) if(NOT ANDROID)
include(FindOpenGL) include(FindOpenGL)

View File

@ -146,7 +146,6 @@ static wxString use_ffv1_desc = wxTRANSLATE("Encode frame dumps using the FFV1 c
#endif #endif
static wxString free_look_desc = wxTRANSLATE("This feature allows you to change the game's camera.\nMove the mouse while holding the right mouse button to pan and while holding the middle button to move.\nHold SHIFT and press one of the WASD keys to move the camera by a certain step distance (SHIFT+0 to move faster and SHIFT+9 to move slower). Press SHIFT+R to reset the camera.\n\nIf unsure, leave this unchecked."); static wxString free_look_desc = wxTRANSLATE("This feature allows you to change the game's camera.\nMove the mouse while holding the right mouse button to pan and while holding the middle button to move.\nHold SHIFT and press one of the WASD keys to move the camera by a certain step distance (SHIFT+0 to move faster and SHIFT+9 to move slower). Press SHIFT+R to reset the camera.\n\nIf unsure, leave this unchecked.");
static wxString crop_desc = wxTRANSLATE("Crop the picture from 4:3 to 5:4 or from 16:9 to 16:10.\n\nIf unsure, leave this unchecked."); static wxString crop_desc = wxTRANSLATE("Crop the picture from 4:3 to 5:4 or from 16:9 to 16:10.\n\nIf unsure, leave this unchecked.");
static wxString omp_desc = wxTRANSLATE("Use multiple threads to decode textures.\nMight result in a speedup (especially on CPUs with more than two cores).\n\nIf unsure, leave this unchecked.");
static wxString ppshader_desc = wxTRANSLATE("Apply a post-processing effect after finishing a frame.\n\nIf unsure, select (off)."); static wxString ppshader_desc = wxTRANSLATE("Apply a post-processing effect after finishing a frame.\n\nIf unsure, select (off).");
static wxString cache_efb_copies_desc = wxTRANSLATE("Slightly speeds up EFB to RAM copies by sacrificing emulation accuracy.\nSometimes also increases visual quality.\nIf you're experiencing any issues, try raising texture cache accuracy or disable this option.\n\nIf unsure, leave this unchecked."); static wxString cache_efb_copies_desc = wxTRANSLATE("Slightly speeds up EFB to RAM copies by sacrificing emulation accuracy.\nSometimes also increases visual quality.\nIf you're experiencing any issues, try raising texture cache accuracy or disable this option.\n\nIf unsure, leave this unchecked.");
static wxString shader_errors_desc = wxTRANSLATE("Usually if shader compilation fails, an error message is displayed.\nHowever, one may skip the popups to allow interruption free gameplay by checking this option.\n\nIf unsure, leave this unchecked."); static wxString shader_errors_desc = wxTRANSLATE("Usually if shader compilation fails, an error message is displayed.\nHowever, one may skip the popups to allow interruption free gameplay by checking this option.\n\nIf unsure, leave this unchecked.");
@ -515,7 +514,6 @@ VideoConfigDiag::VideoConfigDiag(wxWindow* parent, const std::string &title, con
{ {
wxGridSizer* const szr_other = new wxGridSizer(2, 5, 5); wxGridSizer* const szr_other = new wxGridSizer(2, 5, 5);
szr_other->Add(CreateCheckBox(page_hacks, _("Disable Destination Alpha"), wxGetTranslation(disable_dstalpha_desc), vconfig.bDstAlphaPass)); szr_other->Add(CreateCheckBox(page_hacks, _("Disable Destination Alpha"), wxGetTranslation(disable_dstalpha_desc), vconfig.bDstAlphaPass));
szr_other->Add(CreateCheckBox(page_hacks, _("OpenMP Texture Decoder"), wxGetTranslation(omp_desc), vconfig.bOMPDecoder));
szr_other->Add(CreateCheckBox(page_hacks, _("Fast Depth Calculation"), wxGetTranslation(fast_depth_calc_desc), vconfig.bFastDepthCalc)); szr_other->Add(CreateCheckBox(page_hacks, _("Fast Depth Calculation"), wxGetTranslation(fast_depth_calc_desc), vconfig.bFastDepthCalc));
wxStaticBoxSizer* const group_other = new wxStaticBoxSizer(wxVERTICAL, page_hacks, _("Other")); wxStaticBoxSizer* const group_other = new wxStaticBoxSizer(wxVERTICAL, page_hacks, _("Other"));

View File

@ -13,12 +13,6 @@
#include "VideoCommon/TextureDecoder.h" #include "VideoCommon/TextureDecoder.h"
#include "VideoCommon/VideoConfig.h" #include "VideoCommon/VideoConfig.h"
#ifdef _OPENMP
#include <omp.h>
#elif defined __GNUC__
#pragma GCC diagnostic ignored "-Wunknown-pragmas"
#endif
#if _M_SSE >= 0x401 #if _M_SSE >= 0x401
#include <smmintrin.h> #include <smmintrin.h>
#include <emmintrin.h> #include <emmintrin.h>
@ -234,22 +228,6 @@ static void DecodeDXTBlock(u32 *dst, const DXTBlock *src, int pitch)
} }
#endif #endif
static inline void SetOpenMPThreadCount(int width, int height)
{
#ifdef _OPENMP
// Don't use multithreading in small Textures
if (g_ActiveConfig.bOMPDecoder && width > 127 && height > 127)
{
// don't span to many threads they will kill the rest of the emu :)
omp_set_num_threads((omp_get_num_procs() + 2) / 3);
}
else
{
omp_set_num_threads(1);
}
#endif
}
// JSD 01/06/11: // JSD 01/06/11:
// TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to // TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to
// squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128 // squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128
@ -260,8 +238,6 @@ static inline void SetOpenMPThreadCount(int width, int height)
PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, const u8* tlut, TlutFormat tlutfmt) PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, const u8* tlut, TlutFormat tlutfmt)
{ {
SetOpenMPThreadCount(width, height);
const int Wsteps4 = (width + 3) / 4; const int Wsteps4 = (width + 3) / 4;
const int Wsteps8 = (width + 7) / 8; const int Wsteps8 = (width + 7) / 8;
@ -270,7 +246,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
case GX_TF_C4: case GX_TF_C4:
if (tlutfmt == GX_TL_RGB5A3) if (tlutfmt == GX_TL_RGB5A3)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++) for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++)
@ -278,7 +253,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
} }
else if (tlutfmt == GX_TL_IA8) else if (tlutfmt == GX_TL_IA8)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++) for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++)
@ -287,7 +261,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
} }
else if (tlutfmt == GX_TL_RGB565) else if (tlutfmt == GX_TL_RGB565)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++) for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++)
@ -307,7 +280,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
const __m128i maskB3A2 = _mm_set_epi8(11,11,11,11,3,3,3,3,10,10,10,10,2,2,2,2); const __m128i maskB3A2 = _mm_set_epi8(11,11,11,11,3,3,3,3,10,10,10,10,2,2,2,2);
const __m128i maskD5C4 = _mm_set_epi8(13,13,13,13,5,5,5,5,12,12,12,12,4,4,4,4); const __m128i maskD5C4 = _mm_set_epi8(13,13,13,13,5,5,5,5,12,12,12,12,4,4,4,4);
const __m128i maskF7E6 = _mm_set_epi8(15,15,15,15,7,7,7,7,14,14,14,14,6,6,6,6); const __m128i maskF7E6 = _mm_set_epi8(15,15,15,15,7,7,7,7,14,14,14,14,6,6,6,6);
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2,xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2,xStep++)
@ -343,7 +315,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// JSD optimized with SSE2 intrinsics. // JSD optimized with SSE2 intrinsics.
// Produces a ~76% speed improvement over reference C implementation. // Produces a ~76% speed improvement over reference C implementation.
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8 ; x < width; x += 8, yStep++) for (int x = 0, yStep = (y / 8) * Wsteps8 ; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
@ -415,7 +386,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// Produces a ~10% speed improvement over SSE2 implementation // Produces a ~10% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3) if (cpu_info.bSSSE3)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; ++iy, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; ++iy, xStep++)
@ -441,7 +411,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// JSD optimized with SSE2 intrinsics. // JSD optimized with SSE2 intrinsics.
// Produces an ~86% speed improvement over reference C implementation. // Produces an ~86% speed improvement over reference C implementation.
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
{ {
@ -527,7 +496,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
case GX_TF_C8: case GX_TF_C8:
if (tlutfmt == GX_TL_RGB5A3) if (tlutfmt == GX_TL_RGB5A3)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -535,7 +503,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
} }
else if (tlutfmt == GX_TL_IA8) else if (tlutfmt == GX_TL_IA8)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -544,7 +511,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
} }
else if (tlutfmt == GX_TL_RGB565) else if (tlutfmt == GX_TL_RGB565)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -554,7 +520,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
break; break;
case GX_TF_IA4: case GX_TF_IA4:
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -568,7 +533,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// Produces an ~50% speed improvement over SSE2 implementation. // Produces an ~50% speed improvement over SSE2 implementation.
if (cpu_info.bSSSE3) if (cpu_info.bSSSE3)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -590,7 +554,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL); const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL);
const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L); const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L);
const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL); const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL);
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -639,7 +602,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
case GX_TF_C14X2: case GX_TF_C14X2:
if (tlutfmt == GX_TL_RGB5A3) if (tlutfmt == GX_TL_RGB5A3)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -647,7 +609,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
} }
else if (tlutfmt == GX_TL_IA8) else if (tlutfmt == GX_TL_IA8)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -655,7 +616,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
} }
else if (tlutfmt == GX_TL_RGB565) else if (tlutfmt == GX_TL_RGB565)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -671,7 +631,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
const __m128i kMaskG1 = _mm_set1_epi32(0x00000300); const __m128i kMaskG1 = _mm_set1_epi32(0x00000300);
const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000); const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000);
const __m128i kAlpha = _mm_set1_epi32(0xFF000000); const __m128i kAlpha = _mm_set1_epi32(0xFF000000);
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -748,7 +707,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// Produces a ~10% speed improvement over SSE2 implementation // Produces a ~10% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3) if (cpu_info.bSSSE3)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -841,7 +799,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// JSD optimized with SSE2 intrinsics (2 in 4 cases) // JSD optimized with SSE2 intrinsics (2 in 4 cases)
// Produces a ~25% speed improvement over reference C implementation. // Produces a ~25% speed improvement over reference C implementation.
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -955,7 +912,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// Produces a ~30% speed improvement over SSE2 implementation // Produces a ~30% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3) if (cpu_info.bSSSE3)
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{ {
@ -987,7 +943,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// JSD optimized with SSE2 intrinsics // JSD optimized with SSE2 intrinsics
// Produces a ~68% speed improvement over reference C implementation. // Produces a ~68% speed improvement over reference C implementation.
{ {
#pragma omp parallel for
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{ {
@ -1091,7 +1046,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference C implementation. // Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference C implementation.
// The x64 compiled reference C code is faster than the x86 compiled reference C code, but the SSE2 is // The x64 compiled reference C code is faster than the x86 compiled reference C code, but the SSE2 is
// faster than both. // faster than both.
#pragma omp parallel for
for (int y = 0; y < height; y += 8) for (int y = 0; y < height; y += 8)
{ {
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++) for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)

View File

@ -78,7 +78,6 @@ void VideoConfig::Load(const std::string& ini_file)
settings->Get("TexFmtOverlayCenter", &bTexFmtOverlayCenter, 0); settings->Get("TexFmtOverlayCenter", &bTexFmtOverlayCenter, 0);
settings->Get("WireFrame", &bWireFrame, 0); settings->Get("WireFrame", &bWireFrame, 0);
settings->Get("DisableFog", &bDisableFog, 0); settings->Get("DisableFog", &bDisableFog, 0);
settings->Get("OMPDecoder", &bOMPDecoder, false);
settings->Get("EnableShaderDebugging", &bEnableShaderDebugging, false); settings->Get("EnableShaderDebugging", &bEnableShaderDebugging, false);
settings->Get("BorderlessFullscreen", &bBorderlessFullscreen, false); settings->Get("BorderlessFullscreen", &bBorderlessFullscreen, false);
@ -176,7 +175,6 @@ void VideoConfig::GameIniLoad()
CHECK_SETTING("Video_Settings", "DstAlphaPass", bDstAlphaPass); CHECK_SETTING("Video_Settings", "DstAlphaPass", bDstAlphaPass);
CHECK_SETTING("Video_Settings", "DisableFog", bDisableFog); CHECK_SETTING("Video_Settings", "DisableFog", bDisableFog);
CHECK_SETTING("Video_Settings", "OMPDecoder", bOMPDecoder);
CHECK_SETTING("Video_Enhancements", "ForceFiltering", bForceFiltering); CHECK_SETTING("Video_Enhancements", "ForceFiltering", bForceFiltering);
CHECK_SETTING("Video_Enhancements", "MaxAnisotropy", iMaxAnisotropy); // NOTE - this is x in (1 << x) CHECK_SETTING("Video_Enhancements", "MaxAnisotropy", iMaxAnisotropy); // NOTE - this is x in (1 << x)
@ -245,7 +243,6 @@ void VideoConfig::Save(const std::string& ini_file)
settings->Set("Wireframe", bWireFrame); settings->Set("Wireframe", bWireFrame);
settings->Set("DstAlphaPass", bDstAlphaPass); settings->Set("DstAlphaPass", bDstAlphaPass);
settings->Set("DisableFog", bDisableFog); settings->Set("DisableFog", bDisableFog);
settings->Set("OMPDecoder", bOMPDecoder);
settings->Set("EnableShaderDebugging", bEnableShaderDebugging); settings->Set("EnableShaderDebugging", bEnableShaderDebugging);
settings->Set("BorderlessFullscreen", bBorderlessFullscreen); settings->Set("BorderlessFullscreen", bBorderlessFullscreen);

View File

@ -65,9 +65,6 @@ struct VideoConfig final
bool bUseXFB; bool bUseXFB;
bool bUseRealXFB; bool bUseRealXFB;
// OpenMP
bool bOMPDecoder;
// Enhancements // Enhancements
int iMultisampleMode; int iMultisampleMode;
int iEFBScale; int iEFBScale;