Merge pull request #1510 from magcius/destroy-openmp

Destroy OpenMP
2014-11-06 22:14:34 -05:00 · 2014-11-06 22:14:34 -05:00 · fabfe1422e
parent a1b44a9027 44b879dac2
commit fabfe1422e
5 changed files with 0 additions and 68 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -12,7 +12,6 @@ option(ENABLE_PCH "Use PCH to speed up compilation" ON)
 option(ENABLE_LTO "Enables Link Time Optimization" OFF)
 option(ENABLE_GENERIC "Enables generic build that should run on any little-endian host" OFF)

-option(OPENMP "Enable OpenMP parallelization" ON)
 option(ENCODE_FRAMEDUMPS "Encode framedumps in AVI format" ON)

 option(FASTLOG "Enable all logs" OFF)
@ -345,19 +344,6 @@ add_definitions(-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE)
 include(CheckLib)
 include(CheckCXXSourceRuns)

-if(OPENMP)
-	include(FindOpenMP OPTIONAL)
-	if(OPENMP_FOUND)
-		message("OpenMP parallelization enabled")
-		add_definitions("${OpenMP_CXX_FLAGS}")
-		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
-	endif()
-endif()
-if(NOT OPENMP_FOUND)
-	add_definitions(-Wno-unknown-pragmas)
-	message("OpenMP parallelization disabled")
-endif()
-
 if(NOT ANDROID)

 	include(FindOpenGL)
--- a/Source/Core/DolphinWX/VideoConfigDiag.cpp
+++ b/Source/Core/DolphinWX/VideoConfigDiag.cpp
@ -146,7 +146,6 @@ static wxString use_ffv1_desc = wxTRANSLATE("Encode frame dumps using the FFV1 c
 #endif
 static wxString free_look_desc = wxTRANSLATE("This feature allows you to change the game's camera.\nMove the mouse while holding the right mouse button to pan and while holding the middle button to move.\nHold SHIFT and press one of the WASD keys to move the camera by a certain step distance (SHIFT+0 to move faster and SHIFT+9 to move slower). Press SHIFT+R to reset the camera.\n\nIf unsure, leave this unchecked.");
 static wxString crop_desc = wxTRANSLATE("Crop the picture from 4:3 to 5:4 or from 16:9 to 16:10.\n\nIf unsure, leave this unchecked.");
-static wxString omp_desc = wxTRANSLATE("Use multiple threads to decode textures.\nMight result in a speedup (especially on CPUs with more than two cores).\n\nIf unsure, leave this unchecked.");
 static wxString ppshader_desc = wxTRANSLATE("Apply a post-processing effect after finishing a frame.\n\nIf unsure, select (off).");
 static wxString cache_efb_copies_desc = wxTRANSLATE("Slightly speeds up EFB to RAM copies by sacrificing emulation accuracy.\nSometimes also increases visual quality.\nIf you're experiencing any issues, try raising texture cache accuracy or disable this option.\n\nIf unsure, leave this unchecked.");
 static wxString shader_errors_desc = wxTRANSLATE("Usually if shader compilation fails, an error message is displayed.\nHowever, one may skip the popups to allow interruption free gameplay by checking this option.\n\nIf unsure, leave this unchecked.");
@ -515,7 +514,6 @@ VideoConfigDiag::VideoConfigDiag(wxWindow* parent, const std::string &title, con
 	{
 	wxGridSizer* const szr_other = new wxGridSizer(2, 5, 5);
 	szr_other->Add(CreateCheckBox(page_hacks, _("Disable Destination Alpha"), wxGetTranslation(disable_dstalpha_desc), vconfig.bDstAlphaPass));
-	szr_other->Add(CreateCheckBox(page_hacks, _("OpenMP Texture Decoder"), wxGetTranslation(omp_desc), vconfig.bOMPDecoder));
 	szr_other->Add(CreateCheckBox(page_hacks, _("Fast Depth Calculation"), wxGetTranslation(fast_depth_calc_desc), vconfig.bFastDepthCalc));

 	wxStaticBoxSizer* const group_other = new wxStaticBoxSizer(wxVERTICAL, page_hacks, _("Other"));
--- a/Source/Core/VideoCommon/TextureDecoder_x64.cpp
+++ b/Source/Core/VideoCommon/TextureDecoder_x64.cpp
@ -13,12 +13,6 @@
 #include "VideoCommon/TextureDecoder.h"
 #include "VideoCommon/VideoConfig.h"

-#ifdef _OPENMP
-#include <omp.h>
-#elif defined __GNUC__
-#pragma GCC diagnostic ignored "-Wunknown-pragmas"
-#endif
-
 #if _M_SSE >= 0x401
 #include <smmintrin.h>
 #include <emmintrin.h>
@ -234,22 +228,6 @@ static void DecodeDXTBlock(u32 *dst, const DXTBlock *src, int pitch)
 }
 #endif

-static inline void SetOpenMPThreadCount(int width, int height)
-{
-#ifdef _OPENMP
-	// Don't use multithreading in small Textures
-	if (g_ActiveConfig.bOMPDecoder && width > 127 && height > 127)
-	{
-		// don't span to many threads they will kill the rest of the emu :)
-		omp_set_num_threads((omp_get_num_procs() + 2) / 3);
-	}
-	else
-	{
-		omp_set_num_threads(1);
-	}
-#endif
-}
-
 // JSD 01/06/11:
 // TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to
 // squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128
@ -260,8 +238,6 @@ static inline void SetOpenMPThreadCount(int width, int height)

 PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, const u8* tlut, TlutFormat tlutfmt)
 {
-	SetOpenMPThreadCount(width, height);
-
 	const int Wsteps4 = (width + 3) / 4;
 	const int Wsteps8 = (width + 7) / 8;

@ -270,7 +246,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 	case GX_TF_C4:
 		if (tlutfmt == GX_TL_RGB5A3)
 		{
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
 					for (int iy = 0, xStep =  8 * yStep; iy < 8; iy++,xStep++)
@ -278,7 +253,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 		}
 		else if (tlutfmt == GX_TL_IA8)
 		{
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
 					for (int iy = 0, xStep =  8 * yStep; iy < 8; iy++,xStep++)
@ -287,7 +261,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 		}
 		else if (tlutfmt == GX_TL_RGB565)
 		{
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
 					for (int iy = 0, xStep =  8 * yStep; iy < 8; iy++,xStep++)
@ -307,7 +280,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 				const __m128i maskB3A2 = _mm_set_epi8(11,11,11,11,3,3,3,3,10,10,10,10,2,2,2,2);
 				const __m128i maskD5C4 = _mm_set_epi8(13,13,13,13,5,5,5,5,12,12,12,12,4,4,4,4);
 				const __m128i maskF7E6 = _mm_set_epi8(15,15,15,15,7,7,7,7,14,14,14,14,6,6,6,6);
-				#pragma omp parallel for
 				for (int y = 0; y < height; y += 8)
 					for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
 						for (int iy = 0, xStep =  4 * yStep; iy < 8; iy += 2,xStep++)
@ -343,7 +315,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 			// JSD optimized with SSE2 intrinsics.
 			// Produces a ~76% speed improvement over reference C implementation.
 			{
-				#pragma omp parallel for
 				for (int y = 0; y < height; y += 8)
 					for (int x = 0, yStep = (y / 8) * Wsteps8 ; x < width; x += 8, yStep++)
 						for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
@ -415,7 +386,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~10% speed improvement over SSE2 implementation
 			if (cpu_info.bSSSE3)
 			{
-				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
 						for (int iy = 0, xStep = 4 * yStep; iy < 4; ++iy, xStep++)
@ -441,7 +411,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 			// JSD optimized with SSE2 intrinsics.
 			// Produces an ~86% speed improvement over reference C implementation.
 			{
-				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
 					{
@ -527,7 +496,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 	case GX_TF_C8:
 		if (tlutfmt == GX_TL_RGB5A3)
 		{
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -535,7 +503,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 		}
 		else if (tlutfmt == GX_TL_IA8)
 		{
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -544,7 +511,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 		}
 		else if (tlutfmt == GX_TL_RGB565)
 		{
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -554,7 +520,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 		break;
 	case GX_TF_IA4:
 		{
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -568,7 +533,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 			// Produces an ~50% speed improvement over SSE2 implementation.
 			if (cpu_info.bSSSE3)
 			{
-				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -590,7 +554,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 				const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL);
 				const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L);
 				const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL);
-				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -639,7 +602,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 	case GX_TF_C14X2:
 		if (tlutfmt == GX_TL_RGB5A3)
 		{
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -647,7 +609,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 		}
 		else if (tlutfmt == GX_TL_IA8)
 		{
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -655,7 +616,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 		}
 		else if (tlutfmt == GX_TL_RGB565)
 		{
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -671,7 +631,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 			const __m128i kMaskG1 = _mm_set1_epi32(0x00000300);
 			const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000);
 			const __m128i kAlpha  = _mm_set1_epi32(0xFF000000);
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -748,7 +707,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~10% speed improvement over SSE2 implementation
 			if (cpu_info.bSSSE3)
 			{
-				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -841,7 +799,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 			// JSD optimized with SSE2 intrinsics (2 in 4 cases)
 			// Produces a ~25% speed improvement over reference C implementation.
 			{
-				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
@ -955,7 +912,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~30% speed improvement over SSE2 implementation
 			if (cpu_info.bSSSE3)
 			{
-				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					{
@ -987,7 +943,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 			// JSD optimized with SSE2 intrinsics
 			// Produces a ~68% speed improvement over reference C implementation.
 			{
-				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					{
@ -1091,7 +1046,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
 			// Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference C implementation.
 			// The x64 compiled reference C code is faster than the x86 compiled reference C code, but the SSE2 is
 			// faster than both.
-			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 			{
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
--- a/Source/Core/VideoCommon/VideoConfig.cpp
+++ b/Source/Core/VideoCommon/VideoConfig.cpp
@ -78,7 +78,6 @@ void VideoConfig::Load(const std::string& ini_file)
 	settings->Get("TexFmtOverlayCenter", &bTexFmtOverlayCenter, 0);
 	settings->Get("WireFrame", &bWireFrame, 0);
 	settings->Get("DisableFog", &bDisableFog, 0);
-	settings->Get("OMPDecoder", &bOMPDecoder, false);
 	settings->Get("EnableShaderDebugging", &bEnableShaderDebugging, false);
 	settings->Get("BorderlessFullscreen", &bBorderlessFullscreen, false);

@ -176,7 +175,6 @@ void VideoConfig::GameIniLoad()

 	CHECK_SETTING("Video_Settings", "DstAlphaPass", bDstAlphaPass);
 	CHECK_SETTING("Video_Settings", "DisableFog", bDisableFog);
-	CHECK_SETTING("Video_Settings", "OMPDecoder", bOMPDecoder);

 	CHECK_SETTING("Video_Enhancements", "ForceFiltering", bForceFiltering);
 	CHECK_SETTING("Video_Enhancements", "MaxAnisotropy", iMaxAnisotropy);  // NOTE - this is x in (1 << x)
@ -245,7 +243,6 @@ void VideoConfig::Save(const std::string& ini_file)
 	settings->Set("Wireframe", bWireFrame);
 	settings->Set("DstAlphaPass", bDstAlphaPass);
 	settings->Set("DisableFog", bDisableFog);
-	settings->Set("OMPDecoder", bOMPDecoder);
 	settings->Set("EnableShaderDebugging", bEnableShaderDebugging);
 	settings->Set("BorderlessFullscreen", bBorderlessFullscreen);

--- a/Source/Core/VideoCommon/VideoConfig.h
+++ b/Source/Core/VideoCommon/VideoConfig.h
@ -65,9 +65,6 @@ struct VideoConfig final
 	bool bUseXFB;
 	bool bUseRealXFB;

-	// OpenMP
-	bool bOMPDecoder;
-
 	// Enhancements
 	int iMultisampleMode;
 	int iEFBScale;