diff --git a/pcsx2_suite_2012.sln b/pcsx2_suite_2012.sln index a5f519a7ff..9ceff4fc4d 100644 --- a/pcsx2_suite_2012.sln +++ b/pcsx2_suite_2012.sln @@ -120,6 +120,8 @@ Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug AVX|Win32 = Debug AVX|Win32 Debug AVX|x64 = Debug AVX|x64 + Debug AVX2|Win32 = Debug AVX2|Win32 + Debug AVX2|x64 = Debug AVX2|x64 Debug SSE2|Win32 = Debug SSE2|Win32 Debug SSE2|x64 = Debug SSE2|x64 Debug SSE4|Win32 = Debug SSE4|Win32 @@ -132,6 +134,8 @@ Global Devel|x64 = Devel|x64 Release AVX|Win32 = Release AVX|Win32 Release AVX|x64 = Release AVX|x64 + Release AVX2|Win32 = Release AVX2|Win32 + Release AVX2|x64 = Release AVX2|x64 Release SSE2|Win32 = Release SSE2|Win32 Release SSE2|x64 = Release SSE2|x64 Release SSE4|Win32 = Release SSE4|Win32 @@ -145,6 +149,9 @@ Global {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Debug AVX|Win32.Build.0 = Debug|Win32 {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Debug SSE2|Win32.Build.0 = Debug|Win32 {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -163,6 +170,9 @@ Global {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Release AVX|Win32.ActiveCfg = Release|Win32 {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Release AVX|Win32.Build.0 = Release|Win32 {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Release AVX|x64.ActiveCfg = Release|Win32 + {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Release AVX2|Win32.Build.0 = Release|Win32 + {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Release AVX2|x64.ActiveCfg = Release|Win32 {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Release SSE2|Win32.ActiveCfg = Release|Win32 {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Release SSE2|Win32.Build.0 = Release|Win32 {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -177,6 +187,8 @@ Global {1CEFD830-2B76-4596-A4EE-BCD7280A60BD}.Release|x64.ActiveCfg = Release|Win32 {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Debug SSE2|x64.ActiveCfg = Debug|Win32 {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 @@ -192,6 +204,9 @@ Global {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Release AVX|Win32.ActiveCfg = Release|Win32 {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Release AVX|Win32.Build.0 = Release|Win32 {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Release AVX|x64.ActiveCfg = Release|Win32 + {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Release AVX2|Win32.Build.0 = Release|Win32 + {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Release AVX2|x64.ActiveCfg = Release|Win32 {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Release SSE2|Win32.ActiveCfg = Release|Win32 {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Release SSE2|Win32.Build.0 = Release|Win32 {5C6B7D28-E73D-4F71-8FC0-17ADA640EBD8}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -206,6 +221,9 @@ Global {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Debug AVX|Win32.Build.0 = Debug|Win32 {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Debug SSE2|Win32.Build.0 = Debug|Win32 {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -224,6 +242,9 @@ Global {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Release AVX|Win32.ActiveCfg = Release|Win32 {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Release AVX|Win32.Build.0 = Release|Win32 {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Release AVX|x64.ActiveCfg = Release|Win32 + {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Release AVX2|Win32.Build.0 = Release|Win32 + {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Release AVX2|x64.ActiveCfg = Release|Win32 {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Release SSE2|Win32.ActiveCfg = Release|Win32 {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Release SSE2|Win32.Build.0 = Release|Win32 {5F78E90B-BD22-47B1-9CA5-7A80F4DF5EF3}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -239,6 +260,9 @@ Global {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Debug AVX|Win32.Build.0 = Debug|Win32 {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Debug SSE2|Win32.Build.0 = Debug|Win32 {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -257,6 +281,9 @@ Global {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Release AVX|Win32.ActiveCfg = Release|Win32 {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Release AVX|Win32.Build.0 = Release|Win32 {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Release AVX|x64.ActiveCfg = Release|Win32 + {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Release AVX2|Win32.Build.0 = Release|Win32 + {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Release AVX2|x64.ActiveCfg = Release|Win32 {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Release SSE2|Win32.ActiveCfg = Release|Win32 {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Release SSE2|Win32.Build.0 = Release|Win32 {5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -273,6 +300,10 @@ Global {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Debug AVX|Win32.Build.0 = Debug AVX|Win32 {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Debug AVX|x64.ActiveCfg = Debug AVX|x64 {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Debug AVX|x64.Build.0 = Debug AVX|x64 + {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Debug AVX2|Win32.ActiveCfg = Debug AVX2|Win32 + {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Debug AVX2|Win32.Build.0 = Debug AVX2|Win32 + {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Debug AVX2|x64.ActiveCfg = Debug AVX|x64 + {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Debug AVX2|x64.Build.0 = Debug AVX|x64 {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Debug SSE2|Win32.ActiveCfg = Debug SSE2|Win32 {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Debug SSE2|Win32.Build.0 = Debug SSE2|Win32 {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Debug SSE2|x64.ActiveCfg = Debug SSE2|x64 @@ -297,6 +328,10 @@ Global {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Release AVX|Win32.Build.0 = Release AVX|Win32 {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Release AVX|x64.ActiveCfg = Release AVX|x64 {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Release AVX|x64.Build.0 = Release AVX|x64 + {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Release AVX2|Win32.ActiveCfg = Release AVX2|Win32 + {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Release AVX2|Win32.Build.0 = Release AVX2|Win32 + {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Release AVX2|x64.ActiveCfg = Release AVX|x64 + {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Release AVX2|x64.Build.0 = Release AVX|x64 {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Release SSE2|Win32.ActiveCfg = Release SSE2|Win32 {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Release SSE2|Win32.Build.0 = Release SSE2|Win32 {18E42F6F-3A62-41EE-B42F-79366C4F1E95}.Release SSE2|x64.ActiveCfg = Release SSE2|x64 @@ -316,6 +351,9 @@ Global {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Debug AVX|Win32.Build.0 = Debug|Win32 {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Debug SSE2|Win32.Build.0 = Debug|Win32 {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -334,6 +372,9 @@ Global {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Release AVX|Win32.ActiveCfg = Release|Win32 {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Release AVX|Win32.Build.0 = Release|Win32 {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Release AVX|x64.ActiveCfg = Release|Win32 + {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Release AVX2|Win32.Build.0 = Release|Win32 + {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Release AVX2|x64.ActiveCfg = Release|Win32 {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Release SSE2|Win32.ActiveCfg = Release|Win32 {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Release SSE2|Win32.Build.0 = Release|Win32 {E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -349,6 +390,9 @@ Global {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Debug AVX|Win32.Build.0 = Debug|Win32 {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Debug SSE2|Win32.Build.0 = Debug|Win32 {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -367,6 +411,9 @@ Global {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Release AVX|Win32.ActiveCfg = Release|Win32 {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Release AVX|Win32.Build.0 = Release|Win32 {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Release AVX|x64.ActiveCfg = Release|Win32 + {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Release AVX2|Win32.Build.0 = Release|Win32 + {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Release AVX2|x64.ActiveCfg = Release|Win32 {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Release SSE2|Win32.ActiveCfg = Release|Win32 {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Release SSE2|Win32.Build.0 = Release|Win32 {2F6C0388-20CB-4242-9F6C-A6EBB6A83F47}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -382,6 +429,9 @@ Global {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Debug AVX|Win32.Build.0 = Debug|Win32 {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Debug SSE2|Win32.Build.0 = Debug|Win32 {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -400,6 +450,9 @@ Global {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Release AVX|Win32.ActiveCfg = Release|Win32 {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Release AVX|Win32.Build.0 = Release|Win32 {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Release AVX|x64.ActiveCfg = Release|Win32 + {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Release AVX2|Win32.Build.0 = Release|Win32 + {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Release AVX2|x64.ActiveCfg = Release|Win32 {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Release SSE2|Win32.ActiveCfg = Release|Win32 {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Release SSE2|Win32.Build.0 = Release|Win32 {F4EB4AB2-C595-4B05-8BC0-059024BC796C}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -415,6 +468,9 @@ Global {E4081455-398C-4610-A87C-90A8A7D72DC3}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {E4081455-398C-4610-A87C-90A8A7D72DC3}.Debug AVX|x64.ActiveCfg = Debug|x64 {E4081455-398C-4610-A87C-90A8A7D72DC3}.Debug AVX|x64.Build.0 = Debug|x64 + {E4081455-398C-4610-A87C-90A8A7D72DC3}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {E4081455-398C-4610-A87C-90A8A7D72DC3}.Debug AVX2|x64.ActiveCfg = Debug|x64 + {E4081455-398C-4610-A87C-90A8A7D72DC3}.Debug AVX2|x64.Build.0 = Debug|x64 {E4081455-398C-4610-A87C-90A8A7D72DC3}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {E4081455-398C-4610-A87C-90A8A7D72DC3}.Debug SSE2|x64.ActiveCfg = Debug|x64 {E4081455-398C-4610-A87C-90A8A7D72DC3}.Debug SSE2|x64.Build.0 = Debug|x64 @@ -433,6 +489,9 @@ Global {E4081455-398C-4610-A87C-90A8A7D72DC3}.Release AVX|Win32.ActiveCfg = Release|Win32 {E4081455-398C-4610-A87C-90A8A7D72DC3}.Release AVX|Win32.Build.0 = Release|Win32 {E4081455-398C-4610-A87C-90A8A7D72DC3}.Release AVX|x64.ActiveCfg = Release|Win32 + {E4081455-398C-4610-A87C-90A8A7D72DC3}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {E4081455-398C-4610-A87C-90A8A7D72DC3}.Release AVX2|Win32.Build.0 = Release|Win32 + {E4081455-398C-4610-A87C-90A8A7D72DC3}.Release AVX2|x64.ActiveCfg = Release|Win32 {E4081455-398C-4610-A87C-90A8A7D72DC3}.Release SSE2|Win32.ActiveCfg = Release|Win32 {E4081455-398C-4610-A87C-90A8A7D72DC3}.Release SSE2|Win32.Build.0 = Release|Win32 {E4081455-398C-4610-A87C-90A8A7D72DC3}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -449,6 +508,9 @@ Global {26511268-2902-4997-8421-ECD7055F9E28}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {26511268-2902-4997-8421-ECD7055F9E28}.Debug AVX|Win32.Build.0 = Debug|Win32 {26511268-2902-4997-8421-ECD7055F9E28}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {26511268-2902-4997-8421-ECD7055F9E28}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {26511268-2902-4997-8421-ECD7055F9E28}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {26511268-2902-4997-8421-ECD7055F9E28}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {26511268-2902-4997-8421-ECD7055F9E28}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {26511268-2902-4997-8421-ECD7055F9E28}.Debug SSE2|Win32.Build.0 = Debug|Win32 {26511268-2902-4997-8421-ECD7055F9E28}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -467,6 +529,9 @@ Global {26511268-2902-4997-8421-ECD7055F9E28}.Release AVX|Win32.ActiveCfg = Release|Win32 {26511268-2902-4997-8421-ECD7055F9E28}.Release AVX|Win32.Build.0 = Release|Win32 {26511268-2902-4997-8421-ECD7055F9E28}.Release AVX|x64.ActiveCfg = Release|Win32 + {26511268-2902-4997-8421-ECD7055F9E28}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {26511268-2902-4997-8421-ECD7055F9E28}.Release AVX2|Win32.Build.0 = Release|Win32 + {26511268-2902-4997-8421-ECD7055F9E28}.Release AVX2|x64.ActiveCfg = Release|Win32 {26511268-2902-4997-8421-ECD7055F9E28}.Release SSE2|Win32.ActiveCfg = Release|Win32 {26511268-2902-4997-8421-ECD7055F9E28}.Release SSE2|Win32.Build.0 = Release|Win32 {26511268-2902-4997-8421-ECD7055F9E28}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -481,6 +546,8 @@ Global {26511268-2902-4997-8421-ECD7055F9E28}.Release|x64.ActiveCfg = Release|Win32 {7F059854-568D-4E08-9D00-1E78E203E4DC}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {7F059854-568D-4E08-9D00-1E78E203E4DC}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {7F059854-568D-4E08-9D00-1E78E203E4DC}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {7F059854-568D-4E08-9D00-1E78E203E4DC}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {7F059854-568D-4E08-9D00-1E78E203E4DC}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {7F059854-568D-4E08-9D00-1E78E203E4DC}.Debug SSE2|x64.ActiveCfg = Debug|Win32 {7F059854-568D-4E08-9D00-1E78E203E4DC}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 @@ -496,6 +563,9 @@ Global {7F059854-568D-4E08-9D00-1E78E203E4DC}.Release AVX|Win32.ActiveCfg = Release|Win32 {7F059854-568D-4E08-9D00-1E78E203E4DC}.Release AVX|Win32.Build.0 = Release|Win32 {7F059854-568D-4E08-9D00-1E78E203E4DC}.Release AVX|x64.ActiveCfg = Release|Win32 + {7F059854-568D-4E08-9D00-1E78E203E4DC}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {7F059854-568D-4E08-9D00-1E78E203E4DC}.Release AVX2|Win32.Build.0 = Release|Win32 + {7F059854-568D-4E08-9D00-1E78E203E4DC}.Release AVX2|x64.ActiveCfg = Release|Win32 {7F059854-568D-4E08-9D00-1E78E203E4DC}.Release SSE2|Win32.ActiveCfg = Release|Win32 {7F059854-568D-4E08-9D00-1E78E203E4DC}.Release SSE2|Win32.Build.0 = Release|Win32 {7F059854-568D-4E08-9D00-1E78E203E4DC}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -509,6 +579,8 @@ Global {7F059854-568D-4E08-9D00-1E78E203E4DC}.Release|x64.ActiveCfg = Release|Win32 {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Debug AVX|x64.ActiveCfg = Debug|x64 + {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Debug AVX2|x64.ActiveCfg = Debug|x64 {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Debug SSE2|x64.ActiveCfg = Debug|x64 {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 @@ -524,6 +596,9 @@ Global {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Release AVX|Win32.ActiveCfg = Release|Win32 {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Release AVX|Win32.Build.0 = Release|Win32 {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Release AVX|x64.ActiveCfg = Release|x64 + {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Release AVX2|Win32.Build.0 = Release|Win32 + {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Release AVX2|x64.ActiveCfg = Release|x64 {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Release SSE2|Win32.ActiveCfg = Release|Win32 {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Release SSE2|Win32.Build.0 = Release|Win32 {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Release SSE2|x64.ActiveCfg = Release|x64 @@ -537,6 +612,8 @@ Global {6F3C4136-5801-4EBC-AC6E-37DF6FAB150A}.Release|x64.ActiveCfg = Release|x64 {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Debug AVX|x64.ActiveCfg = Debug|x64 + {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Debug AVX2|x64.ActiveCfg = Debug|x64 {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Debug SSE2|x64.ActiveCfg = Debug|x64 {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 @@ -552,6 +629,9 @@ Global {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Release AVX|Win32.ActiveCfg = Release|Win32 {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Release AVX|Win32.Build.0 = Release|Win32 {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Release AVX|x64.ActiveCfg = Release|x64 + {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Release AVX2|Win32.Build.0 = Release|Win32 + {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Release AVX2|x64.ActiveCfg = Release|x64 {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Release SSE2|Win32.ActiveCfg = Release|Win32 {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Release SSE2|Win32.Build.0 = Release|Win32 {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Release SSE2|x64.ActiveCfg = Release|x64 @@ -565,6 +645,8 @@ Global {FCDF5AE2-EA47-4CC6-9F20-23A0517FEBCB}.Release|x64.ActiveCfg = Release|x64 {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Debug AVX|Win32.ActiveCfg = Release|Win32 {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Debug AVX|x64.ActiveCfg = Release|Win32 + {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Debug AVX2|Win32.ActiveCfg = Release|Win32 + {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Debug AVX2|x64.ActiveCfg = Release|Win32 {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Debug SSE2|Win32.ActiveCfg = Release|Win32 {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Debug SSE2|x64.ActiveCfg = Release|Win32 {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Debug SSE4|Win32.ActiveCfg = Release|Win32 @@ -580,6 +662,9 @@ Global {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Release AVX|Win32.ActiveCfg = Release|Win32 {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Release AVX|Win32.Build.0 = Release|Win32 {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Release AVX|x64.ActiveCfg = Release|Win32 + {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Release AVX2|Win32.Build.0 = Release|Win32 + {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Release AVX2|x64.ActiveCfg = Release|Win32 {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Release SSE2|Win32.ActiveCfg = Release|Win32 {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Release SSE2|Win32.Build.0 = Release|Win32 {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -593,6 +678,8 @@ Global {F38D9DF0-F68D-49D9-B3A0-932E74FB74A0}.Release|x64.ActiveCfg = Release|Win32 {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Debug SSE2|x64.ActiveCfg = Debug|Win32 {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 @@ -608,6 +695,9 @@ Global {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Release AVX|Win32.ActiveCfg = Release|Win32 {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Release AVX|Win32.Build.0 = Release|Win32 {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Release AVX|x64.ActiveCfg = Release|Win32 + {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Release AVX2|Win32.Build.0 = Release|Win32 + {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Release AVX2|x64.ActiveCfg = Release|Win32 {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Release SSE2|Win32.ActiveCfg = Release|Win32 {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Release SSE2|Win32.Build.0 = Release|Win32 {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -621,6 +711,8 @@ Global {BF7B81A5-E348-4F7C-A69F-F74C8EEEAD70}.Release|x64.ActiveCfg = Release|Win32 {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Debug SSE2|x64.ActiveCfg = Debug|Win32 {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 @@ -636,6 +728,9 @@ Global {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Release AVX|Win32.ActiveCfg = Release|Win32 {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Release AVX|Win32.Build.0 = Release|Win32 {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Release AVX|x64.ActiveCfg = Release|Win32 + {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Release AVX2|Win32.Build.0 = Release|Win32 + {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Release AVX2|x64.ActiveCfg = Release|Win32 {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Release SSE2|Win32.ActiveCfg = Release|Win32 {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Release SSE2|Win32.Build.0 = Release|Win32 {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -649,6 +744,8 @@ Global {3D0EB14D-32F3-4D82-9C6D-B806ADBB859C}.Release|x64.ActiveCfg = Release|Win32 {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Debug SSE2|x64.ActiveCfg = Debug|Win32 {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 @@ -664,6 +761,9 @@ Global {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Release AVX|Win32.ActiveCfg = Release|Win32 {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Release AVX|Win32.Build.0 = Release|Win32 {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Release AVX|x64.ActiveCfg = Release|Win32 + {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Release AVX2|Win32.Build.0 = Release|Win32 + {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Release AVX2|x64.ActiveCfg = Release|Win32 {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Release SSE2|Win32.ActiveCfg = Release|Win32 {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Release SSE2|Win32.Build.0 = Release|Win32 {04439C5F-05FB-4A9C-AAD1-5388C25377DB}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -678,6 +778,9 @@ Global {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Debug AVX|Win32.Build.0 = Debug|Win32 {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Debug SSE2|Win32.Build.0 = Debug|Win32 {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -696,6 +799,9 @@ Global {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Release AVX|Win32.ActiveCfg = Release|Win32 {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Release AVX|Win32.Build.0 = Release|Win32 {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Release AVX|x64.ActiveCfg = Release|Win32 + {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Release AVX2|Win32.Build.0 = Release|Win32 + {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Release AVX2|x64.ActiveCfg = Release|Win32 {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Release SSE2|Win32.ActiveCfg = Release|Win32 {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Release SSE2|Win32.Build.0 = Release|Win32 {48AD7E0A-25B1-4974-A1E3-03F8C438D34F}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -711,6 +817,9 @@ Global {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Debug AVX|Win32.Build.0 = Debug|Win32 {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Debug SSE2|Win32.Build.0 = Debug|Win32 {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -729,6 +838,9 @@ Global {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Release AVX|Win32.ActiveCfg = Release|Win32 {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Release AVX|Win32.Build.0 = Release|Win32 {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Release AVX|x64.ActiveCfg = Release|Win32 + {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Release AVX2|Win32.Build.0 = Release|Win32 + {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Release AVX2|x64.ActiveCfg = Release|Win32 {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Release SSE2|Win32.ActiveCfg = Release|Win32 {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Release SSE2|Win32.Build.0 = Release|Win32 {0318BA30-EF48-441A-9E10-DC85EFAE39F0}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -744,6 +856,9 @@ Global {C34487AF-228A-4D11-8E50-27803DF76873}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {C34487AF-228A-4D11-8E50-27803DF76873}.Debug AVX|Win32.Build.0 = Debug|Win32 {C34487AF-228A-4D11-8E50-27803DF76873}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {C34487AF-228A-4D11-8E50-27803DF76873}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {C34487AF-228A-4D11-8E50-27803DF76873}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {C34487AF-228A-4D11-8E50-27803DF76873}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {C34487AF-228A-4D11-8E50-27803DF76873}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {C34487AF-228A-4D11-8E50-27803DF76873}.Debug SSE2|Win32.Build.0 = Debug|Win32 {C34487AF-228A-4D11-8E50-27803DF76873}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -762,6 +877,9 @@ Global {C34487AF-228A-4D11-8E50-27803DF76873}.Release AVX|Win32.ActiveCfg = Release|Win32 {C34487AF-228A-4D11-8E50-27803DF76873}.Release AVX|Win32.Build.0 = Release|Win32 {C34487AF-228A-4D11-8E50-27803DF76873}.Release AVX|x64.ActiveCfg = Release|Win32 + {C34487AF-228A-4D11-8E50-27803DF76873}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {C34487AF-228A-4D11-8E50-27803DF76873}.Release AVX2|Win32.Build.0 = Release|Win32 + {C34487AF-228A-4D11-8E50-27803DF76873}.Release AVX2|x64.ActiveCfg = Release|Win32 {C34487AF-228A-4D11-8E50-27803DF76873}.Release SSE2|Win32.ActiveCfg = Release|Win32 {C34487AF-228A-4D11-8E50-27803DF76873}.Release SSE2|Win32.Build.0 = Release|Win32 {C34487AF-228A-4D11-8E50-27803DF76873}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -777,6 +895,9 @@ Global {A51123F5-9505-4EAE-85E7-D320290A272C}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {A51123F5-9505-4EAE-85E7-D320290A272C}.Debug AVX|Win32.Build.0 = Debug|Win32 {A51123F5-9505-4EAE-85E7-D320290A272C}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {A51123F5-9505-4EAE-85E7-D320290A272C}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {A51123F5-9505-4EAE-85E7-D320290A272C}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {A51123F5-9505-4EAE-85E7-D320290A272C}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {A51123F5-9505-4EAE-85E7-D320290A272C}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {A51123F5-9505-4EAE-85E7-D320290A272C}.Debug SSE2|Win32.Build.0 = Debug|Win32 {A51123F5-9505-4EAE-85E7-D320290A272C}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -795,6 +916,9 @@ Global {A51123F5-9505-4EAE-85E7-D320290A272C}.Release AVX|Win32.ActiveCfg = Release|Win32 {A51123F5-9505-4EAE-85E7-D320290A272C}.Release AVX|Win32.Build.0 = Release|Win32 {A51123F5-9505-4EAE-85E7-D320290A272C}.Release AVX|x64.ActiveCfg = Release|Win32 + {A51123F5-9505-4EAE-85E7-D320290A272C}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {A51123F5-9505-4EAE-85E7-D320290A272C}.Release AVX2|Win32.Build.0 = Release|Win32 + {A51123F5-9505-4EAE-85E7-D320290A272C}.Release AVX2|x64.ActiveCfg = Release|Win32 {A51123F5-9505-4EAE-85E7-D320290A272C}.Release SSE2|Win32.ActiveCfg = Release|Win32 {A51123F5-9505-4EAE-85E7-D320290A272C}.Release SSE2|Win32.Build.0 = Release|Win32 {A51123F5-9505-4EAE-85E7-D320290A272C}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -810,6 +934,9 @@ Global {4639972E-424E-4E13-8B07-CA403C481346}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {4639972E-424E-4E13-8B07-CA403C481346}.Debug AVX|Win32.Build.0 = Debug|Win32 {4639972E-424E-4E13-8B07-CA403C481346}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {4639972E-424E-4E13-8B07-CA403C481346}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {4639972E-424E-4E13-8B07-CA403C481346}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {4639972E-424E-4E13-8B07-CA403C481346}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {4639972E-424E-4E13-8B07-CA403C481346}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {4639972E-424E-4E13-8B07-CA403C481346}.Debug SSE2|Win32.Build.0 = Debug|Win32 {4639972E-424E-4E13-8B07-CA403C481346}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -828,6 +955,9 @@ Global {4639972E-424E-4E13-8B07-CA403C481346}.Release AVX|Win32.ActiveCfg = Release|Win32 {4639972E-424E-4E13-8B07-CA403C481346}.Release AVX|Win32.Build.0 = Release|Win32 {4639972E-424E-4E13-8B07-CA403C481346}.Release AVX|x64.ActiveCfg = Release|Win32 + {4639972E-424E-4E13-8B07-CA403C481346}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {4639972E-424E-4E13-8B07-CA403C481346}.Release AVX2|Win32.Build.0 = Release|Win32 + {4639972E-424E-4E13-8B07-CA403C481346}.Release AVX2|x64.ActiveCfg = Release|Win32 {4639972E-424E-4E13-8B07-CA403C481346}.Release SSE2|Win32.ActiveCfg = Release|Win32 {4639972E-424E-4E13-8B07-CA403C481346}.Release SSE2|Win32.Build.0 = Release|Win32 {4639972E-424E-4E13-8B07-CA403C481346}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -843,6 +973,9 @@ Global {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Debug AVX|Win32.Build.0 = Debug|Win32 {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Debug SSE2|Win32.Build.0 = Debug|Win32 {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -861,6 +994,9 @@ Global {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Release AVX|Win32.ActiveCfg = Release|Win32 {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Release AVX|Win32.Build.0 = Release|Win32 {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Release AVX|x64.ActiveCfg = Release|Win32 + {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Release AVX2|Win32.Build.0 = Release|Win32 + {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Release AVX2|x64.ActiveCfg = Release|Win32 {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Release SSE2|Win32.ActiveCfg = Release|Win32 {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Release SSE2|Win32.Build.0 = Release|Win32 {0E231FB1-F3C9-4724-ACCB-DE8BCB3C089E}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -876,6 +1012,9 @@ Global {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Debug AVX|Win32.Build.0 = Debug|Win32 {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Debug SSE2|Win32.Build.0 = Debug|Win32 {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -894,6 +1033,9 @@ Global {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Release AVX|Win32.ActiveCfg = Release|Win32 {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Release AVX|Win32.Build.0 = Release|Win32 {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Release AVX|x64.ActiveCfg = Release|Win32 + {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Release AVX2|Win32.Build.0 = Release|Win32 + {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Release AVX2|x64.ActiveCfg = Release|Win32 {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Release SSE2|Win32.ActiveCfg = Release|Win32 {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Release SSE2|Win32.Build.0 = Release|Win32 {677B7D11-D5E1-40B3-88B1-9A4DF83D2213}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -909,6 +1051,9 @@ Global {BC236261-77E8-4567-8D09-45CD02965EB6}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {BC236261-77E8-4567-8D09-45CD02965EB6}.Debug AVX|Win32.Build.0 = Debug|Win32 {BC236261-77E8-4567-8D09-45CD02965EB6}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {BC236261-77E8-4567-8D09-45CD02965EB6}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {BC236261-77E8-4567-8D09-45CD02965EB6}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {BC236261-77E8-4567-8D09-45CD02965EB6}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {BC236261-77E8-4567-8D09-45CD02965EB6}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {BC236261-77E8-4567-8D09-45CD02965EB6}.Debug SSE2|Win32.Build.0 = Debug|Win32 {BC236261-77E8-4567-8D09-45CD02965EB6}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -927,6 +1072,9 @@ Global {BC236261-77E8-4567-8D09-45CD02965EB6}.Release AVX|Win32.ActiveCfg = Release|Win32 {BC236261-77E8-4567-8D09-45CD02965EB6}.Release AVX|Win32.Build.0 = Release|Win32 {BC236261-77E8-4567-8D09-45CD02965EB6}.Release AVX|x64.ActiveCfg = Release|Win32 + {BC236261-77E8-4567-8D09-45CD02965EB6}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {BC236261-77E8-4567-8D09-45CD02965EB6}.Release AVX2|Win32.Build.0 = Release|Win32 + {BC236261-77E8-4567-8D09-45CD02965EB6}.Release AVX2|x64.ActiveCfg = Release|Win32 {BC236261-77E8-4567-8D09-45CD02965EB6}.Release SSE2|Win32.ActiveCfg = Release|Win32 {BC236261-77E8-4567-8D09-45CD02965EB6}.Release SSE2|Win32.Build.0 = Release|Win32 {BC236261-77E8-4567-8D09-45CD02965EB6}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -942,6 +1090,9 @@ Global {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Debug AVX|Win32.Build.0 = Debug|Win32 {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Debug SSE2|Win32.Build.0 = Debug|Win32 {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -960,6 +1111,9 @@ Global {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Release AVX|Win32.ActiveCfg = Release|Win32 {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Release AVX|Win32.Build.0 = Release|Win32 {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Release AVX|x64.ActiveCfg = Release|Win32 + {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Release AVX2|Win32.Build.0 = Release|Win32 + {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Release AVX2|x64.ActiveCfg = Release|Win32 {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Release SSE2|Win32.ActiveCfg = Release|Win32 {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Release SSE2|Win32.Build.0 = Release|Win32 {7E9B2BE7-CEC3-4F14-847B-0AB8D562FB86}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -975,6 +1129,9 @@ Global {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Debug AVX|Win32.Build.0 = Debug|Win32 {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Debug SSE2|Win32.Build.0 = Debug|Win32 {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -993,6 +1150,9 @@ Global {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Release AVX|Win32.ActiveCfg = Release|Win32 {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Release AVX|Win32.Build.0 = Release|Win32 {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Release AVX|x64.ActiveCfg = Release|Win32 + {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Release AVX2|Win32.Build.0 = Release|Win32 + {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Release AVX2|x64.ActiveCfg = Release|Win32 {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Release SSE2|Win32.ActiveCfg = Release|Win32 {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Release SSE2|Win32.Build.0 = Release|Win32 {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -1007,6 +1167,8 @@ Global {5CF88D5F-64DD-4EDC-9F1A-436BD502940A}.Release|x64.ActiveCfg = Release|Win32 {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Debug SSE2|x64.ActiveCfg = Debug|Win32 {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 @@ -1022,6 +1184,9 @@ Global {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Release AVX|Win32.ActiveCfg = Release|Win32 {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Release AVX|Win32.Build.0 = Release|Win32 {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Release AVX|x64.ActiveCfg = Release|Win32 + {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Release AVX2|Win32.Build.0 = Release|Win32 + {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Release AVX2|x64.ActiveCfg = Release|Win32 {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Release SSE2|Win32.ActiveCfg = Release|Win32 {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Release SSE2|Win32.Build.0 = Release|Win32 {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -1035,6 +1200,8 @@ Global {5FCBD521-5A0B-4D97-A823-A97E6BAB9101}.Release|x64.ActiveCfg = Release|Win32 {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Debug SSE2|x64.ActiveCfg = Debug|Win32 {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 @@ -1050,6 +1217,9 @@ Global {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Release AVX|Win32.ActiveCfg = Release|Win32 {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Release AVX|Win32.Build.0 = Release|Win32 {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Release AVX|x64.ActiveCfg = Release|Win32 + {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Release AVX2|Win32.Build.0 = Release|Win32 + {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Release AVX2|x64.ActiveCfg = Release|Win32 {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Release SSE2|Win32.ActiveCfg = Release|Win32 {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Release SSE2|Win32.Build.0 = Release|Win32 {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -1063,6 +1233,8 @@ Global {6C8D28E4-447E-4856-BD9E-6B8F5E7C58C9}.Release|x64.ActiveCfg = Release|Win32 {6BC4D85D-A399-407E-96A9-CD5416A54269}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {6BC4D85D-A399-407E-96A9-CD5416A54269}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {6BC4D85D-A399-407E-96A9-CD5416A54269}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {6BC4D85D-A399-407E-96A9-CD5416A54269}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {6BC4D85D-A399-407E-96A9-CD5416A54269}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {6BC4D85D-A399-407E-96A9-CD5416A54269}.Debug SSE2|x64.ActiveCfg = Debug|Win32 {6BC4D85D-A399-407E-96A9-CD5416A54269}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 @@ -1078,6 +1250,9 @@ Global {6BC4D85D-A399-407E-96A9-CD5416A54269}.Release AVX|Win32.ActiveCfg = Release|Win32 {6BC4D85D-A399-407E-96A9-CD5416A54269}.Release AVX|Win32.Build.0 = Release|Win32 {6BC4D85D-A399-407E-96A9-CD5416A54269}.Release AVX|x64.ActiveCfg = Release|Win32 + {6BC4D85D-A399-407E-96A9-CD5416A54269}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {6BC4D85D-A399-407E-96A9-CD5416A54269}.Release AVX2|Win32.Build.0 = Release|Win32 + {6BC4D85D-A399-407E-96A9-CD5416A54269}.Release AVX2|x64.ActiveCfg = Release|Win32 {6BC4D85D-A399-407E-96A9-CD5416A54269}.Release SSE2|Win32.ActiveCfg = Release|Win32 {6BC4D85D-A399-407E-96A9-CD5416A54269}.Release SSE2|Win32.Build.0 = Release|Win32 {6BC4D85D-A399-407E-96A9-CD5416A54269}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -1092,6 +1267,9 @@ Global {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Debug AVX|Win32.ActiveCfg = Debug (NO ASIO)|Win32 {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Debug AVX|x64.ActiveCfg = Debug (NO ASIO)|x64 {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Debug AVX|x64.Build.0 = Debug (NO ASIO)|x64 + {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Debug AVX2|Win32.ActiveCfg = Debug (NO ASIO)|Win32 + {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Debug AVX2|x64.ActiveCfg = Debug (NO ASIO)|x64 + {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Debug AVX2|x64.Build.0 = Debug (NO ASIO)|x64 {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Debug SSE2|Win32.ActiveCfg = Debug (NO ASIO)|Win32 {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Debug SSE2|x64.ActiveCfg = Debug (NO ASIO)|x64 {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Debug SSE2|x64.Build.0 = Debug (NO ASIO)|x64 @@ -1113,6 +1291,10 @@ Global {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Release AVX|Win32.Build.0 = Release (NO ASIO)|Win32 {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Release AVX|x64.ActiveCfg = Release (NO ASIO)|x64 {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Release AVX|x64.Build.0 = Release (NO ASIO)|x64 + {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Release AVX2|Win32.ActiveCfg = Release (NO ASIO)|Win32 + {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Release AVX2|Win32.Build.0 = Release (NO ASIO)|Win32 + {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Release AVX2|x64.ActiveCfg = Release (NO ASIO)|x64 + {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Release AVX2|x64.Build.0 = Release (NO ASIO)|x64 {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Release SSE2|Win32.ActiveCfg = Release (NO ASIO)|Win32 {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Release SSE2|Win32.Build.0 = Release (NO ASIO)|Win32 {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Release SSE2|x64.ActiveCfg = Release (NO ASIO)|x64 @@ -1131,6 +1313,8 @@ Global {0A18A071-125E-442F-AFF7-A3F68ABECF99}.Release|x64.Build.0 = Release|x64 {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Debug SSE2|x64.ActiveCfg = Debug|Win32 {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 @@ -1146,6 +1330,9 @@ Global {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Release AVX|Win32.ActiveCfg = Release|Win32 {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Release AVX|Win32.Build.0 = Release|Win32 {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Release AVX|x64.ActiveCfg = Release|Win32 + {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Release AVX2|Win32.Build.0 = Release|Win32 + {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Release AVX2|x64.ActiveCfg = Release|Win32 {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Release SSE2|Win32.ActiveCfg = Release|Win32 {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Release SSE2|Win32.Build.0 = Release|Win32 {2D4E85B2-F47F-4D65-B091-701E5C031DAC}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -1160,6 +1347,9 @@ Global {E613DA9F-41B4-4613-9911-E418EF5533BC}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {E613DA9F-41B4-4613-9911-E418EF5533BC}.Debug AVX|Win32.Build.0 = Debug|Win32 {E613DA9F-41B4-4613-9911-E418EF5533BC}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {E613DA9F-41B4-4613-9911-E418EF5533BC}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {E613DA9F-41B4-4613-9911-E418EF5533BC}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {E613DA9F-41B4-4613-9911-E418EF5533BC}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {E613DA9F-41B4-4613-9911-E418EF5533BC}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {E613DA9F-41B4-4613-9911-E418EF5533BC}.Debug SSE2|Win32.Build.0 = Debug|Win32 {E613DA9F-41B4-4613-9911-E418EF5533BC}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -1178,6 +1368,9 @@ Global {E613DA9F-41B4-4613-9911-E418EF5533BC}.Release AVX|Win32.ActiveCfg = Release|Win32 {E613DA9F-41B4-4613-9911-E418EF5533BC}.Release AVX|Win32.Build.0 = Release|Win32 {E613DA9F-41B4-4613-9911-E418EF5533BC}.Release AVX|x64.ActiveCfg = Release|Win32 + {E613DA9F-41B4-4613-9911-E418EF5533BC}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {E613DA9F-41B4-4613-9911-E418EF5533BC}.Release AVX2|Win32.Build.0 = Release|Win32 + {E613DA9F-41B4-4613-9911-E418EF5533BC}.Release AVX2|x64.ActiveCfg = Release|Win32 {E613DA9F-41B4-4613-9911-E418EF5533BC}.Release SSE2|Win32.ActiveCfg = Release|Win32 {E613DA9F-41B4-4613-9911-E418EF5533BC}.Release SSE2|Win32.Build.0 = Release|Win32 {E613DA9F-41B4-4613-9911-E418EF5533BC}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -1193,6 +1386,9 @@ Global {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Debug AVX|Win32.Build.0 = Debug|Win32 {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Debug SSE2|Win32.Build.0 = Debug|Win32 {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -1211,6 +1407,9 @@ Global {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Release AVX|Win32.ActiveCfg = Release|Win32 {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Release AVX|Win32.Build.0 = Release|Win32 {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Release AVX|x64.ActiveCfg = Release|Win32 + {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Release AVX2|Win32.Build.0 = Release|Win32 + {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Release AVX2|x64.ActiveCfg = Release|Win32 {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Release SSE2|Win32.ActiveCfg = Release|Win32 {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Release SSE2|Win32.Build.0 = Release|Win32 {019773FA-2DAA-4C12-9511-BD2D4EB2A718}.Release SSE2|x64.ActiveCfg = Release|Win32 @@ -1226,6 +1425,9 @@ Global {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Debug AVX|Win32.ActiveCfg = Debug|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Debug AVX|Win32.Build.0 = Debug|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Debug AVX|x64.ActiveCfg = Debug|Win32 + {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Debug AVX2|x64.ActiveCfg = Debug|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Debug SSE2|Win32.Build.0 = Debug|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Debug SSE2|x64.ActiveCfg = Debug|Win32 @@ -1244,6 +1446,9 @@ Global {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release AVX|Win32.ActiveCfg = Release|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release AVX|Win32.Build.0 = Release|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release AVX|x64.ActiveCfg = Release|Win32 + {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release AVX2|Win32.Build.0 = Release|Win32 + {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release AVX2|x64.ActiveCfg = Release|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release SSE2|Win32.ActiveCfg = Release|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release SSE2|Win32.Build.0 = Release|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release SSE2|x64.ActiveCfg = Release|Win32 diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp index 8cd32a1fb3..b17a4b3f95 100644 --- a/plugins/GSdx/GS.cpp +++ b/plugins/GSdx/GS.cpp @@ -1112,7 +1112,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow { ::SetPriorityClass(::GetCurrentProcess(), HIGH_PRIORITY_CLASS); - FILE* file = fopen("c:\\log.txt", "a"); + FILE* file = fopen("c:\\temp1\\log.txt", "a"); fprintf(file, "-------------------------\n\n"); diff --git a/plugins/GSdx/GSBlock.cpp b/plugins/GSdx/GSBlock.cpp index 8a7a217c44..8693d9c38d 100644 --- a/plugins/GSdx/GSBlock.cpp +++ b/plugins/GSdx/GSBlock.cpp @@ -22,14 +22,27 @@ #include "stdafx.h" #include "GSBlock.h" +#if _M_SSE >= 0x501 +const GSVector8i GSBlock::m_r16mask( + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15); +#else const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15); +#endif const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15); const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); +#if _M_SSE >= 0x501 +const GSVector8i GSBlock::m_xxxa(0x00008000); +const GSVector8i GSBlock::m_xxbx(0x00007c00); +const GSVector8i GSBlock::m_xgxx(0x000003e0); +const GSVector8i GSBlock::m_rxxx(0x0000001f); +#else const GSVector4i GSBlock::m_xxxa(0x00008000); const GSVector4i GSBlock::m_xxbx(0x00007c00); const GSVector4i GSBlock::m_xgxx(0x000003e0); const GSVector4i GSBlock::m_rxxx(0x0000001f); +#endif const GSVector4i GSBlock::m_uw8hmask0 = GSVector4i(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9); const GSVector4i GSBlock::m_uw8hmask1 = GSVector4i(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11); diff --git a/plugins/GSdx/GSBlock.h b/plugins/GSdx/GSBlock.h index 4890584c4c..fc4b85cbc4 100644 --- a/plugins/GSdx/GSBlock.h +++ b/plugins/GSdx/GSBlock.h @@ -27,14 +27,25 @@ class GSBlock { + #if _M_SSE >= 0x501 + static const GSVector8i m_r16mask; + #else static const GSVector4i m_r16mask; + #endif static const GSVector4i m_r8mask; static const GSVector4i m_r4mask; + #if _M_SSE >= 0x501 + static const GSVector8i m_xxxa; + static const GSVector8i m_xxbx; + static const GSVector8i m_xgxx; + static const GSVector8i m_rxxx; + #else static const GSVector4i m_xxxa; static const GSVector4i m_xxbx; static const GSVector4i m_xgxx; static const GSVector4i m_rxxx; + #endif static const GSVector4i m_uw8hmask0; static const GSVector4i m_uw8hmask1; @@ -277,41 +288,62 @@ public: WriteColumn4<3, aligned>(dst, src, srcpitch); } - template __forceinline static void ReadColumn32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + template __forceinline static void ReadColumn32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { - GSVector4i v0, v1, v2, v3; + #if _M_SSE >= 0x501 - if(aligned) - { - const GSVector4i* s = (const GSVector4i*)src; + const GSVector8i* s = (const GSVector8i*)src; - v0 = s[i * 4 + 0]; - v1 = s[i * 4 + 1]; - v2 = s[i * 4 + 2]; - v3 = s[i * 4 + 3]; + GSVector8i v0 = s[i * 2 + 0]; + GSVector8i v1 = s[i * 2 + 1]; - GSVector4i::sw64(v0, v1, v2, v3); - } - else - { - v0 = GSVector4i::load(&src[i * 64 + 0], &src[i * 64 + 16]); - v1 = GSVector4i::load(&src[i * 64 + 32], &src[i * 64 + 48]); - v2 = GSVector4i::load(&src[i * 64 + 8], &src[i * 64 + 24]); - v3 = GSVector4i::load(&src[i * 64 + 40], &src[i * 64 + 56]); - } + GSVector8i::sw128(v0, v1); + GSVector8i::sw64(v0, v1); + + GSVector8i::store(&dst[dstpitch * 0], v0); + GSVector8i::store(&dst[dstpitch * 1], v1); + + #else + + const GSVector4i* s = (const GSVector4i*)src; + + GSVector4i v0 = s[i * 4 + 0]; + GSVector4i v1 = s[i * 4 + 1]; + GSVector4i v2 = s[i * 4 + 2]; + GSVector4i v3 = s[i * 4 + 3]; + + GSVector4i::sw64(v0, v1, v2, v3); GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; - GSVector4i::store(&d0[0], v0); - GSVector4i::store(&d0[1], v1); - GSVector4i::store(&d1[0], v2); - GSVector4i::store(&d1[1], v3); + GSVector4i::store(&d0[0], v0); + GSVector4i::store(&d0[1], v1); + GSVector4i::store(&d1[0], v2); + GSVector4i::store(&d1[1], v3); + + #endif } - template __forceinline static void ReadColumn16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + template __forceinline static void ReadColumn16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { - #if _M_SSE >= 0x301 + #if _M_SSE >= 0x501 + + const GSVector8i* s = (const GSVector8i*)src; + + GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask); + GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask); + + GSVector8i::sw128(v0, v1); + GSVector8i::sw32(v0, v1); + + v0 = v0.acbd(); + v1 = v1.acbd(); + + GSVector8i::store(&dst[dstpitch * 0], v0); + GSVector8i::store(&dst[dstpitch * 1], v1); + + #elif _M_SSE >= 0x301 const GSVector4i* s = (const GSVector4i*)src; @@ -326,10 +358,10 @@ public: GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; - GSVector4i::store(&d0[0], v0); - GSVector4i::store(&d0[1], v2); - GSVector4i::store(&d1[0], v1); - GSVector4i::store(&d1[1], v3); + GSVector4i::store(&d0[0], v0); + GSVector4i::store(&d0[1], v2); + GSVector4i::store(&d1[0], v1); + GSVector4i::store(&d1[1], v3); #else @@ -340,6 +372,8 @@ public: GSVector4i v2 = s[i * 4 + 2]; GSVector4i v3 = s[i * 4 + 3]; + //for(int16 i = 0; i < 8; i++) {v0.i16[i] = i; v1.i16[i] = i + 8; v2.i16[i] = i + 16; v3.i16[i] = i + 24;} + GSVector4i::sw16(v0, v1, v2, v3); GSVector4i::sw32(v0, v1, v2, v3); GSVector4i::sw16(v0, v2, v1, v3); @@ -347,15 +381,15 @@ public: GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; - GSVector4i::store(&d0[0], v0); - GSVector4i::store(&d0[1], v1); - GSVector4i::store(&d1[0], v2); - GSVector4i::store(&d1[1], v3); + GSVector4i::store(&d0[0], v0); + GSVector4i::store(&d0[1], v1); + GSVector4i::store(&d1[0], v2); + GSVector4i::store(&d1[1], v3); #endif } - template __forceinline static void ReadColumn8(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + template __forceinline static void ReadColumn8(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { #if _M_SSE >= 0x301 @@ -386,10 +420,10 @@ public: GSVector4i::sw16(v0, v1, v2, v3); GSVector4i::sw32(v0, v1, v3, v2); - GSVector4i::store(&dst[dstpitch * 0], v0); - GSVector4i::store(&dst[dstpitch * 1], v3); - GSVector4i::store(&dst[dstpitch * 2], v1); - GSVector4i::store(&dst[dstpitch * 3], v2); + GSVector4i::store(&dst[dstpitch * 0], v0); + GSVector4i::store(&dst[dstpitch * 1], v3); + GSVector4i::store(&dst[dstpitch * 2], v1); + GSVector4i::store(&dst[dstpitch * 3], v2); #else @@ -416,15 +450,15 @@ public: v1 = v1.yxwz(); } - GSVector4i::store(&dst[dstpitch * 0], v0); - GSVector4i::store(&dst[dstpitch * 1], v1); - GSVector4i::store(&dst[dstpitch * 2], v2); - GSVector4i::store(&dst[dstpitch * 3], v3); + GSVector4i::store(&dst[dstpitch * 0], v0); + GSVector4i::store(&dst[dstpitch * 1], v1); + GSVector4i::store(&dst[dstpitch * 2], v2); + GSVector4i::store(&dst[dstpitch * 3], v3); #endif } - template __forceinline static void ReadColumn4(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + template __forceinline static void ReadColumn4(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { #if _M_SSE >= 0x301 @@ -453,10 +487,10 @@ public: GSVector4i::sw16rl(v0, v1, v2, v3); } - GSVector4i::store(&dst[dstpitch * 0], v0); - GSVector4i::store(&dst[dstpitch * 1], v1); - GSVector4i::store(&dst[dstpitch * 2], v2); - GSVector4i::store(&dst[dstpitch * 3], v3); + GSVector4i::store(&dst[dstpitch * 0], v0); + GSVector4i::store(&dst[dstpitch * 1], v1); + GSVector4i::store(&dst[dstpitch * 2], v2); + GSVector4i::store(&dst[dstpitch * 3], v3); #else @@ -491,104 +525,104 @@ public: v1 = v1.yxwzlh(); } - GSVector4i::store(&dst[dstpitch * 0], v0); - GSVector4i::store(&dst[dstpitch * 1], v1); - GSVector4i::store(&dst[dstpitch * 2], v2); - GSVector4i::store(&dst[dstpitch * 3], v3); + GSVector4i::store(&dst[dstpitch * 0], v0); + GSVector4i::store(&dst[dstpitch * 1], v1); + GSVector4i::store(&dst[dstpitch * 2], v2); + GSVector4i::store(&dst[dstpitch * 3], v3); #endif } - template static void ReadColumn32(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + static void ReadColumn32(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { switch((y >> 1) & 3) { - case 0: ReadColumn32<0, aligned>(src, dst, dstpitch); break; - case 1: ReadColumn32<1, aligned>(src, dst, dstpitch); break; - case 2: ReadColumn32<2, aligned>(src, dst, dstpitch); break; - case 3: ReadColumn32<3, aligned>(src, dst, dstpitch); break; + case 0: ReadColumn32<0>(src, dst, dstpitch); break; + case 1: ReadColumn32<1>(src, dst, dstpitch); break; + case 2: ReadColumn32<2>(src, dst, dstpitch); break; + case 3: ReadColumn32<3>(src, dst, dstpitch); break; default: __assume(0); } } - template static void ReadColumn16(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + static void ReadColumn16(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { switch((y >> 1) & 3) { - case 0: ReadColumn16<0, aligned>(src, dst, dstpitch); break; - case 1: ReadColumn16<1, aligned>(src, dst, dstpitch); break; - case 2: ReadColumn16<2, aligned>(src, dst, dstpitch); break; - case 3: ReadColumn16<3, aligned>(src, dst, dstpitch); break; + case 0: ReadColumn16<0>(src, dst, dstpitch); break; + case 1: ReadColumn16<1>(src, dst, dstpitch); break; + case 2: ReadColumn16<2>(src, dst, dstpitch); break; + case 3: ReadColumn16<3>(src, dst, dstpitch); break; default: __assume(0); } } - template static void ReadColumn8(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + static void ReadColumn8(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { switch((y >> 2) & 3) { - case 0: ReadColumn8<0, aligned>(src, dst, dstpitch); break; - case 1: ReadColumn8<1, aligned>(src, dst, dstpitch); break; - case 2: ReadColumn8<2, aligned>(src, dst, dstpitch); break; - case 3: ReadColumn8<3, aligned>(src, dst, dstpitch); break; + case 0: ReadColumn8<0>(src, dst, dstpitch); break; + case 1: ReadColumn8<1>(src, dst, dstpitch); break; + case 2: ReadColumn8<2>(src, dst, dstpitch); break; + case 3: ReadColumn8<3>(src, dst, dstpitch); break; default: __assume(0); } } - template static void ReadColumn4(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + static void ReadColumn4(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { switch((y >> 2) & 3) { - case 0: ReadColumn4<0, aligned>(src, dst, dstpitch); break; - case 1: ReadColumn4<1, aligned>(src, dst, dstpitch); break; - case 2: ReadColumn4<2, aligned>(src, dst, dstpitch); break; - case 3: ReadColumn4<3, aligned>(src, dst, dstpitch); break; + case 0: ReadColumn4<0>(src, dst, dstpitch); break; + case 1: ReadColumn4<1>(src, dst, dstpitch); break; + case 2: ReadColumn4<2>(src, dst, dstpitch); break; + case 3: ReadColumn4<3>(src, dst, dstpitch); break; default: __assume(0); } } - template static void ReadBlock32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + static void ReadBlock32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { - ReadColumn32<0, aligned>(src, dst, dstpitch); + ReadColumn32<0>(src, dst, dstpitch); dst += dstpitch * 2; - ReadColumn32<1, aligned>(src, dst, dstpitch); + ReadColumn32<1>(src, dst, dstpitch); dst += dstpitch * 2; - ReadColumn32<2, aligned>(src, dst, dstpitch); + ReadColumn32<2>(src, dst, dstpitch); dst += dstpitch * 2; - ReadColumn32<3, aligned>(src, dst, dstpitch); + ReadColumn32<3>(src, dst, dstpitch); } - template static void ReadBlock16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + static void ReadBlock16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { - ReadColumn16<0, aligned>(src, dst, dstpitch); + ReadColumn16<0>(src, dst, dstpitch); dst += dstpitch * 2; - ReadColumn16<1, aligned>(src, dst, dstpitch); + ReadColumn16<1>(src, dst, dstpitch); dst += dstpitch * 2; - ReadColumn16<2, aligned>(src, dst, dstpitch); + ReadColumn16<2>(src, dst, dstpitch); dst += dstpitch * 2; - ReadColumn16<3, aligned>(src, dst, dstpitch); + ReadColumn16<3>(src, dst, dstpitch); } - template static void ReadBlock8(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + static void ReadBlock8(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { - ReadColumn8<0, aligned>(src, dst, dstpitch); + ReadColumn8<0>(src, dst, dstpitch); dst += dstpitch * 4; - ReadColumn8<1, aligned>(src, dst, dstpitch); + ReadColumn8<1>(src, dst, dstpitch); dst += dstpitch * 4; - ReadColumn8<2, aligned>(src, dst, dstpitch); + ReadColumn8<2>(src, dst, dstpitch); dst += dstpitch * 4; - ReadColumn8<3, aligned>(src, dst, dstpitch); + ReadColumn8<3>(src, dst, dstpitch); } - template static void ReadBlock4(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) + static void ReadBlock4(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { - ReadColumn4<0, aligned>(src, dst, dstpitch); + ReadColumn4<0>(src, dst, dstpitch); dst += dstpitch * 4; - ReadColumn4<1, aligned>(src, dst, dstpitch); + ReadColumn4<1>(src, dst, dstpitch); dst += dstpitch * 4; - ReadColumn4<2, aligned>(src, dst, dstpitch); + ReadColumn4<2>(src, dst, dstpitch); dst += dstpitch * 4; - ReadColumn4<3, aligned>(src, dst, dstpitch); + ReadColumn4<3>(src, dst, dstpitch); } __forceinline static void ReadBlock4P(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) @@ -850,8 +884,39 @@ public: } } + template __forceinline static V Expand24to32(const V& c, const V& TA0) + { + return c | (AEM ? TA0.andnot(c == V::zero()) : TA0); // TA0 & (c != GSVector4i::zero()) + } + + template __forceinline static V Expand16to32(const V& c, const V& TA0, const V& TA1) + { + return ((c & m_rxxx) << 3) | ((c & m_xgxx) << 6) | ((c & m_xxbx) << 9) | (AEM ? TA0.blend8(TA1, c.sra16(15)).andnot(c == V::zero()) : TA0.blend(TA1, c.sra16(15))); + } + template static void ExpandBlock24(const uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) { + #if _M_SSE >= 0x501 + + const GSVector8i* s = (const GSVector8i*)src; + + GSVector8i TA0(TEXA.TA0 << 24); + GSVector8i mask = GSVector8i::x00ffffff(); + + for(int i = 0; i < 4; i++, dst += dstpitch * 2) + { + GSVector8i v0 = s[i * 2 + 0] & mask; + GSVector8i v1 = s[i * 2 + 1] & mask; + + GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0]; + GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1]; + + d0[0] = Expand24to32(v0, TA0); + d1[0] = Expand24to32(v1, TA0); + } + + #else + const GSVector4i* s = (const GSVector4i*)src; GSVector4i TA0(TEXA.TA0 << 24); @@ -867,68 +932,53 @@ public: GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; - if(AEM) - { - d0[0] = v0 | TA0.andnot(v0 == GSVector4i::zero()); // TA0 & (v0 != GSVector4i::zero()) - d0[1] = v1 | TA0.andnot(v1 == GSVector4i::zero()); // TA0 & (v1 != GSVector4i::zero()) - d1[0] = v2 | TA0.andnot(v2 == GSVector4i::zero()); // TA0 & (v2 != GSVector4i::zero()) - d1[1] = v3 | TA0.andnot(v3 == GSVector4i::zero()); // TA0 & (v3 != GSVector4i::zero()) - } - else - { - d0[0] = v0 | TA0; - d0[1] = v1 | TA0; - d1[0] = v2 | TA0; - d1[1] = v3 | TA0; - } + d0[0] = Expand24to32(v0, TA0); + d0[1] = Expand24to32(v1, TA0); + d1[0] = Expand24to32(v2, TA0); + d1[1] = Expand24to32(v3, TA0); } + + #endif } template static void ExpandBlock16(const uint16* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) // do not inline, uses too many xmm regs { + #if _M_SSE >= 0x501 + + const GSVector8i* s = (const GSVector8i*)src; + + GSVector8i TA0(TEXA.TA0 << 24); + GSVector8i TA1(TEXA.TA1 << 24); + + for(int i = 0; i < 8; i++, dst += dstpitch) + { + GSVector8i v = s[i].acbd(); + + ((GSVector8i*)dst)[0] = Expand16to32(v.upl16(v), TA0, TA1); + ((GSVector8i*)dst)[1] = Expand16to32(v.uph16(v), TA0, TA1); + } + + #else + const GSVector4i* s = (const GSVector4i*)src; GSVector4i TA0(TEXA.TA0 << 24); GSVector4i TA1(TEXA.TA1 << 24); - GSVector4i rm = m_rxxx; - GSVector4i gm = m_xgxx; - GSVector4i bm = m_xxbx; - GSVector4i l, h; for(int i = 0; i < 8; i++, dst += dstpitch) { GSVector4i v0 = s[i * 2 + 0]; - l = v0.upl16(v0); - h = v0.uph16(v0); - - if(AEM) - { - ((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend8(TA1, l.sra16(15)).andnot(l == GSVector4i::zero()); - ((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend8(TA1, h.sra16(15)).andnot(h == GSVector4i::zero()); - } - else - { - ((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15)); - ((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15)); - } + ((GSVector4i*)dst)[0] = Expand16to32(v0.upl16(v0), TA0, TA1); + ((GSVector4i*)dst)[1] = Expand16to32(v0.uph16(v0), TA0, TA1); GSVector4i v1 = s[i * 2 + 1]; - l = v1.upl16(v1); - h = v1.uph16(v1); - - if(AEM) - { - ((GSVector4i*)dst)[2] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend8(TA1, l.sra16(15)).andnot(l == GSVector4i::zero()); - ((GSVector4i*)dst)[3] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend8(TA1, h.sra16(15)).andnot(h == GSVector4i::zero()); - } - else - { - ((GSVector4i*)dst)[2] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15)); - ((GSVector4i*)dst)[3] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15)); - } + ((GSVector4i*)dst)[2] = Expand16to32(v1.upl16(v1), TA0, TA1); + ((GSVector4i*)dst)[3] = Expand16to32(v1.uph16(v1), TA0, TA1); } + + #endif } __forceinline static void ExpandBlock8_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) @@ -1386,6 +1436,33 @@ public: template __forceinline static void ReadAndExpandBlock24(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) { + #if _M_SSE >= 0x501 + + const GSVector8i* s = (const GSVector8i*)src; + + GSVector8i TA0(TEXA.TA0 << 24); + GSVector8i mask = GSVector8i::x00ffffff(); + + for(int i = 0; i < 4; i++, dst += dstpitch * 2) + { + GSVector8i v0 = s[i * 2 + 0]; + GSVector8i v1 = s[i * 2 + 1]; + + GSVector8i::sw128(v0, v1); + GSVector8i::sw64(v0, v1); + + v0 &= mask; + v1 &= mask; + + GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0]; + GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1]; + + d0[0] = Expand24to32(v0, TA0); + d1[0] = Expand24to32(v1, TA0); + } + + #else + const GSVector4i* s = (const GSVector4i*)src; GSVector4i TA0(TEXA.TA0 << 24); @@ -1408,30 +1485,42 @@ public: GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; - if(AEM) - { - d0[0] = v0 | TA0.andnot(v0 == GSVector4i::zero()); // TA0 & (v0 != GSVector4i::zero()) - d0[1] = v1 | TA0.andnot(v1 == GSVector4i::zero()); // TA0 & (v1 != GSVector4i::zero()) - d1[0] = v2 | TA0.andnot(v2 == GSVector4i::zero()); // TA0 & (v2 != GSVector4i::zero()) - d1[1] = v3 | TA0.andnot(v3 == GSVector4i::zero()); // TA0 & (v3 != GSVector4i::zero()) - } - else - { - d0[0] = v0 | TA0; - d0[1] = v1 | TA0; - d1[0] = v2 | TA0; - d1[1] = v3 | TA0; - } + d0[0] = Expand24to32(v0, TA0); + d0[1] = Expand24to32(v1, TA0); + d1[0] = Expand24to32(v2, TA0); + d1[1] = Expand24to32(v3, TA0); } - } - template __forceinline static GSVector4i Expand16to32(const GSVector4i& c, const GSVector4i& TA0, const GSVector4i& TA1) - { - return ((c & m_rxxx) << 3) | ((c & m_xgxx) << 6) | ((c & m_xxbx) << 9) | (AEM ? TA0.blend8(TA1, c.sra16(15)).andnot(c == GSVector4i::zero()) : TA0.blend(TA1, c.sra16(15))); + + #endif } template __forceinline static void ReadAndExpandBlock16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) { - #if 0 // not faster + #if _M_SSE >= 0x501 + + const GSVector8i* s = (const GSVector8i*)src; + + GSVector8i TA0(TEXA.TA0 << 24); + GSVector8i TA1(TEXA.TA1 << 24); + + for(int i = 0; i < 4; i++, dst += dstpitch * 2) + { + GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask); + GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask); + + GSVector8i::sw128(v0, v1); + GSVector8i::sw32(v0, v1); + + GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0]; + GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1]; + + d0[0] = Expand16to32(v0.upl16(v0), TA0, TA1); + d0[1] = Expand16to32(v0.uph16(v0), TA0, TA1); + d1[0] = Expand16to32(v1.upl16(v1), TA0, TA1); + d1[1] = Expand16to32(v1.uph16(v1), TA0, TA1); + } + + #elif 0 // not faster const GSVector4i* s = (const GSVector4i*)src; @@ -1468,7 +1557,7 @@ public: __aligned(uint16, 32) block[16 * 8]; - ReadBlock16(src, (uint8*)block, sizeof(block) / 8); + ReadBlock16(src, (uint8*)block, sizeof(block) / 8); ExpandBlock16(block, dst, dstpitch, TEXA); @@ -1525,7 +1614,7 @@ public: __aligned(uint8, 32) block[16 * 16]; - ReadBlock8(src, (uint8*)block, sizeof(block) / 16); + ReadBlock8(src, (uint8*)block, sizeof(block) / 16); ExpandBlock8_32(block, dst, dstpitch, pal); @@ -1600,7 +1689,7 @@ public: __aligned(uint8, 32) block[(32 / 2) * 16]; - ReadBlock4(src, (uint8*)block, sizeof(block) / 16); + ReadBlock4(src, (uint8*)block, sizeof(block) / 16); ExpandBlock4_32(block, dst, dstpitch, pal); @@ -1641,7 +1730,7 @@ public: __aligned(uint32, 32) block[8 * 8]; - ReadBlock32(src, (uint8*)block, sizeof(block) / 8); + ReadBlock32(src, (uint8*)block, sizeof(block) / 8); ExpandBlock8H_32(block, dst, dstpitch, pal); @@ -1682,7 +1771,7 @@ public: __aligned(uint32, 32) block[8 * 8]; - ReadBlock32(src, (uint8*)block, sizeof(block) / 8); + ReadBlock32(src, (uint8*)block, sizeof(block) / 8); ExpandBlock4HL_32(block, dst, dstpitch, pal); @@ -1723,7 +1812,7 @@ public: __aligned(uint32, 32) block[8 * 8]; - ReadBlock32(src, (uint8*)block, sizeof(block) / 8); + ReadBlock32(src, (uint8*)block, sizeof(block) / 8); ExpandBlock4HH_32(block, dst, dstpitch, pal); diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index 952c02ada2..01ff9d3dfe 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -1553,19 +1553,30 @@ void GSDrawScanline::DrawRectT(const int* RESTRICT row, const int* RESTRICT col, { if(m == 0xffffffff) return; + #if _M_SSE >= 0x501 + + GSVector8i color((int)c); + GSVector8i mask((int)m); + + #else + GSVector4i color((int)c); GSVector4i mask((int)m); + #endif + if(sizeof(T) == sizeof(uint16)) { color = color.xxzzlh(); mask = mask.xxzzlh(); + c = (c & 0xffff) | (c << 16); + m = (m & 0xffff) | (m << 16); } - if(masked) ASSERT(mask.u32[0] != 0); - color = color.andnot(mask); - c = color.extract32<0>(); + c = c & (~m); + + if(masked) ASSERT(mask.u32[0] != 0); GSVector4i br = r.ralign(GSVector2i(8 * 4 / sizeof(T), 8)); @@ -1606,6 +1617,37 @@ void GSDrawScanline::FillRect(const int* RESTRICT row, const int* RESTRICT col, } } +#if _M_SSE >= 0x501 + +template +void GSDrawScanline::FillBlock(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, const GSVector8i& c, const GSVector8i& m) +{ + if(r.x >= r.z) return; + + T* vm = (T*)m_global.vm; + + for(int y = r.y; y < r.w; y += 8) + { + T* RESTRICT d = &vm[row[y]]; + + for(int x = r.x; x < r.z; x += 8 * 4 / sizeof(T)) + { + GSVector8i* RESTRICT p = (GSVector8i*)&d[col[x]]; + + p[0] = !masked ? c : (c | (p[0] & m)); + p[1] = !masked ? c : (c | (p[1] & m)); + p[2] = !masked ? c : (c | (p[2] & m)); + p[3] = !masked ? c : (c | (p[3] & m)); + p[4] = !masked ? c : (c | (p[4] & m)); + p[5] = !masked ? c : (c | (p[5] & m)); + p[6] = !masked ? c : (c | (p[6] & m)); + p[7] = !masked ? c : (c | (p[7] & m)); + } + } +} + +#else + template void GSDrawScanline::FillBlock(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, const GSVector4i& c, const GSVector4i& m) { @@ -1631,3 +1673,5 @@ void GSDrawScanline::FillBlock(const int* RESTRICT row, const int* RESTRICT col, } } } + +#endif diff --git a/plugins/GSdx/GSDrawScanline.h b/plugins/GSdx/GSDrawScanline.h index 3239f3a680..75a29c28aa 100644 --- a/plugins/GSdx/GSDrawScanline.h +++ b/plugins/GSdx/GSDrawScanline.h @@ -49,9 +49,18 @@ protected: template __forceinline void FillRect(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, uint32 c, uint32 m); + #if _M_SSE >= 0x501 + + template + __forceinline void FillBlock(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, const GSVector8i& c, const GSVector8i& m); + + #else + template __forceinline void FillBlock(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, const GSVector4i& c, const GSVector4i& m); + #endif + public: GSDrawScanline(); virtual ~GSDrawScanline(); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 8fa7ec3afe..9db0ea64d0 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -31,6 +31,7 @@ static const int _v = _args + 8; void GSDrawScanlineCodeGenerator::Generate() { +//ret(8); push(ebx); push(esi); push(edi); @@ -1143,13 +1144,6 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() if(!m_sel.lcm) { - // store u/v - - vpunpckldq(xmm0, xmm2, xmm3); - vmovdqa(ptr[&m_local.temp.uv[0]], xmm0); - vpunpckhdq(xmm0, xmm2, xmm3); - vmovdqa(ptr[&m_local.temp.uv[1]], xmm0); - // lod = -log2(Q) * (1 << L) + K vpcmpeqd(xmm1, xmm1); @@ -1167,18 +1161,37 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // xmm4 = mant(q) | 1.0f - vmulps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]); - vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]); - vmulps(xmm5, xmm4); - vsubps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); - vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]); - vmulps(xmm4, xmm5); - vaddps(xmm4, xmm0); + if(m_cpu.has(util::Cpu::tFMA)) + { + vmovaps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]); // c0 + vfmadd213ps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]); // c0 * xmm4 + c1 + vfmadd213ps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]); // (c0 * xmm4 + c1) * xmm4 + c2 + vsubps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); // xmm4 - 1.0f + vfmadd213ps(xmm4, xmm5, xmm0); // ((c0 * xmm4 + c1) * xmm4 + c2) * (xmm4 - 1.0f) + xmm0 + } + else + { + vmulps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]); + vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]); + vmulps(xmm5, xmm4); + vsubps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); + vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]); + vmulps(xmm4, xmm5); + vaddps(xmm4, xmm0); + } // xmm4 = log2(Q) = ((((c0 * xmm4) + c1) * xmm4) + c2) * (xmm4 - 1.0f) + xmm0 - vmulps(xmm4, ptr[&m_local.gd->l]); - vaddps(xmm4, ptr[&m_local.gd->k]); + if(m_cpu.has(util::Cpu::tFMA)) + { + vmovaps(xmm5, ptr[&m_local.gd->l]); + vfmadd213ps(xmm4, xmm5, ptr[&m_local.gd->k]); + } + else + { + vmulps(xmm4, ptr[&m_local.gd->l]); + vaddps(xmm4, ptr[&m_local.gd->k]); + } // xmm4 = (-log2(Q) * (1 << L) + K) * 0x10000 @@ -1196,6 +1209,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() } vpsrld(xmm0, xmm4, 16); + vmovdqa(ptr[&m_local.temp.lod.i], xmm0); /* vpslld(xmm5, xmm0, 6); @@ -1205,58 +1219,93 @@ return; */ if(m_sel.mmin == 2) // trilinear mode { - vpshuflw(xmm0, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vmovdqa(ptr[&m_local.temp.lod.f], xmm0); + vpshuflw(xmm1, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); + vmovdqa(ptr[&m_local.temp.lod.f], xmm1); } - // shift u/v by (int)lod + // shift u/v/minmax by (int)lod - vmovq(xmm4, ptr[&m_local.gd->t.minmax]); + if(m_cpu.has(util::Cpu::tAVX2)) + { + vpsravd(xmm2, xmm2, xmm0); + vpsravd(xmm3, xmm3, xmm0); - vmovdqa(xmm2, ptr[&m_local.temp.uv[0]]); - vmovdqa(xmm5, xmm2); - vmovdqa(xmm3, ptr[&m_local.temp.uv[1]]); - vmovdqa(xmm6, xmm3); + vmovdqa(ptr[&m_local.temp.uv[0]], xmm2); + vmovdqa(ptr[&m_local.temp.uv[1]], xmm3); - vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[0]]); - vpsrad(xmm2, xmm0); - vpsrlw(xmm1, xmm4, xmm0); - vmovq(ptr[&m_local.temp.uv_minmax[0].u32[0]], xmm1); + // m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1] - vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[1]]); - vpsrad(xmm5, xmm0); - vpsrlw(xmm1, xmm4, xmm0); - vmovq(ptr[&m_local.temp.uv_minmax[1].u32[0]], xmm1); + vmovq(xmm4, ptr[&m_local.gd->t.minmax]); // x x x x maxv maxu minv minu + vpunpcklwd(xmm4, xmm4); // maxv maxv maxu maxu minv minv minu minu - vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[2]]); - vpsrad(xmm3, xmm0); - vpsrlw(xmm1, xmm4, xmm0); - vmovq(ptr[&m_local.temp.uv_minmax[0].u32[2]], xmm1); + vpxor(xmm1, xmm1); + + vpunpckldq(xmm6, xmm4, xmm4); // minv minv minv minv minu minu minu minu + vpunpcklwd(xmm5, xmm6, xmm1); // 0 minu 0 minu 0 minu 0 minu + vpsrlvd(xmm5, xmm5, xmm0); + vpunpckhwd(xmm6, xmm6, xmm1); // 0 minv 0 minv 0 minv 0 minv + vpsrlvd(xmm6, xmm6, xmm0); + vpackusdw(xmm5, xmm6); // xmm5 = minv minv minv minv minu minu minu minu + + vpunpckhdq(xmm4, xmm4); // maxv maxv maxv maxv maxu maxu maxu maxu + vpunpcklwd(xmm6, xmm4, xmm1); // 0 maxu 0 maxu 0 maxu 0 maxu + vpsrlvd(xmm6, xmm6, xmm0); + vpunpckhwd(xmm4, xmm1); // 0 maxv 0 maxv 0 maxv 0 maxv + vpsrlvd(xmm4, xmm4, xmm0); + vpackusdw(xmm6, xmm4); // xmm6 = maxv maxv maxv maxv maxu maxu maxu maxu - vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[3]]); - vpsrad(xmm6, xmm0); - vpsrlw(xmm1, xmm4, xmm0); - vmovq(ptr[&m_local.temp.uv_minmax[1].u32[2]], xmm1); + vmovdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5); + vmovdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6); + } + else + { + vmovq(xmm4, ptr[&m_local.gd->t.minmax]); - vpunpckldq(xmm2, xmm3); - vpunpckhdq(xmm5, xmm6); - vpunpckhdq(xmm3, xmm2, xmm5); - vpunpckldq(xmm2, xmm5); + vpunpckldq(xmm5, xmm2, xmm3); + vpunpckhdq(xmm6, xmm2, xmm3); + vmovdqa(xmm2, xmm5); + vmovdqa(xmm3, xmm6); - vmovdqa(ptr[&m_local.temp.uv[0]], xmm2); - vmovdqa(ptr[&m_local.temp.uv[1]], xmm3); + vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[0]]); + vpsrad(xmm2, xmm0); + vpsrlw(xmm1, xmm4, xmm0); + vmovq(ptr[&m_local.temp.uv_minmax[0].u32[0]], xmm1); - vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]); - vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]); + vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[1]]); + vpsrad(xmm5, xmm0); + vpsrlw(xmm1, xmm4, xmm0); + vmovq(ptr[&m_local.temp.uv_minmax[1].u32[0]], xmm1); - vpunpcklwd(xmm0, xmm5, xmm6); - vpunpckhwd(xmm1, xmm5, xmm6); - vpunpckldq(xmm5, xmm0, xmm1); - vpunpckhdq(xmm6, xmm0, xmm1); + vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[2]]); + vpsrad(xmm3, xmm0); + vpsrlw(xmm1, xmm4, xmm0); + vmovq(ptr[&m_local.temp.uv_minmax[0].u32[2]], xmm1); - vmovdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5); - vmovdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6); + vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[3]]); + vpsrad(xmm6, xmm0); + vpsrlw(xmm1, xmm4, xmm0); + vmovq(ptr[&m_local.temp.uv_minmax[1].u32[2]], xmm1); + + vpunpckldq(xmm2, xmm3); + vpunpckhdq(xmm5, xmm6); + vpunpckhdq(xmm3, xmm2, xmm5); + vpunpckldq(xmm2, xmm5); + + vmovdqa(ptr[&m_local.temp.uv[0]], xmm2); + vmovdqa(ptr[&m_local.temp.uv[1]], xmm3); + + vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]); + vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]); + + vpunpcklwd(xmm0, xmm5, xmm6); + vpunpckhwd(xmm1, xmm5, xmm6); + vpunpckldq(xmm5, xmm0, xmm1); + vpunpckhdq(xmm6, xmm0, xmm1); + + vmovdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5); + vmovdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6); + } } else { @@ -2842,12 +2891,22 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) } const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; + const int t[] = {4, 1, 5, 2}; for(int i = 0; i < pixels; i++) { - for(int j = 0; j < 4; j++) + if(m_cpu.has(util::Cpu::tAVX2) && !m_sel.tlu) // vpgatherdd seems to be dead slow for byte aligned offsets, not using it for palette lookups { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + Xmm mask = Xmm(t[i]); + vpcmpeqd(mask, mask); + vpgatherdd(Xmm(r[i * 2 + 1]), ptr[ebx + Xmm(r[i * 2 + 0]) * 4], mask); + } + else + { + for(int j = 0; j < 4; j++) + { + ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + } } } } diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp index b9b9a3c457..0cd0279318 100644 --- a/plugins/GSdx/GSLocalMemory.cpp +++ b/plugins/GSdx/GSLocalMemory.cpp @@ -801,7 +801,7 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* { case PSM_PSMCT32: case PSM_PSMZ32: - ReadColumn32(y, dst, buff, 32); + ReadColumn32(y, dst, buff, 32); memcpy(&buff[32], &src[x * 4], 32); WriteColumn32(y, dst, buff, 32); break; @@ -809,17 +809,17 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: - ReadColumn16(y, dst, buff, 32); + ReadColumn16(y, dst, buff, 32); memcpy(&buff[32], &src[x * 2], 32); WriteColumn16(y, dst, buff, 32); break; case PSM_PSMT8: - ReadColumn8(y, dst, buff, 16); + ReadColumn8(y, dst, buff, 16); for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + x], 16); WriteColumn8(y, dst, buff, 16); break; case PSM_PSMT4: - ReadColumn4(y, dst, buff, 16); + ReadColumn4(y, dst, buff, 16); for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + (x >> 1)], 16); WriteColumn4(y, dst, buff, 16); break; @@ -882,7 +882,7 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* { case PSM_PSMCT32: case PSM_PSMZ32: - ReadColumn32(y, dst, buff, 32); + ReadColumn32(y, dst, buff, 32); memcpy(&buff[0], &src[x * 4], 32); WriteColumn32(y, dst, buff, 32); break; @@ -890,17 +890,17 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: - ReadColumn16(y, dst, buff, 32); + ReadColumn16(y, dst, buff, 32); memcpy(&buff[0], &src[x * 2], 32); WriteColumn16(y, dst, buff, 32); break; case PSM_PSMT8: - ReadColumn8(y, dst, buff, 16); + ReadColumn8(y, dst, buff, 16); for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + x], 16); WriteColumn8(y, dst, buff, 16); break; case PSM_PSMT4: - ReadColumn4(y, dst, buff, 16); + ReadColumn4(y, dst, buff, 16); for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + (x >> 1)], 16); WriteColumn4(y, dst, buff, 16); break; @@ -1157,6 +1157,7 @@ void GSLocalMemory::WriteImage4HH(int& tx, int& ty, const uint8* src, int len, G ty = th; } } + void GSLocalMemory::WriteImage24Z(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG) { if(TRXREG.RRW == 0) return; @@ -1190,6 +1191,7 @@ void GSLocalMemory::WriteImage24Z(int& tx, int& ty, const uint8* src, int len, G ty = th; } } + void GSLocalMemory::WriteImageX(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG) { if(len <= 0) return; @@ -1587,7 +1589,7 @@ void GSLocalMemory::ReadTexture32(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadBlock32(src, dst, dstpitch); + ReadBlock32(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1693,7 +1695,7 @@ void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, cons { ALIGN_STACK(32); - ReadBlock32(BlockPtr(bp), dst, dstpitch); + ReadBlock32(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const @@ -1845,7 +1847,7 @@ void GSLocalMemory::ReadTexture8P(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 16, 16, 8) { - ReadBlock8(src, dst, dstpitch); + ReadBlock8(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1890,7 +1892,7 @@ void GSLocalMemory::ReadTexture4HHP(const GSOffset* RESTRICT o, const GSVector4i void GSLocalMemory::ReadTextureBlock8P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { - ReadBlock8(BlockPtr(bp), dst, dstpitch); + ReadBlock8(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock4P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index 6f4ed66929..d11c31717c 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -489,7 +489,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index) { edge = v1; - edge.p = (v0.p.xxxx() + ddx[m2] * dv[0].p.yyyy()).xyzw(edge.p); + edge.p = v0.p.xxxx().addm(ddx[m2], dv[0].p.yyyy()).xyzw(edge.p); dedge.p = ddx[2 - (m2 << 1)].yzzw(dedge.p); DrawTriangleSection(tb.y, tb.w, edge, dedge, dscan, v1.p); @@ -532,7 +532,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co GSVertexSW scan; - scan.p = edge.p + dedge.p * dy; + scan.p = edge.p.addm(dedge.p, dy); GSVector4 lrf = scan.p.ceil(); GSVector4 l = lrf.max(scissor); @@ -546,14 +546,14 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co if(pixels > 0) { - scan.t = edge.t + dedge.t * dy; - scan.c = edge.c + dedge.c * dy; + scan.t = edge.t.addm(dedge.t, dy); + scan.c = edge.c.addm(dedge.c, dy); GSVector4 prestep = (l - p0).xxxx(); - scan.p += dscan.p * prestep; - scan.t += dscan.t * prestep; - scan.c += dscan.c * prestep; + scan.p = scan.p.addm(dscan.p, prestep); + scan.t = scan.t.addm(dscan.t, prestep); + scan.c = scan.c.addm(dscan.c, prestep); AddScanline(e++, pixels, left, top, scan); } diff --git a/plugins/GSdx/GSVector.cpp b/plugins/GSdx/GSVector.cpp index 020c903374..501908093c 100644 --- a/plugins/GSdx/GSVector.cpp +++ b/plugins/GSdx/GSVector.cpp @@ -73,6 +73,92 @@ const GSVector4 GSVector4::m_four(4.0f); const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000))); const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f800000))); +#if _M_SSE >= 0x500 + +const GSVector8 GSVector8::m_one(1.0f); +const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); +const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); + +#endif + +#if _M_SSE >= 0x501 + +const GSVector8i GSVector8i::m_xff[33] = +{ + GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), +}; + +const GSVector8i GSVector8i::m_x0f[33] = +{ + GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f), + GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f), +}; + +#endif + GSVector4i GSVector4i::fit(int arx, int ary) const { GSVector4i r = *this; diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index 2733106fc7..d77a22ac79 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -76,6 +76,19 @@ typedef GSVector2T GSVector2; typedef GSVector2T GSVector2i; class GSVector4; +class GSVector4i; + +#if _M_SSE >= 0x500 + +class GSVector8; + +#endif + +#if _M_SSE >= 0x501 + +class GSVector8i; + +#endif __aligned(class, 16) GSVector4i { @@ -156,6 +169,20 @@ public: __forceinline explicit GSVector4i(const GSVector4& v, bool truncate = true); + __forceinline static GSVector4i cast(const GSVector4& v); + + #if _M_SSE >= 0x500 + + __forceinline static GSVector4i cast(const GSVector8& v); + + #endif + + #if _M_SSE >= 0x501 + + __forceinline static GSVector4i cast(const GSVector8i& v); + + #endif + __forceinline void operator = (const GSVector4i& v) { m = v.m; @@ -283,8 +310,6 @@ public: return (uint32)store(v); } - static GSVector4i cast(const GSVector4& v); - #if _M_SSE >= 0x401 __forceinline GSVector4i sat_i8(const GSVector4i& a, const GSVector4i& b) const @@ -2119,11 +2144,6 @@ public: return (v1 < v2) | (v1 == v2); } - template __forceinline GSVector4i shuffle() const - { - return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(i, i, i, i))); - } - #define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ __forceinline GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ __forceinline GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ @@ -2143,9 +2163,6 @@ public: VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ #define VECTOR4i_SHUFFLE_1(xs, xn) \ - __forceinline GSVector4i xs##4() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \ - __forceinline GSVector4i xs##4l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \ - __forceinline GSVector4i xs##4h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \ VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \ VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \ VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \ @@ -2455,6 +2472,20 @@ public: __forceinline explicit GSVector4(const GSVector4i& v); + __forceinline static GSVector4 cast(const GSVector4i& v); + + #if _M_SSE >= 0x500 + + __forceinline static GSVector4 cast(const GSVector8& v); + + #endif + + #if _M_SSE >= 0x501 + + __forceinline static GSVector4 cast(const GSVector8i& v); + + #endif + __forceinline void operator = (const GSVector4& v) { m = v.m; @@ -2490,8 +2521,6 @@ public: return GSVector4(GSVector4i::load((int)rgba).u8to32() << shift); } - __forceinline static GSVector4 cast(const GSVector4i& v); - __forceinline GSVector4 abs() const { return *this & cast(GSVector4i::x7fffffff()); @@ -2558,11 +2587,11 @@ public: // http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html #define LOG_POLY0(x, c0) GSVector4(c0) - #define LOG_POLY1(x, c0, c1) ((LOG_POLY0(x, c1) * x) + GSVector4(c0)) - #define LOG_POLY2(x, c0, c1, c2) ((LOG_POLY1(x, c1, c2) * x) + GSVector4(c0)) - #define LOG_POLY3(x, c0, c1, c2, c3) ((LOG_POLY2(x, c1, c2, c3) * x) + GSVector4(c0)) - #define LOG_POLY4(x, c0, c1, c2, c3, c4) ((LOG_POLY3(x, c1, c2, c3, c4) * x) + GSVector4(c0)) - #define LOG_POLY5(x, c0, c1, c2, c3, c4, c5) ((LOG_POLY4(x, c1, c2, c3, c4, c5) * x) + GSVector4(c0)) + #define LOG_POLY1(x, c0, c1) (LOG_POLY0(x, c1).madd(x, GSVector4(c0))) + #define LOG_POLY2(x, c0, c1, c2) (LOG_POLY1(x, c1, c2).madd(x, GSVector4(c0))) + #define LOG_POLY3(x, c0, c1, c2, c3) (LOG_POLY2(x, c1, c2, c3).madd(x, GSVector4(c0))) + #define LOG_POLY4(x, c0, c1, c2, c3, c4) (LOG_POLY3(x, c1, c2, c3, c4).madd(x, GSVector4(c0))) + #define LOG_POLY5(x, c0, c1, c2, c3, c4, c5) (LOG_POLY4(x, c1, c2, c3, c4, c5).madd(x, GSVector4(c0))) __forceinline GSVector4 log2(int precision = 5) const { @@ -2571,7 +2600,7 @@ public: // The idea behind this algorithm is to split the float into two parts, log2(m * 2^e) => log2(m) + log2(2^e) => log2(m) + e, // and then approximate the logarithm of the mantissa (it's 1.x when normalized, a nice short range). - GSVector4 one(1.0f); + GSVector4 one = m_one; GSVector4i i = GSVector4i::cast(*this); @@ -2606,44 +2635,66 @@ public: return p + e; } - __forceinline GSVector4 mod2x(const GSVector4& f, const int scale = 256) const - { - return *this * (f * (2.0f / scale)); - } - - __forceinline GSVector4 mod2x(float f, const int scale = 256) const - { - return mod2x(GSVector4(f), scale); - } - __forceinline GSVector4 madd(const GSVector4& a, const GSVector4& b) const { - return *this * a + b; // TODO: _mm_fmadd_ps + #if 0//_M_SSE >= 0x501 + + return GSVector4(_mm_fmadd_ps(m, a, b)); + + #else + + return *this * a + b; + + #endif } __forceinline GSVector4 msub(const GSVector4& a, const GSVector4& b) const { - return *this * a + b; // TODO: _mm_fmsub_ps + #if 0//_M_SSE >= 0x501 + + return GSVector4(_mm_fmsub_ps(m, a, b)); + + #else + + return *this * a - b; + + #endif } __forceinline GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const { - return b - *this * a; // TODO: _mm_fnmadd_ps + #if 0//_M_SSE >= 0x501 + + return GSVector4(_mm_fnmadd_ps(m, a, b)); + + #else + + return b - *this * a; + + #endif } __forceinline GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const { - return -b - *this * a; // TODO: _mm_fmnsub_ps + #if 0//_M_SSE >= 0x501 + + return GSVector4(_mm_fnmsub_ps(m, a, b)); + + #else + + return -b - *this * a; + + #endif } - __forceinline GSVector4 lerp(const GSVector4& v, const GSVector4& f) const + __forceinline GSVector4 addm(const GSVector4& a, const GSVector4& b) const { - return *this + (v - *this) * f; + return a.madd(b, *this); // *this + a * b } - __forceinline GSVector4 lerp(const GSVector4& v, float f) const + __forceinline GSVector4 subm(const GSVector4& a, const GSVector4& b) const { - return lerp(v, GSVector4(f)); + return a.nmadd(b, *this); // *this - a * b } __forceinline GSVector4 hadd() const @@ -3134,11 +3185,6 @@ public: return GSVector4(_mm_cmple_ps(v1, v2)); } - template __forceinline GSVector4 shuffle() const - { - return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(i, i, i, i))); - } - #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ __forceinline GSVector4 xs##ys##zs##ws() const {return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ __forceinline GSVector4 xs##ys##zs##ws(const GSVector4& v) const {return GSVector4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ @@ -3167,30 +3213,13 @@ public: VECTOR4_SHUFFLE_1(w, 3) }; -__forceinline GSVector4i::GSVector4i(const GSVector4& v, bool truncate) -{ - m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v); -} - -__forceinline GSVector4::GSVector4(const GSVector4i& v) -{ - m = _mm_cvtepi32_ps(v); -} - -__forceinline GSVector4i GSVector4i::cast(const GSVector4& v) -{ - return GSVector4i(_mm_castps_si128(v.m)); -} - -__forceinline GSVector4 GSVector4::cast(const GSVector4i& v) -{ - return GSVector4(_mm_castsi128_ps(v.m)); -} - -class GSVector8; +#if _M_SSE >= 0x501 __aligned(class, 32) GSVector8i { + static const GSVector8i m_xff[33]; + static const GSVector8i m_x0f[33]; + public: union { @@ -3206,27 +3235,42 @@ public: uint16 u16[16]; uint32 u32[8]; uint64 u64[4]; - #if _M_SSE >= 0x500 __m256i m; __m128i m0, m1; - #else - __m128i m[2]; - #endif }; __forceinline GSVector8i() {} __forceinline explicit GSVector8i(const GSVector8& v, bool truncate = true); - static GSVector8i cast(const GSVector8& v); - - #if _M_SSE >= 0x500 + __forceinline static GSVector8i cast(const GSVector8& v); + __forceinline static GSVector8i cast(const GSVector4& v); + __forceinline static GSVector8i cast(const GSVector4i& v); __forceinline GSVector8i(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1) { m = _mm256_set_epi32(w1, z1, y1, x1, w0, z0, y0, x0); } + __forceinline GSVector8i( + short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7, + short s8, short s9, short s10, short s11, short s12, short s13, short s14, short s15) + { + m = _mm256_set_epi16(s15, s14, s13, s12, s11, s10, s9, s8, s7, s6, s5, s4, s3, s2, s1, s0); + } + + __forceinline GSVector8i( + char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, + char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15, + char b16, char b17, char b18, char b19, char b20, char b21, char b22, char b23, + char b24, char b25, char b26, char b27, char b28, char b29, char b30, char b31 + ) + { + m = _mm256_set_epi8( + b31, b30, b29, b28, b27, b26, b25, b24, b23, b22, b21, b20, b19, b18, b17, b16, + b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0); + } + __forceinline GSVector8i(__m128i m0, __m128i m1) { this->m = zero().insert<0>(m0).insert<1>(m1); @@ -3239,12 +3283,20 @@ public: __forceinline explicit GSVector8i(int i) { + #if _M_SSE >= 0x501 + m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i)); + #else m = _mm256_set1_epi32(i); + #endif } __forceinline explicit GSVector8i(__m128i m) { + #if _M_SSE >= 0x501 + this->m = _mm256_broadcastsi128_si256(m); + #else this->m = zero().insert<0>(m).insert<1>(m); + #endif } __forceinline explicit GSVector8i(__m256i m) @@ -3259,12 +3311,20 @@ public: __forceinline void operator = (int i) { + #if _M_SSE >= 0x501 + m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i)); + #else m = _mm256_set1_epi32(i); + #endif } __forceinline void operator = (__m128i m) { + #if _M_SSE >= 0x501 + this->m = _mm256_broadcastsi128_si256(m); + #else this->m = zero().insert<0>(m).insert<1>(m); + #endif } __forceinline void operator = (__m256i m) @@ -3277,7 +3337,731 @@ public: return m; } - // TODO + // + + __forceinline GSVector8i sat_i8(const GSVector8i& a, const GSVector8i& b) const + { + return max_i8(a).min_i8(b); + } + + __forceinline GSVector8i sat_i8(const GSVector8i& a) const + { + return max_i8(a.xyxy()).min_i8(a.zwzw()); + } + + __forceinline GSVector8i sat_i16(const GSVector8i& a, const GSVector8i& b) const + { + return max_i16(a).min_i16(b); + } + + __forceinline GSVector8i sat_i16(const GSVector8i& a) const + { + return max_i16(a.xyxy()).min_i16(a.zwzw()); + } + + __forceinline GSVector8i sat_i32(const GSVector8i& a, const GSVector8i& b) const + { + return max_i32(a).min_i32(b); + } + + __forceinline GSVector8i sat_i32(const GSVector8i& a) const + { + return max_i32(a.xyxy()).min_i32(a.zwzw()); + } + + __forceinline GSVector8i sat_u8(const GSVector8i& a, const GSVector8i& b) const + { + return max_u8(a).min_u8(b); + } + + __forceinline GSVector8i sat_u8(const GSVector8i& a) const + { + return max_u8(a.xyxy()).min_u8(a.zwzw()); + } + + __forceinline GSVector8i sat_u16(const GSVector8i& a, const GSVector8i& b) const + { + return max_u16(a).min_u16(b); + } + + __forceinline GSVector8i sat_u16(const GSVector8i& a) const + { + return max_u16(a.xyxy()).min_u16(a.zwzw()); + } + + __forceinline GSVector8i sat_u32(const GSVector8i& a, const GSVector8i& b) const + { + return max_u32(a).min_u32(b); + } + + __forceinline GSVector8i sat_u32(const GSVector8i& a) const + { + return max_u32(a.xyxy()).min_u32(a.zwzw()); + } + + __forceinline GSVector8i min_i8(const GSVector8i& a) const + { + return GSVector8i(_mm256_min_epi8(m, a)); + } + + __forceinline GSVector8i max_i8(const GSVector8i& a) const + { + return GSVector8i(_mm256_max_epi8(m, a)); + } + + __forceinline GSVector8i min_i16(const GSVector8i& a) const + { + return GSVector8i(_mm256_min_epi16(m, a)); + } + + __forceinline GSVector8i max_i16(const GSVector8i& a) const + { + return GSVector8i(_mm256_max_epi16(m, a)); + } + + __forceinline GSVector8i min_i32(const GSVector8i& a) const + { + return GSVector8i(_mm256_min_epi32(m, a)); + } + + __forceinline GSVector8i max_i32(const GSVector8i& a) const + { + return GSVector8i(_mm256_max_epi32(m, a)); + } + + __forceinline GSVector8i min_u8(const GSVector8i& a) const + { + return GSVector8i(_mm256_min_epu8(m, a)); + } + + __forceinline GSVector8i max_u8(const GSVector8i& a) const + { + return GSVector8i(_mm256_max_epu8(m, a)); + } + + __forceinline GSVector8i min_u16(const GSVector8i& a) const + { + return GSVector8i(_mm256_min_epu16(m, a)); + } + + __forceinline GSVector8i max_u16(const GSVector8i& a) const + { + return GSVector8i(_mm256_max_epu16(m, a)); + } + + __forceinline GSVector8i min_u32(const GSVector8i& a) const + { + return GSVector8i(_mm256_min_epu32(m, a)); + } + + __forceinline GSVector8i max_u32(const GSVector8i& a) const + { + return GSVector8i(_mm256_max_epu32(m, a)); + } + + __forceinline GSVector8i clamp8() const + { + return pu16().upl8(); + } + + __forceinline GSVector8i blend8(const GSVector8i& a, const GSVector8i& mask) const + { + return GSVector8i(_mm256_blendv_epi8(m, a, mask)); + } + + template __forceinline GSVector8i blend16(const GSVector8i& a) const + { + return GSVector8i(_mm256_blend_epi16(m, a, mask)); + } + + __forceinline GSVector8i blend(const GSVector8i& a, const GSVector8i& mask) const + { + return GSVector8i(_mm256_or_si256(_mm256_andnot_si256(mask, m), _mm256_and_si256(mask, a))); + } + + __forceinline GSVector8i mix16(const GSVector8i& a) const + { + return blend16<0xaa>(a); + } + + __forceinline GSVector8i shuffle8(const GSVector8i& mask) const + { + return GSVector8i(_mm256_shuffle_epi8(m, mask)); + } + + __forceinline GSVector8i ps16(const GSVector8i& a) const + { + return GSVector8i(_mm256_packs_epi16(m, a)); + } + + __forceinline GSVector8i ps16() const + { + return GSVector8i(_mm256_packs_epi16(m, m)); + } + + __forceinline GSVector8i pu16(const GSVector8i& a) const + { + return GSVector8i(_mm256_packus_epi16(m, a)); + } + + __forceinline GSVector8i pu16() const + { + return GSVector8i(_mm256_packus_epi16(m, m)); + } + + __forceinline GSVector8i ps32(const GSVector8i& a) const + { + return GSVector8i(_mm256_packs_epi32(m, a)); + } + + __forceinline GSVector8i ps32() const + { + return GSVector8i(_mm256_packs_epi32(m, m)); + } + + __forceinline GSVector8i pu32(const GSVector8i& a) const + { + return GSVector8i(_mm256_packus_epi32(m, a)); + } + + __forceinline GSVector8i pu32() const + { + return GSVector8i(_mm256_packus_epi32(m, m)); + } + + __forceinline GSVector8i upl8(const GSVector8i& a) const + { + return GSVector8i(_mm256_unpacklo_epi8(m, a)); + } + + __forceinline GSVector8i uph8(const GSVector8i& a) const + { + return GSVector8i(_mm256_unpackhi_epi8(m, a)); + } + + __forceinline GSVector8i upl16(const GSVector8i& a) const + { + return GSVector8i(_mm256_unpacklo_epi16(m, a)); + } + + __forceinline GSVector8i uph16(const GSVector8i& a) const + { + return GSVector8i(_mm256_unpackhi_epi16(m, a)); + } + + __forceinline GSVector8i upl32(const GSVector8i& a) const + { + return GSVector8i(_mm256_unpacklo_epi32(m, a)); + } + + __forceinline GSVector8i uph32(const GSVector8i& a) const + { + return GSVector8i(_mm256_unpackhi_epi32(m, a)); + } + + __forceinline GSVector8i upl64(const GSVector8i& a) const + { + return GSVector8i(_mm256_unpacklo_epi64(m, a)); + } + + __forceinline GSVector8i uph64(const GSVector8i& a) const + { + return GSVector8i(_mm256_unpackhi_epi64(m, a)); + } + + __forceinline GSVector8i upl8() const + { + return GSVector8i(_mm256_unpacklo_epi8(m, _mm256_setzero_si256())); + } + + __forceinline GSVector8i uph8() const + { + return GSVector8i(_mm256_unpackhi_epi8(m, _mm256_setzero_si256())); + } + + __forceinline GSVector8i upl16() const + { + return GSVector8i(_mm256_unpacklo_epi16(m, _mm256_setzero_si256())); + } + + __forceinline GSVector8i uph16() const + { + return GSVector8i(_mm256_unpackhi_epi16(m, _mm256_setzero_si256())); + } + + __forceinline GSVector8i upl32() const + { + return GSVector8i(_mm256_unpacklo_epi32(m, _mm256_setzero_si256())); + } + + __forceinline GSVector8i uph32() const + { + return GSVector8i(_mm256_unpackhi_epi32(m, _mm256_setzero_si256())); + } + + __forceinline GSVector8i upl64() const + { + return GSVector8i(_mm256_unpacklo_epi64(m, _mm256_setzero_si256())); + } + + __forceinline GSVector8i uph64() const + { + return GSVector8i(_mm256_unpackhi_epi64(m, _mm256_setzero_si256())); + } + + __forceinline GSVector8i i8to16() const + { + return GSVector8i(_mm256_cvtepi8_epi16(_mm256_castsi256_si128(m))); + } + + __forceinline GSVector8i u8to16() const + { + return GSVector8i(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(m))); + } + + __forceinline GSVector8i i8to32() const + { + return GSVector8i(_mm256_cvtepi8_epi32(_mm256_castsi256_si128(m))); + } + + __forceinline GSVector8i u8to32() const + { + return GSVector8i(_mm256_cvtepu8_epi32(_mm256_castsi256_si128(m))); + } + + __forceinline GSVector8i i8to64() const + { + return GSVector8i(_mm256_cvtepi8_epi64(_mm256_castsi256_si128(m))); + } + + __forceinline GSVector8i u8to64() const + { + return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m))); + } + + __forceinline GSVector8i i16to32() const + { + return GSVector8i(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(m))); + } + + __forceinline GSVector8i u16to32() const + { + return GSVector8i(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(m))); + } + + __forceinline GSVector8i i16to64() const + { + return GSVector8i(_mm256_cvtepi16_epi64(_mm256_castsi256_si128(m))); + } + + __forceinline GSVector8i u16to64() const + { + return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m))); + } + + __forceinline GSVector8i i32to64() const + { + return GSVector8i(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(m))); + } + + __forceinline GSVector8i u32to64() const + { + return GSVector8i(_mm256_cvtepu32_epi64(_mm256_castsi256_si128(m))); + } + + template __forceinline GSVector8i srl() const + { + return GSVector8i(_mm256_srli_si156(m, i)); + } + + template __forceinline GSVector8i srl(const GSVector8i& v) + { + return GSVector4i(_mm256_alignr_epi8(v.m, m, i)); + } + + template __forceinline GSVector8i sll() const + { + return GSVector8i(_mm256_slli_si128(m, i)); + } + + __forceinline GSVector8i sra16(int i) const + { + return GSVector8i(_mm256_srai_epi16(m, i)); + } + + __forceinline GSVector8i sra16(__m128i i) const + { + return GSVector8i(_mm256_sra_epi16(m, i)); + } + + __forceinline GSVector8i sra16(__m256i i) const + { + return GSVector8i(_mm256_sra_epi16(m, _mm256_castsi256_si128(i))); + } + + __forceinline GSVector8i sra32(int i) const + { + return GSVector8i(_mm256_srai_epi32(m, i)); + } + + __forceinline GSVector8i sra32(__m128i i) const + { + return GSVector8i(_mm256_sra_epi32(m, i)); + } + + __forceinline GSVector8i sra32(__m256i i) const + { + return GSVector8i(_mm256_sra_epi32(m, _mm256_castsi256_si128(i))); + } + + __forceinline GSVector8i srav32(__m256i i) const + { + return GSVector8i(_mm256_srav_epi32(m, i)); + } + + __forceinline GSVector8i sll16(int i) const + { + return GSVector8i(_mm256_slli_epi16(m, i)); + } + + __forceinline GSVector8i sll16(__m128i i) const + { + return GSVector8i(_mm256_sll_epi16(m, i)); + } + + __forceinline GSVector8i sll16(__m256i i) const + { + return GSVector8i(_mm256_sll_epi16(m, _mm256_castsi256_si128(i))); + } + + __forceinline GSVector8i sll32(int i) const + { + return GSVector8i(_mm256_slli_epi32(m, i)); + } + + __forceinline GSVector8i sll32(__m128i i) const + { + return GSVector8i(_mm256_sll_epi32(m, i)); + } + + __forceinline GSVector8i sll32(__m256i i) const + { + return GSVector8i(_mm256_sll_epi32(m, _mm256_castsi256_si128(i))); + } + + __forceinline GSVector8i sllv32(__m256i i) const + { + return GSVector8i(_mm256_sllv_epi32(m, i)); + } + + __forceinline GSVector8i sll64(int i) const + { + return GSVector8i(_mm256_slli_epi64(m, i)); + } + + __forceinline GSVector8i sll64(__m128i i) const + { + return GSVector8i(_mm256_sll_epi64(m, i)); + } + + __forceinline GSVector8i sll64(__m256i i) const + { + return GSVector8i(_mm256_sll_epi64(m, _mm256_castsi256_si128(i))); + } + + __forceinline GSVector8i sllv64(__m256i i) const + { + return GSVector8i(_mm256_sllv_epi64(m, i)); + } + + __forceinline GSVector8i srl16(int i) const + { + return GSVector8i(_mm256_srli_epi16(m, i)); + } + + __forceinline GSVector8i srl16(__m128i i) const + { + return GSVector8i(_mm256_srl_epi16(m, i)); + } + + __forceinline GSVector8i srl16(__m256i i) const + { + return GSVector8i(_mm256_srl_epi16(m, _mm256_castsi256_si128(i))); + } + + __forceinline GSVector8i srl32(int i) const + { + return GSVector8i(_mm256_srli_epi32(m, i)); + } + + __forceinline GSVector8i srl32(__m128i i) const + { + return GSVector8i(_mm256_srl_epi32(m, i)); + } + + __forceinline GSVector8i srl32(__m256i i) const + { + return GSVector8i(_mm256_srl_epi32(m, _mm256_castsi256_si128(i))); + } + + __forceinline GSVector8i srlv32(__m256i i) const + { + return GSVector8i(_mm256_srlv_epi32(m, i)); + } + + __forceinline GSVector8i srl64(int i) const + { + return GSVector8i(_mm256_srli_epi64(m, i)); + } + + __forceinline GSVector8i srl64(__m128i i) const + { + return GSVector8i(_mm256_srl_epi64(m, i)); + } + + __forceinline GSVector8i srl64(__m256i i) const + { + return GSVector8i(_mm256_srl_epi64(m, _mm256_castsi256_si128(i))); + } + + __forceinline GSVector8i srlv64(__m256i i) const + { + return GSVector8i(_mm256_srlv_epi64(m, i)); + } + + __forceinline GSVector8i add8(const GSVector8i& v) const + { + return GSVector8i(_mm256_add_epi8(m, v.m)); + } + + __forceinline GSVector8i add16(const GSVector8i& v) const + { + return GSVector8i(_mm256_add_epi16(m, v.m)); + } + + __forceinline GSVector8i add32(const GSVector8i& v) const + { + return GSVector8i(_mm256_add_epi32(m, v.m)); + } + + __forceinline GSVector8i adds8(const GSVector8i& v) const + { + return GSVector8i(_mm256_adds_epi8(m, v.m)); + } + + __forceinline GSVector8i adds16(const GSVector8i& v) const + { + return GSVector8i(_mm256_adds_epi16(m, v.m)); + } + + __forceinline GSVector8i addus8(const GSVector8i& v) const + { + return GSVector8i(_mm256_adds_epu8(m, v.m)); + } + + __forceinline GSVector8i addus16(const GSVector8i& v) const + { + return GSVector8i(_mm256_adds_epu16(m, v.m)); + } + + __forceinline GSVector8i sub8(const GSVector8i& v) const + { + return GSVector8i(_mm256_sub_epi8(m, v.m)); + } + + __forceinline GSVector8i sub16(const GSVector8i& v) const + { + return GSVector8i(_mm256_sub_epi16(m, v.m)); + } + + __forceinline GSVector8i sub32(const GSVector8i& v) const + { + return GSVector8i(_mm256_sub_epi32(m, v.m)); + } + + __forceinline GSVector8i subs8(const GSVector8i& v) const + { + return GSVector8i(_mm256_subs_epi8(m, v.m)); + } + + __forceinline GSVector8i subs16(const GSVector8i& v) const + { + return GSVector8i(_mm256_subs_epi16(m, v.m)); + } + + __forceinline GSVector8i subus8(const GSVector8i& v) const + { + return GSVector8i(_mm256_subs_epu8(m, v.m)); + } + + __forceinline GSVector8i subus16(const GSVector8i& v) const + { + return GSVector8i(_mm256_subs_epu16(m, v.m)); + } + + __forceinline GSVector8i avg8(const GSVector8i& v) const + { + return GSVector8i(_mm256_avg_epu8(m, v.m)); + } + + __forceinline GSVector8i avg16(const GSVector8i& v) const + { + return GSVector8i(_mm256_avg_epu16(m, v.m)); + } + + __forceinline GSVector8i mul16hs(const GSVector8i& v) const + { + return GSVector8i(_mm256_mulhi_epi16(m, v.m)); + } + + __forceinline GSVector8i mul16hu(const GSVector8i& v) const + { + return GSVector8i(_mm256_mulhi_epu16(m, v.m)); + } + + __forceinline GSVector8i mul16l(const GSVector8i& v) const + { + return GSVector8i(_mm256_mullo_epi16(m, v.m)); + } + + __forceinline GSVector8i mul16hrs(const GSVector8i& v) const + { + return GSVector8i(_mm256_mulhrs_epi16(m, v.m)); + } + + GSVector8i madd(const GSVector8i& v) const + { + return GSVector8i(_mm256_madd_epi16(m, v.m)); + } + + template __forceinline GSVector8i lerp16(const GSVector8i& a, const GSVector8i& f) const + { + // (a - this) * f << shift + this + + return add16(a.sub16(*this).modulate16(f)); + } + + template __forceinline static GSVector8i lerp16(const GSVector8i& a, const GSVector8i& b, const GSVector8i& c) + { + // (a - b) * c << shift + + return a.sub16(b).modulate16(c); + } + + template __forceinline static GSVector8i lerp16(const GSVector8i& a, const GSVector8i& b, const GSVector8i& c, const GSVector8i& d) + { + // (a - b) * c << shift + d + + return d.add16(a.sub16(b).modulate16(c)); + } + + __forceinline GSVector8i lerp16_4(const GSVector8i& a, const GSVector8i& f) const + { + // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit) + + return add16(a.sub16(*this).mul16l(f).sra16(4)); + } + + template __forceinline GSVector8i modulate16(const GSVector8i& f) const + { + // a * f << shift + + if(shift == 0) + { + return mul16hrs(f); + } + + return sll16(shift + 1).mul16hs(f); + } + + __forceinline bool eq(const GSVector8i& v) const + { + GSVector8i t = *this ^ v; + + return _mm256_testz_si256(t, t) != 0; + } + + __forceinline GSVector8i eq8(const GSVector8i& v) const + { + return GSVector8i(_mm256_cmpeq_epi8(m, v.m)); + } + + __forceinline GSVector8i eq16(const GSVector8i& v) const + { + return GSVector8i(_mm256_cmpeq_epi16(m, v.m)); + } + + __forceinline GSVector8i eq32(const GSVector8i& v) const + { + return GSVector8i(_mm256_cmpeq_epi32(m, v.m)); + } + + __forceinline GSVector8i neq8(const GSVector8i& v) const + { + return ~eq8(v); + } + + __forceinline GSVector8i neq16(const GSVector8i& v) const + { + return ~eq16(v); + } + + __forceinline GSVector8i neq32(const GSVector8i& v) const + { + return ~eq32(v); + } + + __forceinline GSVector8i gt8(const GSVector8i& v) const + { + return GSVector8i(_mm256_cmpgt_epi8(m, v.m)); + } + + __forceinline GSVector8i gt16(const GSVector8i& v) const + { + return GSVector8i(_mm256_cmpgt_epi16(m, v.m)); + } + + __forceinline GSVector8i gt32(const GSVector8i& v) const + { + return GSVector8i(_mm256_cmpgt_epi32(m, v.m)); + } + + __forceinline GSVector8i lt8(const GSVector8i& v) const + { + return GSVector8i(_mm256_cmpgt_epi8(v.m, m)); + } + + __forceinline GSVector8i lt16(const GSVector8i& v) const + { + return GSVector8i(_mm256_cmpgt_epi16(v.m, m)); + } + + __forceinline GSVector8i lt32(const GSVector8i& v) const + { + return GSVector8i(_mm256_cmpgt_epi32(v.m, m)); + } + + __forceinline GSVector8i andnot(const GSVector8i& v) const + { + return GSVector8i(_mm256_andnot_si256(v.m, m)); + } + + __forceinline int mask() const + { + return _mm256_movemask_epi8(m); + } + + __forceinline bool alltrue() const + { + return mask() == (int)0xffffffff; + } + + __forceinline bool allfalse() const + { + return _mm256_testz_si256(m, m) != 0; + } + + // TODO: extract/insert template __forceinline GSVector4i extract() const { @@ -3289,122 +4073,610 @@ public: return GSVector8i(_mm256_insertf128_si256(this->m, m, i)); } - __forceinline static GSVector8i zero() + // TODO: gather + + __forceinline static GSVector8i loadnt(const void* p) { - return GSVector8i(_mm256_setzero_si256()); + return GSVector8i(_mm256_stream_load_si256((__m256i*)p)); } - // TODO + __forceinline static GSVector8i loadl(const void* p) + { + return GSVector8i(_mm256_castsi128_si256(_mm_load_si128((__m128i*)p))); + } + + __forceinline static GSVector8i loadh(const void* p) + { + return GSVector8i(_mm256_inserti128_si256(_mm256_setzero_si256(), _mm_load_si128((__m128i*)p), 1)); + + /* TODO: this may be faster + __m256i m = _mm256_castsi128_si256(_mm_load_si128((__m128i*)p)); + return GSVector8i(_mm256_permute2x128_si256(m, m, 0x08)); + */ + } + + __forceinline static GSVector8i loadh(const void* p, const GSVector8i& v) + { + return GSVector8i(_mm256_inserti128_si256(v, _mm_load_si128((__m128i*)p), 1)); + } + + __forceinline static GSVector8i load(const void* pl, const void* ph) + { + return loadh(ph, loadl(pl)); + + /* TODO: this may be faster + __m256 m0 = _mm256_castsi128_si256(_mm_load_si128((__m128*)pl)); + __m256 m1 = _mm256_castsi128_si256(_mm_load_si128((__m128*)ph)); + return GSVector8i(_mm256_permute2x128_si256(m0, m1, 0x20)); + */ + } template __forceinline static GSVector8i load(const void* p) { - return GSVector8i(aligned ? _mm_load256i_si256((__m256i*)p) : _mm256i_loadu_si256((__m256i*)p)); + return GSVector8i(aligned ? _mm256_load_si256((__m256i*)p) : _mm256_loadu_si256((__m256i*)p)); } - template __forceinline static void store(void* p, const GSVector4i& v) + __forceinline static GSVector8i load(int i) { - if(aligned) _mm256i_store_si256((__m256i*)p, v.m); - else _mm256i_storeu_si256((__m256i*)p, v.m); + return cast(GSVector4i::load(i)); } - #else + #ifdef _M_AMD64 - __forceinline GSVector8i(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1) + __forceinline static GSVector8i loadq(int64 i) { - m[0] = _mm_set_epi32(w0, z0, y0, x0); - m[1] = _mm_set_epi32(w1, z1, y1, x1); + return cast(GSVector4i::loadq(i)); } - __forceinline GSVector8i(__m128i m0, __m128i m1) + #endif + + __forceinline static void storent(void* p, const GSVector8i& v) { - m[0] = m0; - m[1] = m1; + _mm256_stream_si256((__m256i*)p, v.m); } - __forceinline GSVector8i(const GSVector8i& v) + __forceinline static void storel(void* p, const GSVector8i& v) { - m[0] = v.m[0]; - m[1] = v.m[1]; + _mm_store_si128((__m128i*)p, _mm256_extracti128_si256(v.m, 0)); } - __forceinline explicit GSVector8i(int i) + __forceinline static void storeh(void* p, const GSVector8i& v) { - m[0] = m[1] = _mm_set1_epi32(i); + _mm_store_si128((__m128i*)p, _mm256_extracti128_si256(v.m, 1)); } - __forceinline explicit GSVector8i(__m128i m) + __forceinline static void store(void* pl, void* ph, const GSVector8i& v) { - this->m[0] = this->m[1] = m; - } - - __forceinline void operator = (const GSVector8i& v) - { - m[0] = v.m[0]; - m[1] = v.m[1]; - } - - __forceinline void operator = (int i) - { - m[0] = m[1] = _mm_set1_epi32(i); - } - - __forceinline void operator = (__m128i m) - { - this->m[0] = this->m[1] = m; - } - - // TODO - - template __forceinline GSVector4i extract() const - { - return GSVector4i(m[i]); - } - - template __forceinline GSVector8i insert(__m128i m) const - { - GSVector8i v = *this; - - v.m[i] = m; - - return v; - } - - __forceinline static GSVector8i zero() - { - GSVector8i v; - - v.m[0] = v.m[1] = _mm_setzero_si128(); - - return v; - } - - // TODO - - template __forceinline static GSVector8i load(const void* p) - { - return GSVector8i( - aligned ? _mm_load_si128((__m128i*)p + 0) : _mm_loadu_si128((__m128i*)p + 0), - aligned ? _mm_load_si128((__m128i*)p + 1) : _mm_loadu_si128((__m128i*)p + 1) - ); + GSVector8i::storel(pl, v); + GSVector8i::storeh(ph, v); } template __forceinline static void store(void* p, const GSVector8i& v) { - if(aligned) - { - _mm_store_si128((__m128i*)p + 0, v.m[0]); - _mm_store_si128((__m128i*)p + 1, v.m[1]); - } - else - { - _mm_storeu_si128((__m128i*)p + 0, v.m[0]); - _mm_storeu_si128((__m128i*)p + 1, v.m[1]); - } + if(aligned) _mm256_store_si256((__m256i*)p, v.m); + else _mm256_storeu_si256((__m256i*)p, v.m); + } + + __forceinline static int store(const GSVector8i& v) + { + return GSVector4i::store(GSVector4i::cast(v)); + } + + #ifdef _M_AMD64 + + __forceinline static int64 storeq(const GSVector8i& v) + { + return GSVector4i::storeq(GSVector4i::cast(v)); } #endif + + __forceinline static void storent(void* RESTRICT dst, const void* RESTRICT src, size_t size) + { + const GSVector8i* s = (const GSVector8i*)src; + GSVector8i* d = (GSVector8i*)dst; + + if(size == 0) return; + + size_t i = 0; + size_t j = size >> 7; + + for(; i < j; i++, s += 4, d += 4) + { + storent(&d[0], s[0]); + storent(&d[1], s[1]); + storent(&d[2], s[2]); + storent(&d[3], s[3]); + } + + size &= 127; + + if(size == 0) return; + + memcpy(d, s, size); + } + + // TODO: swizzling + + __forceinline static void sw8(GSVector8i& a, GSVector8i& b) + { + GSVector8i c = a; + GSVector8i d = b; + + a = c.upl8(d); + b = c.uph8(d); + } + + __forceinline static void sw16(GSVector8i& a, GSVector8i& b) + { + GSVector8i c = a; + GSVector8i d = b; + + a = c.upl16(d); + b = c.uph16(d); + } + + __forceinline static void sw32(GSVector8i& a, GSVector8i& b) + { + GSVector8i c = a; + GSVector8i d = b; + + a = c.upl32(d); + b = c.uph32(d); + } + + __forceinline static void sw64(GSVector8i& a, GSVector8i& b) + { + GSVector8i c = a; + GSVector8i d = b; + + a = c.upl64(d); + b = c.uph64(d); + } + + __forceinline static void sw128(GSVector8i& a, GSVector8i& b) + { + GSVector8i c = a; + GSVector8i d = b; + + a = c.ac(d); + b = c.bd(d); + } + + __forceinline void operator += (const GSVector8i& v) + { + m = _mm256_add_epi32(m, v); + } + + __forceinline void operator -= (const GSVector8i& v) + { + m = _mm256_sub_epi32(m, v); + } + + __forceinline void operator += (int i) + { + *this += GSVector8i(i); + } + + __forceinline void operator -= (int i) + { + *this -= GSVector8i(i); + } + + __forceinline void operator <<= (const int i) + { + m = _mm256_slli_epi32(m, i); + } + + __forceinline void operator >>= (const int i) + { + m = _mm256_srli_epi32(m, i); + } + + __forceinline void operator &= (const GSVector8i& v) + { + m = _mm256_and_si256(m, v); + } + + __forceinline void operator |= (const GSVector8i& v) + { + m = _mm256_or_si256(m, v); + } + + __forceinline void operator ^= (const GSVector8i& v) + { + m = _mm256_xor_si256(m, v); + } + + __forceinline friend GSVector8i operator + (const GSVector8i& v1, const GSVector8i& v2) + { + return GSVector8i(_mm256_add_epi32(v1, v2)); + } + + __forceinline friend GSVector8i operator - (const GSVector8i& v1, const GSVector8i& v2) + { + return GSVector8i(_mm256_sub_epi32(v1, v2)); + } + + __forceinline friend GSVector8i operator + (const GSVector8i& v, int i) + { + return v + GSVector8i(i); + } + + __forceinline friend GSVector8i operator - (const GSVector8i& v, int i) + { + return v - GSVector8i(i); + } + + __forceinline friend GSVector8i operator << (const GSVector8i& v, const int i) + { + return GSVector8i(_mm256_slli_epi32(v, i)); + } + + __forceinline friend GSVector8i operator >> (const GSVector8i& v, const int i) + { + return GSVector8i(_mm256_srli_epi32(v, i)); + } + + __forceinline friend GSVector8i operator & (const GSVector8i& v1, const GSVector8i& v2) + { + return GSVector8i(_mm256_and_si256(v1, v2)); + } + + __forceinline friend GSVector8i operator | (const GSVector8i& v1, const GSVector8i& v2) + { + return GSVector8i(_mm256_or_si256(v1, v2)); + } + + __forceinline friend GSVector8i operator ^ (const GSVector8i& v1, const GSVector8i& v2) + { + return GSVector8i(_mm256_xor_si256(v1, v2)); + } + + __forceinline friend GSVector8i operator & (const GSVector8i& v, int i) + { + return v & GSVector8i(i); + } + + __forceinline friend GSVector8i operator | (const GSVector8i& v, int i) + { + return v | GSVector8i(i); + } + + __forceinline friend GSVector8i operator ^ (const GSVector8i& v, int i) + { + return v ^ GSVector8i(i); + } + + __forceinline friend GSVector8i operator ~ (const GSVector8i& v) + { + return v ^ (v == v); + } + + __forceinline friend GSVector8i operator == (const GSVector8i& v1, const GSVector8i& v2) + { + return GSVector8i(_mm256_cmpeq_epi32(v1, v2)); + } + + __forceinline friend GSVector8i operator != (const GSVector8i& v1, const GSVector8i& v2) + { + return ~(v1 == v2); + } + + __forceinline friend GSVector8i operator > (const GSVector8i& v1, const GSVector8i& v2) + { + return GSVector8i(_mm256_cmpgt_epi32(v1, v2)); + } + + __forceinline friend GSVector8i operator < (const GSVector8i& v1, const GSVector8i& v2) + { + return GSVector8i(_mm256_cmpgt_epi32(v2, v1)); + } + + __forceinline friend GSVector8i operator >= (const GSVector8i& v1, const GSVector8i& v2) + { + return (v1 > v2) | (v1 == v2); + } + + __forceinline friend GSVector8i operator <= (const GSVector8i& v1, const GSVector8i& v2) + { + return (v1 < v2) | (v1 == v2); + } + + // x = v[31:0] / v[159:128] + // y = v[63:32] / v[191:160] + // z = v[95:64] / v[223:192] + // w = v[127:96] / v[255:224] + + #define VECTOR8i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + __forceinline GSVector8i xs##ys##zs##ws() const {return GSVector8i(_mm256_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ + __forceinline GSVector8i xs##ys##zs##ws##l() const {return GSVector8i(_mm256_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ + __forceinline GSVector8i xs##ys##zs##ws##h() const {return GSVector8i(_mm256_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ + __forceinline GSVector8i xs##ys##zs##ws##lh() const {return GSVector8i(_mm256_shufflehi_epi16(_mm256_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \ + + #define VECTOR8i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR8i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ + VECTOR8i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ + VECTOR8i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ + VECTOR8i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ + + #define VECTOR8i_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR8i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ + VECTOR8i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ + VECTOR8i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ + VECTOR8i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ + + #define VECTOR8i_SHUFFLE_1(xs, xn) \ + VECTOR8i_SHUFFLE_2(xs, xn, x, 0) \ + VECTOR8i_SHUFFLE_2(xs, xn, y, 1) \ + VECTOR8i_SHUFFLE_2(xs, xn, z, 2) \ + VECTOR8i_SHUFFLE_2(xs, xn, w, 3) \ + + VECTOR8i_SHUFFLE_1(x, 0) + VECTOR8i_SHUFFLE_1(y, 1) + VECTOR8i_SHUFFLE_1(z, 2) + VECTOR8i_SHUFFLE_1(w, 3) + + // a = v0[127:0] + // b = v0[255:128] + // c = v1[127:0] + // d = v1[255:128] + // _ = 0 + + #define VECTOR8i_PERMUTE128_2(as, an, bs, bn) \ + __forceinline GSVector8i as##bs() const {return GSVector8i(_mm256_permute2x128_si256(m, m, an | (bn << 4)));} \ + __forceinline GSVector8i as##bs(const GSVector8i& v) const {return GSVector8i(_mm256_permute2x128_si256(m, v.m, an | (bn << 4)));} \ + + #define VECTOR8i_PERMUTE128_1(as, an) \ + VECTOR8i_PERMUTE128_2(as, an, a, 0) \ + VECTOR8i_PERMUTE128_2(as, an, b, 1) \ + VECTOR8i_PERMUTE128_2(as, an, c, 2) \ + VECTOR8i_PERMUTE128_2(as, an, d, 3) \ + VECTOR8i_PERMUTE128_2(as, an, _, 8) \ + + VECTOR8i_PERMUTE128_1(a, 0) + VECTOR8i_PERMUTE128_1(b, 1) + VECTOR8i_PERMUTE128_1(c, 2) + VECTOR8i_PERMUTE128_1(d, 3) + VECTOR8i_PERMUTE128_1(_, 8) + + // a = v[63:0] + // b = v[127:64] + // c = v[191:128] + // d = v[255:192] + + #define VECTOR8i_PERMUTE64_4(as, an, bs, bn, cs, cn, ds, dn) \ + __forceinline GSVector8i as##bs##cs##ds() const {return GSVector8i(_mm256_permute4x64_epi64(m, _MM_SHUFFLE(dn, cn, bn, an)));} \ + + #define VECTOR8i_PERMUTE64_3(as, an, bs, bn, cs, cn) \ + VECTOR8i_PERMUTE64_4(as, an, bs, bn, cs, cn, a, 0) \ + VECTOR8i_PERMUTE64_4(as, an, bs, bn, cs, cn, b, 1) \ + VECTOR8i_PERMUTE64_4(as, an, bs, bn, cs, cn, c, 2) \ + VECTOR8i_PERMUTE64_4(as, an, bs, bn, cs, cn, d, 3) \ + + #define VECTOR8i_PERMUTE64_2(as, an, bs, bn) \ + VECTOR8i_PERMUTE64_3(as, an, bs, bn, a, 0) \ + VECTOR8i_PERMUTE64_3(as, an, bs, bn, b, 1) \ + VECTOR8i_PERMUTE64_3(as, an, bs, bn, c, 2) \ + VECTOR8i_PERMUTE64_3(as, an, bs, bn, d, 3) \ + + #define VECTOR8i_PERMUTE64_1(as, an) \ + VECTOR8i_PERMUTE64_2(as, an, a, 0) \ + VECTOR8i_PERMUTE64_2(as, an, b, 1) \ + VECTOR8i_PERMUTE64_2(as, an, c, 2) \ + VECTOR8i_PERMUTE64_2(as, an, d, 3) \ + + VECTOR8i_PERMUTE64_1(a, 0) + VECTOR8i_PERMUTE64_1(b, 1) + VECTOR8i_PERMUTE64_1(c, 2) + VECTOR8i_PERMUTE64_1(d, 3) + + __forceinline static GSVector8i zero() {return GSVector8i(_mm256_setzero_si256());} + + __forceinline static GSVector8i xffffffff() {return zero() == zero();} + + __forceinline static GSVector8i x00000001() {return xffffffff().srl32(31);} + __forceinline static GSVector8i x00000003() {return xffffffff().srl32(30);} + __forceinline static GSVector8i x00000007() {return xffffffff().srl32(29);} + __forceinline static GSVector8i x0000000f() {return xffffffff().srl32(28);} + __forceinline static GSVector8i x0000001f() {return xffffffff().srl32(27);} + __forceinline static GSVector8i x0000003f() {return xffffffff().srl32(26);} + __forceinline static GSVector8i x0000007f() {return xffffffff().srl32(25);} + __forceinline static GSVector8i x000000ff() {return xffffffff().srl32(24);} + __forceinline static GSVector8i x000001ff() {return xffffffff().srl32(23);} + __forceinline static GSVector8i x000003ff() {return xffffffff().srl32(22);} + __forceinline static GSVector8i x000007ff() {return xffffffff().srl32(21);} + __forceinline static GSVector8i x00000fff() {return xffffffff().srl32(20);} + __forceinline static GSVector8i x00001fff() {return xffffffff().srl32(19);} + __forceinline static GSVector8i x00003fff() {return xffffffff().srl32(18);} + __forceinline static GSVector8i x00007fff() {return xffffffff().srl32(17);} + __forceinline static GSVector8i x0000ffff() {return xffffffff().srl32(16);} + __forceinline static GSVector8i x0001ffff() {return xffffffff().srl32(15);} + __forceinline static GSVector8i x0003ffff() {return xffffffff().srl32(14);} + __forceinline static GSVector8i x0007ffff() {return xffffffff().srl32(13);} + __forceinline static GSVector8i x000fffff() {return xffffffff().srl32(12);} + __forceinline static GSVector8i x001fffff() {return xffffffff().srl32(11);} + __forceinline static GSVector8i x003fffff() {return xffffffff().srl32(10);} + __forceinline static GSVector8i x007fffff() {return xffffffff().srl32( 9);} + __forceinline static GSVector8i x00ffffff() {return xffffffff().srl32( 8);} + __forceinline static GSVector8i x01ffffff() {return xffffffff().srl32( 7);} + __forceinline static GSVector8i x03ffffff() {return xffffffff().srl32( 6);} + __forceinline static GSVector8i x07ffffff() {return xffffffff().srl32( 5);} + __forceinline static GSVector8i x0fffffff() {return xffffffff().srl32( 4);} + __forceinline static GSVector8i x1fffffff() {return xffffffff().srl32( 3);} + __forceinline static GSVector8i x3fffffff() {return xffffffff().srl32( 2);} + __forceinline static GSVector8i x7fffffff() {return xffffffff().srl32( 1);} + + __forceinline static GSVector8i x80000000() {return xffffffff().sll32(31);} + __forceinline static GSVector8i xc0000000() {return xffffffff().sll32(30);} + __forceinline static GSVector8i xe0000000() {return xffffffff().sll32(29);} + __forceinline static GSVector8i xf0000000() {return xffffffff().sll32(28);} + __forceinline static GSVector8i xf8000000() {return xffffffff().sll32(27);} + __forceinline static GSVector8i xfc000000() {return xffffffff().sll32(26);} + __forceinline static GSVector8i xfe000000() {return xffffffff().sll32(25);} + __forceinline static GSVector8i xff000000() {return xffffffff().sll32(24);} + __forceinline static GSVector8i xff800000() {return xffffffff().sll32(23);} + __forceinline static GSVector8i xffc00000() {return xffffffff().sll32(22);} + __forceinline static GSVector8i xffe00000() {return xffffffff().sll32(21);} + __forceinline static GSVector8i xfff00000() {return xffffffff().sll32(20);} + __forceinline static GSVector8i xfff80000() {return xffffffff().sll32(19);} + __forceinline static GSVector8i xfffc0000() {return xffffffff().sll32(18);} + __forceinline static GSVector8i xfffe0000() {return xffffffff().sll32(17);} + __forceinline static GSVector8i xffff0000() {return xffffffff().sll32(16);} + __forceinline static GSVector8i xffff8000() {return xffffffff().sll32(15);} + __forceinline static GSVector8i xffffc000() {return xffffffff().sll32(14);} + __forceinline static GSVector8i xffffe000() {return xffffffff().sll32(13);} + __forceinline static GSVector8i xfffff000() {return xffffffff().sll32(12);} + __forceinline static GSVector8i xfffff800() {return xffffffff().sll32(11);} + __forceinline static GSVector8i xfffffc00() {return xffffffff().sll32(10);} + __forceinline static GSVector8i xfffffe00() {return xffffffff().sll32( 9);} + __forceinline static GSVector8i xffffff00() {return xffffffff().sll32( 8);} + __forceinline static GSVector8i xffffff80() {return xffffffff().sll32( 7);} + __forceinline static GSVector8i xffffffc0() {return xffffffff().sll32( 6);} + __forceinline static GSVector8i xffffffe0() {return xffffffff().sll32( 5);} + __forceinline static GSVector8i xfffffff0() {return xffffffff().sll32( 4);} + __forceinline static GSVector8i xfffffff8() {return xffffffff().sll32( 3);} + __forceinline static GSVector8i xfffffffc() {return xffffffff().sll32( 2);} + __forceinline static GSVector8i xfffffffe() {return xffffffff().sll32( 1);} + + __forceinline static GSVector8i x0001() {return xffffffff().srl16(15);} + __forceinline static GSVector8i x0003() {return xffffffff().srl16(14);} + __forceinline static GSVector8i x0007() {return xffffffff().srl16(13);} + __forceinline static GSVector8i x000f() {return xffffffff().srl16(12);} + __forceinline static GSVector8i x001f() {return xffffffff().srl16(11);} + __forceinline static GSVector8i x003f() {return xffffffff().srl16(10);} + __forceinline static GSVector8i x007f() {return xffffffff().srl16( 9);} + __forceinline static GSVector8i x00ff() {return xffffffff().srl16( 8);} + __forceinline static GSVector8i x01ff() {return xffffffff().srl16( 7);} + __forceinline static GSVector8i x03ff() {return xffffffff().srl16( 6);} + __forceinline static GSVector8i x07ff() {return xffffffff().srl16( 5);} + __forceinline static GSVector8i x0fff() {return xffffffff().srl16( 4);} + __forceinline static GSVector8i x1fff() {return xffffffff().srl16( 3);} + __forceinline static GSVector8i x3fff() {return xffffffff().srl16( 2);} + __forceinline static GSVector8i x7fff() {return xffffffff().srl16( 1);} + + __forceinline static GSVector8i x8000() {return xffffffff().sll16(15);} + __forceinline static GSVector8i xc000() {return xffffffff().sll16(14);} + __forceinline static GSVector8i xe000() {return xffffffff().sll16(13);} + __forceinline static GSVector8i xf000() {return xffffffff().sll16(12);} + __forceinline static GSVector8i xf800() {return xffffffff().sll16(11);} + __forceinline static GSVector8i xfc00() {return xffffffff().sll16(10);} + __forceinline static GSVector8i xfe00() {return xffffffff().sll16( 9);} + __forceinline static GSVector8i xff00() {return xffffffff().sll16( 8);} + __forceinline static GSVector8i xff80() {return xffffffff().sll16( 7);} + __forceinline static GSVector8i xffc0() {return xffffffff().sll16( 6);} + __forceinline static GSVector8i xffe0() {return xffffffff().sll16( 5);} + __forceinline static GSVector8i xfff0() {return xffffffff().sll16( 4);} + __forceinline static GSVector8i xfff8() {return xffffffff().sll16( 3);} + __forceinline static GSVector8i xfffc() {return xffffffff().sll16( 2);} + __forceinline static GSVector8i xfffe() {return xffffffff().sll16( 1);} + + __forceinline static GSVector8i xffffffff(const GSVector8i& v) {return v == v;} + + __forceinline static GSVector8i x00000001(const GSVector8i& v) {return xffffffff(v).srl32(31);} + __forceinline static GSVector8i x00000003(const GSVector8i& v) {return xffffffff(v).srl32(30);} + __forceinline static GSVector8i x00000007(const GSVector8i& v) {return xffffffff(v).srl32(29);} + __forceinline static GSVector8i x0000000f(const GSVector8i& v) {return xffffffff(v).srl32(28);} + __forceinline static GSVector8i x0000001f(const GSVector8i& v) {return xffffffff(v).srl32(27);} + __forceinline static GSVector8i x0000003f(const GSVector8i& v) {return xffffffff(v).srl32(26);} + __forceinline static GSVector8i x0000007f(const GSVector8i& v) {return xffffffff(v).srl32(25);} + __forceinline static GSVector8i x000000ff(const GSVector8i& v) {return xffffffff(v).srl32(24);} + __forceinline static GSVector8i x000001ff(const GSVector8i& v) {return xffffffff(v).srl32(23);} + __forceinline static GSVector8i x000003ff(const GSVector8i& v) {return xffffffff(v).srl32(22);} + __forceinline static GSVector8i x000007ff(const GSVector8i& v) {return xffffffff(v).srl32(21);} + __forceinline static GSVector8i x00000fff(const GSVector8i& v) {return xffffffff(v).srl32(20);} + __forceinline static GSVector8i x00001fff(const GSVector8i& v) {return xffffffff(v).srl32(19);} + __forceinline static GSVector8i x00003fff(const GSVector8i& v) {return xffffffff(v).srl32(18);} + __forceinline static GSVector8i x00007fff(const GSVector8i& v) {return xffffffff(v).srl32(17);} + __forceinline static GSVector8i x0000ffff(const GSVector8i& v) {return xffffffff(v).srl32(16);} + __forceinline static GSVector8i x0001ffff(const GSVector8i& v) {return xffffffff(v).srl32(15);} + __forceinline static GSVector8i x0003ffff(const GSVector8i& v) {return xffffffff(v).srl32(14);} + __forceinline static GSVector8i x0007ffff(const GSVector8i& v) {return xffffffff(v).srl32(13);} + __forceinline static GSVector8i x000fffff(const GSVector8i& v) {return xffffffff(v).srl32(12);} + __forceinline static GSVector8i x001fffff(const GSVector8i& v) {return xffffffff(v).srl32(11);} + __forceinline static GSVector8i x003fffff(const GSVector8i& v) {return xffffffff(v).srl32(10);} + __forceinline static GSVector8i x007fffff(const GSVector8i& v) {return xffffffff(v).srl32( 9);} + __forceinline static GSVector8i x00ffffff(const GSVector8i& v) {return xffffffff(v).srl32( 8);} + __forceinline static GSVector8i x01ffffff(const GSVector8i& v) {return xffffffff(v).srl32( 7);} + __forceinline static GSVector8i x03ffffff(const GSVector8i& v) {return xffffffff(v).srl32( 6);} + __forceinline static GSVector8i x07ffffff(const GSVector8i& v) {return xffffffff(v).srl32( 5);} + __forceinline static GSVector8i x0fffffff(const GSVector8i& v) {return xffffffff(v).srl32( 4);} + __forceinline static GSVector8i x1fffffff(const GSVector8i& v) {return xffffffff(v).srl32( 3);} + __forceinline static GSVector8i x3fffffff(const GSVector8i& v) {return xffffffff(v).srl32( 2);} + __forceinline static GSVector8i x7fffffff(const GSVector8i& v) {return xffffffff(v).srl32( 1);} + + __forceinline static GSVector8i x80000000(const GSVector8i& v) {return xffffffff(v).sll32(31);} + __forceinline static GSVector8i xc0000000(const GSVector8i& v) {return xffffffff(v).sll32(30);} + __forceinline static GSVector8i xe0000000(const GSVector8i& v) {return xffffffff(v).sll32(29);} + __forceinline static GSVector8i xf0000000(const GSVector8i& v) {return xffffffff(v).sll32(28);} + __forceinline static GSVector8i xf8000000(const GSVector8i& v) {return xffffffff(v).sll32(27);} + __forceinline static GSVector8i xfc000000(const GSVector8i& v) {return xffffffff(v).sll32(26);} + __forceinline static GSVector8i xfe000000(const GSVector8i& v) {return xffffffff(v).sll32(25);} + __forceinline static GSVector8i xff000000(const GSVector8i& v) {return xffffffff(v).sll32(24);} + __forceinline static GSVector8i xff800000(const GSVector8i& v) {return xffffffff(v).sll32(23);} + __forceinline static GSVector8i xffc00000(const GSVector8i& v) {return xffffffff(v).sll32(22);} + __forceinline static GSVector8i xffe00000(const GSVector8i& v) {return xffffffff(v).sll32(21);} + __forceinline static GSVector8i xfff00000(const GSVector8i& v) {return xffffffff(v).sll32(20);} + __forceinline static GSVector8i xfff80000(const GSVector8i& v) {return xffffffff(v).sll32(19);} + __forceinline static GSVector8i xfffc0000(const GSVector8i& v) {return xffffffff(v).sll32(18);} + __forceinline static GSVector8i xfffe0000(const GSVector8i& v) {return xffffffff(v).sll32(17);} + __forceinline static GSVector8i xffff0000(const GSVector8i& v) {return xffffffff(v).sll32(16);} + __forceinline static GSVector8i xffff8000(const GSVector8i& v) {return xffffffff(v).sll32(15);} + __forceinline static GSVector8i xffffc000(const GSVector8i& v) {return xffffffff(v).sll32(14);} + __forceinline static GSVector8i xffffe000(const GSVector8i& v) {return xffffffff(v).sll32(13);} + __forceinline static GSVector8i xfffff000(const GSVector8i& v) {return xffffffff(v).sll32(12);} + __forceinline static GSVector8i xfffff800(const GSVector8i& v) {return xffffffff(v).sll32(11);} + __forceinline static GSVector8i xfffffc00(const GSVector8i& v) {return xffffffff(v).sll32(10);} + __forceinline static GSVector8i xfffffe00(const GSVector8i& v) {return xffffffff(v).sll32( 9);} + __forceinline static GSVector8i xffffff00(const GSVector8i& v) {return xffffffff(v).sll32( 8);} + __forceinline static GSVector8i xffffff80(const GSVector8i& v) {return xffffffff(v).sll32( 7);} + __forceinline static GSVector8i xffffffc0(const GSVector8i& v) {return xffffffff(v).sll32( 6);} + __forceinline static GSVector8i xffffffe0(const GSVector8i& v) {return xffffffff(v).sll32( 5);} + __forceinline static GSVector8i xfffffff0(const GSVector8i& v) {return xffffffff(v).sll32( 4);} + __forceinline static GSVector8i xfffffff8(const GSVector8i& v) {return xffffffff(v).sll32( 3);} + __forceinline static GSVector8i xfffffffc(const GSVector8i& v) {return xffffffff(v).sll32( 2);} + __forceinline static GSVector8i xfffffffe(const GSVector8i& v) {return xffffffff(v).sll32( 1);} + + __forceinline static GSVector8i x0001(const GSVector8i& v) {return xffffffff(v).srl16(15);} + __forceinline static GSVector8i x0003(const GSVector8i& v) {return xffffffff(v).srl16(14);} + __forceinline static GSVector8i x0007(const GSVector8i& v) {return xffffffff(v).srl16(13);} + __forceinline static GSVector8i x000f(const GSVector8i& v) {return xffffffff(v).srl16(12);} + __forceinline static GSVector8i x001f(const GSVector8i& v) {return xffffffff(v).srl16(11);} + __forceinline static GSVector8i x003f(const GSVector8i& v) {return xffffffff(v).srl16(10);} + __forceinline static GSVector8i x007f(const GSVector8i& v) {return xffffffff(v).srl16( 9);} + __forceinline static GSVector8i x00ff(const GSVector8i& v) {return xffffffff(v).srl16( 8);} + __forceinline static GSVector8i x01ff(const GSVector8i& v) {return xffffffff(v).srl16( 7);} + __forceinline static GSVector8i x03ff(const GSVector8i& v) {return xffffffff(v).srl16( 6);} + __forceinline static GSVector8i x07ff(const GSVector8i& v) {return xffffffff(v).srl16( 5);} + __forceinline static GSVector8i x0fff(const GSVector8i& v) {return xffffffff(v).srl16( 4);} + __forceinline static GSVector8i x1fff(const GSVector8i& v) {return xffffffff(v).srl16( 3);} + __forceinline static GSVector8i x3fff(const GSVector8i& v) {return xffffffff(v).srl16( 2);} + __forceinline static GSVector8i x7fff(const GSVector8i& v) {return xffffffff(v).srl16( 1);} + + __forceinline static GSVector8i x8000(const GSVector8i& v) {return xffffffff(v).sll16(15);} + __forceinline static GSVector8i xc000(const GSVector8i& v) {return xffffffff(v).sll16(14);} + __forceinline static GSVector8i xe000(const GSVector8i& v) {return xffffffff(v).sll16(13);} + __forceinline static GSVector8i xf000(const GSVector8i& v) {return xffffffff(v).sll16(12);} + __forceinline static GSVector8i xf800(const GSVector8i& v) {return xffffffff(v).sll16(11);} + __forceinline static GSVector8i xfc00(const GSVector8i& v) {return xffffffff(v).sll16(10);} + __forceinline static GSVector8i xfe00(const GSVector8i& v) {return xffffffff(v).sll16( 9);} + __forceinline static GSVector8i xff00(const GSVector8i& v) {return xffffffff(v).sll16( 8);} + __forceinline static GSVector8i xff80(const GSVector8i& v) {return xffffffff(v).sll16( 7);} + __forceinline static GSVector8i xffc0(const GSVector8i& v) {return xffffffff(v).sll16( 6);} + __forceinline static GSVector8i xffe0(const GSVector8i& v) {return xffffffff(v).sll16( 5);} + __forceinline static GSVector8i xfff0(const GSVector8i& v) {return xffffffff(v).sll16( 4);} + __forceinline static GSVector8i xfff8(const GSVector8i& v) {return xffffffff(v).sll16( 3);} + __forceinline static GSVector8i xfffc(const GSVector8i& v) {return xffffffff(v).sll16( 2);} + __forceinline static GSVector8i xfffe(const GSVector8i& v) {return xffffffff(v).sll16( 1);} + + __forceinline static GSVector8i xff(int n) {return m_xff[n];} + __forceinline static GSVector8i x0f(int n) {return m_x0f[n];} }; +#endif + +#if _M_SSE >= 0x500 + __aligned(class, 32) GSVector8 { public: @@ -3422,89 +4694,18 @@ public: uint16 u16[16]; uint32 u32[8]; uint64 u64[4]; - #if _M_SSE >= 0x500 __m256 m; __m128 m0, m1; - #else - __m128 m[2]; - #endif }; - __forceinline GSVector8() {} + static const GSVector8 m_one; + static const GSVector8 m_x7fffffff; + static const GSVector8 m_x80000000; - __forceinline explicit GSVector8(const GSVector8i& v); - - __forceinline static GSVector8 cast(const GSVector8i& v); - - __forceinline GSVector8 rcpnr() const + __forceinline GSVector8() { - GSVector8 v = rcp(); - - return (v + v) - (v * v) * *this; } - __forceinline GSVector8 floor() const - { - return round(); - } - - __forceinline GSVector8 ceil() const - { - return round(); - } - - __forceinline GSVector8 operator - () const - { - return neg(); - } - - __forceinline void operator += (float f) - { - *this += GSVector8(f); - } - - __forceinline void operator -= (float f) - { - *this -= GSVector8(f); - } - - __forceinline void operator *= (float f) - { - *this *= GSVector8(f); - } - - __forceinline void operator /= (float f) - { - *this /= GSVector8(f); - } - - __forceinline friend GSVector8 operator + (const GSVector8& v, float f) - { - return v + GSVector8(f); - } - - __forceinline friend GSVector8 operator - (const GSVector8& v, float f) - { - return v - GSVector8(f); - } - - __forceinline friend GSVector8 operator * (const GSVector8& v, float f) - { - return v * GSVector8(f); - } - - __forceinline friend GSVector8 operator / (const GSVector8& v, float f) - { - return v / GSVector8(f); - } - - __forceinline static GSVector8 xffffffff() - { - return zero() == zero(); - } - - #if _M_SSE >= 0x500 - __forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1) { m = _mm256_set_ps(w1, z1, y1, x1, w0, z0, y0, x0); @@ -3517,7 +4718,7 @@ public: __forceinline GSVector8(__m128 m0, __m128 m1) { - // FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps + // FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps (in vs2012 they simply changed it to vmovups, still can't keep the second xmm in a register) // m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m0), _mm256_castps128_ps256(m1), 0x20); @@ -3536,12 +4737,18 @@ public: __forceinline explicit GSVector8(__m128 m) { + #if _MSC_VER >= 1700 + // FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps - // this->m = _mm256_castps128_ps256(m); - // this->m = _mm256_permute2f128_ps(this->m, this->m, 0); + this->m = _mm256_castps128_ps256(m); + this->m = _mm256_permute2f128_ps(this->m, this->m, 0); + + #else this->m = zero().insert<0>(m).xx(); + + #endif } __forceinline explicit GSVector8(__m256 m) @@ -3549,6 +4756,17 @@ public: this->m = m; } + #if _M_SSE >= 0x501 + + __forceinline explicit GSVector8(const GSVector8i& v); + + __forceinline static GSVector8 cast(const GSVector8i& v); + + #endif + + __forceinline static GSVector8 cast(const GSVector4& v); + __forceinline static GSVector8 cast(const GSVector4i& v); + __forceinline void operator = (const GSVector8& v) { m = v.m; @@ -3561,8 +4779,18 @@ public: __forceinline void operator = (__m128 m) { + #if _MSC_VER >= 1700 + + // FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps + this->m = _mm256_castps128_ps256(m); this->m = _mm256_permute2f128_ps(this->m, this->m, 0); + + #else + + this->m = zero().insert<0>(m).xx(); + + #endif } __forceinline void operator = (__m256 m) @@ -3577,12 +4805,28 @@ public: __forceinline GSVector8 abs() const { - return *this & cast(GSVector8i(GSVector4i::x7fffffff())); // TODO: add GSVector8 consts + #if _M_SSE >= 0x501 + + return *this & cast(GSVector8i::x7fffffff()); + + #else + + return *this & m_x7fffffff; + + #endif } __forceinline GSVector8 neg() const { - return *this ^ cast(GSVector8i(GSVector4i::x80000000())); + #if _M_SSE >= 0x501 + + return *this ^ cast(GSVector8i::x80000000()); + + #else + + return *this ^ m_x80000000; + + #endif } __forceinline GSVector8 rcp() const @@ -3590,12 +4834,182 @@ public: return GSVector8(_mm256_rcp_ps(m)); } + __forceinline GSVector8 rcpnr() const + { + GSVector8 v = rcp(); + + return (v + v) - (v * v) * *this; + } + template __forceinline GSVector8 round() const { return GSVector8(_mm256_round_ps(m, mode)); } - // TODO + __forceinline GSVector8 floor() const + { + return round(); + } + + __forceinline GSVector8 ceil() const + { + return round(); + } + + #if _M_SSE >= 0x501 + + #define LOG8_POLY0(x, c0) GSVector8(c0) + #define LOG8_POLY1(x, c0, c1) (LOG8_POLY0(x, c1).madd(x, GSVector8(c0))) + #define LOG8_POLY2(x, c0, c1, c2) (LOG8_POLY1(x, c1, c2).madd(x, GSVector8(c0))) + #define LOG8_POLY3(x, c0, c1, c2, c3) (LOG8_POLY2(x, c1, c2, c3).madd(x, GSVector8(c0))) + #define LOG8_POLY4(x, c0, c1, c2, c3, c4) (LOG8_POLY3(x, c1, c2, c3, c4).madd(x, GSVector8(c0))) + #define LOG8_POLY5(x, c0, c1, c2, c3, c4, c5) (LOG8_POLY4(x, c1, c2, c3, c4, c5).madd(x, GSVector8(c0))) + + __forceinline GSVector8 log2(int precision = 5) const + { + // NOTE: see GSVector4::log2 + + GSVector8 one = m_one; + + GSVector8i i = GSVector8i::cast(*this); + + GSVector8 e = GSVector8(((i << 1) >> 24) - GSVector8i::x0000007f()); + GSVector8 m = GSVector8::cast((i << 9) >> 9) | one; + + GSVector8 p; + + switch(precision) + { + case 3: + p = LOG8_POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + break; + case 4: + p = LOG8_POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + break; + default: + case 5: + p = LOG8_POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + break; + case 6: + p = LOG8_POLY5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + break; + } + + // This effectively increases the polynomial degree by one, but ensures that log2(1) == 0 + + p = p * (m - one); + + return p + e; + } + + #endif + + __forceinline GSVector8 madd(const GSVector8& a, const GSVector8& b) const + { + #if 0//_M_SSE >= 0x501 + + return GSVector8(_mm256_fmadd_ps(m, a, b)); + + #else + + return *this * a + b; + + #endif + } + + __forceinline GSVector8 msub(const GSVector8& a, const GSVector8& b) const + { + #if 0//_M_SSE >= 0x501 + + return GSVector8(_mm256_fmsub_ps(m, a, b)); + + #else + + return *this * a - b; + + #endif + } + + __forceinline GSVector8 nmadd(const GSVector8& a, const GSVector8& b) const + { + #if 0//_M_SSE >= 0x501 + + return GSVector8(_mm256_fnmadd_ps(m, a, b)); + + #else + + return b - *this * a; + + #endif + } + + __forceinline GSVector8 nmsub(const GSVector8& a, const GSVector8& b) const + { + #if 0//_M_SSE >= 0x501 + + return GSVector8(_mm256_fnmsub_ps(m, a, b)); + + #else + + return -b - *this * a; + + #endif + } + + __forceinline GSVector8 addm(const GSVector8& a, const GSVector8& b) const + { + return a.madd(b, *this); // *this + a * b + } + + __forceinline GSVector8 subm(const GSVector8& a, const GSVector8& b) const + { + return a.nmadd(b, *this); // *this - a * b + } + + __forceinline GSVector8 hadd() const + { + return GSVector8(_mm256_hadd_ps(m, m)); + } + + __forceinline GSVector8 hadd(const GSVector8& v) const + { + return GSVector8(_mm256_hadd_ps(m, v.m)); + } + + __forceinline GSVector8 hsub() const + { + return GSVector8(_mm256_hsub_ps(m, m)); + } + + __forceinline GSVector8 hsub(const GSVector8& v) const + { + return GSVector8(_mm256_hsub_ps(m, v.m)); + } + + template __forceinline GSVector8 dp(const GSVector8& v) const + { + return GSVector8(_mm256_dp_ps(m, v.m, i)); + } + + __forceinline GSVector8 sat(const GSVector8& a, const GSVector8& b) const + { + return GSVector8(_mm256_min_ps(_mm256_max_ps(m, a), b)); + } + + __forceinline GSVector8 sat(const GSVector8& a) const + { + return GSVector8(_mm256_min_ps(_mm256_max_ps(m, a.xyxy()), a.zwzw())); + } + + __forceinline GSVector8 sat(const float scale = 255) const + { + return sat(zero(), GSVector8(scale)); + } + + __forceinline GSVector8 clamp(const float scale = 255) const + { + return min(GSVector8(scale)); + } __forceinline GSVector8 min(const GSVector8& a) const { @@ -3637,16 +5051,14 @@ public: return GSVector8(_mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(m), _mm256_castps_pd(a)))); } - // TODO - __forceinline GSVector8 l2h() const { - return GSVector8(_mm256_shuffle_ps(m, m, 0x88)); + return xyxy(); } __forceinline GSVector8 h2l() const { - return GSVector8(_mm256_shuffle_ps(m, m, 0x22)); + return zwzw(); } __forceinline GSVector8 andnot(const GSVector8& v) const @@ -3668,30 +5080,27 @@ public: { return _mm256_testz_ps(m, m) != 0; } - - template __forceinline GSVector4 extract() const - { - return GSVector4(_mm256_extractf128_ps(m, i)); - } + + // TODO: 32-bit insert/extract template __forceinline GSVector8 insert(__m128 m) const { return GSVector8(_mm256_insertf128_ps(this->m, m, i)); } + template __forceinline GSVector4 extract() const + { + return GSVector4(_mm256_extractf128_ps(m, i)); + } + __forceinline static GSVector8 zero() { return GSVector8(_mm256_setzero_ps()); } - __forceinline static void zeroupper() + __forceinline static GSVector8 xffffffff() { - _mm256_zeroupper(); - } - - __forceinline static void zeroall() - { - _mm256_zeroall(); + return zero() == zero(); } // TODO: load low, ss @@ -3709,7 +5118,24 @@ public: else _mm256_storeu_ps((float*)p, v.m); } - // TODO + // + + __forceinline static void zeroupper() + { + _mm256_zeroupper(); + } + + __forceinline static void zeroall() + { + _mm256_zeroall(); + } + + // + + __forceinline GSVector8 operator - () const + { + return neg(); + } __forceinline void operator += (const GSVector8& v) { @@ -3731,6 +5157,26 @@ public: m = _mm256_div_ps(m, v); } + __forceinline void operator += (float f) + { + *this += GSVector8(f); + } + + __forceinline void operator -= (float f) + { + *this -= GSVector8(f); + } + + __forceinline void operator *= (float f) + { + *this *= GSVector8(f); + } + + __forceinline void operator /= (float f) + { + *this /= GSVector8(f); + } + __forceinline void operator &= (const GSVector8& v) { m = _mm256_and_ps(m, v); @@ -3766,6 +5212,26 @@ public: return GSVector8(_mm256_div_ps(v1, v2)); } + __forceinline friend GSVector8 operator + (const GSVector8& v, float f) + { + return v + GSVector8(f); + } + + __forceinline friend GSVector8 operator - (const GSVector8& v, float f) + { + return v - GSVector8(f); + } + + __forceinline friend GSVector8 operator * (const GSVector8& v, float f) + { + return v * GSVector8(f); + } + + __forceinline friend GSVector8 operator / (const GSVector8& v, float f) + { + return v / GSVector8(f); + } + __forceinline friend GSVector8 operator & (const GSVector8& v1, const GSVector8& v2) { return GSVector8(_mm256_and_ps(v1, v2)); @@ -3811,385 +5277,15 @@ public: return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LE_OQ)); } - #define VECTOR8_PERMUTE_2(xs, xn, ys, yn) \ - __forceinline GSVector8 xs##ys() const {return GSVector8(_mm256_permute2f128_ps(m, m, xn | (yn << 4)));} \ - __forceinline GSVector8 xs##ys(const GSVector8& v) const {return GSVector8(_mm256_permute2f128_ps(m, v.m, xn | (yn << 4)));} \ + // x = v[31:0] / v[159:128] + // y = v[63:32] / v[191:160] + // z = v[95:64] / v[223:192] + // w = v[127:96] / v[255:224] #define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ __forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ __forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ - #else - - __forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1) - { - m[0] = _mm_set_ps(w0, z0, y0, x0); - m[1] = _mm_set_ps(w1, z1, y1, x1); - } - - __forceinline GSVector8(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1) - { - m[0] = _mm_cvtepi32_ps(_mm_set_epi32(w0, z0, y0, x0)); - m[1] = _mm_cvtepi32_ps(_mm_set_epi32(w1, z1, y1, x1)); - } - - __forceinline GSVector8(__m128 m0, __m128 m1) - { - m[0] = m0; - m[1] = m1; - } - - __forceinline GSVector8(const GSVector8& v) - { - m[0] = v.m[0]; - m[1] = v.m[1]; - } - - __forceinline explicit GSVector8(float f) - { - m[0] = m[1] = _mm_set1_ps(f); - } - - __forceinline explicit GSVector8(__m128 m) - { - this->m[0] = this->m[1] = m; - } - - __forceinline void operator = (const GSVector8& v) - { - m[0] = v.m[0]; - m[1] = v.m[1]; - } - - __forceinline void operator = (float f) - { - m[0] = m[1] = _mm_set1_ps(f); - } - - __forceinline void operator = (__m128 m) - { - this->m[0] = this->m[1] = m; - } - - __forceinline GSVector8 abs() const - { - GSVector4 mask = GSVector4::cast(GSVector4i::x7fffffff()); - - return GSVector8(_mm_and_ps(m[0], mask), _mm_and_ps(m[1], mask)); - } - - __forceinline GSVector8 neg() const - { - GSVector4 mask = GSVector4::cast(GSVector4i::x80000000()); - - return GSVector8(_mm_xor_ps(m[0], mask), _mm_xor_ps(m[1], mask)); - } - - __forceinline GSVector8 rcp() const - { - return GSVector8(_mm_rcp_ps(m[0]), _mm_rcp_ps(m[1])); - } - - template __forceinline GSVector8 round() const - { - return GSVector8(GSVector4(m[0]).round(), GSVector4(m[1]).round()); - } - - // TODO - - __forceinline GSVector8 min(const GSVector8& a) const - { - return GSVector8(_mm_min_ps(m[0], a.m[0]), _mm_min_ps(m[1], a.m[1])); - } - - __forceinline GSVector8 max(const GSVector8& a) const - { - return GSVector8(_mm_max_ps(m[0], a.m[0]), _mm_max_ps(m[1], a.m[1])); - } - - #if _M_SSE >= 0x401 - - template __forceinline GSVector8 blend32(const GSVector8& a) const - { - return GSVector8(_mm_blend_ps(m[0], a.m[0], mask & 0x0f), _mm_blend_ps(m[1], a.m[1], (mask >> 4) & 0x0f)); - } - - __forceinline GSVector8 blend32(const GSVector8& a, const GSVector8& mask) const - { - return GSVector8(_mm_blendv_ps(m[0], a.m[0], mask.m[0]), _mm_blendv_ps(m[1], a.m[1], mask.m[1])); - } - - #endif - - __forceinline GSVector8 upl(const GSVector8& a) const - { - return GSVector8(_mm_unpacklo_ps(m[0], a.m[0]), _mm_unpacklo_ps(m[1], a.m[1])); - } - - __forceinline GSVector8 uph(const GSVector8& a) const - { - return GSVector8(_mm_unpackhi_ps(m[0], a.m[0]), _mm_unpackhi_ps(m[1], a.m[1])); - } - - __forceinline GSVector8 upl64(const GSVector8& a) const - { - return GSVector8( - _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m[0]), _mm_castps_pd(a.m[0]))), - _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m[1]), _mm_castps_pd(a.m[1]))) - ); - } - - __forceinline GSVector8 uph64(const GSVector8& a) const - { - return GSVector8( - _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m[0]), _mm_castps_pd(a.m[0]))), - _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m[1]), _mm_castps_pd(a.m[1]))) - ); - } - - // TODO - - __forceinline GSVector8 l2h() const - { - return GSVector8(_mm_movelh_ps(m[0], m[0]), _mm_movelh_ps(m[1], m[1])); - } - - __forceinline GSVector8 h2l() const - { - return GSVector8(_mm_movehl_ps(m[0], m[0]), _mm_movehl_ps(m[1], m[1])); - } - - __forceinline GSVector8 andnot(const GSVector8& v) const - { - return GSVector8(_mm_andnot_ps(v.m[0], m[0]), _mm_andnot_ps(v.m[1], m[1])); - } - - __forceinline int mask() const - { - return _mm_movemask_ps(m[0]) | (_mm_movemask_ps(m[1]) << 4); - } - - __forceinline bool alltrue() const - { - return mask() == 0xff; - } - - __forceinline bool allfalse() const - { - #if _M_SSE >= 0x500 - - return (_mm_testz_ps(m[0], m[0]) & _mm_testz_ps(m[1], m[1])) != 0; - - #elif _M_SSE >= 0x401 - - __m128i a = _mm_castps_si128(m[0]); - __m128i b = _mm_castps_si128(m[1]); - - return (_mm_testz_si128(a, a) & _mm_testz_si128(b, b)) != 0; - - #else - - return mask() == 0; - - #endif - } - - template __forceinline GSVector4 extract() const - { - return GSVector4(m[i]); - } - - template __forceinline GSVector8 insert(__m128 m) const - { - return GSVector8(i == 0 ? m : this->m[0], i == 1 ? m : this->m[1]); - } - - __forceinline static GSVector8 zero() - { - return GSVector8(_mm_setzero_ps(), _mm_setzero_ps()); - } - - __forceinline static void zeroupper() - { - // N/A - } - - __forceinline static void zeroall() - { - // N/A - } - - // TODO: load low, ss - - template __forceinline static GSVector8 load(const void* p) - { - return GSVector8( - aligned ? _mm_load_ps((const float*)p + 0) : _mm_loadu_ps((const float*)p + 0), - aligned ? _mm_load_ps((const float*)p + 4) : _mm_loadu_ps((const float*)p + 4) - ); - } - - // TODO: store low, ss - - template __forceinline static void store(void* p, const GSVector8& v) - { - if(aligned) - { - _mm_store_ps((float*)p + 0, v.m[0]); - _mm_store_ps((float*)p + 4, v.m[1]); - } - else - { - _mm_storeu_ps((float*)p + 0, v.m[0]); - _mm_storeu_ps((float*)p + 4, v.m[1]); - } - } - - // TODO - - __forceinline void operator += (const GSVector8& v) - { - m[0] = _mm_add_ps(m[0], v.m[0]); - m[1] = _mm_add_ps(m[1], v.m[1]); - } - - __forceinline void operator -= (const GSVector8& v) - { - m[0] = _mm_sub_ps(m[0], v.m[0]); - m[1] = _mm_sub_ps(m[1], v.m[1]); - } - - __forceinline void operator *= (const GSVector8& v) - { - m[0] = _mm_mul_ps(m[0], v.m[0]); - m[1] = _mm_mul_ps(m[1], v.m[1]); - } - - __forceinline void operator /= (const GSVector8& v) - { - m[0] = _mm_div_ps(m[0], v.m[0]); - m[1] = _mm_div_ps(m[1], v.m[1]); - } - - __forceinline void operator &= (const GSVector8& v) - { - m[0] = _mm_and_ps(m[0], v.m[0]); - m[1] = _mm_and_ps(m[1], v.m[1]); - } - - __forceinline void operator |= (const GSVector8& v) - { - m[0] = _mm_or_ps(m[0], v.m[0]); - m[1] = _mm_or_ps(m[1], v.m[1]); - } - - __forceinline void operator ^= (const GSVector8& v) - { - m[0] = _mm_xor_ps(m[0], v.m[0]); - m[1] = _mm_xor_ps(m[1], v.m[1]); - } - - __forceinline friend GSVector8 operator + (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_add_ps(v1.m[0], v2.m[0]), _mm_add_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator - (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_sub_ps(v1.m[0], v2.m[0]), _mm_sub_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator * (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_mul_ps(v1.m[0], v2.m[0]), _mm_mul_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator / (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_div_ps(v1.m[0], v2.m[0]), _mm_div_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator & (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_and_ps(v1.m[0], v2.m[0]), _mm_and_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator | (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_or_ps(v1.m[0], v2.m[0]), _mm_or_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator ^ (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_xor_ps(v1.m[0], v2.m[0]), _mm_xor_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator == (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_cmpeq_ps(v1.m[0], v2.m[0]), _mm_cmpeq_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator != (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_cmpneq_ps(v1.m[0], v2.m[0]), _mm_cmpeq_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator > (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_cmpgt_ps(v1.m[0], v2.m[0]), _mm_cmpgt_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator < (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_cmplt_ps(v1.m[0], v2.m[0]), _mm_cmplt_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator >= (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_cmpge_ps(v1.m[0], v2.m[0]), _mm_cmpge_ps(v1.m[1], v2.m[1])); - } - - __forceinline friend GSVector8 operator <= (const GSVector8& v1, const GSVector8& v2) - { - return GSVector8(_mm_cmple_ps(v1.m[0], v2.m[0]), _mm_cmple_ps(v1.m[1], v2.m[1])); - } - - __forceinline static __m128 VECTOR8_SELECT(const GSVector8& v1, const GSVector8& v2, int n) - { - switch(n) - { - case 0: return v1.m[0]; - case 1: return v1.m[1]; - case 2: return v2.m[0]; - case 3: return v2.m[1]; - } - - return _mm_setzero_ps(); - } - - #define VECTOR8_PERMUTE_2(xs, xn, ys, yn) \ - __forceinline GSVector8 xs##ys() const {return GSVector8(VECTOR8_SELECT(*this, *this, xn), VECTOR8_SELECT(*this, *this, yn));} \ - __forceinline GSVector8 xs##ys(const GSVector8& v) const {return GSVector8(VECTOR8_SELECT(*this, v, xn), VECTOR8_SELECT(*this, v, yn));} \ - - #define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ - __forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm_shuffle_ps(m[0], m[0], _MM_SHUFFLE(wn, zn, yn, xn)), _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(wn, zn, yn, xn)));} \ - __forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm_shuffle_ps(m[0], v.m[0], _MM_SHUFFLE(wn, zn, yn, xn)), _mm_shuffle_ps(m[1], v.m[1], _MM_SHUFFLE(wn, zn, yn, xn)));} \ - - #endif - - #define VECTOR8_PERMUTE_1(xs, xn) \ - VECTOR8_PERMUTE_2(xs, xn, x, 0) \ - VECTOR8_PERMUTE_2(xs, xn, y, 1) \ - VECTOR8_PERMUTE_2(xs, xn, z, 2) \ - VECTOR8_PERMUTE_2(xs, xn, w, 3) \ - VECTOR8_PERMUTE_2(xs, xn, _, 8) \ - - VECTOR8_PERMUTE_1(x, 0) - VECTOR8_PERMUTE_1(y, 1) - VECTOR8_PERMUTE_1(z, 2) - VECTOR8_PERMUTE_1(w, 3) - VECTOR8_PERMUTE_1(_, 8) - #define VECTOR8_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ @@ -4212,9 +5308,78 @@ public: VECTOR8_SHUFFLE_1(y, 1) VECTOR8_SHUFFLE_1(z, 2) VECTOR8_SHUFFLE_1(w, 3) + + // a = v0[127:0] + // b = v0[255:128] + // c = v1[127:0] + // d = v1[255:128] + // _ = 0 + + #define VECTOR8_PERMUTE128_2(as, an, bs, bn) \ + __forceinline GSVector8 as##bs() const {return GSVector8(_mm256_permute2f128_ps(m, m, an | (bn << 4)));} \ + __forceinline GSVector8 as##bs(const GSVector8& v) const {return GSVector8(_mm256_permute2f128_ps(m, v.m, an | (bn << 4)));} \ + + #define VECTOR8_PERMUTE128_1(as, an) \ + VECTOR8_PERMUTE128_2(as, an, a, 0) \ + VECTOR8_PERMUTE128_2(as, an, b, 1) \ + VECTOR8_PERMUTE128_2(as, an, c, 2) \ + VECTOR8_PERMUTE128_2(as, an, d, 3) \ + VECTOR8_PERMUTE128_2(as, an, _, 8) \ + + VECTOR8_PERMUTE128_1(a, 0) + VECTOR8_PERMUTE128_1(b, 1) + VECTOR8_PERMUTE128_1(c, 2) + VECTOR8_PERMUTE128_1(d, 3) + VECTOR8_PERMUTE128_1(_, 8) + + // a = v[63:0] + // b = v[127:64] + // c = v[191:128] + // d = v[255:192] + + #define VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, ds, dn) \ + __forceinline GSVector8 as##bs##cs##ds() const {return GSVector8(_mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(m), _MM_SHUFFLE(dn, cn, bn, an))));} \ + __forceinline GSVector8 as##bs##cs##ds(const GSVector8& v) const {return GSVector8(_mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(m), _MM_SHUFFLE(dn, cn, bn, an))));} \ + + #define VECTOR8_PERMUTE64_3(as, an, bs, bn, cs, cn) \ + VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, a, 0) \ + VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, b, 1) \ + VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, c, 2) \ + VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, d, 3) \ + + #define VECTOR8_PERMUTE64_2(as, an, bs, bn) \ + VECTOR8_PERMUTE64_3(as, an, bs, bn, a, 0) \ + VECTOR8_PERMUTE64_3(as, an, bs, bn, b, 1) \ + VECTOR8_PERMUTE64_3(as, an, bs, bn, c, 2) \ + VECTOR8_PERMUTE64_3(as, an, bs, bn, d, 3) \ + + #define VECTOR8_PERMUTE64_1(as, an) \ + VECTOR8_PERMUTE64_2(as, an, a, 0) \ + VECTOR8_PERMUTE64_2(as, an, b, 1) \ + VECTOR8_PERMUTE64_2(as, an, c, 2) \ + VECTOR8_PERMUTE64_2(as, an, d, 3) \ + + VECTOR8_PERMUTE64_1(a, 0) + VECTOR8_PERMUTE64_1(b, 1) + VECTOR8_PERMUTE64_1(c, 2) + VECTOR8_PERMUTE64_1(d, 3) }; -#if _M_SSE >= 0x500 +#endif + +// conversion + +__forceinline GSVector4i::GSVector4i(const GSVector4& v, bool truncate) +{ + m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v); +} + +__forceinline GSVector4::GSVector4(const GSVector4i& v) +{ + m = _mm_cvtepi32_ps(v); +} + +#if _M_SSE >= 0x501 __forceinline GSVector8i::GSVector8i(const GSVector8& v, bool truncate) { @@ -4226,6 +5391,66 @@ __forceinline GSVector8::GSVector8(const GSVector8i& v) m = _mm256_cvtepi32_ps(v); } +#endif + +// casting + +__forceinline GSVector4i GSVector4i::cast(const GSVector4& v) +{ + return GSVector4i(_mm_castps_si128(v.m)); +} + +__forceinline GSVector4 GSVector4::cast(const GSVector4i& v) +{ + return GSVector4(_mm_castsi128_ps(v.m)); +} + +#if _M_SSE >= 0x500 + +__forceinline GSVector4i GSVector4i::cast(const GSVector8& v) +{ + return GSVector4i(_mm_castps_si128(_mm256_castps256_ps128(v))); +} + +__forceinline GSVector4 GSVector4::cast(const GSVector8& v) +{ + return GSVector4(_mm256_castps256_ps128(v)); +} + +__forceinline GSVector8 GSVector8::cast(const GSVector4i& v) +{ + return GSVector8(_mm256_castps128_ps256(_mm_castsi128_ps(v.m))); +} + +__forceinline GSVector8 GSVector8::cast(const GSVector4& v) +{ + return GSVector8(_mm256_castps128_ps256(v.m)); +} + +#endif + +#if _M_SSE >= 0x501 + +__forceinline GSVector4i GSVector4i::cast(const GSVector8i& v) +{ + return GSVector4i(_mm256_castsi256_si128(v)); +} + +__forceinline GSVector4 GSVector4::cast(const GSVector8i& v) +{ + return GSVector4(_mm_castsi128_ps(_mm256_castsi256_si128(v))); +} + +__forceinline GSVector8i GSVector8i::cast(const GSVector4i& v) +{ + return GSVector8i(_mm256_castsi128_si256(v.m)); +} + +__forceinline GSVector8i GSVector8i::cast(const GSVector4& v) +{ + return GSVector8i(_mm256_castsi128_si256(_mm_castps_si128(v.m))); +} + __forceinline GSVector8i GSVector8i::cast(const GSVector8& v) { return GSVector8i(_mm256_castps_si256(v.m)); @@ -4236,40 +5461,6 @@ __forceinline GSVector8 GSVector8::cast(const GSVector8i& v) return GSVector8(_mm256_castsi256_ps(v.m)); } -#else - -__forceinline GSVector8i::GSVector8i(const GSVector8& v, bool truncate) -{ - m[0] = truncate ? _mm_cvttps_epi32(v.m[0]) : _mm_cvtps_epi32(v.m[0]); - m[1] = truncate ? _mm_cvttps_epi32(v.m[1]) : _mm_cvtps_epi32(v.m[1]); -} - -__forceinline GSVector8::GSVector8(const GSVector8i& v) -{ - m[0] = _mm_cvtepi32_ps(v.m[0]); - m[1] = _mm_cvtepi32_ps(v.m[1]); -} - -__forceinline GSVector8i GSVector8i::cast(const GSVector8& v) -{ - GSVector8i v2; - - v2.m[0] = _mm_castps_si128(v.m[0]); - v2.m[1] = _mm_castps_si128(v.m[1]); - - return v2; -} - -__forceinline GSVector8 GSVector8::cast(const GSVector8i& v) -{ - GSVector8 v2; - - v2.m[0] = _mm_castsi128_ps(v.m[0]); - v2.m[1] = _mm_castsi128_ps(v.m[1]); - - return v2; -} - #endif #pragma pack(pop) diff --git a/plugins/GSdx/GSdx_vs11.vcxproj b/plugins/GSdx/GSdx_vs11.vcxproj index c88ce7e607..dbf610f2c0 100644 --- a/plugins/GSdx/GSdx_vs11.vcxproj +++ b/plugins/GSdx/GSdx_vs11.vcxproj @@ -1,6 +1,14 @@  + + Debug AVX2 + Win32 + + + Debug AVX2 + x64 + Debug AVX Win32 @@ -33,6 +41,14 @@ Debug SSSE3 x64 + + Release AVX2 + Win32 + + + Release AVX2 + x64 + Release AVX Win32 @@ -85,6 +101,12 @@ true v110 + + DynamicLibrary + MultiByte + true + v110 + DynamicLibrary MultiByte @@ -95,6 +117,11 @@ MultiByte v110 + + DynamicLibrary + MultiByte + v110 + DynamicLibrary MultiByte @@ -129,6 +156,12 @@ true v110 + + DynamicLibrary + MultiByte + true + v110 + DynamicLibrary MultiByte @@ -139,6 +172,11 @@ MultiByte v110 + + DynamicLibrary + MultiByte + v110 + DynamicLibrary MultiByte @@ -181,6 +219,14 @@ + + + + + + + + @@ -197,6 +243,14 @@ + + + + + + + + @@ -245,6 +299,14 @@ + + + + + + + + @@ -261,6 +323,14 @@ + + + + + + + + @@ -387,6 +457,15 @@ MachineX86 + + + Use + + + .\GSdx.def + MachineX86 + + X64 @@ -411,6 +490,18 @@ + + + X64 + + + Use + + + + + + Use @@ -429,6 +520,15 @@ MachineX86 + + + Use + + + .\GSdx.def + MachineX86 + + X64 @@ -453,6 +553,18 @@ + + + X64 + + + Use + + + + + + @@ -483,10 +595,12 @@ true + true true true true true + true true true true @@ -499,22 +613,28 @@ true + true true true true true + true true true true true + true true + true true + true true true true true + true true true true @@ -527,15 +647,19 @@ true + true true true true true + true true true true true + true true + true @@ -544,6 +668,8 @@ AssemblyAndSourceCode + AssemblyAndSourceCode + AssemblyAndSourceCode @@ -570,13 +696,19 @@ true true true + true true + true true + true true + true true + true true + true true true true @@ -598,13 +730,19 @@ true true true + true true + true true + true true + true true + true true + true true true true @@ -629,6 +767,7 @@ AssemblyAndSourceCode + AssemblyAndSourceCode @@ -639,8 +778,10 @@ Create Create Create + Create Create Create + Create Create Create Create @@ -649,8 +790,10 @@ Create Create Create + Create Create Create + Create Create Create Create @@ -664,10 +807,14 @@ + + + + @@ -684,10 +831,14 @@ + + + + @@ -704,10 +855,14 @@ + + + + @@ -724,10 +879,14 @@ + + + + @@ -744,10 +903,14 @@ + + + + @@ -764,10 +927,14 @@ + + + + @@ -784,10 +951,14 @@ + + + + @@ -804,10 +975,14 @@ + + + + @@ -824,10 +999,14 @@ + + + + @@ -844,10 +1023,14 @@ + + + + @@ -864,10 +1047,14 @@ + + + + @@ -884,10 +1071,14 @@ + + + + @@ -904,10 +1095,14 @@ + + + + @@ -924,10 +1119,14 @@ + + + + @@ -944,10 +1143,14 @@ + + + + @@ -964,10 +1167,14 @@ + + + + @@ -984,10 +1191,14 @@ + + + + @@ -1004,10 +1215,14 @@ + + + + @@ -1024,10 +1239,14 @@ + + + + @@ -1044,10 +1263,14 @@ + + + + @@ -1064,10 +1287,14 @@ + + + + @@ -1084,10 +1311,14 @@ + + + + @@ -1104,10 +1335,14 @@ + + + + @@ -1124,10 +1359,14 @@ + + + + @@ -1144,10 +1383,14 @@ + + + + @@ -1164,10 +1407,14 @@ + + + + @@ -1184,10 +1431,14 @@ + + + + @@ -1204,10 +1455,14 @@ + + + + @@ -1224,10 +1479,14 @@ + + + + @@ -1244,10 +1503,14 @@ + + + + @@ -1264,10 +1527,14 @@ + + + + @@ -1284,10 +1551,14 @@ + + + + @@ -1304,10 +1575,14 @@ + + + + @@ -1324,10 +1599,14 @@ + + + + @@ -1344,10 +1623,14 @@ + + + + @@ -1364,10 +1647,14 @@ + + + + @@ -1384,10 +1671,14 @@ + + + + @@ -1404,10 +1695,14 @@ + + + + @@ -1424,10 +1719,14 @@ + + + + @@ -1444,10 +1743,14 @@ + + + + @@ -1464,10 +1767,14 @@ + + + + @@ -1484,10 +1791,14 @@ + + + + @@ -1504,10 +1815,14 @@ + + + + @@ -1524,10 +1839,14 @@ + + + + @@ -1544,10 +1863,14 @@ + + + + @@ -1564,10 +1887,14 @@ + + + + diff --git a/plugins/GSdx/vsprops/avx.props b/plugins/GSdx/vsprops/avx.props index b8e7d56274..a5c03035f9 100644 --- a/plugins/GSdx/vsprops/avx.props +++ b/plugins/GSdx/vsprops/avx.props @@ -10,6 +10,7 @@ _M_SSE=0x500;%(PreprocessorDefinitions) /arch:AVX %(AdditionalOptions) + AdvancedVectorExtensions diff --git a/plugins/GSdx/vsprops/avx2.props b/plugins/GSdx/vsprops/avx2.props new file mode 100644 index 0000000000..342494e98f --- /dev/null +++ b/plugins/GSdx/vsprops/avx2.props @@ -0,0 +1,21 @@ + + + + AVX2 + + + <_ProjectFileVersion>10.0.30128.1 + + + + _M_SSE=0x501;%(PreprocessorDefinitions) + /arch:AVX %(AdditionalOptions) + AdvancedVectorExtensions + + + + + $(SSEtype) + + + \ No newline at end of file