From 8d7edf440d0b4443b87e325a5a78c0a649080963 Mon Sep 17 00:00:00 2001
From: "gregory.hainaut@gmail.com"
 <gregory.hainaut@gmail.com@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Tue, 28 Sep 2010 19:36:23 +0000
Subject: [PATCH] GregMiscellaneous: zzogl-pg: Nice comment to decipher asm
 code.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3851 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 plugins/zzogl-pg/opengl/x86-32.S | 110 ++++++++++++++++++++++++++-----
 1 file changed, 95 insertions(+), 15 deletions(-)

diff --git a/plugins/zzogl-pg/opengl/x86-32.S b/plugins/zzogl-pg/opengl/x86-32.S
index 0707761167..728d203788 100644
--- a/plugins/zzogl-pg/opengl/x86-32.S
+++ b/plugins/zzogl-pg/opengl/x86-32.S
@@ -4,15 +4,15 @@
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation either ve%rsion 2, or (at your option)
 #  any later ve%rsion.
-#   
+#
 #  This Program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 #  GNU General Public License for more details.
-#   
+#
 #  You should have received a copy of the GNU General Public License
 #  along with GNU Make see the file COPYING.  If not, write to
-#  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
+#  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 #  http://www.gnu.org/copyleft/gpl.html
 #
 #
@@ -20,6 +20,11 @@
 
 #ifdef ZEROGS_SSE2
 // SSE2 extensions
+
+// Note: pshufd 0xea <=> movdqa !!!
+// What the function does is
+// Interleave s1 and sd0 -> d1 (high) & sd0 (low)
+// Interleave s3 and sd2 -> d3 (high) & sd2 (low)
 #define punpck(op, sd0, sd2, s1, s3, d1, d3) \
 	movdqa %xmm##d1, %xmm##sd0; \
 	pshufd %xmm##d3, %xmm##sd2, 0xe4; \
@@ -28,7 +33,16 @@
 	punpckl##op %xmm##sd2, %xmm##s3; \
 	punpckh##op %xmm##d3, %xmm##s3; \
 
-	
+
+// Input xmm7 == 0x0F0F0F0F 0x0F0F0F0F  0x0F0F0F0F 0x0F0F0F0F
+// DATA xmm[0-3]
+// This function does a 4-bits interleaving of 4 xmm registers
+//
+// ARG Can not put comment in the middle of the define...
+// After the first por
+// low 32bits (4bits packed) == 1.6 0.6 1.4 0.4  1.2 0.2 1.0 0.0
+// After the second one
+// low 32bits (4bits packed) == 1.7 0.7 1.5 0.5  1.3 0.3 1.1 0.1
 #define punpcknb \
 	movdqa	%xmm4, %xmm0; \
 	pshufd	%xmm5, %xmm1, 0xe4; \
@@ -48,6 +62,7 @@
         \
 	movdqa	%xmm1, %xmm4; \
         \
+        \
 	movdqa	%xmm4, %xmm2; \
 	pshufd	%xmm5, %xmm3, 0xe4; \
         \
@@ -66,7 +81,13 @@
         \
 	movdqa	%xmm3, %xmm4; \
         \
-	punpck(bw, 0, 2, 1, 3, 4, 6); \
+	punpck(bw, 0, 2, 1, 3, 4, 6);\
+
+// output
+// low 32 bits 0 (4 bits packed) == 1.3  0.3  1.2  0.2    1.1  0.1  1.0  0.0
+// low 32 bits 4 (4 bits packed) == 1.19 0.19 1.18 0.18   1.17 0.17 1.16 0.16
+// low 32 bits 2 (4 bits packed) == 3.3  2.3  3.2  2.2    3.1  2.1  3.0  2.0
+// low 32 bits 6 (4 bits packed) == 3.19 2.19 3.18 2.18   3.17 2.17 3.16 2.16
 
 
 //
@@ -84,11 +105,15 @@ SwizzleBlock32_sse2:
 	push		%esi
 	push		%edi
 
+    // save dst
 	mov			%edi, %ecx
+    // save src
 	mov			%esi, %edx
+    // get pitch
 	mov			%edx, [%esp+4+8]
 	mov			%ecx, 4
 
+    // get WriteMask
 	mov			%eax, [%esp+8+8]
 	cmp			%eax, 0xffffffff
 	jne			SwizzleBlock32_sse2_2
@@ -100,6 +125,8 @@ SwizzleBlock32_sse2_1:
 	movdqa		%xmm1, [%esi+%edx]
 	movdqa		%xmm5, [%esi+%edx+16]
 
+    // 64bits interleave 1&0 -> 2&0
+    // 64bits interleave 5&4 -> 6&4
 	punpck(qdq, 0, 4, 1, 5, 2, 6)
 
 	movntps		[%edi+16*0], %xmm0
@@ -107,6 +134,7 @@ SwizzleBlock32_sse2_1:
 	movntps		[%edi+16*2], %xmm4
 	movntps		[%edi+16*3], %xmm6
 
+    // update ptr
 	lea			%esi, [%esi+%edx*2]
 	add			%edi, 64
 
@@ -120,9 +148,10 @@ SwizzleBlock32_sse2_1:
 
 SwizzleBlock32_sse2_2:
 
+    // WriteMask: 32bits to 4*32bits
 	movd		%xmm7, %eax
 	pshufd		%xmm7, %xmm7, 0
-	
+
 	.align 16
 SwizzleBlock32_sse2_3:
 	movdqa		%xmm0, [%esi]
@@ -130,13 +159,19 @@ SwizzleBlock32_sse2_3:
 	movdqa		%xmm1, [%esi+%edx]
 	movdqa		%xmm5, [%esi+%edx+16]
 
+    // 64bits interleave 1&0 -> 2&0
+    // 64bits interleave 5&4 -> 6&4
 	punpck(qdq, 0, 4, 1, 5, 2, 6)
 
+    // save a mask copy
 	movdqa		%xmm3, %xmm7
 	pshufd		%xmm5, %xmm7, 0xe4
 
+    // *dst & ~WriteMask
 	pandn		%xmm3, [%edi+16*0]
+    // *src & WriteMask
 	pand		%xmm0, %xmm7
+    // Final value to save
 	por			%xmm0, %xmm3
 	movntps		[%edi+16*0], %xmm0
 
@@ -158,6 +193,7 @@ SwizzleBlock32_sse2_3:
 	por			%xmm6, %xmm5
 	movntps		[%edi+16*3], %xmm6
 
+    // update ptr
 	lea			%esi, [%esi+%edx*2]
 	add			%edi, 64
 
@@ -179,6 +215,7 @@ SwizzleBlock16_sse2:
 
 	push		%ebx
 
+    // srcpitch
 	mov			%ebx, [%esp+4+4]
 	mov			%eax, 4
 
@@ -189,7 +226,11 @@ SwizzleBlock16_sse2_1:
 	movdqa		%xmm2, [%edx+%ebx]
 	movdqa		%xmm3, [%edx+%ebx+16]
 
+    // 16bits interleave 1&0 -> 4&0
+    // 16bits interleave 3&2 -> 6&2
 	punpck(wd, 0, 2, 1, 3, 4, 6)
+    // 64bits interleave 2&0 -> 1&0
+    // 64bits interleave 6&4 -> 5&4
 	punpck(qdq, 0, 4, 2, 6, 1, 5)
 
 	movntps		[%ecx+16*0], %xmm0
@@ -197,6 +238,7 @@ SwizzleBlock16_sse2_1:
 	movntps		[%ecx+16*2], %xmm4
 	movntps		[%ecx+16*3], %xmm5
 
+    // update ptr
 	lea			%edx, [%edx+%ebx*2]
 	add			%ecx, 64
 
@@ -217,7 +259,9 @@ SwizzleBlock8_sse2:
 
 	push		%ebx
 
+    // load srcpitch
 	mov			%ebx, [%esp+4+4]
+    // basic counter
 	mov			%eax, 2
 
 	.align 16
@@ -226,14 +270,23 @@ SwizzleBlock8_sse2_1:
 
 	movdqa		%xmm0, [%edx]
 	movdqa		%xmm2, [%edx+%ebx]
+    // update src pointer
 	lea			%edx, [%edx+%ebx*2]
 
+    // 2 3  0 1
 	pshufd		%xmm1, [%edx], 0xb1
 	pshufd		%xmm3, [%edx+%ebx], 0xb1
+    // update src pointer
 	lea			%edx, [%edx+%ebx*2]
 
+    // 8bits interleave 1&0 -> 4&0
+    // 8bits interleave 3&2 -> 6&2
 	punpck(bw, 0, 2, 1, 3, 4, 6)
+    // 16bits interleave 4&0 -> 1&0
+    // 16bits interleave 6&2 -> 3&2
 	punpck(wd, 0, 2, 4, 6, 1, 3)
+    // 64bits interleave 2&0 -> 4&0
+    // 64bits interleave 3&1 -> 5&1
 	punpck(qdq, 0, 1, 2, 3, 4, 5)
 
 	movntps		[%ecx+16*0], %xmm0
@@ -241,18 +294,27 @@ SwizzleBlock8_sse2_1:
 	movntps		[%ecx+16*2], %xmm1
 	movntps		[%ecx+16*3], %xmm5
 
-	// col 1, 3
+	// col 1, 3 (same as previous column)
 
+    // 2 3  0 1
 	pshufd		%xmm0, [%edx], 0xb1
 	pshufd		%xmm2, [%edx+%ebx], 0xb1
+    // update src pointer
 	lea			%edx, [%edx+%ebx*2]
 
 	movdqa		%xmm1, [%edx]
 	movdqa		%xmm3, [%edx+%ebx]
+    // update src pointer
 	lea			%edx, [%edx+%ebx*2]
 
+    // 8bits interleave 1&0 -> 4&0
+    // 8bits interleave 3&2 -> 6&2
 	punpck(bw, 0, 2, 1, 3, 4, 6)
+    // 16bits interleave 4&0 -> 1&0
+    // 16bits interleave 6&2 -> 3&2
 	punpck(wd, 0, 2, 4, 6, 1, 3)
+    // 64bits interleave 2&0 -> 4&0
+    // 64bits interleave 3&1 -> 5&1
 	punpck(qdq, 0, 1, 2, 3, 4, 5)
 
 	movntps		[%ecx+16*4], %xmm0
@@ -260,6 +322,7 @@ SwizzleBlock8_sse2_1:
 	movntps		[%ecx+16*6], %xmm1
 	movntps		[%ecx+16*7], %xmm5
 
+    // update dst pointer
 	add			%ecx, 128
 
 	dec			%eax
@@ -278,11 +341,13 @@ SwizzleBlock8_sse2_1:
 SwizzleBlock4_sse2:
 
 	push		%ebx
-	
+
+    // load 4 0x0F0F0F0F
 	mov         %eax, 0xf0f0f0f
-	movd        %xmm7, %eax 
+	movd        %xmm7, %eax
 	pshufd      %xmm7, %xmm7, 0
 
+    // load srcpitch
 	mov			%ebx, [%esp+4+4]
 	mov			%eax, 2
 
@@ -292,20 +357,32 @@ SwizzleBlock4_sse2_1:
 
 	movdqa		%xmm0, [%edx]
 	movdqa		%xmm2, [%edx+%ebx]
+    //update src pointer
 	lea			%edx, [%edx+%ebx*2]
 
 	movdqa		%xmm1, [%edx]
 	movdqa		%xmm3, [%edx+%ebx]
+    // update src pointer
 	lea			%edx, [%edx+%ebx*2]
 
+    // - - - -  2 3 0 1
 	pshuflw		%xmm1, %xmm1, 0xb1
 	pshuflw		%xmm3, %xmm3, 0xb1
+    // 6 7 4 5  - - - -
 	pshufhw		%xmm1, %xmm1, 0xb1
 	pshufhw		%xmm3, %xmm3, 0xb1
 
+    // 4bits interleave 1&0 -> 4&0
+    // 4bits interleave 3&2 -> 6&2
 	punpcknb
+    // 8bits interleave 4&0 -> 1&0
+    // 8bits interleave 6&2 -> 3&2
 	punpck(bw, 0, 2, 4, 6, 1, 3)
+    // 8bits interleave 1&0 -> 4&0
+    // 8bits interleave 3&2 -> 6&2
 	punpck(bw, 0, 2, 1, 3, 4, 6)
+    // 64bits interleave 2&0 -> 1&0
+    // 64bits interleave 6&4 -> 3&4
 	punpck(qdq, 0, 4, 2, 6, 1, 3)
 
 	movntps		[%ecx+16*0], %xmm0
@@ -313,7 +390,7 @@ SwizzleBlock4_sse2_1:
 	movntps		[%ecx+16*2], %xmm4
 	movntps		[%ecx+16*3], %xmm3
 
-	// col 1, 3
+	// col 1, 3 (same as previous column)
 
 	movdqa		%xmm0, [%edx]
 	movdqa		%xmm2, [%edx+%ebx]
@@ -349,6 +426,9 @@ SwizzleBlock4_sse2_1:
 
 //
 // swizzling with unaligned reads
+// Same functions as a above with movdqu instead of movdqa for the reads
+// Movdqu is as fast as movdqa with aligned address... So do not bother, directly
+// use movdqu
 //
 
 //
@@ -400,7 +480,7 @@ SwizzleBlock32u_sse2_2:
 
 	movd		%xmm7, %eax
 	pshufd		%xmm7, %xmm7, 0
-	
+
 	.align 16
 SwizzleBlock32u_sse2_3:
 	movdqu		%xmm0, [%esi]
@@ -480,7 +560,7 @@ SwizzleBlock16u_sse2_1:
 
 	dec			%eax
 	jnz			SwizzleBlock16u_sse2_1
-        
+
 	pop			%ebx
 
 	ret			4
@@ -560,9 +640,9 @@ SwizzleBlock8u_sse2_1:
 SwizzleBlock4u_sse2:
 
 	push		%ebx
-	
+
 	mov         %eax, 0xf0f0f0f
-	movd        %xmm7, %eax 
+	movd        %xmm7, %eax
 	pshufd      %xmm7, %xmm7, 0
 
 	mov			%ebx, [%esp+4+4]
@@ -628,7 +708,7 @@ SwizzleBlock4u_sse2_1:
 	pop			%ebx
 
 	ret			4
-                        
+
 #endif
 
 #if defined(__linux__) && defined(__ELF__)