From cc2c86cf1182bf8cb0c19074692d1297aa493af5 Mon Sep 17 00:00:00 2001
From: zeromus <zeromus@users.sf.net>
Date: Sat, 13 Aug 2016 23:48:51 +0000
Subject: [PATCH] fix #1555 (regression in Kingdom Hearts Re:coded caused by
 r5440) by changing how wacky nearly-out-of-limits geometry is handled to a
 possibly more plausible mechanism

---
 desmume/src/gfx3d.cpp  | 25 ++++++++++++++++++-------
 desmume/src/matrix.cpp |  5 -----
 desmume/src/matrix.h   | 16 ----------------
 desmume/src/types.h    | 14 ++------------
 4 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp
index 5a6e669bb..9ed7786e0 100644
--- a/desmume/src/gfx3d.cpp
+++ b/desmume/src/gfx3d.cpp
@@ -1,6 +1,6 @@
 /*	
 	Copyright (C) 2006 yopyop
-	Copyright (C) 2008-2015 DeSmuME team
+	Copyright (C) 2008-2016 DeSmuME team
 
 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@@ -659,12 +659,23 @@ static void SetVertex()
 			return;
 	if(polylist->count >= POLYLIST_SIZE) 
 			return;
-	
-	//TODO - think about keeping the clip matrix concatenated,
-	//so that we only have to multiply one matrix here
-	//(we could lazy cache the concatenated clip matrix and only generate it
-	//when we need to)
-	MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed);
+
+	//games will definitely count on overflowing the matrix math
+	//scenarios to balance here:
+	//+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen
+	//morover in some conditions there will be vertical glitched lines sometimes when drilling at the top center of the screen.
+	//+ kingdom hearts re-coded: first conversation with cast characters will place them oddly with something overflowing to about 0xA???????
+	//+ SM64: skybox
+	//+ TBD other things, probably, dragon quest worldmaps?
+	//At first I tried saturating the math elsewhere, but that couldn't fix all cases
+	//So after some fooling around, I found this nicely aesthetic way of balancing all the cases. I don't doubt that it's still inaccurate, however
+	//Note, if <<3 seems weird, it's reasonable if you assume the goal is to end up with 16 integer bits and a sign bit.
+	MatrixMultVec4x4(mtxCurrent[1],coordTransformed); //modelview
+	for(int i=0;i<4;i++) coordTransformed[i] = (((s32)coordTransformed[i])<<3>>3); //balances everything ok
+	//for(int i=0;i<4;i++) coordTransformed[i] = (((s32)coordTransformed[i])<<4>>4); //breaks SM64 skyboxes
+	//for(int i=0;i<4;i++) coordTransformed[i] = (((u32)coordTransformed[i])<<4>>4)|(((s32)(coordTransformed[i]&0x80000000))>>3); //another way generally to drop precision (but breaks spectrobes which does seem to need some kind of buggy wrap-around behaviour)
+	MatrixMultVec4x4(mtxCurrent[0],coordTransformed); //projection
+	for(int i=0;i<4;i++) coordTransformed[i] = (((s32)coordTransformed[i])<<3>>3); //no proof this is needed, but suspected to be similar based on above
 
 	//printf("%f %f %f\n",s16coord[0]/4096.0f,s16coord[1]/4096.0f,s16coord[2]/4096.0f);
 	//printf("x %f %f %f %f\n",mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f);
diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp
index 666029bbf..35ecf271a 100644
--- a/desmume/src/matrix.cpp
+++ b/desmume/src/matrix.cpp
@@ -427,8 +427,3 @@ void MatrixTranslate(s32 *matrix, const s32 *ptr)
 	});
 }
 
-void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr)
-{
-	MatrixMultVec4x4(matrix+16,vecPtr);
-	MatrixMultVec4x4(matrix,vecPtr);
-}
diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h
index 8aa87c2fe..d060a4d38 100644
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@@ -276,13 +276,6 @@ FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr)
 	_mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr)));
 }
 
-FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr)
-{
-	//there are hardly any gains from merging these manually
-	MatrixMultVec4x4(matrix+16,vecPtr);
-	MatrixMultVec4x4(matrix,vecPtr);
-}
-
 FORCEINLINE void MatrixMultVec3x3(const float * matrix, float * vecPtr)
 {
 	const __m128 vec = _mm_load_ps(vecPtr);
@@ -355,13 +348,6 @@ void MatrixMultiply(float * matrix, const float * rightMatrix);
 void MatrixTranslate(float *matrix, const float *ptr);
 void MatrixScale(float * matrix, const float * ptr);
 
-FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr)
-{
-	//there are hardly any gains from merging these manually
-	MatrixMultVec4x4(matrix+16,vecPtr);
-	MatrixMultVec4x4(matrix,vecPtr);
-}
-
 template<int NUM_ROWS>
 FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
 {
@@ -373,8 +359,6 @@ FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
 
 void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr);
 
-void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr);
-
 void MatrixMultiply(s32* matrix, const s32* rightMatrix);
 void MatrixScale(s32 *matrix, const s32 *ptr);
 void MatrixTranslate(s32 *matrix, const s32 *ptr);
diff --git a/desmume/src/types.h b/desmume/src/types.h
index ce794d5a7..56b225c3b 100644
--- a/desmume/src/types.h
+++ b/desmume/src/types.h
@@ -504,21 +504,11 @@ FORCEINLINE s64 fx32_mul(const s32 a, const s32 b)
 
 FORCEINLINE s32 fx32_shiftdown(const s64 a)
 {
-	s64 shifted;
 #ifdef _MSC_VER
-	shifted = __ll_rshift(a,12);
+	return (s32)__ll_rshift(a,12);
 #else
-	shifted = (a>>12);
+	return (s32)(a>>12);
 #endif
-	//either matrix math is happening at higher precision (an extra bit would suffice, I think), or the sums sent to this are saturated.
-	//tested by: spectrobes beyond the portals excavation blower
-	//(it sets very large +x,+y in the modelview matrix to push things offscreen, but the +y will overflow and become negative if we're not careful)
-	//I didnt think very hard about what would be fastest here on 32bit systems
-	//NOTE: this was intended for use in MatrixMultVec4x4_M2; it may not be appropriate for other uses of fx32_shiftdown.
-	//if this causes problems we should refactor the math routines a bit to take care of saturating in another function
-	if(shifted>(s32)0x7FFFFFFF) return 0x7FFFFFFF;
-	else if(shifted<=(s32)0x80000000) return 0x80000000;
-	else return shifted;
 }
 
 FORCEINLINE s64 fx32_shiftup(const s32 a)