diff --git a/ARM.cpp b/ARM.cpp index 69aa2eec..f582deb2 100644 --- a/ARM.cpp +++ b/ARM.cpp @@ -376,6 +376,8 @@ s32 ARM::Execute() //if (R[15]==0x037F9364) printf("R8=%08X R9=%08X\n", R[8], R[9]); // gross hack + // TODO, though: move timer code here too? + // quick testing shows that moving this to the NDS loop doesn't really slow things down if (Num==0) { s32 diff = Cycles - lastcycles; diff --git a/DMA.cpp b/DMA.cpp index 12845658..836a5805 100644 --- a/DMA.cpp +++ b/DMA.cpp @@ -20,6 +20,7 @@ #include "NDS.h" #include "DMA.h" #include "NDSCart.h" +#include "GPU3D.h" // NOTES ON DMA SHIT @@ -87,9 +88,11 @@ void DMA::WriteCnt(u32 val) if ((StartMode & 0x7) == 0) Start(); + else if (StartMode == 0x07) + GPU3D::CheckFIFODMA(); //else // printf("SPECIAL ARM%d DMA%d START MODE %02X\n", CPU?7:9, Num, StartMode); - if ((StartMode&7)!=0x00 && (StartMode&7)!=0x1 && StartMode!=2 && StartMode!=0x05 && StartMode!=0x12) + if ((StartMode&7)!=0x00 && (StartMode&7)!=0x1 && StartMode!=2 && StartMode!=0x05 && StartMode!=0x12 && StartMode!=0x07) printf("UNIMPLEMENTED ARM%d DMA%d START MODE %02X\n", CPU?7:9, Num, StartMode); //if (StartMode==2)printf("HBLANK DMA %08X -> %08X\n", SrcAddr, DstAddr); } @@ -123,6 +126,10 @@ void DMA::Start() NDS::TriggerIRQ(CPU, NDS::IRQ_DMA0 + Num); return; } +if (StartMode == 0x07)printf("GXFIFO DMA %08X %08X\n", Cnt, CurSrcAddr); + u32 num = RemCount; + if (StartMode == 0x07 && num > 112) + num = 112; // TODO: NOT MAKE THE DMA INSTANT!! if (!(Cnt & 0x04000000)) @@ -130,12 +137,13 @@ void DMA::Start() u16 (*readfn)(u32) = CPU ? NDS::ARM7Read16 : NDS::ARM9Read16; void (*writefn)(u32,u16) = CPU ? NDS::ARM7Write16 : NDS::ARM9Write16; - while (RemCount > 0) + while (num > 0) { writefn(CurDstAddr, readfn(CurSrcAddr)); CurSrcAddr += SrcAddrInc<<1; CurDstAddr += DstAddrInc<<1; + num--; RemCount--; } } @@ -144,16 +152,24 @@ void DMA::Start() u32 (*readfn)(u32) = CPU ? NDS::ARM7Read32 : NDS::ARM9Read32; void (*writefn)(u32,u32) = CPU ? NDS::ARM7Write32 : NDS::ARM9Write32; - while (RemCount > 0) + while (num > 0) { writefn(CurDstAddr, readfn(CurSrcAddr)); CurSrcAddr += SrcAddrInc<<2; CurDstAddr += DstAddrInc<<2; + num--; RemCount--; } } + if (RemCount) + { + Cnt &= ~countmask; + Cnt |= RemCount; + return; + } + if (!(Cnt & 0x02000000)) Cnt &= ~0x80000000; diff --git a/GPU3D.cpp b/GPU3D.cpp index 5e11f84a..6cf636a7 100644 --- a/GPU3D.cpp +++ b/GPU3D.cpp @@ -119,6 +119,142 @@ u32 ExecParamCount; s32 CycleCount; +u32 MatrixMode; + +s32 ProjMatrix[16]; +s32 PosMatrix[16]; +s32 VecMatrix[16]; +s32 TexMatrix[16]; + +s32 ProjMatrixStack[16]; +s32 PosMatrixStack[31][16]; +s32 ProjMatrixStackPointer; +s32 PosMatrixStackPointer; + + +void MatrixLoadIdentity(s32* m) +{ + m[0] = 0x1000; m[1] = 0; m[2] = 0; m[3] = 0; + m[4] = 0; m[5] = 0x1000; m[6] = 0; m[7] = 0; + m[8] = 0; m[9] = 0; m[10] = 0x1000; m[11] = 0; + m[12] = 0; m[13] = 0; m[14] = 0; m[15] = 0x1000; +} + +void MatrixLoad4x4(s32* m, s32* s) +{ + memcpy(m, s, 16*4); +} + +void MatrixLoad4x3(s32* m, s32* s) +{ + m[0] = s[0]; m[1] = s[1]; m[2] = s[2]; m[3] = 0; + m[4] = s[3]; m[5] = s[4]; m[6] = s[5]; m[7] = 0; + m[8] = s[6]; m[9] = s[7]; m[10] = s[8]; m[11] = 0; + m[12] = s[9]; m[13] = s[10]; m[14] = s[11]; m[15] = 0x1000; +} + +void MatrixMult4x4(s32* m, s32* s) +{ + s32 tmp[16]; + memcpy(tmp, m, 16*4); + + // m = s*m + m[0] = (s[0]*tmp[0] + s[1]*tmp[4] + s[2]*tmp[8] + s[3]*tmp[12]) >> 12; + m[1] = (s[0]*tmp[1] + s[1]*tmp[5] + s[2]*tmp[9] + s[3]*tmp[13]) >> 12; + m[2] = (s[0]*tmp[2] + s[1]*tmp[6] + s[2]*tmp[10] + s[3]*tmp[14]) >> 12; + m[3] = (s[0]*tmp[3] + s[1]*tmp[7] + s[2]*tmp[11] + s[3]*tmp[15]) >> 12; + + m[4] = (s[4]*tmp[0] + s[5]*tmp[4] + s[6]*tmp[8] + s[7]*tmp[12]) >> 12; + m[5] = (s[4]*tmp[1] + s[5]*tmp[5] + s[6]*tmp[9] + s[7]*tmp[13]) >> 12; + m[6] = (s[4]*tmp[2] + s[5]*tmp[6] + s[6]*tmp[10] + s[7]*tmp[14]) >> 12; + m[7] = (s[4]*tmp[3] + s[5]*tmp[7] + s[6]*tmp[11] + s[7]*tmp[15]) >> 12; + + m[8] = (s[8]*tmp[0] + s[9]*tmp[4] + s[10]*tmp[8] + s[11]*tmp[12]) >> 12; + m[9] = (s[8]*tmp[1] + s[9]*tmp[5] + s[10]*tmp[9] + s[11]*tmp[13]) >> 12; + m[10] = (s[8]*tmp[2] + s[9]*tmp[6] + s[10]*tmp[10] + s[11]*tmp[14]) >> 12; + m[11] = (s[8]*tmp[3] + s[9]*tmp[7] + s[10]*tmp[11] + s[11]*tmp[15]) >> 12; + + m[12] = (s[12]*tmp[0] + s[13]*tmp[4] + s[14]*tmp[8] + s[15]*tmp[12]) >> 12; + m[13] = (s[12]*tmp[1] + s[13]*tmp[5] + s[14]*tmp[9] + s[15]*tmp[13]) >> 12; + m[14] = (s[12]*tmp[2] + s[13]*tmp[6] + s[14]*tmp[10] + s[15]*tmp[14]) >> 12; + m[15] = (s[12]*tmp[3] + s[13]*tmp[7] + s[14]*tmp[11] + s[15]*tmp[15]) >> 12; +} + +void MatrixMult4x3(s32* m, s32* s) +{ + s32 tmp[16]; + memcpy(tmp, m, 16*4); + + // m = s*m + m[0] = (s[0]*tmp[0] + s[1]*tmp[4] + s[2]*tmp[8]) >> 12; + m[1] = (s[0]*tmp[1] + s[1]*tmp[5] + s[2]*tmp[9]) >> 12; + m[2] = (s[0]*tmp[2] + s[1]*tmp[6] + s[2]*tmp[10]) >> 12; + m[3] = (s[0]*tmp[3] + s[1]*tmp[7] + s[2]*tmp[11]) >> 12; + + m[4] = (s[3]*tmp[0] + s[4]*tmp[4] + s[5]*tmp[8]) >> 12; + m[5] = (s[3]*tmp[1] + s[4]*tmp[5] + s[5]*tmp[9]) >> 12; + m[6] = (s[3]*tmp[2] + s[4]*tmp[6] + s[5]*tmp[10]) >> 12; + m[7] = (s[3]*tmp[3] + s[4]*tmp[7] + s[5]*tmp[11]) >> 12; + + m[8] = (s[6]*tmp[0] + s[7]*tmp[4] + s[8]*tmp[8]) >> 12; + m[9] = (s[6]*tmp[1] + s[7]*tmp[5] + s[8]*tmp[9]) >> 12; + m[10] = (s[6]*tmp[2] + s[7]*tmp[6] + s[8]*tmp[10]) >> 12; + m[11] = (s[6]*tmp[3] + s[7]*tmp[7] + s[8]*tmp[11]) >> 12; + + m[12] = (s[9]*tmp[0] + s[10]*tmp[4] + s[11]*tmp[8] + 0x1000*tmp[12]) >> 12; + m[13] = (s[9]*tmp[1] + s[10]*tmp[5] + s[11]*tmp[9] + 0x1000*tmp[13]) >> 12; + m[14] = (s[9]*tmp[2] + s[10]*tmp[6] + s[11]*tmp[10] + 0x1000*tmp[14]) >> 12; + m[15] = (s[9]*tmp[3] + s[10]*tmp[7] + s[11]*tmp[11] + 0x1000*tmp[15]) >> 12; +} + +void MatrixMult3x3(s32* m, s32* s) +{ + s32 tmp[12]; + memcpy(tmp, m, 12*4); + + // m = s*m + m[0] = (s[0]*tmp[0] + s[1]*tmp[4] + s[2]*tmp[8]) >> 12; + m[1] = (s[0]*tmp[1] + s[1]*tmp[5] + s[2]*tmp[9]) >> 12; + m[2] = (s[0]*tmp[2] + s[1]*tmp[6] + s[2]*tmp[10]) >> 12; + m[3] = (s[0]*tmp[3] + s[1]*tmp[7] + s[2]*tmp[11]) >> 12; + + m[4] = (s[3]*tmp[0] + s[4]*tmp[4] + s[5]*tmp[8]) >> 12; + m[5] = (s[3]*tmp[1] + s[4]*tmp[5] + s[5]*tmp[9]) >> 12; + m[6] = (s[3]*tmp[2] + s[4]*tmp[6] + s[5]*tmp[10]) >> 12; + m[7] = (s[3]*tmp[3] + s[4]*tmp[7] + s[5]*tmp[11]) >> 12; + + m[8] = (s[6]*tmp[0] + s[7]*tmp[4] + s[8]*tmp[8]) >> 12; + m[9] = (s[6]*tmp[1] + s[7]*tmp[5] + s[8]*tmp[9]) >> 12; + m[10] = (s[6]*tmp[2] + s[7]*tmp[6] + s[8]*tmp[10]) >> 12; + m[11] = (s[6]*tmp[3] + s[7]*tmp[7] + s[8]*tmp[11]) >> 12; +} + +void MatrixScale(s32* m, s32* s) +{ + m[0] = (s[0]*m[0]) >> 12; + m[1] = (s[0]*m[1]) >> 12; + m[2] = (s[0]*m[2]) >> 12; + m[3] = (s[0]*m[3]) >> 12; + + m[4] = (s[1]*m[4]) >> 12; + m[5] = (s[1]*m[5]) >> 12; + m[6] = (s[1]*m[6]) >> 12; + m[7] = (s[1]*m[7]) >> 12; + + m[8] = (s[2]*m[8]) >> 12; + m[9] = (s[2]*m[9]) >> 12; + m[10] = (s[2]*m[10]) >> 12; + m[11] = (s[2]*m[11]) >> 12; +} + +void MatrixTranslate(s32* m, s32* s) +{ + m[12] += (s[0]*m[0] + s[1]*m[4] + s[2]*m[8]) >> 12; + m[13] += (s[0]*m[1] + s[1]*m[5] + s[2]*m[9]) >> 12; + m[14] += (s[0]*m[2] + s[1]*m[6] + s[2]*m[10]) >> 12; +} + + bool Init() { CmdFIFO = new FIFO(256); @@ -148,6 +284,19 @@ void Reset() memset(ExecParams, 0, 32*4); ExecParamCount = 0; CycleCount = 0; + + + MatrixMode = 0; + + MatrixLoadIdentity(ProjMatrix); + MatrixLoadIdentity(PosMatrix); + MatrixLoadIdentity(VecMatrix); + MatrixLoadIdentity(TexMatrix); + + memset(ProjMatrixStack, 0, 16*4); + memset(PosMatrixStack, 0, 31 * 16*4); + ProjMatrixStackPointer = 0; + PosMatrixStackPointer = 0; } @@ -179,9 +328,11 @@ CmdFIFOEntry CmdFIFORead() CmdPIPE->Write(CmdFIFO->Read()); if (!CmdFIFO->IsEmpty()) CmdPIPE->Write(CmdFIFO->Read()); + + CheckFIFODMA(); + CheckFIFOIRQ(); } - CheckFIFOIRQ(); return ret; } @@ -200,15 +351,242 @@ void ExecuteCommand() CycleCount += CmdNumCycles[entry.Command]; ExecParamCount = 0; - // TODO: actually execute the command, maybe + GXStat &= ~(1<<14); + //printf("3D CMD %02X\n", entry.Command); switch (entry.Command) { - case 0x18: - case 0x19: - case 0x1A: - // TODO: more cycles if MTX_MODE=2 + case 0x10: // matrix mode + MatrixMode = ExecParams[0] & 0x3; + break; + + case 0x11: // push matrix + if (MatrixMode == 0) + { + if (ProjMatrixStackPointer > 0) + { + printf("!! PROJ MATRIX STACK OVERFLOW\n"); + GXStat |= (1<<15); + break; + } + + memcpy(ProjMatrixStack, ProjMatrix, 16*4); + ProjMatrixStackPointer++; + GXStat |= (1<<14); + } + else if (MatrixMode == 3) + { + printf("!! CAN'T PUSH TEXTURE MATRIX\n"); + GXStat |= (1<<15); // CHECKME + } + else + { + if (PosMatrixStackPointer > 30) + { + printf("!! POS MATRIX STACK OVERFLOW\n"); + GXStat |= (1<<15); + break; + } + + memcpy(PosMatrixStack[PosMatrixStackPointer], PosMatrix, 16*4); + PosMatrixStackPointer++; + GXStat |= (1<<14); + } + break; + + case 0x12: // pop matrix + if (MatrixMode == 0) + { + if (ProjMatrixStackPointer <= 0) + { + printf("!! PROJ MATRIX STACK UNDERFLOW\n"); + GXStat |= (1<<15); + break; + } + + ProjMatrixStackPointer--; + memcpy(ProjMatrix, ProjMatrixStack, 16*4); + GXStat |= (1<<14); + } + else if (MatrixMode == 3) + { + printf("!! CAN'T POP TEXTURE MATRIX\n"); + GXStat |= (1<<15); // CHECKME + } + else + { + s32 offset = (s32)(ExecParams[0] << 26) >> 26; + PosMatrixStackPointer -= offset; + + if (PosMatrixStackPointer < 0 || PosMatrixStackPointer > 30) + { + printf("!! POS MATRIX STACK UNDER/OVERFLOW %d\n", PosMatrixStackPointer); + PosMatrixStackPointer += offset; + GXStat |= (1<<15); + break; + } + + memcpy(PosMatrix, PosMatrixStack[PosMatrixStackPointer], 16*4); + GXStat |= (1<<14); + } + break; + + case 0x13: // store matrix + if (MatrixMode == 0) + { + memcpy(ProjMatrixStack, ProjMatrix, 16*4); + } + else if (MatrixMode == 3) + { + printf("!! CAN'T STORE TEXTURE MATRIX\n"); + GXStat |= (1<<15); // CHECKME + } + else + { + u32 addr = ExecParams[0] & 0x1F; + if (addr > 30) + { + printf("!! POS MATRIX STORE ADDR 31\n"); + GXStat |= (1<<15); + break; + } + + memcpy(PosMatrixStack[addr], PosMatrix, 16*4); + } + break; + + case 0x14: // restore matrix + if (MatrixMode == 0) + { + memcpy(ProjMatrix, ProjMatrixStack, 16*4); + } + else if (MatrixMode == 3) + { + printf("!! CAN'T RESTORE TEXTURE MATRIX\n"); + GXStat |= (1<<15); // CHECKME + } + else + { + u32 addr = ExecParams[0] & 0x1F; + if (addr > 30) + { + printf("!! POS MATRIX STORE ADDR 31\n"); + GXStat |= (1<<15); + break; + } + + memcpy(PosMatrix, PosMatrixStack[addr], 16*4); + } + break; + + case 0x15: // identity + if (MatrixMode == 0) + MatrixLoadIdentity(ProjMatrix); + else if (MatrixMode == 3) + MatrixLoadIdentity(TexMatrix); + else + { + MatrixLoadIdentity(PosMatrix); + if (MatrixMode == 2) + MatrixLoadIdentity(VecMatrix); + } + break; + + case 0x16: // load 4x4 + if (MatrixMode == 0) + MatrixLoad4x4(ProjMatrix, (s32*)ExecParams); + else if (MatrixMode == 3) + MatrixLoad4x4(TexMatrix, (s32*)ExecParams); + else + { + MatrixLoad4x4(PosMatrix, (s32*)ExecParams); + if (MatrixMode == 2) + MatrixLoad4x4(VecMatrix, (s32*)ExecParams); + } + break; + + case 0x17: // load 4x3 + if (MatrixMode == 0) + MatrixLoad4x3(ProjMatrix, (s32*)ExecParams); + else if (MatrixMode == 3) + MatrixLoad4x3(TexMatrix, (s32*)ExecParams); + else + { + MatrixLoad4x3(PosMatrix, (s32*)ExecParams); + if (MatrixMode == 2) + MatrixLoad4x3(VecMatrix, (s32*)ExecParams); + } + break; + + case 0x18: // mult 4x4 + if (MatrixMode == 0) + MatrixMult4x4(ProjMatrix, (s32*)ExecParams); + else if (MatrixMode == 3) + MatrixMult4x4(TexMatrix, (s32*)ExecParams); + else + { + MatrixMult4x4(PosMatrix, (s32*)ExecParams); + if (MatrixMode == 2) + { + MatrixMult4x4(VecMatrix, (s32*)ExecParams); + CycleCount += 30; + } + } + break; + + case 0x19: // mult 4x3 + if (MatrixMode == 0) + MatrixMult4x3(ProjMatrix, (s32*)ExecParams); + else if (MatrixMode == 3) + MatrixMult4x3(TexMatrix, (s32*)ExecParams); + else + { + MatrixMult4x3(PosMatrix, (s32*)ExecParams); + if (MatrixMode == 2) + { + MatrixMult4x3(VecMatrix, (s32*)ExecParams); + CycleCount += 30; + } + } + break; + + case 0x1A: // mult 3x3 + if (MatrixMode == 0) + MatrixMult3x3(ProjMatrix, (s32*)ExecParams); + else if (MatrixMode == 3) + MatrixMult3x3(TexMatrix, (s32*)ExecParams); + else + { + MatrixMult3x3(PosMatrix, (s32*)ExecParams); + if (MatrixMode == 2) + { + MatrixMult3x3(VecMatrix, (s32*)ExecParams); + CycleCount += 30; + } + } + break; + + case 0x1B: // scale + if (MatrixMode == 0) + MatrixScale(ProjMatrix, (s32*)ExecParams); + else if (MatrixMode == 3) + MatrixScale(TexMatrix, (s32*)ExecParams); + else + MatrixScale(PosMatrix, (s32*)ExecParams); + break; + + case 0x1C: // translate + if (MatrixMode == 0) + MatrixTranslate(ProjMatrix, (s32*)ExecParams); + else if (MatrixMode == 3) + MatrixTranslate(TexMatrix, (s32*)ExecParams); + else + { + MatrixTranslate(PosMatrix, (s32*)ExecParams); + if (MatrixMode == 2) + MatrixTranslate(VecMatrix, (s32*)ExecParams); + } break; case 0x21: @@ -249,6 +627,12 @@ void CheckFIFOIRQ() if (irq) NDS::TriggerIRQ(0, NDS::IRQ_GXFIFO); } +void CheckFIFODMA() +{ + if (CmdFIFO->Level() < 128) + NDS::CheckDMAs(0, 0x07); +} + u8 Read8(u32 addr) { @@ -272,13 +656,26 @@ u32 Read32(u32 addr) u32 fifolevel = CmdFIFO->Level(); return GXStat | - // matrix stack levels, TODO + ((PosMatrixStackPointer & 0x1F) << 8) | + ((ProjMatrixStackPointer & 0x1) << 13) | (fifolevel << 16) | (fifolevel < 128 ? (1<<25) : 0) | (fifolevel == 0 ? (1<<26) : 0) | (CycleCount > 0 ? (1<<27) : 0); } } + + if (addr >= 0x04000640 && addr < 0x04000680) + { + printf("!! CLIPMTX READ\n"); + return 0; + } + if (addr >= 0x04000680 && addr < 0x040006A4) + { + printf("!! VECMTX READ\n"); + return 0; + } + return 0; } @@ -330,6 +727,8 @@ void Write32(u32 addr, u32 val) ParamCount = 0; TotalParams = CmdNumParams[CurCommand & 0xFF]; } + + return; } if (addr >= 0x04000440 && addr < 0x040005CC) diff --git a/GPU3D.h b/GPU3D.h index 322dd311..d606e25d 100644 --- a/GPU3D.h +++ b/GPU3D.h @@ -28,6 +28,7 @@ void Reset(); void Run(s32 cycles); void CheckFIFOIRQ(); +void CheckFIFODMA(); u8 Read8(u32 addr); u16 Read16(u32 addr); diff --git a/NDS.cpp b/NDS.cpp index 2ce57bdc..68ca25c2 100644 --- a/NDS.cpp +++ b/NDS.cpp @@ -307,7 +307,7 @@ void Reset() // test //LoadROM(); //LoadFirmware(); - if (NDSCart::LoadROM("rom/nsmb.nds")) + if (NDSCart::LoadROM("rom/Simple_Tri.nds")) Running = true; // hax } @@ -377,6 +377,7 @@ void RunFrame() ARM7Offset = ARM7->Cycles - ARM7->CyclesToRun; RunSystem(ndscyclestorun); + //GPU3D::Run(ndscyclestorun); /*while (ndscycles < ndscyclestorun) { diff --git a/melonDS.depend b/melonDS.depend index 8a27fb08..f49c92a2 100644 --- a/melonDS.depend +++ b/melonDS.depend @@ -10,7 +10,7 @@ 1481161027 c:\documents\sources\melonds\types.h -1486511394 source:c:\documents\sources\melonds\nds.cpp +1486515172 source:c:\documents\sources\melonds\nds.cpp "NDS.h" @@ -24,7 +24,7 @@ "RTC.h" "Wifi.h" -1486511108 source:c:\documents\sources\melonds\arm.cpp +1486512922 source:c:\documents\sources\melonds\arm.cpp "NDS.h" "ARM.h" @@ -109,11 +109,12 @@ 1486511075 c:\documents\sources\melonds\fifo.h "types.h" -1486309616 source:c:\documents\sources\melonds\dma.cpp +1486514961 source:c:\documents\sources\melonds\dma.cpp "NDS.h" "DMA.h" "NDSCart.h" + "GPU3D.h" 1484698068 c:\documents\sources\melonds\dma.h "types.h" @@ -145,9 +146,9 @@ "NDS.h" "NDSCart.h" -1486510737 c:\documents\sources\melonds\gpu3d.h +1486514429 c:\documents\sources\melonds\gpu3d.h -1486511257 source:c:\documents\sources\melonds\gpu3d.cpp +1486585700 source:c:\documents\sources\melonds\gpu3d.cpp "NDS.h"