From 3d2b116323eec683fec7b7644735a1ee065e6de6 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 9 Jun 2015 17:42:15 -0500 Subject: [PATCH 1/2] [AArch64] Implement a couple instructions in the emitter. Implements LD2R. Implements LD1R/LD2R with post-indexing support. Implements vector min/max instructions. --- Source/Core/Common/Arm64Emitter.cpp | 20 ++++++++++++++++++++ Source/Core/Common/Arm64Emitter.h | 5 +++++ 2 files changed, 25 insertions(+) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 0860d53cf0..3ba31539be 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2673,6 +2673,18 @@ void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn) { EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn); } +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn) +{ + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn); +} +void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm); +} +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm); +} void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) { @@ -3026,10 +3038,18 @@ void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm); } +void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm); +} void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm); } +void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm); +} void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn) { Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn); diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index a5eb986e2f..5796b877f6 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -750,6 +750,9 @@ public: void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); @@ -799,8 +802,10 @@ public: void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); From 113c2dcd746d783c419b1767862376eb3d8e3ccf Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 9 Jun 2015 17:43:25 -0500 Subject: [PATCH 2/2] [AArch64] Clamp quantized store ranges. Fixes block dots in THP videos. Nintendo's THP video uses paired U8 stores to write their THP videos after decoding with floating point operations. Paired stores clamp the range to the minimum and maximum values(0 - 255 in this case). In some instances the resulting float will be larger than what a U8 can fit(Typically white) and results in black dots due to how AArch64 handles quantizing. --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 69 ++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 16534edf5f..2e39791e47 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -99,6 +99,14 @@ void JitArm64AsmRoutineManager::Generate() FlushIcache(); } +static float s_quantize_ranges[] = +{ + 0.0f, 255.0f, // U8 + -128.0, 127.0f, // S8 + 0.0f, 65535.0f, // U16 + -32768.0f, 32767.0f, // S16 +}; + void JitArm64AsmRoutineManager::GenerateCommon() { // X0 is the scale @@ -291,6 +299,13 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[0]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -318,6 +333,13 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[1]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -346,6 +368,13 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[2]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.REV16(8, D0, D0); @@ -373,6 +402,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[3]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.REV16(8, D0, D0); @@ -415,6 +452,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[0]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -441,6 +486,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[1]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -467,6 +520,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[2]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); }; @@ -493,6 +554,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[3]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); };