diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 0860d53cf0..3ba31539be 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2673,6 +2673,18 @@ void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn) { EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn); } +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn) +{ + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn); +} +void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm); +} +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm); +} void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) { @@ -3026,10 +3038,18 @@ void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm); } +void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm); +} void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm); } +void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm); +} void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn) { Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn); diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index a5eb986e2f..5796b877f6 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -750,6 +750,9 @@ public: void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); @@ -799,8 +802,10 @@ public: void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 16534edf5f..2e39791e47 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -99,6 +99,14 @@ void JitArm64AsmRoutineManager::Generate() FlushIcache(); } +static float s_quantize_ranges[] = +{ + 0.0f, 255.0f, // U8 + -128.0, 127.0f, // S8 + 0.0f, 65535.0f, // U16 + -32768.0f, 32767.0f, // S16 +}; + void JitArm64AsmRoutineManager::GenerateCommon() { // X0 is the scale @@ -291,6 +299,13 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[0]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -318,6 +333,13 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[1]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -346,6 +368,13 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[2]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.REV16(8, D0, D0); @@ -373,6 +402,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[3]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.REV16(8, D0, D0); @@ -415,6 +452,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[0]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -441,6 +486,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[1]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -467,6 +520,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[2]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); }; @@ -493,6 +554,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[3]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); };