unit emit_VOP2; {$mode objfpc}{$H+} interface uses sysutils, ps4_pssl, spirv, srType, srReg, srConst, emit_fetch; type TEmit_VOP2=class(TEmitFetch) procedure emit_VOP2; function get_legacy_cmp(src0,src1,zero:TsrRegNode):TsrRegNode; procedure emit_V_CNDMASK_B32; procedure emit_V_AND_B32; procedure emit_V_OR_B32; procedure emit_V_XOR_B32; procedure emit_V_SH(OpId:DWORD;rtype:TsrDataType;rev:Boolean); procedure emit_V_ADD_I32; procedure emit_V_SUB_I32(rev:Boolean); procedure emit_V_ADDC_U32; procedure emit_V_SUBB_U32(rev:Boolean); procedure emit_V2_F32(OpId:DWORD;rev:Boolean); procedure emit_V_MUL_LEGACY_F32; procedure emit_V_CVT_PKRTZ_F16_F32; procedure emit_V_MUL_I32_I24; procedure emit_V_MUL_U32_U24; procedure emit_V_MAC_F32; procedure emit_V_MAC_LEGACY_F32; procedure emit_V_MADAK_F32; procedure emit_V_MADMK_F32; procedure emit_V_BCNT_U32_B32; procedure emit_V_MMX(OpId:DWORD;rtype:TsrDataType); procedure emit_V_LDEXP_F32; procedure emit_V_MBCNT_LO_U32_B32; procedure emit_V_MBCNT_HI_U32_B32; procedure emit_V_WRITELANE_B32; procedure emit_V_READLANE_B32; end; implementation function TEmit_VOP2.get_legacy_cmp(src0,src1,zero:TsrRegNode):TsrRegNode; var eql:array[0..1] of TsrRegNode; begin if CompareReg(src0,src1) then begin Result:=NewReg(dtBool); _Op2(line,Op.OpFOrdEqual,Result,src0,zero); end else begin eql[0]:=NewReg(dtBool); eql[1]:=NewReg(dtBool); _Op2(line,Op.OpFOrdEqual,eql[0],src0,zero); _Op2(line,Op.OpFOrdEqual,eql[1],src1,zero); Result:=NewReg(dtBool); _Op2(line,Op.OpLogicalOr,Result,eql[0],eql[1]); end; end; procedure TEmit_VOP2.emit_V_CNDMASK_B32; //vdst = smask[thread_id:] ? vsrc1 : vsrc0 Var dst:PsrRegSlot; src:array[0..2] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9 (FSPI.VOP2.SRC0 ,dtUnknow); src[1]:=fetch_vsrc8 (FSPI.VOP2.VSRC1,dtUnknow); src[2]:=GetThreadBit(get_vcc0,get_vcc1,dtBool); //dst,cond,src_true,src_false OpSelect(dst,src[2],src[1],src[0]); end; procedure TEmit_VOP2.emit_V_AND_B32; Var dst:PsrRegSlot; src:array[0..1] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUnknow); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUnknow); OpBitwiseAnd(dst,src[0],src[1]); end; procedure TEmit_VOP2.emit_V_OR_B32; Var dst:PsrRegSlot; src:array[0..1] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUnknow); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUnknow); OpBitwiseOr(dst,src[0],src[1]); end; procedure TEmit_VOP2.emit_V_XOR_B32; Var dst:PsrRegSlot; src:array[0..1] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUInt32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUInt32); OpBitwiseXor(dst,src[0],src[1]); end; procedure TEmit_VOP2.emit_V_SH(OpId:DWORD;rtype:TsrDataType;rev:Boolean); Var dst:PsrRegSlot; src:array[0..1] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); case rev of False: begin src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,rtype); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUInt32); end; True: begin src[1]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUInt32); src[0]:=fetch_vsrc8(FSPI.VOP2.VSRC1,rtype); end; end; src[1]:=OpAndTo(src[1],31); src[1].PrepType(ord(dtUInt32)); Op2(OpId,rtype,dst,src[0],src[1]); end; procedure TEmit_VOP2.emit_V_ADD_I32; //vdst = vsrc0.s + vsrc1.s; sdst[thread_id:] = carry_out & EXEC Var dst,car:PsrRegSlot; src:array[0..1] of TsrRegNode; //exc:TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); car:=get_vcc0; src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUInt32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUint32); OpIAddExt(dst,car,src[0],src[1],dtUint32); { TODO: if (EXEC[i]) { V_ADD_I32 VCC[i] = car; } else { VCC[i] = 0; } } //exc:=MakeRead(get_exec0,dtUnknow); //OpBitwiseAnd(car,car^.current,exc); //carry_out & EXEC end; procedure TEmit_VOP2.emit_V_SUB_I32(rev:Boolean); //vdst = vsrc0.u - vsub.u; sdst[thread_id:] = borrow_out & EXEC Var dst,bor:PsrRegSlot; src:array[0..1] of TsrRegNode; //exc:TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); bor:=get_vcc0; case rev of False: begin src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUint32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUint32); end; True: begin src[1]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUint32); src[0]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUint32); end; end; OpISubExt(dst,bor,src[0],src[1],dtUint32); { TODO: if (EXEC[i]) { V_SUB_I32 VCC[i] = bor; } else { VCC[i] = 0; } } //exc:=MakeRead(get_exec0,dtUnknow); //OpBitwiseAnd(bor,bor^.current,exc); //borrow_out & EXEC end; procedure TEmit_VOP2.emit_V_ADDC_U32; Var dst,car:PsrRegSlot; src:array[0..2] of TsrRegNode; //exc:TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); car:=get_vcc0; src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUint32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUint32); src[2]:=GetThreadBit(get_vcc0,get_vcc1,dtUInt32); src[2]:=OpAndTo(src[2],1); src[2].PrepType(ord(dtUInt32)); OpIAddExt(dst,car,src[0],src[1],dtUint32); //src0+src1 src[0]:=MakeRead(dst,dtUInt32); src[1]:=MakeRead(car,dtUInt32); //save car1 OpIAddExt(dst,car,src[0],src[2],dtUint32); //(src0+src1)+src2 src[0]:=MakeRead(car,dtUInt32); OpBitwiseOr(car,src[1],src[0]); //car1 or car2 src[0]:=MakeRead(car,dtUInt32); { TODO: if (EXEC[i]) { V_ADDC_U32 VCC[i] = car; } else { VCC[i] = 0; } } //exc:=MakeRead(get_exec0,dtUnknow); //OpBitwiseAnd(car,src[0],exc); //carry_out & EXEC end; //v_subbrev_u32 //vdst = vsrc1.u - vsub.u - sborrow[thread_id:]; sdst[thread_id:] = borrow_out & EXEC procedure TEmit_VOP2.emit_V_SUBB_U32(rev:Boolean); //vdst = vsrc0.u - vsub.u - sborrow[thread_id:]; sdst[thread_id:] = borrow_out & EXEC Var dst,bor:PsrRegSlot; src:array[0..2] of TsrRegNode; //exc:TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); bor:=get_vcc0; case rev of False: begin src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUint32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUint32); end; True: begin src[1]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUint32); src[0]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUint32); end; end; src[2]:=GetThreadBit(get_vcc0,get_vcc1,dtUInt32); src[2]:=OpAndTo(src[2],1); src[2].PrepType(ord(dtUInt32)); OpISubExt(dst,bor,src[0],src[1],dtUInt32); //src0-src1 src[0]:=MakeRead(dst,dtUInt32); src[1]:=MakeRead(bor,dtUInt32); //save car1 OpISubExt(dst,bor,src[0],src[2],dtUInt32); //(src0-src1)-src2 src[0]:=MakeRead(bor,dtUInt32); //Or??? And??? OpBitwiseOr(bor,src[1],src[0]); //car1 or car2 { TODO: if (EXEC[i]) { V_SUBB_U32 SDST[i] = bor; } else { SDST[i] = 0; } } //src[0]:=MakeRead(bor,dtUInt32); //exc:=MakeRead(get_exec0,dtUnknow); //OpBitwiseAnd(bor,src[0],exc); //borrow_out & EXEC end; procedure TEmit_VOP2.emit_V2_F32(OpId:DWORD;rev:Boolean); Var dst:PsrRegSlot; src:array[0..1] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); case rev of False: begin src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtFloat32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtFloat32); end; True: begin src[1]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtFloat32); src[0]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtFloat32); end; end; Op2(OpId,dtFloat32,dst,src[0],src[1]); end; procedure TEmit_VOP2.emit_V_MUL_LEGACY_F32; Var dst:PsrRegSlot; src:array[0..1] of TsrRegNode; zero:TsrRegNode; cmp:TsrRegNode; mul:TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtFloat32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtFloat32); zero:=NewImm_s(dtFloat32,0); cmp:=get_legacy_cmp(src[0],src[1],zero); // mul:=NewReg(dtFloat32); _Op2(line,Op.OpFMul,mul,src[0],src[1]); //dst,cond,src_true,src_false OpSelect(dst,cmp,zero,mul); end; procedure TEmit_VOP2.emit_V_CVT_PKRTZ_F16_F32; Var dst:PsrRegSlot; src:array[0..1] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtFloat32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtFloat32); OpConvFloatToHalf2(dst,src[0],src[1]); end; procedure TEmit_VOP2.emit_V_MUL_I32_I24; //vdst = (vsrc0[23:0].s * vsrc1[23:0].s) Var dst:PsrRegSlot; src:array[0..1] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtInt32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtInt32); src[0]:=OpBFSETo(src[0],NewImm_i(dtInt32,0),NewImm_i(dtInt32,24)); src[1]:=OpBFSETo(src[1],NewImm_i(dtInt32,0),NewImm_i(dtInt32,24)); OpIMul(dst,src[0],src[1]); end; procedure TEmit_VOP2.emit_V_MUL_U32_U24; //vdst = (vsrc0[23:0].u * vsrc1[23:0].u) Var dst:PsrRegSlot; src:array[0..1] of TsrRegNode; bit24:TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUInt32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUInt32); bit24:=NewImm_q(dtUInt32,$FFFFFF); src[0]:=OpAndTo(src[0],bit24); src[0].PrepType(ord(dtUInt32)); src[1]:=OpAndTo(src[1],bit24); src[1].PrepType(ord(dtUInt32)); OpIMul(dst,src[0],src[1]); end; procedure TEmit_VOP2.emit_V_MAC_F32; //vdst = vsrc0.f * vsrc1.f + vdst.f -> fma Var dst:PsrRegSlot; src:array[0..2] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtFloat32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtFloat32); src[2]:=MakeRead(dst,dtFloat32); OpFmaF32(dst,src[0],src[1],src[2]); end; procedure TEmit_VOP2.emit_V_MAC_LEGACY_F32; Var dst:PsrRegSlot; src:array[0..2] of TsrRegNode; zero:TsrRegNode; cmp:TsrRegNode; mul:TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtFloat32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtFloat32); src[2]:=MakeRead(dst,dtFloat32); zero:=NewImm_s(dtFloat32,0); cmp:=get_legacy_cmp(src[0],src[1],zero); // OpFmaF32(dst,src[0],src[1],src[2]); // mul:=MakeRead(dst,dtFloat32); //dst,cond,src_true,src_false OpSelect(dst,cmp,zero,mul); end; procedure TEmit_VOP2.emit_V_MADAK_F32; //vdst = vsrc0.f * vsrc1.f + kadd.f Var dst:PsrRegSlot; src:array[0..2] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtFloat32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtFloat32); src[2]:=NewImm_q(dtFloat32,FSPI.INLINE32); OpFmaF32(dst,src[0],src[1],src[2]); end; procedure TEmit_VOP2.emit_V_MADMK_F32; //vdst = vsrc0.f * kmul.f + vadd.f Var dst:PsrRegSlot; src:array[0..2] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtFloat32); src[1]:=NewImm_q(dtFloat32,FSPI.INLINE32); src[2]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtFloat32); OpFmaF32(dst,src[0],src[1],src[2]); end; procedure TEmit_VOP2.emit_V_BCNT_U32_B32; //vdst = bit_count(vsrc0) + vsrc1.u Var dst:PsrRegSlot; src:array[0..1] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUint32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUint32); src[0]:=OpBitCountTo(src[0]); Op2(Op.OpIAdd,dtUint32,dst,src[0],src[1]); end; procedure TEmit_VOP2.emit_V_MMX(OpId:DWORD;rtype:TsrDataType); Var dst:PsrRegSlot; src:array[0..1] of TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,rtype); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,rtype); OpGlsl2(OpId,rtype,dst,src[0],src[1]); end; procedure TEmit_VOP2.emit_V_LDEXP_F32; //vdst.f = vsrc0.f * pow(2.0, vsrc1.s) Var dst:PsrRegSlot; src:array[0..2] of TsrRegNode; two:TsrRegNode; begin dst:=get_vdst8(FSPI.VOP2.VDST); src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtFloat32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtInt32); two:=NewImm_s(dtFloat32,2); src[1]:=OpSToF(src[1],dtFloat32); src[1]:=OpPowTo(two,src[1]); Op2(Op.OpFMul,dtFloat32,dst,src[0],src[1]); end; //V_MBCNT_LO_U32_B32 v1, -1, v1 procedure TEmit_VOP2.emit_V_MBCNT_LO_U32_B32; Var dst:PsrRegSlot; src:array[0..2] of TsrRegNode; begin //V_MBCNT_LO_U32_B32 vdst, vsrc, vaccum //mask_lo_threads_before= (thread_id>32) ? 0xffffffff : (1<32) ? (1<<(thread_id-32))-1 : 0 //vdst = vaccum.u + bit_count(vsrc & mask_hi_threads_before) dst:=get_vdst8(FSPI.VOP2.VDST); //src[0]:=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUint32); src[1]:=fetch_vsrc8(FSPI.VOP2.VSRC1,dtUint32); //only lower thread_id mean MakeCopy(dst,src[1]); end; procedure TEmit_VOP2.emit_V_WRITELANE_B32; Var dst :PsrRegSlot; src :TsrRegNode; slane:TsrRegNode; slane_id:Byte; lane :PsrRegSlot; begin dst :=get_vdst8 (FSPI.VOP2.VDST); src :=fetch_ssrc9(FSPI.VOP2.SRC0 ,dtUnknow); slane:=fetch_ssrc8(FSPI.VOP2.VSRC1,dtUnknow); Assert(slane.is_const,'TODO: indexed V_WRITELANE_B32'); slane_id:=(slane.AsConst.AsUint8 and 63); if not dst^.ConvertToVectorArray then begin Assert(false,'ConvertToVectorArray'); end; lane:=dst^.Lanes(slane_id); MakeCopy(lane,src); end; procedure TEmit_VOP2.emit_V_READLANE_B32; Var dst :PsrRegSlot; src :PsrRegSlot; slane:TsrRegNode; slane_id:Byte; lane :PsrRegSlot; sdata:TsrRegNode; begin dst :=get_ssrc8 (FSPI.VOP2.VDST); src :=get_ssrc9 (FSPI.VOP2.SRC0); slane:=fetch_ssrc8(FSPI.VOP2.VSRC1,dtUnknow); Assert(src^.Category=cVectorArray,'TODO: subgroup V_READLANE_B32'); Assert(slane.is_const,'TODO: indexed V_READLANE_B32'); slane_id:=(slane.AsConst.AsUint8 and 63); lane:=src^.Lanes(slane_id); sdata:=MakeRead(lane,dtUnknow); MakeCopy(dst,sdata); end; procedure TEmit_VOP2.emit_VOP2; begin Case FSPI.VOP2.OP of V_CNDMASK_B32: emit_V_CNDMASK_B32; V_AND_B32 : emit_V_AND_B32; V_OR_B32 : emit_V_OR_B32; V_XOR_B32 : emit_V_XOR_B32; V_LSHL_B32 : emit_V_SH(Op.OpShiftLeftLogical ,dtUint32,False); V_LSHLREV_B32: emit_V_SH(Op.OpShiftLeftLogical ,dtUint32,True); V_LSHR_B32 : emit_V_SH(Op.OpShiftRightLogical ,dtUint32,False); V_LSHRREV_B32: emit_V_SH(Op.OpShiftRightLogical ,dtUint32,True); V_ASHR_I32 : emit_V_SH(Op.OpShiftRightArithmetic,dtInt32 ,False); V_ASHRREV_I32: emit_V_SH(Op.OpShiftRightArithmetic,dtInt32 ,True); V_ADD_I32 : emit_V_ADD_I32; V_SUB_I32 : emit_V_SUB_I32(False); V_SUBREV_I32 : emit_V_SUB_I32(True ); V_ADD_F32 : emit_V2_F32(Op.OpFAdd,False); V_SUB_F32 : emit_V2_F32(Op.OpFSub,False); V_SUBREV_F32 : emit_V2_F32(Op.OpFSub,True ); V_ADDC_U32: emit_V_ADDC_U32; V_SUBB_U32 :emit_V_SUBB_U32(False); V_SUBBREV_U32:emit_V_SUBB_U32(True); V_MUL_F32 : emit_V2_F32(Op.OpFMul,False); V_MUL_LEGACY_F32: emit_V_MUL_LEGACY_F32; V_CVT_PKRTZ_F16_F32: emit_V_CVT_PKRTZ_F16_F32; V_MUL_I32_I24: emit_V_MUL_I32_I24; V_MUL_U32_U24: emit_V_MUL_U32_U24; V_MAC_F32 : emit_V_MAC_F32; V_MAC_LEGACY_F32: emit_V_MAC_LEGACY_F32; V_MADAK_F32: emit_V_MADAK_F32; V_MADMK_F32: emit_V_MADMK_F32; V_BCNT_U32_B32: emit_V_BCNT_U32_B32; V_MIN_LEGACY_F32: emit_V_MMX(GlslOp.NMin,dtFloat32); V_MAX_LEGACY_F32: emit_V_MMX(GlslOp.NMax,dtFloat32); V_MIN_F32: emit_V_MMX(GlslOp.FMin,dtFloat32); V_MAX_F32: emit_V_MMX(GlslOp.FMax,dtFloat32); V_MIN_I32: emit_V_MMX(GlslOp.SMin,dtInt32); V_MAX_I32: emit_V_MMX(GlslOp.SMax,dtInt32); V_MIN_U32: emit_V_MMX(GlslOp.UMin,dtUint32); V_MAX_U32: emit_V_MMX(GlslOp.UMax,dtUint32); V_LDEXP_F32: emit_V_LDEXP_F32; V_MBCNT_LO_U32_B32: emit_V_MBCNT_LO_U32_B32; V_MBCNT_HI_U32_B32: emit_V_MBCNT_HI_U32_B32; V_WRITELANE_B32:emit_V_WRITELANE_B32; V_READLANE_B32 :emit_V_READLANE_B32; else Assert(false,'VOP2?'+IntToStr(FSPI.VOP2.OP)+' '+get_str_spi(FSPI)); end; end; end.