unit emit_DS; {$mode objfpc}{$H+} interface uses sysutils, spirv, ps4_pssl, bittype, srType, srConst, srInput, srReg, srLayout, emit_fetch; type TEmit_DS=class(TEmitFetch) procedure emit_DS; procedure emit_DS_SWIZZLE_B32; function fetch_ds_chain (vbindex:TsrRegNode;rtype,atomic:TsrDataType;offset:Word):TsrChain; procedure emit_DS_WRITE (rtype:TsrDataType); procedure emit_DS_WRITE2 (rtype:TsrDataType;extra_stride:Word); procedure emit_DS_READ (rtype:TsrDataType); procedure emit_DS_READ2 (rtype:TsrDataType;extra_stride:Word); procedure emit_DS_ATOMIC_OP(rtype:TsrDataType;OpId:DWORD;rtn:Boolean); end; implementation const BitMode=0; QdMode =1; type tds_lanes=bitpacked record lane0:bit2; lane1:bit2; lane2:bit2; lane3:bit2; end; tds_pattern=packed record Case Byte of 0:(qd:bitpacked record lanes:tds_lanes; align:bit7; mode :bit1; end); 1:(bit:bitpacked record mask_and:bit5; mask_or :bit5; mask_xor:bit5; mode :bit1; end); end; procedure TEmit_DS.emit_DS_SWIZZLE_B32; Var dst :PsrRegSlot; src :TsrRegNode; lane_id :TsrRegNode; id_in_group:TsrRegNode; lanes :TsrRegNode; base :TsrRegNode; index :TsrRegNode; maskAnd :TsrRegNode; maskOr :TsrRegNode; maskXor :TsrRegNode; valAnd :TsrRegNode; valOr :TsrRegNode; pat:tds_pattern; begin Word(pat):=PWORD(@FSPI.DS.OFFSET)^; dst:=get_vdst8 (FSPI.DS.VDST); src:=fetch_vsrc8(FSPI.DS.ADDR,dtUnknow); case src.dtype of dtFloat32:; //allow dtInt32 :; //allow dtUint32 :; //allow else begin //retype src:=fetch_vsrc8(FSPI.DS.ADDR,dtUint32); end; end; Case pat.qd.mode of QdMode: begin if (pat.qd.lanes.lane0=pat.qd.lanes.lane1) and (pat.qd.lanes.lane0=pat.qd.lanes.lane2) and (pat.qd.lanes.lane0=pat.qd.lanes.lane3) then begin index:=NewImm_i(dtUint32,pat.qd.lanes.lane0); Op3(Op.OpGroupNonUniformQuadBroadcast,src.dtype,dst,NewImm_i(dtUint32,Scope.Subgroup),src,index); end else begin lanes:=NewImm_i(dtUint32,Byte(pat.qd.lanes)); lane_id:=AddInput(@RegsStory.FUnattach,dtUint32,itSubgroupLocalInvocationId); id_in_group:=OpAndTo(lane_id,3); id_in_group.PrepType(ord(dtUint32)); base :=OpShlTo (id_in_group,1); index:=OpBFUETo(lanes,base,NewImm_i(dtUint32,2)); Op3(Op.OpGroupNonUniformQuadBroadcast,src.dtype,dst,NewImm_i(dtUint32,Scope.Subgroup),src,index); end; end; BitMode: begin lane_id:=AddInput(@RegsStory.FUnattach,dtUint32,itSubgroupLocalInvocationId); if ((pat.bit.mask_and or $20)<>63) then begin maskAnd:=NewImm_i(dtUint32,pat.bit.mask_and or $20); valAnd :=NewReg(dtUint32); _Op2(line,Op.OpBitwiseAnd,valAnd,lane_id,maskAnd); end else begin valAnd:=lane_id; end; if (pat.bit.mask_or<>0) then begin maskOr:=NewImm_i(dtUint32,pat.bit.mask_or ); valOr :=NewReg(dtUint32); _Op2(line,Op.OpBitwiseOr,valOr,valAnd,maskOr); end else begin valOr:=valAnd; end; begin maskXor:=NewImm_i(dtUint32,pat.bit.mask_xor); index :=NewReg(dtUint32); _Op2(line,Op.OpBitwiseXor,index,valOr,maskXor); end; //this is needed? //vdst.lane[i] = EXEC[j] ? vsrc_tmp.lane[j] : 0 Op3(Op.OpGroupNonUniformShuffle,src.dtype,dst,NewImm_i(dtUint32,Scope.Subgroup),src,index); end; end; end; function TEmit_DS.fetch_ds_chain(vbindex:TsrRegNode;rtype,atomic:TsrDataType;offset:Word):TsrChain; var pLayout:TsrDataLayout; lvl_0:TsrChainLvl_0; lvl_1:TsrChainLvl_1; stride:PtrUint; begin case FSPI.DS.GDS of 0:pLayout:=DataLayoutList.FetchLDS(); //base:LDS_BASE size:min(M0[16:0], LDS_SIZE) 1:pLayout:=DataLayoutList.FetchGDS(); //base:M0[31:16] size:M0[15:0] end; //region_addr0 = (OFFSET0 * OpDataSize + vbindex) //region_addr0 = (OFFSET0 * OpDataSize * 64 + vbindex) stride:=(rtype.BitSize div 8); lvl_0.size :=stride; lvl_0.offset:=offset; if vbindex.is_const then begin //#static //i = #(OFFSET + vbindex) & alignment lvl_0.offset:=lvl_0.offset + vbindex.AsConst.GetData; lvl_0.offset:=lvl_0.offset and (not (stride-1)); //4,8 Result:=pLayout.Fetch(@lvl_0,nil,cflags(atomic)); end else begin //#dynamic //i = (vbindex + OFFSET) / stride lvl_1.pIndex:=OpIAddTo(vbindex,lvl_0.offset); lvl_1.pIndex:=OpIDivTo(lvl_1.pIndex,stride); lvl_1.stride:=stride; lvl_0.offset:=0; Result:=pLayout.Fetch(@lvl_0,@lvl_1,cflags(atomic)); end; end; //vbindex, vsrc[] [OFFSET:<0..65535>] [GDS:< 0|1>] procedure TEmit_DS.emit_DS_WRITE(rtype:TsrDataType); var pChain:TsrChain; vbindex:TsrRegNode; vsrc:TsrRegNode; begin vbindex:=fetch_vdst8(FSPI.DS.ADDR,dtUint32); if (rtype.BitSize=64) then begin vsrc:=fetch_vdst8_64(FSPI.DS.DATA[0],dtUint64); end else if (rtype.BitSize=32) then begin vsrc:=fetch_vdst8(FSPI.DS.DATA[0],rtype); end else begin vsrc:=fetch_vdst8(FSPI.DS.DATA[0],dtUnknow); end; case rtype of dtUint8 :vsrc:=OpUToU(vsrc,dtUint8); dtUint16:vsrc:=OpUToU(vsrc,dtUint16); else; end; pChain:=fetch_ds_chain(vbindex,rtype,dtUnknow,WORD(FSPI.DS.OFFSET)); FetchStore(pChain,vsrc); end; //vbindex, vsrc0[], vsrc1[] [OFFSET0:<0..255>] [OFFSET1:<0..255>] [GDS:< 0|1>] procedure TEmit_DS.emit_DS_WRITE2(rtype:TsrDataType;extra_stride:Word); var pChain:array[0..3] of TsrChain; vbindex:TsrRegNode; vsrc:array[0..3] of TsrRegNode; i,hi:Byte; begin vbindex:=fetch_vdst8(FSPI.DS.ADDR,dtUint32); hi:=ord(FSPI.DS.OFFSET[0]<>FSPI.DS.OFFSET[1]); if (rtype.BitSize=64) then begin for i:=0 to hi do begin vsrc[i*2+0]:=fetch_vdst8(FSPI.DS.DATA[i]+0,dtUint32); vsrc[i*2+1]:=fetch_vdst8(FSPI.DS.DATA[i]+1,dtUint32); end; for i:=0 to hi do begin pChain[i*2+0]:=fetch_ds_chain(vbindex,dtUint32,dtUnknow,FSPI.DS.OFFSET[i]*(8)*extra_stride+0); pChain[i*2+1]:=fetch_ds_chain(vbindex,dtUint32,dtUnknow,FSPI.DS.OFFSET[i]*(8)*extra_stride+4); end; for i:=0 to hi do begin FetchStore(pChain[i*2+0],vsrc[i*2+0]); FetchStore(pChain[i*2+1],vsrc[i*2+1]); end; exit; { Assert(false,'DS_WRITE2 64'); for i:=0 to hi do begin vsrc[i]:=fetch_vdst8_64(FSPI.DS.DATA[i],dtUint64); end; } end else begin for i:=0 to hi do begin vsrc[i]:=fetch_vdst8(FSPI.DS.DATA[i],rtype); end; end; for i:=0 to hi do begin pChain[i]:=fetch_ds_chain(vbindex,rtype,dtUnknow,FSPI.DS.OFFSET[i]*(rtype.BitSize div 8)*extra_stride); end; for i:=0 to hi do begin FetchStore(pChain[i],vsrc[i]); end; end; //vdst[], vbindex [OFFSET:<0..65535>] [GDS:< 0|1>] procedure TEmit_DS.emit_DS_READ(rtype:TsrDataType); var pChain:TsrChain; vbindex:TsrRegNode; vdst:TsrRegNode; dst:array[0..1] of PsrRegSlot; begin vbindex:=fetch_vdst8(FSPI.DS.ADDR,dtUint32); pChain:=fetch_ds_chain(vbindex,rtype,dtUnknow,WORD(FSPI.DS.OFFSET)); vdst:=FetchLoad(pChain,rtype); case rtype of dtUint8 :vdst:=OpUToU(vdst,dtUint32); dtUint16:vdst:=OpUToU(vdst,dtUint32); // dtInt8 :vdst:=OpSToS(vdst,dtInt32); dtInt16 :vdst:=OpSToS(vdst,dtInt32); else; end; if (rtype.BitSize=64) then begin dst[0]:=get_vdst8(FSPI.DS.VDST+0); dst[1]:=get_vdst8(FSPI.DS.VDST+1); MakeCopy64(dst[0],dst[1],vdst); end else begin dst[0]:=get_vdst8(FSPI.DS.VDST); MakeCopy(dst[0],vdst); end; end; procedure TEmit_DS.emit_DS_READ2(rtype:TsrDataType;extra_stride:Word); var pChain:array[0..3] of TsrChain; vbindex:TsrRegNode; vdst:array[0..3] of TsrRegNode; dst:array[0..3] of PsrRegSlot; i,hi:Byte; begin vbindex:=fetch_vdst8(FSPI.DS.ADDR,dtUint32); hi:=ord(FSPI.DS.OFFSET[0]<>FSPI.DS.OFFSET[1]); if (rtype.BitSize=64) then begin for i:=0 to hi do begin pChain[i*2+0]:=fetch_ds_chain(vbindex,dtUint32,dtUnknow,FSPI.DS.OFFSET[i]*(8)*extra_stride+0); pChain[i*2+1]:=fetch_ds_chain(vbindex,dtUint32,dtUnknow,FSPI.DS.OFFSET[i]*(8)*extra_stride+4); end; for i:=0 to hi do begin vdst[i*2+0]:=FetchLoad(pChain[i*2+0],dtUint32); vdst[i*2+1]:=FetchLoad(pChain[i*2+1],dtUint32); end; for i:=0 to hi do begin dst[i*2+0]:=get_vdst8(FSPI.DS.VDST+i*2+0); dst[i*2+1]:=get_vdst8(FSPI.DS.VDST+i*2+1); MakeCopy(dst[i*2+0],vdst[i*2+0]); MakeCopy(dst[i*2+1],vdst[i*2+1]); end; exit; end; for i:=0 to hi do begin pChain[i]:=fetch_ds_chain(vbindex,rtype,dtUnknow,FSPI.DS.OFFSET[i]*(rtype.BitSize div 8)*extra_stride); end; for i:=0 to hi do begin vdst[i]:=FetchLoad(pChain[i],rtype); end; if (rtype.BitSize=64) then begin for i:=0 to hi do begin dst[i*2+0]:=get_vdst8(FSPI.DS.VDST+i*2+0); dst[i*2+1]:=get_vdst8(FSPI.DS.VDST+i*2+1); MakeCopy64(dst[i*2+0],dst[i*2+1],vdst[i]); end; end else begin for i:=0 to hi do begin dst[i]:=get_vdst8(FSPI.DS.VDST+i); MakeCopy(dst[i],vdst[i]); end; end; end; //vdst, vbindex, vsrc [OFFSET:<0..65535>] [GDS:< 0|1>] procedure TEmit_DS.emit_DS_ATOMIC_OP(rtype:TsrDataType;OpId:DWORD;rtn:Boolean); var pChain:TsrChain; vbindex:TsrRegNode; vsrc:TsrRegNode; vdst:TsrRegNode; dst:array[0..1] of PsrRegSlot; begin vbindex:=fetch_vdst8(FSPI.DS.ADDR,dtUint32); if (rtype.BitSize=64) then begin vsrc:=fetch_vdst8_64(FSPI.DS.DATA[0],dtUint64); end else if (rtype.BitSize=32) then begin vsrc:=fetch_vdst8(FSPI.DS.DATA[0],rtype); end else begin vsrc:=fetch_vdst8(FSPI.DS.DATA[0],dtUnknow); end; case rtype of dtUint8 :vsrc:=OpUToU(vsrc,dtUint8); dtUint16:vsrc:=OpUToU(vsrc,dtUint16); else end; pChain:=fetch_ds_chain(vbindex,rtype,rtype,WORD(FSPI.DS.OFFSET)); vdst:=FetchAtomic(pChain,OpId,rtype,vsrc); if rtn then begin //save result case rtype of dtUint8 :vdst:=OpUToU(vdst,dtUint32); dtUint16:vdst:=OpUToU(vdst,dtUint32); // dtInt8 :vdst:=OpSToS(vdst,dtInt32); dtInt16 :vdst:=OpSToS(vdst,dtInt32); else; end; if (rtype.BitSize=64) then begin dst[0]:=get_vdst8(FSPI.DS.VDST+0); dst[1]:=get_vdst8(FSPI.DS.VDST+1); MakeCopy64(dst[0],dst[1],vdst); end else begin dst[0]:=get_vdst8(FSPI.DS.VDST); MakeCopy(dst[0],vdst); end; end else begin //no result vdst.mark_read(nil); //self link end; end; procedure TEmit_DS.emit_DS; begin Case FSPI.DS.OP of DS_NOP:; DS_WRITE_B8 :emit_DS_WRITE(dtUint8); DS_WRITE_B16 :emit_DS_WRITE(dtUint16); DS_WRITE_B32 :emit_DS_WRITE(dtUint32); DS_WRITE_B64 :emit_DS_WRITE(dtUint64); DS_WRITE2_B32 :emit_DS_WRITE2(dtUint32,1); DS_WRITE2_B64 :emit_DS_WRITE2(dtUint64,1); DS_WRITE2ST64_B32:emit_DS_WRITE2(dtUint32,64); DS_WRITE2ST64_B64:emit_DS_WRITE2(dtUint64,64); DS_READ_I8 :emit_DS_READ(dtInt8); DS_READ_U8 :emit_DS_READ(dtUint8); DS_READ_I16 :emit_DS_READ(dtInt16); DS_READ_U16 :emit_DS_READ(dtUint16); DS_READ_B32 :emit_DS_READ(dtUint32); DS_READ_B64 :emit_DS_READ(dtUint64); DS_READ2_B32 :emit_DS_READ2(dtUint32,1); DS_READ2_B64 :emit_DS_READ2(dtUint64,1); DS_READ2ST64_B32:emit_DS_READ2(dtUint32,64); DS_READ2ST64_B64:emit_DS_READ2(dtUint64,64); DS_ADD_U32 :emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicIAdd,False); DS_SUB_U32 :emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicISub,False); DS_INC_U32 :emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicIIncrement,False); DS_DEC_U32 :emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicIDecrement,False); DS_MIN_I32 :emit_DS_ATOMIC_OP(dtInt32 ,Op.OpAtomicSMin,False); DS_MAX_I32 :emit_DS_ATOMIC_OP(dtInt32 ,Op.OpAtomicSMax,False); DS_MIN_U32 :emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicUMin,False); DS_MAX_U32 :emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicUMax,False); DS_AND_B32 :emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicAnd,False); DS_OR_B32 :emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicOr ,False); DS_XOR_B32 :emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicXor,False); DS_ADD_RTN_U32:emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicIAdd,True); DS_SUB_RTN_U32:emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicISub,True); DS_INC_RTN_U32:emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicIIncrement,True); DS_DEC_RTN_U32:emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicIDecrement,True); DS_MIN_RTN_I32:emit_DS_ATOMIC_OP(dtInt32 ,Op.OpAtomicSMin,True); DS_MAX_RTN_I32:emit_DS_ATOMIC_OP(dtInt32 ,Op.OpAtomicSMax,True); DS_MIN_RTN_U32:emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicUMin,True); DS_MAX_RTN_U32:emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicUMax,True); DS_AND_RTN_B32:emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicAnd,True); DS_OR_RTN_B32 :emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicOr ,True); DS_XOR_RTN_B32:emit_DS_ATOMIC_OP(dtUint32,Op.OpAtomicXor,True); DS_SWIZZLE_B32:emit_DS_SWIZZLE_B32; else Assert(false,'DS?'+IntToStr(FSPI.DS.OP)+' '+get_str_spi(FSPI)); end; end; { OFFSET0:Byte; OFFSET1:Byte; GDS:bit1; ADDR:Byte; (vbindex) DATA0:Byte; (vsrc0) DATA1:Byte; (vsrc1) VDST:Byte; } end.