diff --git a/chip/pm4_me.pas b/chip/pm4_me.pas
index 58f277a5..b2ea5914 100644
--- a/chip/pm4_me.pas
+++ b/chip/pm4_me.pas
@@ -3292,6 +3292,13 @@ begin
  end;
 end;
 
+procedure pm4_PfpSyncMe(var ctx:t_me_render_context;node:p_pm4_node_PfpSyncMe);
+begin
+ if not ctx.WaitConfirmOrSwitch then Exit;
+
+ RTLEventSetEvent(node^.event);
+end;
+
 //
 
 procedure pm4_me_thread(me:p_pm4_me); SysV_ABI_CDecl;
@@ -3402,6 +3409,8 @@ begin
       ntWaitOnCECounter    :pm4_WaitOnCECounter    (ctx,Pointer(ctx.node));
       ntWaitOnDECounterDiff:pm4_WaitOnDECounterDiff(ctx,Pointer(ctx.node));
 
+      ntPfpSyncMe          :pm4_PfpSyncMe          (ctx,Pointer(ctx.node));
+
       else
        begin
         Writeln(stderr,'me:+',ctx.node^.ntype);
diff --git a/chip/pm4_pfp.pas b/chip/pm4_pfp.pas
index 2c8971ea..4e006886 100644
--- a/chip/pm4_pfp.pas
+++ b/chip/pm4_pfp.pas
@@ -56,6 +56,7 @@ type
   curr_ibuf :p_pm4_ibuffer;
   //
   LastSetReg:Word;
+  event:PRTLEvent;
   //
   function  stream_type:t_pm4_stream_type;
   procedure init;
@@ -1045,6 +1046,28 @@ begin
 
 end;
 
+procedure FlushAndWaitMe(pctx:p_pfp_ctx);
+var
+ event:PRTLEvent;
+begin
+ if (pctx^.stream_type=stGfxDcb) then
+ begin
+
+  if (pctx^.event=nil) then
+  begin
+   pctx^.event:=RTLEventCreate;
+  end;
+
+  event:=pctx^.event;
+
+  pctx^.stream[stGfxDcb].PfpSyncMe(event);
+
+  pctx^.Flush_stream(stGfxDcb);
+
+  RTLEventWaitFor(event);
+ end;
+end;
+
 procedure onEventWrite(pctx:p_pfp_ctx;Body:PTPM4CMDEVENTWRITE);
 begin
  Assert(pctx^.stream_type=stGfxDcb);
@@ -1109,7 +1132,9 @@ begin
 
  pctx^.stream[stGfxDcb].EventWriteEop(Pointer(Body^.address),Body^.DATA,Body^.eventType,Body^.dataSel,Body^.intSel);
 
- pctx^.Flush_stream(stGfxDcb);
+ //pctx^.Flush_stream(stGfxDcb);
+
+ FlushAndWaitMe(pctx);
 end;
 
 procedure onEventWriteEos(pctx:p_pfp_ctx;Body:PPM4CMDEVENTWRITEEOS);
@@ -1143,6 +1168,8 @@ begin
  DWORD(pctx^.CX_REG.VGT_EVENT_INITIATOR):=Body^.eventType;
 
  pctx^.stream[stGfxDcb].EventWriteEos(Pointer(Body^.address),Body^.data,Body^.eventType,Body^.command);
+
+ FlushAndWaitMe(pctx);
 end;
 
 const
@@ -1196,6 +1223,8 @@ begin
    begin
     //Execute on the parser side
 
+    //FlushAndWaitMe(pctx);
+
     adrDst_dmem:=get_dmem_ptr(Pointer(adrDst));
 
     case (srcSel or (dstSel shl 4)) of
@@ -1283,6 +1312,8 @@ begin
   WRITE_DATA_ENGINE_PFP:
     begin
 
+     //FlushAndWaitMe(pctx);
+
      case dstSel of
       WRITE_DATA_DST_SEL_MEMORY_SYNC,  //writeDataInline
       WRITE_DATA_DST_SEL_TCL2,         //writeDataInlineThroughL2
@@ -1353,6 +1384,8 @@ begin
   WAIT_REG_MEM_ENGINE_PFP:
     begin
 
+     pctx^.Flush_stream(pctx^.stream_type);
+
      while not me_test_mem(Pointer(Body^.pollAddress),Body^.reference,Body^.mask,Body^.compareFunc) do
      begin
       msleep_td(hz div 10000);
@@ -1695,6 +1728,8 @@ begin
  pctx^.stream[stGfxDcb].DrawIndex2(pctx^.SG_REG,
                                    pctx^.CX_REG,
                                    pctx^.UC_REG);
+
+ //FlushAndWaitMe(pctx);
 end;
 
 procedure onDrawIndexOffset2(pctx:p_pfp_ctx;Body:PPM4CMDDRAWINDEXOFFSET2);
@@ -1716,6 +1751,8 @@ begin
                                          pctx^.CX_REG,
                                          pctx^.UC_REG,
                                          Body^.indexOffset);
+
+ //FlushAndWaitMe(pctx);
 end;
 
 procedure onDrawIndexAuto(pctx:p_pfp_ctx;Body:PPM4CMDDRAWINDEXAUTO);
@@ -1740,6 +1777,8 @@ begin
  pctx^.stream[stGfxDcb].DrawIndexAuto(pctx^.SG_REG,
                                       pctx^.CX_REG,
                                       pctx^.UC_REG);
+
+ FlushAndWaitMe(pctx);
 end;
 
 procedure onDrawIndexIndirectCountMulti(pctx:p_pfp_ctx;Body:PPM4CMDDRAWINDEXINDIRECTMULTI);
@@ -1781,7 +1820,7 @@ begin
  //stallCommandBufferParser
  //PFP waits until the ME completes all preceding commands before allowing the next batch to proceed.
 
- pctx^.Flush_stream(stGfxDcb);
+ FlushAndWaitMe(pctx);
 end;
 
 procedure onPushMarker(pctx:p_pfp_ctx;Body:PChar;size:Integer);
diff --git a/chip/pm4_stream.pas b/chip/pm4_stream.pas
index fb647f3b..f730b5ad 100644
--- a/chip/pm4_stream.pas
+++ b/chip/pm4_stream.pas
@@ -129,7 +129,8 @@ type
   ntDrawIndex2,
   ntDrawIndexOffset2,
   ntDrawIndexAuto,
-  ntDispatchDirect
+  ntDispatchDirect,
+  ntPfpSyncMe
  );
 
 const
@@ -375,6 +376,11 @@ type
 
  end;
 
+ p_pm4_node_PfpSyncMe=^t_pm4_node_PfpSyncMe;
+ t_pm4_node_PfpSyncMe=object(t_pm4_node)
+  event:PRTLEvent;
+ end;
+
  p_pm4_stream=^t_pm4_stream;
  t_pm4_stream=object(t_pm4_resource_stream_scope)
   //
@@ -441,6 +447,7 @@ type
                           var UC_REG:TUSERCONFIG_REG_SHORT);
   procedure Build_cs_info (node:p_pm4_node_DispatchDirect;var GPU_REGS:TGPU_REGS);
   procedure DispatchDirect(var SC_REG:TSH_REG_COMPUTE_GROUP);
+  procedure PfpSyncMe(event:PRTLEvent);
  end;
 
 implementation
@@ -1811,6 +1818,20 @@ begin
  add_node(node);
 end;
 
+procedure t_pm4_stream.PfpSyncMe(event:PRTLEvent);
+var
+ node:p_pm4_node_PfpSyncMe;
+begin
+ node:=allocator.Alloc(SizeOf(t_pm4_node_PfpSyncMe));
+
+ node^.ntype:=ntPfpSyncMe;
+ node^.scope:=Default(t_pm4_resource_curr_scope);
+
+ node^.event:=event;
+
+ add_node(node);
+end;
+
 //
 
 procedure t_cache_block_allocator.init;