From fe88ee410240ef94292c4d5e282fd1bbdf58b06e Mon Sep 17 00:00:00 2001 From: gabest11 Date: Tue, 8 Mar 2011 01:48:15 +0000 Subject: [PATCH] GSdx: optimized the triangle setup of the rasterizer a bit, while it isn't the bottle-neck of drawing, it can still add a few percent to the fps. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4404 96395faa-99c1-11dd-bbfe-3dabce05a288 --- .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 4 +- .../GSDrawScanlineCodeGenerator.x86.avx.cpp | 16 +- .../GSdx/GSDrawScanlineCodeGenerator.x86.cpp | 20 +- plugins/GSdx/GSFunctionMap.h | 18 + plugins/GSdx/GSRasterizer.cpp | 565 ++++++++---------- plugins/GSdx/GSRasterizer.h | 31 +- plugins/GSdx/GSRenderer.cpp | 2 - plugins/GSdx/GSRenderer.h | 2 +- plugins/GSdx/GSRendererHW.h | 7 +- plugins/GSdx/GSRendererSW.cpp | 4 +- plugins/GSdx/GSState.cpp | 14 +- plugins/GSdx/GSState.h | 2 - plugins/GSdx/GSTextureCache.h | 1 + plugins/GSdx/GSVertexSW.h | 6 +- plugins/GSdx/GSdx.vcxproj | 2 +- plugins/GSdx/vtune/iacaMarks.h | 75 +++ 16 files changed, 397 insertions(+), 372 deletions(-) create mode 100644 plugins/GSdx/vtune/iacaMarks.h diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index a12271a3eb..bc90117f10 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -379,8 +379,8 @@ void GSDrawScanlineCodeGenerator::Init() } else { - vmovdqa(xmm13, ptr[&m_local.c.rb]); - vmovdqa(xmm14, ptr[&m_local.c.ga]); + vmovdqa(xmm13, ptr[r11 + offsetof(GSScanlineLocalData, c.rb)]); + vmovdqa(xmm14, ptr[r11 + offsetof(GSScanlineLocalData, c.ga)]); } } } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 085e212901..8df198c6e7 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -273,12 +273,12 @@ void GSDrawScanlineCodeGenerator::Init() mov(esi, dword[esp + _top]); lea(esi, ptr[esi * 8]); - add(esi, dword[&m_local.gd->fzbr]); + add(esi, ptr[&m_local.gd->fzbr]); // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; lea(edi, ptr[ebx * 2]); - add(edi, dword[&m_local.gd->fzbc]); + add(edi, ptr[&m_local.gd->fzbc]); if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) { @@ -585,8 +585,8 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) // int za = fza_base.y + fza_offset->y; - mov(ebp, dword[esi + 4]); - add(ebp, dword[edi + 4]); + mov(ebp, ptr[esi + 4]); + add(ebp, ptr[edi + 4]); // GSVector4i zs = zi; @@ -682,7 +682,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() return; } - mov(ebx, dword[&m_local.gd->tex]); + mov(ebx, ptr[&m_local.gd->tex]); // ebx = tex @@ -1446,8 +1446,8 @@ void GSDrawScanlineCodeGenerator::ReadFrame() // int fa = fza_base.x + fza_offset->x; - mov(ebx, dword[esi]); - add(ebx, dword[edi]); + mov(ebx, ptr[esi]); + add(ebx, ptr[edi]); if(!m_sel.rfb) { @@ -1805,7 +1805,7 @@ void GSDrawScanlineCodeGenerator::WriteFrame() if(m_sel.fpsm == 2 && m_sel.dthe) { - mov(eax, dword[esp + _top]); + mov(eax, ptr[esp + _top]); and(eax, 3); shl(eax, 5); vpaddw(xmm5, ptr[eax + (size_t)&m_local.gd->dimx[0]]); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index 9315dfb314..47f1b96753 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -268,14 +268,14 @@ void GSDrawScanlineCodeGenerator::Init() // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - mov(esi, dword[esp + _top]); + mov(esi, ptr[esp + _top]); lea(esi, ptr[esi * 8]); - add(esi, dword[&m_local.gd->fzbr]); + add(esi, ptr[&m_local.gd->fzbr]); // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; lea(edi, ptr[ebx * 2]); - add(edi, dword[&m_local.gd->fzbc]); + add(edi, ptr[&m_local.gd->fzbc]); if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) { @@ -286,7 +286,7 @@ void GSDrawScanlineCodeGenerator::Init() // ebx = &v - mov(ebx, dword[esp + _v]); + mov(ebx, ptr[esp + _v]); } if(!m_sel.sprite) @@ -587,8 +587,8 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) // int za = fza_base.y + fza_offset->y; - mov(ebp, dword[esi + 4]); - add(ebp, dword[edi + 4]); + mov(ebp, ptr[esi + 4]); + add(ebp, ptr[edi + 4]); // GSVector4i zs = zi; @@ -684,7 +684,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() return; } - mov(ebx, dword[&m_local.gd->tex]); + mov(ebx, ptr[&m_local.gd->tex]); // ebx = tex @@ -1495,8 +1495,8 @@ void GSDrawScanlineCodeGenerator::ReadFrame() // int fa = fza_base.x + fza_offset->x; - mov(ebx, dword[esi]); - add(ebx, dword[edi]); + mov(ebx, ptr[esi]); + add(ebx, ptr[edi]); if(!m_sel.rfb) { @@ -1875,7 +1875,7 @@ void GSDrawScanlineCodeGenerator::WriteFrame() if(m_sel.fpsm == 2 && m_sel.dthe) { - mov(eax, dword[esp + _top]); + mov(eax, ptr[esp + _top]); and(eax, 3); shl(eax, 5); paddw(xmm5, ptr[eax + (size_t)&m_local.gd->dimx[0]]); diff --git a/plugins/GSdx/GSFunctionMap.h b/plugins/GSdx/GSFunctionMap.h index da2ef22f86..d692c0a9c3 100644 --- a/plugins/GSdx/GSFunctionMap.h +++ b/plugins/GSdx/GSFunctionMap.h @@ -234,6 +234,24 @@ public: ml.method_size = (unsigned int)cg->getSize(); iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &ml); +/* + name = format("c:/temp/%s_%016llx.bin", m_name.c_str(), (uint64)key); + + if(FILE* fp = fopen(name.c_str(), "wb")) + { + fputc(0x0F, fp); fputc(0x0B, fp); + fputc(0xBB, fp); fputc(0x6F, fp); fputc(0x00, fp); fputc(0x00, fp); fputc(0x00, fp); + fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp); + + fwrite(cg->getCode(), cg->getSize(), 1, fp); + + fputc(0xBB, fp); fputc(0xDE, fp); fputc(0x00, fp); fputc(0x00, fp); fputc(0x00, fp); + fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp); + fputc(0x0F, fp); fputc(0x0B, fp); + + fclose(fp); + } +*/ } #endif diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index 39ea4d44a2..fcbb6f6081 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -44,10 +44,14 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds) , m_id(0) , m_threads(1) { + m_edge.buff = (GSScanline*)vmalloc(sizeof(GSScanline) * 2048, false); + m_edge.count = 0; } GSRasterizer::~GSRasterizer() { + if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSScanline) * 2048); + delete m_ds; } @@ -68,10 +72,12 @@ void GSRasterizer::Draw(const GSRasterizerData* data) { m_ds->BeginDraw(data->param); - const GSVector4i scissor = data->scissor; const GSVertexSW* vertices = data->vertices; const int count = data->count; + m_scissor = data->scissor; + m_fscissor = GSVector4(data->scissor); + m_stats.Reset(); int64 start = __rdtsc(); @@ -80,22 +86,22 @@ void GSRasterizer::Draw(const GSRasterizerData* data) { case GS_POINT_CLASS: m_stats.prims = count; - for(int i = 0; i < count; i++) DrawPoint(&vertices[i], scissor); + for(int i = 0; i < count; i++) DrawPoint(&vertices[i]); break; case GS_LINE_CLASS: ASSERT(!(count & 1)); m_stats.prims = count / 2; - for(int i = 0; i < count; i += 2) DrawLine(&vertices[i], scissor); + for(int i = 0; i < count; i += 2) DrawLine(&vertices[i]); break; case GS_TRIANGLE_CLASS: ASSERT(!(count % 3)); m_stats.prims = count / 3; - for(int i = 0; i < count; i += 3) DrawTriangle(&vertices[i], scissor); + for(int i = 0; i < count; i += 3) DrawTriangle(&vertices[i]); break; case GS_SPRITE_CLASS: ASSERT(!(count & 1)); m_stats.prims = count / 2; - for(int i = 0; i < count; i += 2) DrawSprite(&vertices[i], scissor); + for(int i = 0; i < count; i += 2) DrawSprite(&vertices[i]); break; default: __assume(0); @@ -111,26 +117,26 @@ void GSRasterizer::GetStats(GSRasterizerStats& stats) stats = m_stats; } -void GSRasterizer::DrawPoint(const GSVertexSW* v, const GSVector4i& scissor) +void GSRasterizer::DrawPoint(const GSVertexSW* v) { // TODO: round to closest for point, prestep for line GSVector4i p(v->p); - if(scissor.left <= p.x && p.x < scissor.right && scissor.top <= p.y && p.y < scissor.bottom) + if(m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom) { if(IsOneOfMyScanlines(p.y)) { + m_stats.pixels++; + m_ds->SetupPrim(v, *v); m_ds->DrawScanline(p.x + 1, p.x, p.y, *v); - - m_stats.pixels++; } } } -void GSRasterizer::DrawLine(const GSVertexSW* v, const GSVector4i& scissor) +void GSRasterizer::DrawLine(const GSVertexSW* v) { GSVertexSW dv = v[1] - v[0]; @@ -148,8 +154,10 @@ void GSRasterizer::DrawLine(const GSVertexSW* v, const GSVector4i& scissor) m_ds->SetupPrim(v, dscan); - DrawEdge(v[0], v[1], dv, scissor, i, 0); - DrawEdge(v[0], v[1], dv, scissor, i, 1); + DrawEdge(v[0], v[1], dv, i, 0); + DrawEdge(v[0], v[1], dv, i, 1); + + FlushEdge(); return; } @@ -176,7 +184,7 @@ void GSRasterizer::DrawLine(const GSVertexSW* v, const GSVector4i& scissor) GSVector4i p(l.p); - if(scissor.top <= p.y && p.y < scissor.bottom) + if(m_scissor.top <= p.y && p.y < m_scissor.bottom) { GSVertexSW dscan = dv / dv.p.xxxx(); @@ -184,9 +192,9 @@ void GSRasterizer::DrawLine(const GSVertexSW* v, const GSVector4i& scissor) l.p = l.p.upl(r).xyzw(l.p); // r.x => l.y - GSVector4 fscissor(scissor); + DrawTriangleSection(p.y, p.y + 1, l, dl, dscan); - DrawTriangleSection(p.y, p.y + 1, l, dl, dscan, fscissor); + Flush(); } } @@ -199,12 +207,13 @@ void GSRasterizer::DrawLine(const GSVertexSW* v, const GSVector4i& scissor) GSVertexSW dedge = dv / dp.v[i]; // TODO: prestep + clip with the scissor + // TODO: inline drawpoint + Flush() int steps = dpi.v[i]; while(steps-- > 0) { - DrawPoint(&edge, scissor); + DrawPoint(&edge); edge += dedge; } @@ -222,301 +231,187 @@ static const int s_abc[8][4] = {2, 1, 0, 0}, // a > b > c }; -void GSRasterizer::DrawTriangle(const GSVertexSW* vertices, const GSVector4i& scissor) +void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) { - GSVertexSW v[3]; + // edge buffer is used here to avoid xmm save-restores (except when we do aa1 in the middle) + + GSVertexSW v[4]; + GSVertexSW dv[3]; + GSVertexSW ddv[3]; + GSVertexSW longest; + GSVertexSW dscan; GSVector4 aabb = vertices[0].p.yyyy(vertices[1].p); GSVector4 bccb = vertices[1].p.yyyy(vertices[2].p).xzzx(); - int i = (aabb > bccb).mask() & 7; + int abc = (aabb > bccb).mask() & 7; - v[0] = vertices[s_abc[i][0]]; - v[1] = vertices[s_abc[i][1]]; - v[2] = vertices[s_abc[i][2]]; + v[0] = vertices[s_abc[abc][0]]; + v[1] = vertices[s_abc[abc][1]]; + v[2] = vertices[s_abc[abc][2]]; aabb = v[0].p.yyyy(v[1].p); bccb = v[1].p.yyyy(v[2].p).xzzx(); - i = (aabb == bccb).mask() & 7; + int i = (aabb == bccb).mask() & 7; - if(m_ds->IsEdge()) - { - DrawEdge(v, scissor); - } - - switch(i) - { - case 0: // a < b < c - DrawTriangleTopBottom(v, scissor); - break; - case 1: // a == b < c - DrawTriangleBottom(v, scissor); - break; - case 4: // a < b == c - DrawTriangleTop(v, scissor); - break; - case 7: // a == b == c - break; - default: - __assume(0); - } -} - -void GSRasterizer::DrawEdge(const GSVertexSW* v, const GSVector4i& scissor) -{ - GSVertexSW dv[3]; + GSVector4 tbf = aabb.xzxz(bccb).ceil(); + GSVector4 tbmax = tbf.max(m_fscissor.yyyy()); + GSVector4 tbmin = tbf.min(m_fscissor.wwww()); + GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin)); dv[0] = v[1] - v[0]; dv[1] = v[2] - v[0]; dv[2] = v[2] - v[1]; - GSVector4 dx = dv[0].p.upl(dv[1].p).xyxy(dv[2].p); - GSVector4 dy = dv[0].p.upl(dv[1].p).zwyx(dv[2].p); - - GSVector4 a = dx.abs() < dy.abs(); // |x| <= |y| - GSVector4 b = dx < GSVector4::zero(); // x < 0 - GSVector4 c = dv[1].p * (dv[0].p / dv[1].p).yyyy() < dv[0].p; // longest.p.x < 0 - - int i = a.mask(); - int j = ((a | b) ^ c.xxxx()).mask() ^ 2; // evil - - GSVertexSW dscan; - - dscan.p = GSVector4::zero(); - dscan.t = GSVector4::zero(); - dscan.c = GSVector4::zero(); - - m_ds->SetupPrim(v, dscan); // TODO: don't call it twice (can't be sure about the second call if the triangle is too small) - - DrawEdge(v[0], v[1], dv[0], scissor, i & 1, j & 1); - DrawEdge(v[0], v[2], dv[1], scissor, i & 2, j & 2); - DrawEdge(v[1], v[2], dv[2], scissor, i & 4, j & 4); -} - -void GSRasterizer::DrawTriangleTop(GSVertexSW* v, const GSVector4i& scissor) -{ - GSVertexSW longest; - - longest.p = v[2].p - v[1].p; - - int i = longest.p.upl(longest.p == GSVector4::zero()).mask(); - - if(i & 2) return; - - i &= 1; - - GSVertexSW& l = v[0]; - GSVector4& r = v[0].p; - - GSVector4 fscissor(scissor); - - GSVector4 tb = l.p.upl(v[2].p).ceil(); - - GSVector4 tbmax = tb.max(fscissor.yyyy()); - GSVector4 tbmin = tb.min(fscissor.wwww()); - - GSVector4i tbi = GSVector4i(tbmax.zzww(tbmin)); - - int top = tbi.extract32<0>(); - int bottom = tbi.extract32<2>(); - - if(top >= bottom) return; - - longest.t = v[2].t - v[1].t; - longest.c = v[2].c - v[1].c; - - GSVertexSW dscan = longest * longest.p.xxxx().rcp(); - - GSVertexSW vl = v[1 + i] - l; - GSVector4 vr = v[2 - i].p - r; - - GSVertexSW dl = vl / vl.p.yyyy(); - GSVector4 dr = vr / vr.yyyy(); - - GSVector4 dy = tbmax.zzzz() - l.p.yyyy(); - - l.p = l.p.upl(r).xyzw(l.p); // r.x => l.y - dl.p = dl.p.upl(dr).xyzw(dl.p); // dr.x => dl.y - - l += dl * dy; - - m_ds->SetupPrim(v, dscan); - - DrawTriangleSection(top, bottom, l, dl, dscan, fscissor); -} - -void GSRasterizer::DrawTriangleBottom(GSVertexSW* v, const GSVector4i& scissor) -{ - GSVertexSW longest; - - longest.p = v[1].p - v[0].p; - - int i = longest.p.upl(longest.p == GSVector4::zero()).mask(); - - if(i & 2) return; - - i &= 1; - - GSVertexSW& l = v[i]; - GSVector4& r = v[1 - i].p; - - GSVector4 fscissor(scissor); - - GSVector4 tb = l.p.upl(v[2].p).ceil(); - - GSVector4 tbmax = tb.max(fscissor.yyyy()); - GSVector4 tbmin = tb.min(fscissor.wwww()); - - GSVector4i tbi = GSVector4i(tbmax.zzww(tbmin)); - - int top = tbi.extract32<0>(); - int bottom = tbi.extract32<2>(); - - if(top >= bottom) return; - - longest.t = v[1].t - v[0].t; - longest.c = v[1].c - v[0].c; - - GSVertexSW dscan = longest * longest.p.xxxx().rcp(); - - GSVertexSW vl = v[2] - l; - GSVector4 vr = v[2].p - r; - - GSVertexSW dl = vl / vl.p.yyyy(); - GSVector4 dr = vr / vr.yyyy(); - - GSVector4 dy = tbmax.zzzz() - l.p.yyyy(); - - l.p = l.p.upl(r).xyzw(l.p); // r.x => l.y - dl.p = dl.p.upl(dr).xyzw(dl.p); // dr.x => dl.y - - l += dl * dy; - - m_ds->SetupPrim(v, dscan); - - DrawTriangleSection(top, bottom, l, dl, dscan, fscissor); -} - -void GSRasterizer::DrawTriangleTopBottom(GSVertexSW* v, const GSVector4i& scissor) -{ - GSVertexSW dv[3]; - - dv[0] = v[1] - v[0]; - dv[1] = v[2] - v[0]; - - GSVertexSW longest = dv[1] * (dv[0].p / dv[1].p).yyyy() - dv[0]; - - int i = longest.p.upl(longest.p == GSVector4::zero()).mask(); - - if(i & 2) return; - - i &= 1; - - GSVertexSW dscan = longest * longest.p.xxxx().rcp(); - - m_ds->SetupPrim(v, dscan); - - GSVector4 fscissor(scissor); - - GSVector4 tb = v[0].p.upl(v[1].p).zwzw(v[1].p.upl(v[2].p)).ceil(); - - GSVector4 tbmax = tb.max(fscissor.yyyy()); - GSVector4 tbmin = tb.min(fscissor.wwww()); - - GSVector4i tbi = GSVector4i(tbmax.xzyw(tbmin)); - - int top = tbi.extract32<0>(); - int bottom = tbi.extract32<2>(); - - GSVertexSW& l = v[0]; - GSVector4 r = v[0].p; - - GSVertexSW dl = dv[i] / dv[i].p.yyyy(); - GSVector4 dr = dv[1 - i].p / dv[1 - i].p.yyyy(); - - GSVector4 dy = tbmax.xxxx() - l.p.yyyy(); - - l += dl * dy; - r += dr * dy; - - if(top < bottom) + switch(i) { - DrawTriangleSection(top, bottom, l, dl, r, dr, dscan, fscissor); + case 0: // a < b < c + ddv[0] = dv[0] / dv[0].p.yyyy(); + ddv[1] = dv[1] / dv[1].p.yyyy(); + ddv[2] = dv[2] / dv[2].p.yyyy(); + longest = ddv[1] * dv[0].p.yyyy() - dv[0]; + v[3] = v[1] + longest; // point between v[0] and v[2] where y == v[1].y + break; + case 1: // a == b < c + ddv[1] = dv[1] / dv[1].p.yyyy(); + ddv[2] = dv[2] / dv[2].p.yyyy(); + longest = dv[0]; + break; + case 4: // a < b == c + ddv[0] = dv[0] / dv[0].p.yyyy(); + ddv[1] = dv[1] / dv[1].p.yyyy(); + longest = dv[2]; + break; + case 7: // a == b == c + return; + default: + __assume(0); } - top = tbi.y; - bottom = tbi.w; + int j = longest.p.upl(longest.p == GSVector4::zero()).mask(); - if(top < bottom) + if(j & 2) return; + + j &= 1; + + dscan = longest * longest.p.xxxx().rcp(); + + if(m_ds->IsEdge()) { - if(i == 0) - { - l = v[1]; - dv[2] = v[2] - v[1]; - dl = dv[2] / dv[2].p.yyyy(); - } - else - { - r = v[1].p; - dv[2].p = v[2].p - v[1].p; - dr = dv[2].p / dv[2].p.yyyy(); - } + GSVector4 dx = dv[0].p.upl(dv[1].p).xyxy(dv[2].p); + GSVector4 dy = dv[0].p.upl(dv[1].p).zwyx(dv[2].p); - l += dl * (tbmax.zzzz() - l.p.yyyy()); - r += dr * (tbmax.zzzz() - r.yyyy()); + GSVector4 a = dx.abs() < dy.abs(); // |dx| <= |dy| + GSVector4 b = dx < GSVector4::zero(); // dx < 0 + GSVector4 c = longest.p.xxxx() < GSVector4::zero(); // longest.p.x < 0 - l.p = l.p.upl(r).xyzw(l.p); // r.x => l.y - dl.p = dl.p.upl(dr).xyzw(dl.p); // dr.x => dl.y + int i = a.mask(); + int j = ((a | b) ^ c).mask() ^ 2; // evil - DrawTriangleSection(top, bottom, l, dl, dscan, fscissor); + DrawEdge(v[0], v[1], dv[0], i & 1, j & 1); + DrawEdge(v[0], v[2], dv[1], i & 2, j & 2); + DrawEdge(v[1], v[2], dv[2], i & 4, j & 4); + + GSVertexSW dscan; + + dscan.p = GSVector4::zero(); + dscan.t = GSVector4::zero(); + dscan.c = GSVector4::zero(); + + m_ds->SetupPrim(v, dscan); + + FlushEdge(); } + + switch(i) + { + case 0: // a < b < c + + if(tb.x < tb.z) + { + GSVertexSW l = v[0]; + GSVertexSW dl = ddv[j]; + + GSVector4 dy = tbmax.xxxx() - l.p.yyyy(); + + l.p = l.p.xxzw(); // r.x => l.y + dl.p = dl.p.upl(ddv[1 - j].p).xyzw(dl.p); // dr.x => dl.y + + l += dl * dy; + + DrawTriangleSection(tb.x, tb.z, l, dl, dscan); + } + + if(tb.y < tb.w) + { + GSVertexSW l = v[1 + (1 << j)]; + GSVertexSW dl = ddv[2 - j]; + + GSVector4 dy = tbmax.zzzz() - l.p.yyyy(); + + l.p = l.p.upl(v[3 - (1 << j)].p).xyzw(l.p); // r.x => l.y + dl.p = dl.p.upl(ddv[1 + j].p).xyzw(dl.p); // dr.x => dl.y + + l += dl * dy; + + DrawTriangleSection(tb.y, tb.w, l, dl, dscan); + } + + break; + + case 1: // a == b < c + + if(tb.x < tb.w) + { + GSVertexSW l = v[j]; + GSVertexSW dl = ddv[1 + j]; + + GSVector4 dy = tbmax.xxxx() - l.p.yyyy(); + + l.p = l.p.upl(v[1 - j].p).xyzw(l.p); // r.x => l.y + dl.p = dl.p.upl(ddv[2 - j].p).xyzw(dl.p); // dr.x => dl.y + + l += dl * dy; + + DrawTriangleSection(tb.x, tb.w, l, dl, dscan); + } + + break; + + case 4: // a < b == c + + if(tb.x < tb.w) + { + GSVertexSW l = v[0]; + GSVertexSW dl = ddv[j]; + + GSVector4 dy = tbmax.xxxx() - l.p.yyyy(); + + l.p = l.p.xxzw(); // r.x => l.y + dl.p = dl.p.upl(ddv[1 - j].p).xyzw(dl.p); // dr.x => dl.y + + l += dl * dy; + + DrawTriangleSection(tb.x, tb.w, l, dl, dscan); + } + + break; + + default: + __assume(0); + } + + m_ds->SetupPrim(v, dscan); + + Flush(); } -void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, GSVector4& r, const GSVector4& dr, const GSVertexSW& dscan, const GSVector4& fscissor) +void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, const GSVertexSW& dscan) { ASSERT(top < bottom); - while(1) - { - do - { - if(IsOneOfMyScanlines(top)) - { - GSVector4 lr = l.p.xyxy(r).ceil(); - - GSVector4 lrmax = lr.max(fscissor.xxxx()); - GSVector4 lrmin = lr.min(fscissor.zzzz()); - - GSVector4i lri = GSVector4i(lrmax.xxzz(lrmin)); - - int left = lri.extract32<0>(); - int right = lri.extract32<2>(); - - int pixels = right - left; - - if(pixels > 0) - { - m_stats.pixels += pixels; - - GSVertexSW scan = l + dscan * (lrmax - l.p).xxxx(); - - m_ds->DrawScanline(right, left, top, scan); - } - } - } - while(0); - - if(++top >= bottom) break; - - l += dl; - r += dr; - } -} - -void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, const GSVertexSW& dscan, const GSVector4& fscissor) -{ - ASSERT(top < bottom); + GSScanline* RESTRICT e = &m_edge.buff[m_edge.count]; while(1) { @@ -526,8 +421,8 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const { GSVector4 lr = l.p.ceil(); - GSVector4 lrmax = lr.max(fscissor.xxxx()); - GSVector4 lrmin = lr.min(fscissor.zzzz()); + GSVector4 lrmax = lr.max(m_fscissor.xxxx()); + GSVector4 lrmin = lr.min(m_fscissor.zzzz()); GSVector4i lri = GSVector4i(lrmax.xxyy(lrmin)); @@ -540,9 +435,13 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const { m_stats.pixels += pixels; - GSVertexSW scan = l + dscan * (lrmax - l.p).xxxx(); + e->scan = l + dscan * (lrmax - l.p).xxxx(); - m_ds->DrawScanline(right, left, top, scan); + e->p.left = left; + e->p.top = top; + e->p.right = right; + + e++; } } } @@ -552,9 +451,11 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const l += dl; } + + m_edge.count += e - &m_edge.buff[m_edge.count]; } -void GSRasterizer::DrawSprite(const GSVertexSW* vertices, const GSVector4i& scissor) +void GSRasterizer::DrawSprite(const GSVertexSW* vertices) { GSVertexSW v[2]; @@ -569,7 +470,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices, const GSVector4i& scis GSVector4i r(v[0].p.xyxy(v[1].p).ceil()); - r = r.rintersect(scissor); + r = r.rintersect(m_scissor); if(r.rempty()) return; @@ -611,14 +512,14 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices, const GSVector4i& scis { if(IsOneOfMyScanlines(r.top)) { - m_ds->DrawScanline(r.right, r.left, r.top, scan); - m_stats.pixels += r.width(); + + m_ds->DrawScanline(r.right, r.left, r.top, scan); } } } -void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GSVertexSW& dv, const GSVector4i& scissor, int orientation, int side) +void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GSVertexSW& dv, int orientation, int side) { // orientation: // - true: |dv.p.y| > |dv.p.x| @@ -630,14 +531,14 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS // TODO: bit slow and too much duplicated code // TODO: inner pre-step is still missing (hardly noticable) - GSVector4 fscissor(scissor); - + GSScanline* RESTRICT dst = &m_edge.buff[m_edge.count]; + GSVector4 lrtb = v0.p.upl(v1.p).ceil(); if(orientation) { - GSVector4 tbmax = lrtb.max(fscissor.yyyy()); - GSVector4 tbmin = lrtb.min(fscissor.wwww()); + GSVector4 tbmax = lrtb.max(m_fscissor.yyyy()); + GSVector4 tbmin = lrtb.min(m_fscissor.wwww()); GSVector4i tbi = GSVector4i(tbmax.zwzw(tbmin)); @@ -684,15 +585,18 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS int xi = x >> 16; int xf = x & 0xffff; - if(scissor.left <= xi && xi < scissor.right && IsOneOfMyScanlines(xi)) + if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi)) { m_stats.pixels++; - edge.t.u32[3] = (0x10000 - xf) & 0xffff; + dst->scan = edge; + dst->scan.t.u32[3] = (0x10000 - xf) & 0xffff; - m_ds->DrawEdge(xi + 1, xi, top, edge); + dst->p.left = xi; + dst->p.top = top; + dst->p.right = xi + 1; - edge.t.u32[3] = 0; + dst++; } } while(0); @@ -712,15 +616,18 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS int xi = (x >> 16) + 1; int xf = x & 0xffff; - if(scissor.left <= xi && xi < scissor.right && IsOneOfMyScanlines(xi)) + if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi)) { m_stats.pixels++; - edge.t.u32[3] = xf; + dst->scan = edge; + dst->scan.t.u32[3] = xf; - m_ds->DrawEdge(xi + 1, xi, top, edge); + dst->p.left = xi; + dst->p.top = top; + dst->p.right = xi + 1; - edge.t.u32[3] = 0; + dst++; } } while(0); @@ -734,8 +641,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS } else { - GSVector4 lrmax = lrtb.max(fscissor.xxxx()); - GSVector4 lrmin = lrtb.min(fscissor.zzzz()); + GSVector4 lrmax = lrtb.max(m_fscissor.xxxx()); + GSVector4 lrmin = lrtb.min(m_fscissor.zzzz()); GSVector4i lri = GSVector4i(lrmax.xyxy(lrmin)); @@ -782,15 +689,18 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS int yi = y >> 16; int yf = y & 0xffff; - if(scissor.top <= yi && yi < scissor.bottom && IsOneOfMyScanlines(yi)) + if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi)) { m_stats.pixels++; - edge.t.u32[3] = (0x10000 - yf) & 0xffff; + dst->scan = edge; + dst->scan.t.u32[3] = (0x10000 - yf) & 0xffff; - m_ds->DrawEdge(left + 1, left, yi, edge); + dst->p.left = left; + dst->p.top = yi; + dst->p.right = left + 1; - edge.t.u32[3] = 0; + dst++; } } while(0); @@ -810,15 +720,18 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS int yi = (y >> 16) + 1; int yf = y & 0xffff; - if(scissor.top <= yi && yi < scissor.bottom && IsOneOfMyScanlines(yi)) + if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi)) { m_stats.pixels++; - edge.t.u32[3] = yf; + dst->scan = edge; + dst->scan.t.u32[3] = yf; - m_ds->DrawEdge(left + 1, left, yi, edge); + dst->p.left = left; + dst->p.top = yi; + dst->p.right = left + 1; - edge.t.u32[3] = 0; + dst++; } } while(0); @@ -830,6 +743,34 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS } } } + + m_edge.count += dst - &m_edge.buff[m_edge.count]; +} + +void GSRasterizer::Flush() +{ + // TODO: on win64 this could be the place where xmm6-15 are preserved (not by each DrawScanline) + + const GSScanline* s = m_edge.buff; + + for(int count = m_edge.count; count > 0; count--, s++) + { + m_ds->DrawScanline(s->p.right, s->p.left, s->p.top, s->scan); + } + + m_edge.count = 0; +} + +void GSRasterizer::FlushEdge() +{ + const GSScanline* s = m_edge.buff; + + for(int count = m_edge.count; count > 0; count--, s++) + { + m_ds->DrawEdge(s->p.right, s->p.left, s->p.top, s->scan); + } + + m_edge.count = 0; } // diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index 81b30ea4f9..9ca9c6f267 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -59,7 +59,7 @@ public: virtual void EndDraw(const GSRasterizerStats& stats, uint64 frame) = 0; virtual void PrintStats() = 0; - __forceinline void SetupPrim(const GSVertexSW* v, const GSVertexSW& dscan) {m_sp(v, dscan);} + __forceinline void SetupPrim(const GSVertexSW* vertices, const GSVertexSW& dscan) {m_sp(vertices, dscan);} __forceinline void DrawScanline(int right, int left, int top, const GSVertexSW& scan) {m_ds(right, left, top, scan);} __forceinline void DrawEdge(int right, int left, int top, const GSVertexSW& scan) {m_de(right, left, top, scan);} __forceinline void DrawRect(const GSVector4i& r, const GSVertexSW& v) {(this->*m_dr)(r, v);} @@ -79,30 +79,33 @@ public: virtual void SetThreadId(int id, int threads) = 0; }; -class GSRasterizer : public IRasterizer +__aligned(class, 32) GSRasterizer : public GSAlignedClass<32>, public IRasterizer { + struct GSScanline {GSVertexSW scan; GSVector4i p;}; + protected: IDrawScanline* m_ds; int m_id; int m_threads; GSRasterizerStats m_stats; + GSVector4i m_scissor; + GSVector4 m_fscissor; + struct {GSScanline* buff; int count;} m_edge; - void DrawPoint(const GSVertexSW* v, const GSVector4i& scissor); - void DrawLine(const GSVertexSW* v, const GSVector4i& scissor); - void DrawTriangle(const GSVertexSW* v, const GSVector4i& scissor); - void DrawEdge(const GSVertexSW* v, const GSVector4i& scissor); - void DrawSprite(const GSVertexSW* v, const GSVector4i& scissor); + void DrawPoint(const GSVertexSW* v); + void DrawLine(const GSVertexSW* v); + void DrawTriangle(const GSVertexSW* v); + void DrawSprite(const GSVertexSW* v); + void DrawEdge(const GSVertexSW* v); - void DrawTriangleTop(GSVertexSW* v, const GSVector4i& scissor); - void DrawTriangleBottom(GSVertexSW* v, const GSVector4i& scissor); - void DrawTriangleTopBottom(GSVertexSW* v, const GSVector4i& scissor); + __forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, const GSVertexSW& dscan); - __forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, GSVector4& r, const GSVector4& dr, const GSVertexSW& dscan, const GSVector4& scissor); - __forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, const GSVertexSW& dscan, const GSVector4& scissor); + void DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GSVertexSW& dv, int orientation, int side); - void DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GSVertexSW& dv, const GSVector4i& scissor, int orientation, int side); + __forceinline bool IsOneOfMyScanlines(int scanline) const; - inline bool IsOneOfMyScanlines(int scanline) const; + void Flush(); + void FlushEdge(); public: GSRasterizer(IDrawScanline* ds); diff --git a/plugins/GSdx/GSRenderer.cpp b/plugins/GSdx/GSRenderer.cpp index 8ec44e3157..2aad4d892e 100644 --- a/plugins/GSdx/GSRenderer.cpp +++ b/plugins/GSdx/GSRenderer.cpp @@ -76,8 +76,6 @@ bool GSRenderer::CreateDevice(GSDevice* dev) void GSRenderer::ResetDevice() { - InvalidateTextureCache(); - ResetPrim(); if(m_dev) m_dev->Reset(1, 1); diff --git a/plugins/GSdx/GSRenderer.h b/plugins/GSdx/GSRenderer.h index 3453962520..d7659928c9 100644 --- a/plugins/GSdx/GSRenderer.h +++ b/plugins/GSdx/GSRenderer.h @@ -139,7 +139,7 @@ protected: { if(m_vertices != NULL) _aligned_free(m_vertices); - m_maxcount = max(10000, m_maxcount * 3/2); + m_maxcount = std::max(10000, m_maxcount * 3 / 2); m_vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * m_maxcount, 32); m_maxcount -= 100; } diff --git a/plugins/GSdx/GSRendererHW.h b/plugins/GSdx/GSRendererHW.h index 55a44dccdf..561ac01734 100644 --- a/plugins/GSdx/GSRendererHW.h +++ b/plugins/GSdx/GSRendererHW.h @@ -484,13 +484,10 @@ protected: } } - void InvalidateTextureCache() - { - m_tc->RemoveAll(); - } - void ResetDevice() { + m_tc->RemoveAll(); + __super::ResetDevice(); } diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index 5cde471657..9b1c39a6d8 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -372,12 +372,12 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) if(gd.sel.ltf) { - GSVector4 half(0x8000, 0x8000); - if(gd.sel.fst) { // if q is constant we can do the half pel shift for bilinear sampling on the vertices + GSVector4 half(0x8000, 0x8000); + GSVertexSW* v = m_vertices; for(int i = 0, j = m_count; i < j; i++) diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index f4b5693d3a..ab8ac4d87e 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -196,8 +196,6 @@ void GSState::Reset() m_env.Reset(); m_context = &m_env.CTXT[0]; - - InvalidateTextureCache(); } void GSState::ResetHandlers() @@ -770,8 +768,6 @@ void GSState::GIFRegHandlerFOGCOL(const GIFReg* r) void GSState::GIFRegHandlerTEXFLUSH(const GIFReg* r) { // TRACE(_T("TEXFLUSH\n")); - - // InvalidateTextureCache(); } template void GSState::GIFRegHandlerSCISSOR(const GIFReg* r) @@ -903,6 +899,7 @@ template void GSState::GIFRegHandlerFRAME(const GIFReg* r) template void GSState::GIFRegHandlerZBUF(const GIFReg* r) { GIFRegZBUF ZBUF = r->ZBUF; + if(ZBUF.u32[0] == 0) { // during startup all regs are cleared to 0 (by the bios or something), so we mask z until this register becomes valid @@ -1396,9 +1393,6 @@ template void GSState::Transfer<3>(const uint8* mem, uint32 size); template void GSState::Transfer(const uint8* mem, uint32 size) { - // [TODO] make me into a template parameter... I think. --air - static const bool FrameSkipIt = false; - GSPerfMonAutoTimer pmat(m_perfmon); const uint8* start = mem; @@ -1420,7 +1414,7 @@ template void GSState::Transfer(const uint8* mem, uint32 size) // ASSERT(!(path.tag.PRE && path.tag.FLG == GIF_FLG_REGLIST)); // kingdom hearts - if(path.tag.PRE && (path.tag.FLG == GIF_FLG_PACKED) && !FrameSkipIt) + if(path.tag.PRE && path.tag.FLG == GIF_FLG_PACKED) { GIFRegPRIM r; r.u64 = path.tag.PRIM; @@ -1551,7 +1545,7 @@ template void GSState::Transfer(const uint8* mem, uint32 size) if(m_mt) { // Hackfix for BIOS, which sends an incomplete packet when it does an XGKICK without - // having an EOP specified anywhere in VU1 memory. Needed until PCSX2 is fixed t + // having an EOP specified anywhere in VU1 memory. Needed until PCSX2 is fixed to // handle it more properly (ie, without looping infinitely). path.nloop = 0; @@ -1802,7 +1796,7 @@ bool GSState::GSTransferBuffer::Update(int tw, int th, int bpp, int& len) if(total == 0) { start = end = 0; - total = min((tw * bpp >> 3) * th, 1024 * 1024 * 4); + total = std::min((tw * bpp >> 3) * th, 1024 * 1024 * 4); overflow = false; } diff --git a/plugins/GSdx/GSState.h b/plugins/GSdx/GSState.h index aa02c9660e..a7423d35f4 100644 --- a/plugins/GSdx/GSState.h +++ b/plugins/GSdx/GSState.h @@ -28,7 +28,6 @@ #include "GSVertex.h" #include "GSVertexList.h" #include "GSUtil.h" -#include "GSDirtyRect.h" #include "GSPerfMon.h" #include "GSVector.h" #include "GSDevice.h" @@ -208,7 +207,6 @@ public: virtual void ResetPrim() = 0; virtual void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) {} virtual void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) {} - virtual void InvalidateTextureCache() {} void Move(); void Write(const uint8* mem, int len); diff --git a/plugins/GSdx/GSTextureCache.h b/plugins/GSdx/GSTextureCache.h index 18052fddcf..6abb856e08 100644 --- a/plugins/GSdx/GSTextureCache.h +++ b/plugins/GSdx/GSTextureCache.h @@ -22,6 +22,7 @@ #pragma once #include "GSRenderer.h" +#include "GSDirtyRect.h" class GSTextureCache { diff --git a/plugins/GSdx/GSVertexSW.h b/plugins/GSdx/GSVertexSW.h index 8eae1ee8d0..a900b026e3 100644 --- a/plugins/GSdx/GSVertexSW.h +++ b/plugins/GSdx/GSVertexSW.h @@ -23,15 +23,15 @@ #include "GSVector.h" -__aligned(struct, 32) GSVertexSW +__aligned(struct, 16) GSVertexSW { GSVector4 c, p, t; GSVertexSW() {} GSVertexSW(const GSVertexSW& v) {*this = v;} - void operator = (const GSVertexSW& v) {c = v.c; p = v.p; t = v.t;} - void operator += (const GSVertexSW& v) {c += v.c; p += v.p; t += v.t;} + __forceinline void operator = (const GSVertexSW& v) {c = v.c; p = v.p; t = v.t;} + __forceinline void operator += (const GSVertexSW& v) {c += v.c; p += v.p; t += v.t;} friend GSVertexSW operator + (const GSVertexSW& v1, const GSVertexSW& v2); friend GSVertexSW operator - (const GSVertexSW& v1, const GSVertexSW& v2); diff --git a/plugins/GSdx/GSdx.vcxproj b/plugins/GSdx/GSdx.vcxproj index 5275ce04e5..645dca5b22 100644 --- a/plugins/GSdx/GSdx.vcxproj +++ b/plugins/GSdx/GSdx.vcxproj @@ -528,7 +528,7 @@ - All + AssemblyAndSourceCode AssemblyAndSourceCode diff --git a/plugins/GSdx/vtune/iacaMarks.h b/plugins/GSdx/vtune/iacaMarks.h new file mode 100644 index 0000000000..a42f3a6bae --- /dev/null +++ b/plugins/GSdx/vtune/iacaMarks.h @@ -0,0 +1,75 @@ +/* +* INTEL CONFIDENTIAL +* Copyright (2008-2009) Intel Corporation All Rights Reserved. +* The source code contained or described herein and all documents +* related to the source code ("Material") are owned by Intel Corporation +* or its suppliers or licensors. Title to the Material remains with +* Intel Corporation or its suppliers and licensors. The Material +* contains trade secrets and proprietary and confidential information +* of Intel or its suppliers and licensors. The Material is protected +* by worldwide copyright and trade secret laws and treaty provisions. +* No part of the Material may be used, copied, reproduced, modified, +* published, uploaded, posted, transmitted, distributed, or disclosed +* in any way without Intel’s prior express written permission. +* +* No license under any patent, copyright, trade secret or other +* intellectual property right is granted to or conferred upon you by +* disclosure or delivery of the Materials, either expressly, by implication, +* inducement, estoppel or otherwise. Any license under such intellectual +* property rights must be express and approved by Intel in writing. +*/ + +/********************************************************/ +/* Binaries that contain IACA_MARKS will not run. */ +/* Define IACA_MARKS_OFF when you compile your sources, */ +/* to disable IACA_START, IACA_END, IACA_MSC64_START */ +/* and IACA_MSC64_END */ +/********************************************************/ +#ifdef IACA_MARKS_OFF + +#define IACA_START +#define IACA_END +#define IACA_MSC64_START +#define IACA_MSC64_END + +#else +#if defined (__GNUC__) +#define IACA_SSC_MARK( MARK_ID ) \ +__asm__ __volatile__ ( \ + "\n\t movl $"#MARK_ID", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : : : "memory" ); + +#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B"); + +#else +#define IACA_UD_BYTES {__asm _emit 0x0F \ + __asm _emit 0x0B} + +#define IACA_SSC_MARK(x) {__asm mov ebx, x\ + __asm _emit 0x64 \ + __asm _emit 0x67 \ + __asm _emit 0x90 } + +#define IACA_VC64_START __writegsbyte(111, 111); +#define IACA_VC64_END __writegsbyte(222, 222); + +#endif + +#define IACA_START {IACA_UD_BYTES \ + IACA_SSC_MARK(111)} +#define IACA_END {IACA_SSC_MARK(222) \ + IACA_UD_BYTES} + +#endif + +/**************** asm ***************** +;START_MARKER +mov ebx, 111 +db 0x64, 0x67, 0x90 + +;END_MARKER +mov ebx, 222 +db 0x64, 0x67, 0x90 + +**************************************/