/* * Copyright (C) 2007-2009 Gabest * http://www.gabest.org * * This Program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This Program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU Make; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. * http://www.gnu.org/copyleft/gpl.html * */ #pragma once #include "GSLocalMemory.h" #include "GSVector.h" union GSScanlineSelector { struct { uint32 fpsm:2; // 0 uint32 zpsm:2; // 2 uint32 ztst:2; // 4 (0: off, 1: write, 2: test (ge), 3: test (g)) uint32 atst:3; // 6 uint32 afail:2; // 9 uint32 iip:1; // 11 uint32 tfx:3; // 12 uint32 tcc:1; // 15 uint32 fst:1; // 16 uint32 ltf:1; // 17 uint32 tlu:1; // 18 uint32 fge:1; // 19 uint32 date:1; // 20 uint32 abe:1; // 21 uint32 aba:2; // 22 uint32 abb:2; // 24 uint32 abc:2; // 26 uint32 abd:2; // 28 uint32 pabe:1; // 30 uint32 aa1:1; // 31 uint32 fwrite:1; // 32 uint32 ftest:1; // 33 uint32 rfb:1; // 34 uint32 zwrite:1; // 35 uint32 ztest:1; // 36 uint32 zoverflow:1; // 37 (z max >= 0x80000000) uint32 wms:2; // 38 uint32 wmt:2; // 40 uint32 datm:1; // 42 uint32 colclamp:1; // 43 uint32 fba:1; // 44 uint32 dthe:1; // 45 uint32 prim:2; // 46 uint32 edge:1; // 48 uint32 tw:3; // 49 (encodes values between 3 -> 10, texture cache makes sure it is at least 3) uint32 lcm:1; // 52 uint32 mmin:2; // 53 uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) // TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction uint32 breakpoint:1; // Insert a trap to stop the program, helpful to stop debugger on a program }; struct { uint32 _pad1:22; uint32 ababcd:8; uint32 _pad2:2; uint32 fb:2; uint32 _pad3:1; uint32 zb:2; }; struct { uint32 lo; uint32 hi; }; uint64 key; GSScanlineSelector() = default; GSScanlineSelector(uint64 k) : key(k) {} operator uint32() const {return lo;} operator uint64() const {return key;} bool IsSolidRect() const { return prim == GS_SPRITE_CLASS && iip == 0 && tfx == TFX_NONE && abe == 0 && ztst <= 1 && atst <= 1 && date == 0 && fge == 0; } void Print() const { fprintf(stderr, "fpsm:%d zpsm:%d ztst:%d ztest:%d atst:%d afail:%d iip:%d rfb:%d fb:%d zb:%d zw:%d " "tfx:%d tcc:%d fst:%d ltf:%d tlu:%d wms:%d wmt:%d mmin:%d lcm:%d tw:%d " "fba:%d cclamp:%d date:%d datm:%d " "prim:%d abe:%d %d%d%d%d fge:%d dthe:%d notest:%d\n", fpsm, zpsm, ztst, ztest, atst, afail, iip, rfb, fb, zb, zwrite, tfx, tcc, fst, ltf, tlu, wms, wmt, mmin, lcm, tw, fba, colclamp, date, datm, prim, abe, aba, abb, abc, abd , fge, dthe, notest); } }; struct alignas(32) GSScanlineGlobalData // per batch variables, this is like a pixel shader constant buffer { GSScanlineSelector sel; // - the data of vm, tex may change, multi-threaded drawing must be finished before that happens, clut and dimx are copies // - tex is a cached texture, it may be recycled to free up memory, its absolute address cannot be compiled into code // - row and column pointers are allocated once and never change or freed, thier address can be used directly void* vm; const void* tex[7]; uint32* clut; GSVector4i* dimx; const int* fbr; const int* zbr; const int* fbc; const int* zbc; const GSVector2i* fzbr; const GSVector2i* fzbc; GSVector4i aref; GSVector4i afix; struct {GSVector4i min, max, minmax, mask, invmask;} t; // [u] x 4 [v] x 4 #if _M_SSE >= 0x501 uint32 fm, zm; uint32 frb, fga; GSVector8 mxl; GSVector8 k; // TEX1.K * 0x10000 GSVector8 l; // TEX1.L * -0x10000 struct {GSVector8i i, f;} lod; // lcm == 1 #else GSVector4i fm, zm; GSVector4i frb, fga; GSVector4 mxl; GSVector4 k; // TEX1.K * 0x10000 GSVector4 l; // TEX1.L * -0x10000 struct {GSVector4i i, f;} lod; // lcm == 1 #endif }; struct alignas(32) GSScanlineLocalData // per prim variables, each thread has its own { #if _M_SSE >= 0x501 struct skip {GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad;} d[8]; struct step {GSVector4 stq; struct {uint32 rb, ga;} c; struct {uint32 z, f;} p;} d8; struct {GSVector8i rb, ga;} c; struct {uint32 z, f;} p; // these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack) struct { GSVector8 z, zo; GSVector8i f; GSVector8 s, t, q; GSVector8i rb, ga; GSVector8i zs, zd; GSVector8i uf, vf; GSVector8i cov; // mipmapping struct {GSVector8i i, f;} lod; GSVector8i uv[2]; GSVector8i uv_minmax[2]; GSVector8i trb, tga; GSVector8i test; } temp; #else struct skip {GSVector4 z, s, t, q; GSVector4i rb, ga, f, _pad;} d[4]; struct step {GSVector4 z, stq; GSVector4i c, f;} d4; struct {GSVector4i rb, ga;} c; struct {GSVector4i z, f;} p; // these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack) struct { GSVector4 z, zo; GSVector4i f; GSVector4 s, t, q; GSVector4i rb, ga; GSVector4i zs, zd; GSVector4i uf, vf; GSVector4i cov; // mipmapping struct {GSVector4i i, f;} lod; GSVector4i uv[2]; GSVector4i uv_minmax[2]; GSVector4i trb, tga; GSVector4i test; } temp; #endif // const GSScanlineGlobalData* gd; }; // Constant shared by all threads (to reduce cache miss) // // Note: Avoid GSVector* to support all ISA at once // // WARNING: Don't use static storage. Static variables are relocated to random // location (above 2GB). Small allocation on the heap could be below 2GB, this way we can use // absolute addressing. Otherwise we need to store a base address in a register. struct GSScanlineConstantData : public GSAlignedClass<32> { alignas(32) uint8 m_test_256b[16][8]; alignas(32) float m_shift_256b[9][8]; alignas(32) float m_log2_coef_256b[4][8]; alignas(16) uint32 m_test_128b[8][4]; alignas(16) float m_shift_128b[5][4]; alignas(16) float m_log2_coef_128b[4][4]; GSScanlineConstantData() {} // GCC will be clever enough to stick some AVX instruction here // So it must be defered to post global constructor void Init() { uint8 I_hate_vs2013_m_test_256b[16][8] = { {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, {0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, {0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, {0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00}, {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, {0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00}, {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00}, {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00}, {0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, {0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, {0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff}, {0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff}, {0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff}, {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff}, {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff}, {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} }; uint32 I_hate_vs2013_m_test_128b[8][4] = { { 0x00000000, 0x00000000, 0x00000000, 0x00000000}, { 0xffffffff, 0x00000000, 0x00000000, 0x00000000}, { 0xffffffff, 0xffffffff, 0x00000000, 0x00000000}, { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000}, { 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff}, { 0x00000000, 0x00000000, 0xffffffff, 0xffffffff}, { 0x00000000, 0x00000000, 0x00000000, 0xffffffff}, { 0x00000000, 0x00000000, 0x00000000, 0x00000000} }; float I_hate_vs2013_m_shift_256b[9][8] = { { 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f}, { 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f , 6.0f , 7.0f}, { -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f , 6.0f}, { -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f}, { -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f}, { -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f}, { -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f}, { -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f}, { -7.0f , -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f} }; float I_hate_vs2013_m_shift_128b[5][4] = { { 4.0f , 4.0f , 4.0f , 4.0f}, { 0.0f , 1.0f , 2.0f , 3.0f}, { -1.0f , 0.0f , 1.0f , 2.0f}, { -2.0f , -1.0f , 0.0f , 1.0f}, { -3.0f , -2.0f , -1.0f , 0.0f} }; memcpy(m_test_256b, I_hate_vs2013_m_test_256b, sizeof(I_hate_vs2013_m_test_256b)); memcpy(m_test_128b, I_hate_vs2013_m_test_128b, sizeof(I_hate_vs2013_m_test_128b)); memcpy(m_shift_256b, I_hate_vs2013_m_shift_256b, sizeof(I_hate_vs2013_m_shift_256b)); memcpy(m_shift_128b, I_hate_vs2013_m_shift_128b, sizeof(I_hate_vs2013_m_shift_128b)); float log2_coef[] = { 0.204446009836232697516f, -1.04913055217340124191f, 2.28330284476918490682f, 1.0f }; for (size_t n = 0; n < countof(log2_coef); ++n) { for (size_t i = 0; i < 4; ++i) { m_log2_coef_128b[n][i] = log2_coef[n]; m_log2_coef_256b[n][i] = log2_coef[n]; m_log2_coef_256b[n][i+4] = log2_coef[n]; } } } }; extern std::unique_ptr g_const;