flycast/core/rend/soft/softrend.cpp

1234 lines
41 KiB
C++

#include <omp.h>
#include "hw/pvr/Renderer_if.h"
#include "hw/pvr/pvr_mem.h"
#include "oslib/oslib.h"
/*
SSE/MMX based softrend
Initial code by skmp and gigaherz
This is a rather weird very basic pvr softrend.
Renders in some kind of tile format (that I forget now),
and does depth and color, but no alpha, texture, or pixel
processing. All of the pipeline is based on quads.
*/
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include <cmath>
#include "rend/gles/gles.h"
u32 decoded_colors[3][65536];
#define MAX_RENDER_WIDTH 640
#define MAX_RENDER_HEIGHT 480
#define MAX_RENDER_PIXELS (MAX_RENDER_WIDTH * MAX_RENDER_HEIGHT)
#define STRIDE_PIXEL_OFFSET MAX_RENDER_WIDTH
#define Z_BUFFER_PIXEL_OFFSET MAX_RENDER_PIXELS
DECL_ALIGN(32) u32 render_buffer[MAX_RENDER_PIXELS * 2]; //Color + depth
DECL_ALIGN(32) u32 pixels[MAX_RENDER_PIXELS];
#if HOST_OS != OS_WINDOWS
struct RECT {
int left, top, right, bottom;
};
#include <X11/Xlib.h>
#endif
union m128i {
__m128i mm;
int8_t m128i_u8[16];
int8_t m128i_i8[16];
int16_t m128i_i16[8];
int32_t m128i_i32[4];
uint32_t m128i_u32[4];
};
static __m128 _mm_load_scaled_float(float v, float s)
{
return _mm_setr_ps(v, v + s, v + s + s, v + s + s + s);
}
static __m128 _mm_broadcast_float(float v)
{
return _mm_setr_ps(v, v, v, v);
}
static __m128i _mm_broadcast_int(int v)
{
__m128i rv = _mm_cvtsi32_si128(v);
return _mm_shuffle_epi32(rv, 0);
}
static __m128 _mm_load_ps_r(float a, float b, float c, float d)
{
DECL_ALIGN(128) float v[4];
v[0] = a;
v[1] = b;
v[2] = c;
v[3] = d;
return _mm_load_ps(v);
}
__forceinline int iround(float x)
{
return _mm_cvtt_ss2si(_mm_load_ss(&x));
}
float mmin(float a, float b, float c, float d)
{
float rv = min(a, b);
rv = min(c, rv);
return max(d, rv);
}
float mmax(float a, float b, float c, float d)
{
float rv = max(a, b);
rv = max(c, rv);
return min(d, rv);
}
//i think this gives false positives ...
//yup, if ANY of the 3 tests fail the ANY tests fails.
__forceinline void EvalHalfSpace(bool& all, bool& any, float cp, float sv, float lv)
{
//bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0;
//bool a10 = C1 + DX12 * y0 - DY12 * x0 > qDY12;
//bool a01 = C1 + DX12 * y0 - DY12 * x0 > -qDX12;
//bool a11 = C1 + DX12 * y0 - DY12 * x0 > (qDY12-qDX12);
//C1 + DX12 * y0 - DY12 * x0 > 0
// + DX12 * y0 - DY12 * x0 > 0 - C1
//int pd=DX * y0 - DY * x0;
bool a = cp > sv; //needed for ANY
bool b = cp > lv; //needed for ALL
any &= a;
all &= b;
}
//return true if any is positive
__forceinline bool EvalHalfSpaceFAny(float cp12, float cp23, float cp31)
{
bool svt = cp12 > 0; //needed for ANY
svt |= cp23 > 0;
svt |= cp31 > 0;
return svt;
}
__forceinline bool EvalHalfSpaceFAll(float cp12, float cp23, float cp31, float lv12, float lv23, float lv31)
{
bool lvt = (cp12 - lv12) > 0;
lvt &= (cp23 - lv23) > 0;
lvt &= (cp31 - lv31) > 0; //needed for all
return lvt;
}
__forceinline void PlaneMinMax(float& MIN, float& MAX, float DX, float DY, float q)
{
float q_fp = (q - 1);
float v1 = 0;
float v2 = q_fp*DY;
float v3 = -q_fp*DX;
float v4 = q_fp*(DY - DX);
MIN = min(v1, min(v2, min(v3, v4)));
MAX = max(v1, max(v2, max(v3, v4)));
}
struct PlaneStepper
{
__m128 ddx, ddy;
__m128 c;
void Setup(const Vertex &v1, const Vertex &v2, const Vertex &v3, int minx, int miny, int q
, float v1_a, float v2_a, float v3_a
, float v1_b, float v2_b, float v3_b
, float v1_c, float v2_c, float v3_c
, float v1_d, float v2_d, float v3_d)
{
// float v1_z=v1.z,v2_z=v2.z,v3_z=v3.z;
float Aa = ((v3_a - v1_a) * (v2.y - v1.y) - (v2_a - v1_a) * (v3.y - v1.y));
float Ba = ((v3.x - v1.x) * (v2_a - v1_a) - (v2.x - v1.x) * (v3_a - v1_a));
float Ab = ((v3_b - v1_b) * (v2.y - v1.y) - (v2_b - v1_b) * (v3.y - v1.y));
float Bb = ((v3.x - v1.x) * (v2_b - v1_b) - (v2.x - v1.x) * (v3_b - v1_b));
float Ac = ((v3_c - v1_c) * (v2.y - v1.y) - (v2_c - v1_c) * (v3.y - v1.y));
float Bc = ((v3.x - v1.x) * (v2_c - v1_c) - (v2.x - v1.x) * (v3_c - v1_c));
float Ad = ((v3_d - v1_d) * (v2.y - v1.y) - (v2_d - v1_d) * (v3.y - v1.y));
float Bd = ((v3.x - v1.x) * (v2_d - v1_d) - (v2.x - v1.x) * (v3_d - v1_d));
float C = ((v2.x - v1.x) * (v3.y - v1.y) - (v3.x - v1.x) * (v2.y - v1.y));
float ddx_s_a = -Aa / C;
float ddy_s_a = -Ba / C;
float ddx_s_b = -Ab / C;
float ddy_s_b = -Bb / C;
float ddx_s_c = -Ac / C;
float ddy_s_c = -Bc / C;
float ddx_s_d = -Ad / C;
float ddy_s_d = -Bd / C;
ddx = _mm_load_ps_r(ddx_s_a, ddx_s_b, ddx_s_c, ddx_s_d);
ddy = _mm_load_ps_r(ddy_s_a, ddy_s_b, ddy_s_c, ddy_s_d);
float c_s_a = (v1_a - ddx_s_a *v1.x - ddy_s_a*v1.y);
float c_s_b = (v1_b - ddx_s_b *v1.x - ddy_s_b*v1.y);
float c_s_c = (v1_c - ddx_s_c *v1.x - ddy_s_c*v1.y);
float c_s_d = (v1_d - ddx_s_d *v1.x - ddy_s_d*v1.y);
c = _mm_load_ps_r(c_s_a, c_s_b, c_s_c, c_s_d);
//z = z1 + dzdx * (minx - v1.x) + dzdy * (minx - v1.y);
//z = (z1 - dzdx * v1.x - v1.y*dzdy) + dzdx*inx + dzdy *iny;
}
__forceinline __m128 Ip(__m128 x, __m128 y) const
{
__m128 p1 = _mm_mul_ps(x, ddx);
__m128 p2 = _mm_mul_ps(y, ddy);
__m128 s1 = _mm_add_ps(p1, p2);
return _mm_add_ps(s1, c);
}
__forceinline __m128 InStep(__m128 bas) const
{
return _mm_add_ps(bas, ddx);
}
};
struct IPs
{
PlaneStepper ZUV;
PlaneStepper Col;
void Setup(PolyParam* pp, text_info* texture, const Vertex &v1, const Vertex &v2, const Vertex &v3, int minx, int miny, int q)
{
u32 w = 0, h = 0;
if (texture) {
w = texture->width;
h = texture->height;
}
ZUV.Setup(v1, v2, v3, minx, miny, q,
v1.z, v2.z, v3.z,
v1.u * w * v1.z, v2.u * w* v2.z, v3.u * w* v3.z,
v1.v * h* v1.z, v2.v * h* v2.z, v3.v * h* v3.z,
0, -1, 1);
Col.Setup(v1, v2, v3, minx, miny, q,
v1.col[2], v2.col[2], v3.col[2],
v1.col[1], v2.col[1], v3.col[1],
v1.col[0], v2.col[0], v3.col[0],
v1.col[3], v2.col[3], v3.col[3]
);
}
};
#define TPL_DECL_pixel template<bool useoldmsk, int alpha_mode, bool pp_UseAlpha, bool pp_Texture, bool pp_IgnoreTexA, int pp_ShadInstr, bool pp_Offset >
#define TPL_DECL_triangle template<int alpha_mode, bool pp_UseAlpha, bool pp_Texture, bool pp_IgnoreTexA, int pp_ShadInstr, bool pp_Offset >
#define TPL_PRMS_pixel(useoldmsk) <useoldmsk, alpha_mode, pp_UseAlpha, pp_Texture, pp_IgnoreTexA, pp_ShadInstr, pp_Offset >
#define TPL_PRMS_triangle <alpha_mode, pp_UseAlpha, pp_Texture, pp_IgnoreTexA, pp_ShadInstr, pp_Offset >
//<alpha_blend, pp_UseAlpha, pp_Texture, pp_IgnoreTexA, pp_ShadInstr, pp_Offset >
typedef void(*RendtriangleFn)(PolyParam* pp, int vertex_offset, const Vertex &v1, const Vertex &v2, const Vertex &v3, u32* colorBuffer, RECT* area);
RendtriangleFn RendtriangleFns[3][2][2][2][4][2];
__m128i const_setAlpha;
__m128i shuffle_alpha;
TPL_DECL_pixel
static void PixelFlush(PolyParam* pp, text_info* texture, __m128 x, __m128 y, u8* cb, __m128 oldmask, IPs& ip)
{
x = _mm_shuffle_ps(x, x, 0);
__m128 invW = ip.ZUV.Ip(x, y);
__m128 u = ip.ZUV.InStep(invW);
__m128 v = ip.ZUV.InStep(u);
__m128 ws = ip.ZUV.InStep(v);
_MM_TRANSPOSE4_PS(invW, u, v, ws);
u = _mm_div_ps(u, invW);
v = _mm_div_ps(v, invW);
//invW : {z1,z2,z3,z4}
//u : {u1,u2,u3,u4}
//v : {v1,v2,v3,v4}
//wx : {?,?,?,?}
__m128* zb = (__m128*)&cb[Z_BUFFER_PIXEL_OFFSET * 4];
__m128 ZMask = _mm_cmpge_ps(invW, *zb);
if (useoldmsk)
ZMask = _mm_and_ps(oldmask, ZMask);
u32 msk = _mm_movemask_ps(ZMask);//0xF
if (msk == 0)
return;
__m128i rv;
{
__m128 a = ip.Col.Ip(x, y);
__m128 b = ip.Col.InStep(a);
__m128 c = ip.Col.InStep(b);
__m128 d = ip.Col.InStep(c);
//we need :
__m128i ab = _mm_packs_epi32(_mm_cvttps_epi32(a), _mm_cvttps_epi32(b));
__m128i cd = _mm_packs_epi32(_mm_cvttps_epi32(c), _mm_cvttps_epi32(d));
rv = _mm_packus_epi16(ab, cd);
if (!pp_UseAlpha) {
rv = _mm_or_si128(rv, const_setAlpha);
}
if (pp_Texture) {
__m128i ui = _mm_cvttps_epi32(u);
__m128i vi = _mm_cvttps_epi32(v);
__m128 uf = _mm_sub_ps(u, _mm_cvtepi32_ps(ui));
__m128 vf = _mm_sub_ps(v, _mm_cvtepi32_ps(vi));
__m128i ufi = _mm_cvttps_epi32(_mm_mul_ps(uf, _mm_set1_ps(256)));
__m128i vfi = _mm_cvttps_epi32(_mm_mul_ps(vf, _mm_set1_ps(256)));
//(int)v<<x+(int)u
m128i textadr;
textadr.mm = _mm_add_epi32(_mm_slli_epi32(vi, 16), ui);//texture addresses ! 4x of em !
m128i textel;
for (int i = 0; i < 4; i++) {
u32 u = textadr.m128i_i16[i * 2 + 0];
u32 v = textadr.m128i_i16[i * 2 + 1];
__m128i mufi_ = _mm_shuffle_epi32(ufi, _MM_SHUFFLE(0, 0, 0, 0));
__m128i mufi_n = _mm_sub_epi32(_mm_set1_epi32(255), mufi_);
__m128i mvfi_ = _mm_shuffle_epi32(vfi, _MM_SHUFFLE(0, 0, 0, 0));
__m128i mvfi_n = _mm_sub_epi32(_mm_set1_epi32(255), mvfi_);
ufi = _mm_shuffle_epi32(ufi, _MM_SHUFFLE(0,3,2,1));
vfi = _mm_shuffle_epi32(vfi, _MM_SHUFFLE(0,3,2,1));
u32 pixel;
#if 0
u32 textel_size = 2;
u32 pixel00 = decoded_colors[texture->textype][texture->pdata[((u + 1) % texture->width + (v + 1) % texture->height * texture->width)]];
u32 pixel01 = decoded_colors[texture->textype][texture->pdata[((u + 0) % texture->width + (v + 1) % texture->height * texture->width)]];
u32 pixel10 = decoded_colors[texture->textype][texture->pdata[((u + 1) % texture->width + (v + 0) % texture->height * texture->width)]];
u32 pixel11 = decoded_colors[texture->textype][texture->pdata[((u + 0) % texture->width + (v + 0) % texture->height * texture->width)]];
for (int j = 0; j < 4; j++) {
((u8*)&pixel)[j] =
(((u8*)&pixel00)[j] * uf.m128_f32[i] + ((u8*)&pixel01)[j] * (1 - uf.m128_f32[i])) * vf.m128_f32[i] + (((u8*)&pixel10)[j] * uf.m128_f32[i] + ((u8*)&pixel11)[j] * (1 - uf.m128_f32[i])) * (1 - vf.m128_f32[i]);
}
#endif
__m128i px = ((__m128i*)texture->pdata)[((u + 0) % texture->width + (v + 0) % texture->height * texture->width)];
__m128i tex_00 = _mm_cvtepu8_epi32(px);
__m128i tex_01 = _mm_cvtepu8_epi32(_mm_shuffle_epi32(px, _MM_SHUFFLE(0, 0, 0, 1)));
__m128i tex_10 = _mm_cvtepu8_epi32(_mm_shuffle_epi32(px, _MM_SHUFFLE(0, 0, 0, 2)));
__m128i tex_11 = _mm_cvtepu8_epi32(_mm_shuffle_epi32(px, _MM_SHUFFLE(0, 0, 0, 3)));
tex_00 = _mm_add_epi32(_mm_mullo_epi32(tex_00, mufi_), _mm_mullo_epi32(tex_01, mufi_n));
tex_10 = _mm_add_epi32(_mm_mullo_epi32(tex_10, mufi_), _mm_mullo_epi32(tex_10, mufi_n));
tex_00 = _mm_add_epi32(_mm_mullo_epi32(tex_00, mvfi_), _mm_mullo_epi32(tex_10, mvfi_n));
tex_00 = _mm_srli_epi32(tex_00, 16);
tex_00 = _mm_packus_epi32(tex_00, tex_00);
tex_00 = _mm_packus_epi16(tex_00, tex_00);
pixel = _mm_cvtsi128_si32(tex_00);
#if 0
//top = c0 * a + c1 * (1-a)
//bottom = c2 * a + c3 * (1-a)
//[c0 c2] [c1 c3]
//[c0 c2]*a + [c1 c3] * (1 - a) = [cx cy]
//[cx * d + cy * (1-d)]
//cf
_mm_unpacklo_epi8()
__m128i y = _mm_cvtps_epi32(x); // Convert them to 32-bit ints
y = _mm_packus_epi32(y, y); // Pack down to 16 bits
y = _mm_packus_epi16(y, y); // Pack down to 8 bits
*(int*)out = _mm_cvtsi128_si32(y); // Store the lower 32 bits
// 0x000000FF * 0x00010001 = 0x00FF00FF
__m128i px = ((__m128i*)texture->pdata)[((u) & ( texture->width - 1) + (v) & (texture->height-1) * texture->width)];
__m128i lo_px = _mm_cvtepu8_epi16(px);
__m128i hi_px = _mm_cvtepu8_epi16(_mm_shuffle_epi32(px, _MM_SHUFFLE(1, 0, 3, 2)));
#endif
textel.m128i_i32[i] = pixel;
}
if (pp_IgnoreTexA) {
textel.mm = _mm_or_si128(textel.mm, const_setAlpha);
}
if (pp_ShadInstr == 0){
//color.rgb = texcol.rgb;
//color.a = texcol.a;
rv = textel.mm;
}
else if (pp_ShadInstr == 1) {
//color.rgb *= texcol.rgb;
//color.a = texcol.a;
//color.a = 1
rv = _mm_or_si128(rv, const_setAlpha);
//color *= texcol
__m128i lo_rv = _mm_cvtepu8_epi16(rv);
__m128i hi_rv = _mm_cvtepu8_epi16(_mm_shuffle_epi32(rv, _MM_SHUFFLE(1, 0, 3, 2)));
__m128i lo_fb = _mm_cvtepu8_epi16(textel.mm);
__m128i hi_fb = _mm_cvtepu8_epi16(_mm_shuffle_epi32(textel.mm, _MM_SHUFFLE(1, 0, 3, 2)));
lo_rv = _mm_mullo_epi16(lo_rv, lo_fb);
hi_rv = _mm_mullo_epi16(hi_rv, hi_fb);
rv = _mm_packus_epi16(_mm_srli_epi16(lo_rv, 8), _mm_srli_epi16(hi_rv, 8));
}
else if (pp_ShadInstr == 2) {
//color.rgb=mix(color.rgb,texcol.rgb,texcol.a);
// a bit wrong atm, as it also mixes alphas
__m128i lo_rv = _mm_cvtepu8_epi16(rv);
__m128i hi_rv = _mm_cvtepu8_epi16(_mm_shuffle_epi32(rv, _MM_SHUFFLE(1, 0, 3, 2)));
__m128i lo_fb = _mm_cvtepu8_epi16(textel.mm);
__m128i hi_fb = _mm_cvtepu8_epi16(_mm_shuffle_epi32(textel.mm, _MM_SHUFFLE(1, 0, 3, 2)));
__m128i lo_rv_alpha = _mm_shuffle_epi8(lo_fb, shuffle_alpha);
__m128i hi_rv_alpha = _mm_shuffle_epi8(hi_fb, shuffle_alpha);
__m128i lo_fb_alpha = _mm_sub_epi16(_mm_set1_epi16(255), lo_rv_alpha);
__m128i hi_fb_alpha = _mm_sub_epi16(_mm_set1_epi16(255), hi_rv_alpha);
lo_rv = _mm_mullo_epi16(lo_rv, lo_rv_alpha);
hi_rv = _mm_mullo_epi16(hi_rv, hi_rv_alpha);
lo_fb = _mm_mullo_epi16(lo_fb, lo_fb_alpha);
hi_fb = _mm_mullo_epi16(hi_fb, hi_fb_alpha);
rv = _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(lo_rv, lo_fb), 8), _mm_srli_epi16(_mm_adds_epu16(hi_rv, hi_fb), 8));
}
else if (pp_ShadInstr == 3) {
//color*=texcol
__m128i lo_rv = _mm_cvtepu8_epi16(rv);
__m128i hi_rv = _mm_cvtepu8_epi16(_mm_shuffle_epi32(rv, _MM_SHUFFLE(1, 0, 3, 2)));
__m128i lo_fb = _mm_cvtepu8_epi16(textel.mm);
__m128i hi_fb = _mm_cvtepu8_epi16(_mm_shuffle_epi32(textel.mm, _MM_SHUFFLE(1, 0, 3, 2)));
lo_rv = _mm_mullo_epi16(lo_rv, lo_fb);
hi_rv = _mm_mullo_epi16(hi_rv, hi_fb);
rv = _mm_packus_epi16(_mm_srli_epi16(lo_rv, 8), _mm_srli_epi16(hi_rv, 8));
}
if (pp_Offset) {
//add offset
}
//textadr = _mm_add_epi32(textadr, _mm_setr_epi32(tex_addr, tex_addr, tex_addr, tex_addr));
//rv = textel.mm; // _mm_xor_si128(rv, textadr);
}
}
//__m128i rv=ip.col;//_mm_xor_si128(_mm_cvtps_epi32(_mm_mul_ps(x,Z.c)),_mm_cvtps_epi32(y));
//Alpha test
if (alpha_mode == 1) {
__m128i fb = *(__m128i*)cb;
#if 1
m128i mm_rv, mm_fb;
mm_rv.mm = rv;
mm_fb.mm = fb;
//ALPHA_TEST
for (int i = 0; i < 4; i++) {
if (mm_rv.m128i_u8[i * 4 + 3] < PT_ALPHA_REF) {
mm_rv.m128i_u32[i] = mm_fb.m128i_u32[i];
}
}
rv = mm_rv.mm;
#else
__m128i ALPHA_TEST = _mm_set1_epi8(PT_ALPHA_REF);
__m128i mask = _mm_cmplt_epi8(_mm_subs_epu16(ALPHA_TEST, rv), _mm_setzero_si128());
mask = _mm_srai_epi32(mask, 31); //FF on the pixels we want to keep
rv = _mm_or_si128(_mm_and_si128(rv, mask), _mm_andnot_si128(mask, cb));
#endif
}
else if (alpha_mode == 2) {
__m128i fb = *(__m128i*)cb;
#if 0
for (int i = 0; i < 16; i += 4) {
u8 src_blend[4] = { rv.m128i_u8[i + 3], rv.m128i_u8[i + 3], rv.m128i_u8[i + 3], rv.m128i_u8[i + 3] };
u8 dst_blend[4] = { 255 - rv.m128i_u8[i + 3], 255 - rv.m128i_u8[i + 3], 255 - rv.m128i_u8[i + 3], 255 - rv.m128i_u8[i + 3] };
for (int j = 0; j < 4; j++) {
rv.m128i_u8[i + j] = (rv.m128i_u8[i + j] * src_blend[j]) / 256 + (fb.m128i_u8[i + j] * dst_blend[j]) / 256;
}
}
#else
__m128i lo_rv = _mm_cvtepu8_epi16(rv);
__m128i hi_rv = _mm_cvtepu8_epi16(_mm_shuffle_epi32(rv, _MM_SHUFFLE(1, 0, 3, 2)));
__m128i lo_fb = _mm_cvtepu8_epi16(fb);
__m128i hi_fb = _mm_cvtepu8_epi16(_mm_shuffle_epi32(fb, _MM_SHUFFLE(1, 0, 3, 2)));
__m128i lo_rv_alpha = _mm_shuffle_epi8(lo_rv, shuffle_alpha);
__m128i hi_rv_alpha = _mm_shuffle_epi8(hi_rv, shuffle_alpha);
__m128i lo_fb_alpha = _mm_sub_epi16(_mm_set1_epi16(255), lo_rv_alpha);
__m128i hi_fb_alpha = _mm_sub_epi16(_mm_set1_epi16(255), hi_rv_alpha);
lo_rv = _mm_mullo_epi16(lo_rv, lo_rv_alpha);
hi_rv = _mm_mullo_epi16(hi_rv, hi_rv_alpha);
lo_fb = _mm_mullo_epi16(lo_fb, lo_fb_alpha);
hi_fb = _mm_mullo_epi16(hi_fb, hi_fb_alpha);
rv = _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(lo_rv, lo_fb), 8), _mm_srli_epi16(_mm_adds_epu16(hi_rv, hi_fb), 8));
#endif
}
if (msk != 0xF)
{
rv = _mm_and_si128(rv, *(__m128i*)&ZMask);
rv = _mm_or_si128(_mm_andnot_si128(*(__m128i*)&ZMask, *(__m128i*)cb), rv);
invW = _mm_and_ps(invW, ZMask);
invW = _mm_or_ps(_mm_andnot_ps(ZMask, *zb), invW);
}
*zb = invW;
*(__m128i*)cb = rv;
}
//u32 nok,fok;
TPL_DECL_triangle
static void Rendtriangle(PolyParam* pp, int vertex_offset, const Vertex &v1, const Vertex &v2, const Vertex &v3, u32* colorBuffer, RECT* area)
{
text_info texture = { 0 };
if (pp_Texture) {
#pragma omp critical (texture_lookup)
{
texture = raw_GetTexture(pp->tsp, pp->tcw);
}
}
const int stride_bytes = STRIDE_PIXEL_OFFSET * 4;
//Plane equation
// 28.4 fixed-point coordinates
const float Y1 = v1.y;// iround(16.0f * v1.y);
const float Y2 = v2.y;// iround(16.0f * v2.y);
const float Y3 = v3.y;// iround(16.0f * v3.y);
const float X1 = v1.x;// iround(16.0f * v1.x);
const float X2 = v2.x;// iround(16.0f * v2.x);
const float X3 = v3.x;// iround(16.0f * v3.x);
int sgn = 1;
// Deltas
{
//area: (X1-X3)*(Y2-Y3)-(Y1-Y3)*(X2-X3)
float area = ((X1 - X3)*(Y2 - Y3) - (Y1 - Y3)*(X2 - X3));
if (area>0)
sgn = -1;
if (pp->isp.CullMode != 0) {
float abs_area = fabsf(area);
if (abs_area < FPU_CULL_VAL)
return;
if (pp->isp.CullMode >= 2) {
u32 mode = vertex_offset ^ pp->isp.CullMode & 1;
if (
(mode == 0 && area < 0) ||
(mode == 1 && area > 0)) {
return;
}
}
}
}
const float DX12 = sgn*(X1 - X2);
const float DX23 = sgn*(X2 - X3);
const float DX31 = sgn*(X3 - X1);
const float DY12 = sgn*(Y1 - Y2);
const float DY23 = sgn*(Y2 - Y3);
const float DY31 = sgn*(Y3 - Y1);
// Fixed-point deltas
const float FDX12 = DX12;// << 4;
const float FDX23 = DX23;// << 4;
const float FDX31 = DX31;// << 4;
const float FDY12 = DY12;// << 4;
const float FDY23 = DY23;// << 4;
const float FDY31 = DY31;// << 4;
// Block size, standard 4x4 (must be power of two)
const int q = 4;
// Bounding rectangle
int minx = iround(mmin(X1, X2, X3, area->left));// +0xF) >> 4;
int miny = iround(mmin(Y1, Y2, Y3, area->top));// +0xF) >> 4;
// Start in corner of block
minx &= ~(q - 1);
miny &= ~(q - 1);
int spanx = iround(mmax(X1 + 0.5f, X2 + 0.5f, X3 + 0.5f, area->right)) - minx;
int spany = iround(mmax(Y1 + 0.5f, Y2 + 0.5f, Y3 + 0.5f, area->bottom)) - miny;
//Inside scissor area?
if (spanx < 0 || spany < 0)
return;
// Half-edge constants
float C1 = DY12 * X1 - DX12 * Y1;
float C2 = DY23 * X2 - DX23 * Y2;
float C3 = DY31 * X3 - DX31 * Y3;
// Correct for fill convention
if (DY12 < 0 || (DY12 == 0 && DX12 > 0)) C1++;
if (DY23 < 0 || (DY23 == 0 && DX23 > 0)) C2++;
if (DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++;
float MAX_12, MAX_23, MAX_31, MIN_12, MIN_23, MIN_31;
PlaneMinMax(MIN_12, MAX_12, DX12, DY12, q);
PlaneMinMax(MIN_23, MAX_23, DX23, DY23, q);
PlaneMinMax(MIN_31, MAX_31, DX31, DY31, q);
const float FDqX12 = FDX12 * q;
const float FDqX23 = FDX23 * q;
const float FDqX31 = FDX31 * q;
const float FDqY12 = FDY12 * q;
const float FDqY23 = FDY23 * q;
const float FDqY31 = FDY31 * q;
const float FDX12mq = FDX12 + FDY12*q;
const float FDX23mq = FDX23 + FDY23*q;
const float FDX31mq = FDX31 + FDY31*q;
float hs12 = C1 + FDX12 * (miny + 0.5f) - FDY12 * (minx + 0.5f) + FDqY12 - MIN_12;
float hs23 = C2 + FDX23 * (miny + 0.5f) - FDY23 * (minx + 0.5f) + FDqY23 - MIN_23;
float hs31 = C3 + FDX31 * (miny + 0.5f) - FDY31 * (minx + 0.5f) + FDqY31 - MIN_31;
MAX_12 -= MIN_12;
MAX_23 -= MIN_23;
MAX_31 -= MIN_31;
float C1_pm = MIN_12;
float C2_pm = MIN_23;
float C3_pm = MIN_31;
u8* cb_y = (u8*)colorBuffer;
cb_y += miny*stride_bytes + minx*(q * 4);
DECL_ALIGN(64) IPs ip;
ip.Setup(pp, &texture, v1, v2, v3, minx, miny, q);
__m128 y_ps = _mm_broadcast_float(miny);
__m128 minx_ps = _mm_load_scaled_float(minx - q, 1);
static DECL_ALIGN(16) float ones_ps[4] = { 1, 1, 1, 1 };
static DECL_ALIGN(16) float q_ps[4] = { q, q, q, q };
// Loop through blocks
for (int y = spany; y > 0; y -= q)
{
float Xhs12 = hs12;
float Xhs23 = hs23;
float Xhs31 = hs31;
u8* cb_x = cb_y;
__m128 x_ps = minx_ps;
for (int x = spanx; x > 0; x -= q)
{
Xhs12 -= FDqY12;
Xhs23 -= FDqY23;
Xhs31 -= FDqY31;
x_ps = _mm_add_ps(x_ps, *(__m128*)q_ps);
// Corners of block
bool any = EvalHalfSpaceFAny(Xhs12, Xhs23, Xhs31);
// Skip block when outside an edge
if (!any)
{
cb_x += q*q * 4;
continue;
}
bool all = EvalHalfSpaceFAll(Xhs12, Xhs23, Xhs31, MAX_12, MAX_23, MAX_31);
// Accept whole block when totally covered
if (all)
{
__m128 yl_ps = y_ps;
for (int iy = q; iy > 0; iy--)
{
PixelFlush TPL_PRMS_pixel(false) (pp, &texture, x_ps, yl_ps, cb_x, x_ps, ip);
yl_ps = _mm_add_ps(yl_ps, *(__m128*)ones_ps);
cb_x += sizeof(__m128);
}
}
else // Partially covered block
{
float CY1 = C1_pm + Xhs12;
float CY2 = C2_pm + Xhs23;
float CY3 = C3_pm + Xhs31;
__m128 pfdx12 = _mm_broadcast_float(FDX12);
__m128 pfdx23 = _mm_broadcast_float(FDX23);
__m128 pfdx31 = _mm_broadcast_float(FDX31);
__m128 pcy1 = _mm_load_scaled_float(CY1, -FDY12);
__m128 pcy2 = _mm_load_scaled_float(CY2, -FDY23);
__m128 pcy3 = _mm_load_scaled_float(CY3, -FDY31);
__m128 pzero = _mm_setzero_ps();
//bool ok=false;
__m128 yl_ps = y_ps;
for (int iy = q; iy > 0; iy--)
{
__m128 mask1 = _mm_cmple_ps(pcy1, pzero);
__m128 mask2 = _mm_cmple_ps(pcy2, pzero);
__m128 mask3 = _mm_cmple_ps(pcy3, pzero);
__m128 summary = _mm_or_ps(mask3, _mm_or_ps(mask2, mask1));
__m128i a = _mm_cmpeq_epi32((__m128i&)summary, (__m128i&)pzero);
int msk = _mm_movemask_ps((__m128&)a);
if (msk != 0)
{
if (msk != 0xF)
PixelFlush TPL_PRMS_pixel(true) (pp, &texture, x_ps, yl_ps, cb_x, *(__m128*)&a, ip);
else
PixelFlush TPL_PRMS_pixel(false) (pp, &texture, x_ps, yl_ps, cb_x, *(__m128*)&a, ip);
}
yl_ps = _mm_add_ps(yl_ps, *(__m128*)ones_ps);
cb_x += sizeof(__m128);
//CY1 += FDX12mq;
//CY2 += FDX23mq;
//CY3 += FDX31mq;
pcy1 = _mm_add_ps(pcy1, pfdx12);
pcy2 = _mm_add_ps(pcy2, pfdx23);
pcy3 = _mm_add_ps(pcy3, pfdx31);
}
/*
if (!ok)
{
nok++;
}
else
{
fok++;
}*/
}
}
next_y:
hs12 += FDqX12;
hs23 += FDqX23;
hs31 += FDqX31;
cb_y += stride_bytes*q;
y_ps = _mm_add_ps(y_ps, *(__m128*)q_ps);
}
}
#if HOST_OS == OS_WINDOWS
BITMAPINFOHEADER bi = { sizeof(BITMAPINFOHEADER), 0, 0, 1, 32, BI_RGB };
#endif
struct softrend : Renderer
{
virtual bool Process(TA_context* ctx) {
//disable RTTs for now ..
if (ctx->rend.isRTT)
return false;
ctx->rend_inuse.Lock();
if (!ta_parse_vdrc(ctx))
return false;
return true;
}
template <int alpha_mode>
void RenderParamList(List<PolyParam>* param_list, RECT* area) {
Vertex* verts = pvrrc.verts.head();
u16* idx = pvrrc.idx.head();
PolyParam* params = param_list->head();
int param_count = param_list->used();
for (int i = 0; i < param_count; i++)
{
int vertex_count = params[i].count - 2;
u16* poly_idx = &idx[params[i].first];
for (int v = 0; v < vertex_count; v++) {
////<alpha_blend, pp_UseAlpha, pp_Texture, pp_IgnoreTexA, pp_ShadInstr, pp_Offset >
RendtriangleFn fn = RendtriangleFns[alpha_mode][params[i].tsp.UseAlpha][params[i].pcw.Texture][params[i].tsp.IgnoreTexA][params[i].tsp.ShadInstr][params[i].pcw.Offset];
fn(&params[i], v, verts[poly_idx[v]], verts[poly_idx[v + 1]], verts[poly_idx[v + 2]], render_buffer, area);
}
}
}
virtual bool Render() {
bool is_rtt = pvrrc.isRTT;
memset(render_buffer, 0, sizeof(render_buffer));
if (pvrrc.verts.used()<3)
return false;
if (pvrrc.render_passes.head()[0].autosort)
SortPParams(0, pvrrc.global_param_tr.used());
int tcount = omp_get_num_procs() - 1;
if (tcount == 0) tcount = 1;
if (tcount > settings.pvr.MaxThreads) tcount = settings.pvr.MaxThreads;
#pragma omp parallel num_threads(tcount)
{
int thd = omp_get_thread_num();
int y_offs = 480 % omp_get_num_threads();
int y_thd = 480 / omp_get_num_threads();
int y_start = (!!thd) * y_offs + y_thd * thd;
int y_end = y_offs + y_thd * (thd + 1);
RECT area = { 0, y_start, 640, y_end };
RenderParamList<0>(&pvrrc.global_param_op, &area);
RenderParamList<1>(&pvrrc.global_param_pt, &area);
RenderParamList<2>(&pvrrc.global_param_tr, &area);
}
/*
for (int y = 0; y < 480; y++) {
for (int x = 0; x < 640; x++) {
color_buffer[x + y * 640] = rand();
}
} */
return !is_rtt;
}
#if HOST_OS == OS_WINDOWS
HWND hWnd;
HBITMAP hBMP = 0, holdBMP;
HDC hmem;
#endif
virtual bool Init() {
const_setAlpha = _mm_set1_epi32(0xFF000000);
u8 ushuffle[] = { 0x0E, 0x80, 0x0E, 0x80, 0x0E, 0x80, 0x0E, 0x80, 0x06, 0x80, 0x06, 0x80, 0x06, 0x80, 0x06, 0x80};
memcpy(&shuffle_alpha, ushuffle, sizeof(shuffle_alpha));
#if HOST_OS == OS_WINDOWS
hWnd = (HWND)libPvr_GetRenderTarget();
bi.biWidth = 640;
bi.biHeight = 480;
RECT rect;
GetClientRect(hWnd, &rect);
HDC hdc = GetDC(hWnd);
FillRect(hdc, &rect, (HBRUSH)(COLOR_BACKGROUND));
bi.biSizeImage = bi.biWidth * bi.biHeight * 4;
hBMP = CreateCompatibleBitmap(hdc, bi.biWidth, bi.biHeight);
hmem = CreateCompatibleDC(hdc);
holdBMP = (HBITMAP)SelectObject(hmem, hBMP);
ReleaseDC(hWnd, hdc);
#endif
#define REP_16(x) ((x)* 16 + (x))
#define REP_32(x) ((x)* 8 + (x)/4)
#define REP_64(x) ((x)* 4 + (x)/16)
for (int c = 0; c < 65536; c++) {
//565
decoded_colors[0][c] = 0xFF000000 | (REP_32((c >> 11) % 32) << 16) | (REP_64((c >> 5) % 64) << 8) | (REP_32((c >> 0) % 32) << 0);
//1555
decoded_colors[1][c] = ((c >> 0) % 2 * 255 << 24) | (REP_32((c >> 11) % 32) << 16) | (REP_32((c >> 6) % 32) << 8) | (REP_32((c >> 1) % 32) << 0);
//4444
decoded_colors[2][c] = (REP_16((c >> 0) % 16) << 24) | (REP_16((c >> 12) % 16) << 16) | (REP_16((c >> 8) % 16) << 8) | (REP_16((c >> 4) % 16) << 0);
}
{
RendtriangleFns[0][0][1][0][0][0] = &Rendtriangle<0, 0, 1, 0, 0, 0>;
RendtriangleFns[0][0][1][0][0][1] = &Rendtriangle<0, 0, 1, 0, 0, 1>;
RendtriangleFns[0][0][1][0][1][0] = &Rendtriangle<0, 0, 1, 0, 1, 0>;
RendtriangleFns[0][0][1][0][1][1] = &Rendtriangle<0, 0, 1, 0, 1, 1>;
RendtriangleFns[0][0][1][0][2][0] = &Rendtriangle<0, 0, 1, 0, 2, 0>;
RendtriangleFns[0][0][1][0][2][1] = &Rendtriangle<0, 0, 1, 0, 2, 1>;
RendtriangleFns[0][0][1][0][3][0] = &Rendtriangle<0, 0, 1, 0, 3, 0>;
RendtriangleFns[0][0][1][0][3][1] = &Rendtriangle<0, 0, 1, 0, 3, 1>;
RendtriangleFns[0][0][1][1][0][0] = &Rendtriangle<0, 0, 1, 1, 0, 0>;
RendtriangleFns[0][0][1][1][0][1] = &Rendtriangle<0, 0, 1, 1, 0, 1>;
RendtriangleFns[0][0][1][1][1][0] = &Rendtriangle<0, 0, 1, 1, 1, 0>;
RendtriangleFns[0][0][1][1][1][1] = &Rendtriangle<0, 0, 1, 1, 1, 1>;
RendtriangleFns[0][0][1][1][2][0] = &Rendtriangle<0, 0, 1, 1, 2, 0>;
RendtriangleFns[0][0][1][1][2][1] = &Rendtriangle<0, 0, 1, 1, 2, 1>;
RendtriangleFns[0][0][1][1][3][0] = &Rendtriangle<0, 0, 1, 1, 3, 0>;
RendtriangleFns[0][0][1][1][3][1] = &Rendtriangle<0, 0, 1, 1, 3, 1>;
RendtriangleFns[0][0][0][0][0][0] = &Rendtriangle<0, 0, 0, 0, 0, 0>;
RendtriangleFns[0][0][0][0][0][1] = &Rendtriangle<0, 0, 0, 0, 0, 1>;
RendtriangleFns[0][0][0][0][1][0] = &Rendtriangle<0, 0, 0, 0, 1, 0>;
RendtriangleFns[0][0][0][0][1][1] = &Rendtriangle<0, 0, 0, 0, 1, 1>;
RendtriangleFns[0][0][0][0][2][0] = &Rendtriangle<0, 0, 0, 0, 2, 0>;
RendtriangleFns[0][0][0][0][2][1] = &Rendtriangle<0, 0, 0, 0, 2, 1>;
RendtriangleFns[0][0][0][0][3][0] = &Rendtriangle<0, 0, 0, 0, 3, 0>;
RendtriangleFns[0][0][0][0][3][1] = &Rendtriangle<0, 0, 0, 0, 3, 1>;
RendtriangleFns[0][0][0][1][0][0] = &Rendtriangle<0, 0, 0, 1, 0, 0>;
RendtriangleFns[0][0][0][1][0][1] = &Rendtriangle<0, 0, 0, 1, 0, 1>;
RendtriangleFns[0][0][0][1][1][0] = &Rendtriangle<0, 0, 0, 1, 1, 0>;
RendtriangleFns[0][0][0][1][1][1] = &Rendtriangle<0, 0, 0, 1, 1, 1>;
RendtriangleFns[0][0][0][1][2][0] = &Rendtriangle<0, 0, 0, 1, 2, 0>;
RendtriangleFns[0][0][0][1][2][1] = &Rendtriangle<0, 0, 0, 1, 2, 1>;
RendtriangleFns[0][0][0][1][3][0] = &Rendtriangle<0, 0, 0, 1, 3, 0>;
RendtriangleFns[0][0][0][1][3][1] = &Rendtriangle<0, 0, 0, 1, 3, 1>;
RendtriangleFns[0][1][1][0][0][0] = &Rendtriangle<0, 1, 1, 0, 0, 0>;
RendtriangleFns[0][1][1][0][0][1] = &Rendtriangle<0, 1, 1, 0, 0, 1>;
RendtriangleFns[0][1][1][0][1][0] = &Rendtriangle<0, 1, 1, 0, 1, 0>;
RendtriangleFns[0][1][1][0][1][1] = &Rendtriangle<0, 1, 1, 0, 1, 1>;
RendtriangleFns[0][1][1][0][2][0] = &Rendtriangle<0, 1, 1, 0, 2, 0>;
RendtriangleFns[0][1][1][0][2][1] = &Rendtriangle<0, 1, 1, 0, 2, 1>;
RendtriangleFns[0][1][1][0][3][0] = &Rendtriangle<0, 1, 1, 0, 3, 0>;
RendtriangleFns[0][1][1][0][3][1] = &Rendtriangle<0, 1, 1, 0, 3, 1>;
RendtriangleFns[0][1][1][1][0][0] = &Rendtriangle<0, 1, 1, 1, 0, 0>;
RendtriangleFns[0][1][1][1][0][1] = &Rendtriangle<0, 1, 1, 1, 0, 1>;
RendtriangleFns[0][1][1][1][1][0] = &Rendtriangle<0, 1, 1, 1, 1, 0>;
RendtriangleFns[0][1][1][1][1][1] = &Rendtriangle<0, 1, 1, 1, 1, 1>;
RendtriangleFns[0][1][1][1][2][0] = &Rendtriangle<0, 1, 1, 1, 2, 0>;
RendtriangleFns[0][1][1][1][2][1] = &Rendtriangle<0, 1, 1, 1, 2, 1>;
RendtriangleFns[0][1][1][1][3][0] = &Rendtriangle<0, 1, 1, 1, 3, 0>;
RendtriangleFns[0][1][1][1][3][1] = &Rendtriangle<0, 1, 1, 1, 3, 1>;
RendtriangleFns[0][1][0][0][0][0] = &Rendtriangle<0, 1, 0, 0, 0, 0>;
RendtriangleFns[0][1][0][0][0][1] = &Rendtriangle<0, 1, 0, 0, 0, 1>;
RendtriangleFns[0][1][0][0][1][0] = &Rendtriangle<0, 1, 0, 0, 1, 0>;
RendtriangleFns[0][1][0][0][1][1] = &Rendtriangle<0, 1, 0, 0, 1, 1>;
RendtriangleFns[0][1][0][0][2][0] = &Rendtriangle<0, 1, 0, 0, 2, 0>;
RendtriangleFns[0][1][0][0][2][1] = &Rendtriangle<0, 1, 0, 0, 2, 1>;
RendtriangleFns[0][1][0][0][3][0] = &Rendtriangle<0, 1, 0, 0, 3, 0>;
RendtriangleFns[0][1][0][0][3][1] = &Rendtriangle<0, 1, 0, 0, 3, 1>;
RendtriangleFns[0][1][0][1][0][0] = &Rendtriangle<0, 1, 0, 1, 0, 0>;
RendtriangleFns[0][1][0][1][0][1] = &Rendtriangle<0, 1, 0, 1, 0, 1>;
RendtriangleFns[0][1][0][1][1][0] = &Rendtriangle<0, 1, 0, 1, 1, 0>;
RendtriangleFns[0][1][0][1][1][1] = &Rendtriangle<0, 1, 0, 1, 1, 1>;
RendtriangleFns[0][1][0][1][2][0] = &Rendtriangle<0, 1, 0, 1, 2, 0>;
RendtriangleFns[0][1][0][1][2][1] = &Rendtriangle<0, 1, 0, 1, 2, 1>;
RendtriangleFns[0][1][0][1][3][0] = &Rendtriangle<0, 1, 0, 1, 3, 0>;
RendtriangleFns[0][1][0][1][3][1] = &Rendtriangle<0, 1, 0, 1, 3, 1>;
RendtriangleFns[1][0][1][0][0][0] = &Rendtriangle<1, 0, 1, 0, 0, 0>;
RendtriangleFns[1][0][1][0][0][1] = &Rendtriangle<1, 0, 1, 0, 0, 1>;
RendtriangleFns[1][0][1][0][1][0] = &Rendtriangle<1, 0, 1, 0, 1, 0>;
RendtriangleFns[1][0][1][0][1][1] = &Rendtriangle<1, 0, 1, 0, 1, 1>;
RendtriangleFns[1][0][1][0][2][0] = &Rendtriangle<1, 0, 1, 0, 2, 0>;
RendtriangleFns[1][0][1][0][2][1] = &Rendtriangle<1, 0, 1, 0, 2, 1>;
RendtriangleFns[1][0][1][0][3][0] = &Rendtriangle<1, 0, 1, 0, 3, 0>;
RendtriangleFns[1][0][1][0][3][1] = &Rendtriangle<1, 0, 1, 0, 3, 1>;
RendtriangleFns[1][0][1][1][0][0] = &Rendtriangle<1, 0, 1, 1, 0, 0>;
RendtriangleFns[1][0][1][1][0][1] = &Rendtriangle<1, 0, 1, 1, 0, 1>;
RendtriangleFns[1][0][1][1][1][0] = &Rendtriangle<1, 0, 1, 1, 1, 0>;
RendtriangleFns[1][0][1][1][1][1] = &Rendtriangle<1, 0, 1, 1, 1, 1>;
RendtriangleFns[1][0][1][1][2][0] = &Rendtriangle<1, 0, 1, 1, 2, 0>;
RendtriangleFns[1][0][1][1][2][1] = &Rendtriangle<1, 0, 1, 1, 2, 1>;
RendtriangleFns[1][0][1][1][3][0] = &Rendtriangle<1, 0, 1, 1, 3, 0>;
RendtriangleFns[1][0][1][1][3][1] = &Rendtriangle<1, 0, 1, 1, 3, 1>;
RendtriangleFns[1][0][0][0][0][0] = &Rendtriangle<1, 0, 0, 0, 0, 0>;
RendtriangleFns[1][0][0][0][0][1] = &Rendtriangle<1, 0, 0, 0, 0, 1>;
RendtriangleFns[1][0][0][0][1][0] = &Rendtriangle<1, 0, 0, 0, 1, 0>;
RendtriangleFns[1][0][0][0][1][1] = &Rendtriangle<1, 0, 0, 0, 1, 1>;
RendtriangleFns[1][0][0][0][2][0] = &Rendtriangle<1, 0, 0, 0, 2, 0>;
RendtriangleFns[1][0][0][0][2][1] = &Rendtriangle<1, 0, 0, 0, 2, 1>;
RendtriangleFns[1][0][0][0][3][0] = &Rendtriangle<1, 0, 0, 0, 3, 0>;
RendtriangleFns[1][0][0][0][3][1] = &Rendtriangle<1, 0, 0, 0, 3, 1>;
RendtriangleFns[1][0][0][1][0][0] = &Rendtriangle<1, 0, 0, 1, 0, 0>;
RendtriangleFns[1][0][0][1][0][1] = &Rendtriangle<1, 0, 0, 1, 0, 1>;
RendtriangleFns[1][0][0][1][1][0] = &Rendtriangle<1, 0, 0, 1, 1, 0>;
RendtriangleFns[1][0][0][1][1][1] = &Rendtriangle<1, 0, 0, 1, 1, 1>;
RendtriangleFns[1][0][0][1][2][0] = &Rendtriangle<1, 0, 0, 1, 2, 0>;
RendtriangleFns[1][0][0][1][2][1] = &Rendtriangle<1, 0, 0, 1, 2, 1>;
RendtriangleFns[1][0][0][1][3][0] = &Rendtriangle<1, 0, 0, 1, 3, 0>;
RendtriangleFns[1][0][0][1][3][1] = &Rendtriangle<1, 0, 0, 1, 3, 1>;
RendtriangleFns[1][1][1][0][0][0] = &Rendtriangle<1, 1, 1, 0, 0, 0>;
RendtriangleFns[1][1][1][0][0][1] = &Rendtriangle<1, 1, 1, 0, 0, 1>;
RendtriangleFns[1][1][1][0][1][0] = &Rendtriangle<1, 1, 1, 0, 1, 0>;
RendtriangleFns[1][1][1][0][1][1] = &Rendtriangle<1, 1, 1, 0, 1, 1>;
RendtriangleFns[1][1][1][0][2][0] = &Rendtriangle<1, 1, 1, 0, 2, 0>;
RendtriangleFns[1][1][1][0][2][1] = &Rendtriangle<1, 1, 1, 0, 2, 1>;
RendtriangleFns[1][1][1][0][3][0] = &Rendtriangle<1, 1, 1, 0, 3, 0>;
RendtriangleFns[1][1][1][0][3][1] = &Rendtriangle<1, 1, 1, 0, 3, 1>;
RendtriangleFns[1][1][1][1][0][0] = &Rendtriangle<1, 1, 1, 1, 0, 0>;
RendtriangleFns[1][1][1][1][0][1] = &Rendtriangle<1, 1, 1, 1, 0, 1>;
RendtriangleFns[1][1][1][1][1][0] = &Rendtriangle<1, 1, 1, 1, 1, 0>;
RendtriangleFns[1][1][1][1][1][1] = &Rendtriangle<1, 1, 1, 1, 1, 1>;
RendtriangleFns[1][1][1][1][2][0] = &Rendtriangle<1, 1, 1, 1, 2, 0>;
RendtriangleFns[1][1][1][1][2][1] = &Rendtriangle<1, 1, 1, 1, 2, 1>;
RendtriangleFns[1][1][1][1][3][0] = &Rendtriangle<1, 1, 1, 1, 3, 0>;
RendtriangleFns[1][1][1][1][3][1] = &Rendtriangle<1, 1, 1, 1, 3, 1>;
RendtriangleFns[1][1][0][0][0][0] = &Rendtriangle<1, 1, 0, 0, 0, 0>;
RendtriangleFns[1][1][0][0][0][1] = &Rendtriangle<1, 1, 0, 0, 0, 1>;
RendtriangleFns[1][1][0][0][1][0] = &Rendtriangle<1, 1, 0, 0, 1, 0>;
RendtriangleFns[1][1][0][0][1][1] = &Rendtriangle<1, 1, 0, 0, 1, 1>;
RendtriangleFns[1][1][0][0][2][0] = &Rendtriangle<1, 1, 0, 0, 2, 0>;
RendtriangleFns[1][1][0][0][2][1] = &Rendtriangle<1, 1, 0, 0, 2, 1>;
RendtriangleFns[1][1][0][0][3][0] = &Rendtriangle<1, 1, 0, 0, 3, 0>;
RendtriangleFns[1][1][0][0][3][1] = &Rendtriangle<1, 1, 0, 0, 3, 1>;
RendtriangleFns[1][1][0][1][0][0] = &Rendtriangle<1, 1, 0, 1, 0, 0>;
RendtriangleFns[1][1][0][1][0][1] = &Rendtriangle<1, 1, 0, 1, 0, 1>;
RendtriangleFns[1][1][0][1][1][0] = &Rendtriangle<1, 1, 0, 1, 1, 0>;
RendtriangleFns[1][1][0][1][1][1] = &Rendtriangle<1, 1, 0, 1, 1, 1>;
RendtriangleFns[1][1][0][1][2][0] = &Rendtriangle<1, 1, 0, 1, 2, 0>;
RendtriangleFns[1][1][0][1][2][1] = &Rendtriangle<1, 1, 0, 1, 2, 1>;
RendtriangleFns[1][1][0][1][3][0] = &Rendtriangle<1, 1, 0, 1, 3, 0>;
RendtriangleFns[1][1][0][1][3][1] = &Rendtriangle<1, 1, 0, 1, 3, 1>;
RendtriangleFns[2][0][1][0][0][0] = &Rendtriangle<2, 0, 1, 0, 0, 0>;
RendtriangleFns[2][0][1][0][0][1] = &Rendtriangle<2, 0, 1, 0, 0, 1>;
RendtriangleFns[2][0][1][0][1][0] = &Rendtriangle<2, 0, 1, 0, 1, 0>;
RendtriangleFns[2][0][1][0][1][1] = &Rendtriangle<2, 0, 1, 0, 1, 1>;
RendtriangleFns[2][0][1][0][2][0] = &Rendtriangle<2, 0, 1, 0, 2, 0>;
RendtriangleFns[2][0][1][0][2][1] = &Rendtriangle<2, 0, 1, 0, 2, 1>;
RendtriangleFns[2][0][1][0][3][0] = &Rendtriangle<2, 0, 1, 0, 3, 0>;
RendtriangleFns[2][0][1][0][3][1] = &Rendtriangle<2, 0, 1, 0, 3, 1>;
RendtriangleFns[2][0][1][1][0][0] = &Rendtriangle<2, 0, 1, 1, 0, 0>;
RendtriangleFns[2][0][1][1][0][1] = &Rendtriangle<2, 0, 1, 1, 0, 1>;
RendtriangleFns[2][0][1][1][1][0] = &Rendtriangle<2, 0, 1, 1, 1, 0>;
RendtriangleFns[2][0][1][1][1][1] = &Rendtriangle<2, 0, 1, 1, 1, 1>;
RendtriangleFns[2][0][1][1][2][0] = &Rendtriangle<2, 0, 1, 1, 2, 0>;
RendtriangleFns[2][0][1][1][2][1] = &Rendtriangle<2, 0, 1, 1, 2, 1>;
RendtriangleFns[2][0][1][1][3][0] = &Rendtriangle<2, 0, 1, 1, 3, 0>;
RendtriangleFns[2][0][1][1][3][1] = &Rendtriangle<2, 0, 1, 1, 3, 1>;
RendtriangleFns[2][0][0][0][0][0] = &Rendtriangle<2, 0, 0, 0, 0, 0>;
RendtriangleFns[2][0][0][0][0][1] = &Rendtriangle<2, 0, 0, 0, 0, 1>;
RendtriangleFns[2][0][0][0][1][0] = &Rendtriangle<2, 0, 0, 0, 1, 0>;
RendtriangleFns[2][0][0][0][1][1] = &Rendtriangle<2, 0, 0, 0, 1, 1>;
RendtriangleFns[2][0][0][0][2][0] = &Rendtriangle<2, 0, 0, 0, 2, 0>;
RendtriangleFns[2][0][0][0][2][1] = &Rendtriangle<2, 0, 0, 0, 2, 1>;
RendtriangleFns[2][0][0][0][3][0] = &Rendtriangle<2, 0, 0, 0, 3, 0>;
RendtriangleFns[2][0][0][0][3][1] = &Rendtriangle<2, 0, 0, 0, 3, 1>;
RendtriangleFns[2][0][0][1][0][0] = &Rendtriangle<2, 0, 0, 1, 0, 0>;
RendtriangleFns[2][0][0][1][0][1] = &Rendtriangle<2, 0, 0, 1, 0, 1>;
RendtriangleFns[2][0][0][1][1][0] = &Rendtriangle<2, 0, 0, 1, 1, 0>;
RendtriangleFns[2][0][0][1][1][1] = &Rendtriangle<2, 0, 0, 1, 1, 1>;
RendtriangleFns[2][0][0][1][2][0] = &Rendtriangle<2, 0, 0, 1, 2, 0>;
RendtriangleFns[2][0][0][1][2][1] = &Rendtriangle<2, 0, 0, 1, 2, 1>;
RendtriangleFns[2][0][0][1][3][0] = &Rendtriangle<2, 0, 0, 1, 3, 0>;
RendtriangleFns[2][0][0][1][3][1] = &Rendtriangle<2, 0, 0, 1, 3, 1>;
RendtriangleFns[2][1][1][0][0][0] = &Rendtriangle<2, 1, 1, 0, 0, 0>;
RendtriangleFns[2][1][1][0][0][1] = &Rendtriangle<2, 1, 1, 0, 0, 1>;
RendtriangleFns[2][1][1][0][1][0] = &Rendtriangle<2, 1, 1, 0, 1, 0>;
RendtriangleFns[2][1][1][0][1][1] = &Rendtriangle<2, 1, 1, 0, 1, 1>;
RendtriangleFns[2][1][1][0][2][0] = &Rendtriangle<2, 1, 1, 0, 2, 0>;
RendtriangleFns[2][1][1][0][2][1] = &Rendtriangle<2, 1, 1, 0, 2, 1>;
RendtriangleFns[2][1][1][0][3][0] = &Rendtriangle<2, 1, 1, 0, 3, 0>;
RendtriangleFns[2][1][1][0][3][1] = &Rendtriangle<2, 1, 1, 0, 3, 1>;
RendtriangleFns[2][1][1][1][0][0] = &Rendtriangle<2, 1, 1, 1, 0, 0>;
RendtriangleFns[2][1][1][1][0][1] = &Rendtriangle<2, 1, 1, 1, 0, 1>;
RendtriangleFns[2][1][1][1][1][0] = &Rendtriangle<2, 1, 1, 1, 1, 0>;
RendtriangleFns[2][1][1][1][1][1] = &Rendtriangle<2, 1, 1, 1, 1, 1>;
RendtriangleFns[2][1][1][1][2][0] = &Rendtriangle<2, 1, 1, 1, 2, 0>;
RendtriangleFns[2][1][1][1][2][1] = &Rendtriangle<2, 1, 1, 1, 2, 1>;
RendtriangleFns[2][1][1][1][3][0] = &Rendtriangle<2, 1, 1, 1, 3, 0>;
RendtriangleFns[2][1][1][1][3][1] = &Rendtriangle<2, 1, 1, 1, 3, 1>;
RendtriangleFns[2][1][0][0][0][0] = &Rendtriangle<2, 1, 0, 0, 0, 0>;
RendtriangleFns[2][1][0][0][0][1] = &Rendtriangle<2, 1, 0, 0, 0, 1>;
RendtriangleFns[2][1][0][0][1][0] = &Rendtriangle<2, 1, 0, 0, 1, 0>;
RendtriangleFns[2][1][0][0][1][1] = &Rendtriangle<2, 1, 0, 0, 1, 1>;
RendtriangleFns[2][1][0][0][2][0] = &Rendtriangle<2, 1, 0, 0, 2, 0>;
RendtriangleFns[2][1][0][0][2][1] = &Rendtriangle<2, 1, 0, 0, 2, 1>;
RendtriangleFns[2][1][0][0][3][0] = &Rendtriangle<2, 1, 0, 0, 3, 0>;
RendtriangleFns[2][1][0][0][3][1] = &Rendtriangle<2, 1, 0, 0, 3, 1>;
RendtriangleFns[2][1][0][1][0][0] = &Rendtriangle<2, 1, 0, 1, 0, 0>;
RendtriangleFns[2][1][0][1][0][1] = &Rendtriangle<2, 1, 0, 1, 0, 1>;
RendtriangleFns[2][1][0][1][1][0] = &Rendtriangle<2, 1, 0, 1, 1, 0>;
RendtriangleFns[2][1][0][1][1][1] = &Rendtriangle<2, 1, 0, 1, 1, 1>;
RendtriangleFns[2][1][0][1][2][0] = &Rendtriangle<2, 1, 0, 1, 2, 0>;
RendtriangleFns[2][1][0][1][2][1] = &Rendtriangle<2, 1, 0, 1, 2, 1>;
RendtriangleFns[2][1][0][1][3][0] = &Rendtriangle<2, 1, 0, 1, 3, 0>;
RendtriangleFns[2][1][0][1][3][1] = &Rendtriangle<2, 1, 0, 1, 3, 1>;
}
return true;
}
virtual void Resize(int w, int h) {
}
virtual void Term() {
#if HOST_OS == OS_WINDOWS
if (hBMP) {
DeleteObject(SelectObject(hmem, holdBMP));
DeleteDC(hmem);
}
#endif
}
#define RR(x, a, b, c, d) (x + a), (x + b), (x + c), (x + d)
#define R(a, b, c, d) RR(12, a, b, c, d), RR(8, a, b, c, d), RR(4, a, b, c, d), RR(0, a, b, c, d)
//R coefs should be adjusted to match pixel format
INLINE __m128 shuffle_pixel(__m128 v) {
return _mm_cvtepi32_ps(_mm_shuffle_epi8(_mm_cvtps_epi32(v), _mm_set_epi8(R(0x80, 2, 1, 0))));
}
virtual void Present() {
__m128* psrc = (__m128*)render_buffer;
__m128* pdst = (__m128*)pixels;
#define SHUFFL(v) v
// #define SHUFFL(v) shuffle_pixel(v)
#if HOST_OS == OS_WINDOWS
#define FLIP_Y 479 -
#else
#define FLIP_Y
#endif
const int stride = STRIDE_PIXEL_OFFSET / 4;
for (int y = 0; y<MAX_RENDER_HEIGHT; y += 4)
{
for (int x = 0; x<MAX_RENDER_WIDTH; x += 4)
{
pdst[(FLIP_Y (y + 0))*stride + x / 4] = SHUFFL(*psrc++);
pdst[(FLIP_Y (y + 1))*stride + x / 4] = SHUFFL(*psrc++);
pdst[(FLIP_Y (y + 2))*stride + x / 4] = SHUFFL(*psrc++);
pdst[(FLIP_Y (y + 3))*stride + x / 4] = SHUFFL(*psrc++);
}
}
#if HOST_OS == OS_WINDOWS
SetDIBits(hmem, hBMP, 0, 480, pixels, (BITMAPINFO*)&bi, DIB_RGB_COLORS);
RECT clientRect;
GetClientRect(hWnd, &clientRect);
HDC hdc = GetDC(hWnd);
int w = clientRect.right - clientRect.left;
int h = clientRect.bottom - clientRect.top;
int x = (w - 640) / 2;
int y = (h - 480) / 2;
BitBlt(hdc, x, y, 640 , 480 , hmem, 0, 0, SRCCOPY);
ReleaseDC(hWnd, hdc);
#else
extern Window x11_win;
extern Display* x11_disp;
extern Visual* x11_vis;
int width = 640;
int height = 480;
extern int x11_width;
extern int x11_height;
XImage* ximage = XCreateImage(x11_disp, x11_vis, 24, ZPixmap, 0, (char *)pixels, width, height, 32, width * 4);
GC gc = XCreateGC(x11_disp, x11_win, 0, 0);
XPutImage(x11_disp, x11_win, gc, ximage, 0, 0, (x11_width - width)/2, (x11_height - height)/2, width, height);
XFree(ximage);
XFreeGC(x11_disp, gc);
#endif
}
};
Renderer* rend_softrend() {
return new(_mm_malloc(sizeof(softrend), 32)) softrend();
}