IPU: MultiISA IPU

This commit is contained in:
TellowKrinkle 2022-10-20 21:49:29 -05:00 committed by TellowKrinkle
parent 44e69a9603
commit 1a383de5c4
16 changed files with 552 additions and 421 deletions

View File

@ -971,6 +971,31 @@ else()
)
endif()
# IPU sources
set(pcsx2IPUSources
IPU/IPU.cpp
IPU/IPU_Fifo.cpp
IPU/IPUdma.cpp
)
set(pcsx2IPUSourcesUnshared
IPU/IPU_MultiISA.cpp
IPU/IPUdither.cpp
IPU/mpeg2lib/Idct.cpp
IPU/mpeg2lib/Mpeg.cpp
IPU/yuv2rgb.cpp
)
# IPU headers
set(pcsx2IPUHeaders
IPU/IPU.h
IPU/IPU_Fifo.h
IPU/IPU_MultiISA.h
IPU/IPUdma.h
IPU/mpeg2lib/Mpeg.h
IPU/mpeg2lib/Vlc.h
IPU/yuv2rgb.h
)
if(DISABLE_ADVANCE_SIMD)
target_compile_definitions(PCSX2 PRIVATE MULTI_ISA_SHARED_COMPILATION)
@ -997,7 +1022,7 @@ if(DISABLE_ADVANCE_SIMD)
# Note: ld64 (macOS's linker) does not act the same way when presented with .a files, unless linked with `-force_load` (cmake WHOLE_ARCHIVE).
set(is_first_isa "1")
foreach(isa "sse4" "avx" "avx2")
add_library(GS-${isa} STATIC ${pcsx2GSSourcesUnshared})
add_library(GS-${isa} STATIC ${pcsx2GSSourcesUnshared} ${pcsx2IPUSourcesUnshared})
target_link_libraries(GS-${isa} PRIVATE PCSX2_FLAGS)
target_compile_definitions(GS-${isa} PRIVATE MULTI_ISA_UNSHARED_COMPILATION=isa_${isa} MULTI_ISA_IS_FIRST=${is_first_isa} ${pcsx2_defs_${isa}})
target_compile_options(GS-${isa} PRIVATE ${compile_options_${isa}})
@ -1012,6 +1037,7 @@ if(DISABLE_ADVANCE_SIMD)
endforeach()
else()
list(APPEND pcsx2GSSources ${pcsx2GSSourcesUnshared})
list(APPEND pcsx2IPUSources ${pcsx2IPUSourcesUnshared})
endif()
# DebugTools sources
@ -1341,26 +1367,6 @@ set(pcsx2GuiResources
${res_bin}/Breakpoint_Inactive.h
)
# IPU sources
set(pcsx2IPUSources
IPU/IPU.cpp
IPU/IPU_Fifo.cpp
IPU/IPUdither.cpp
IPU/IPUdma.cpp
IPU/mpeg2lib/Idct.cpp
IPU/mpeg2lib/Mpeg.cpp
IPU/yuv2rgb.cpp)
# IPU headers
set(pcsx2IPUHeaders
IPU/IPUdma.h
IPU/IPU_Fifo.h
IPU/IPU.h
IPU/mpeg2lib/Mpeg.h
IPU/mpeg2lib/Vlc.h
IPU/yuv2rgb.h
)
# Linux sources
set(pcsx2LinuxSources
CDVD/Linux/DriveUtility.cpp

View File

@ -18,6 +18,7 @@
#include "common/emitter/tools.h"
#include "common/General.h"
#include <string>
#include <vector>
class SettingsInterface;
class SettingsWrapper;

View File

@ -17,8 +17,8 @@
#include "Common.h"
#include "IPU.h"
#include "IPU_MultiISA.h"
#include "IPUdma.h"
#include "yuv2rgb.h"
#include "mpeg2lib/Mpeg.h"
#include <limits.h>
@ -31,7 +31,7 @@ alignas(16) tIPU_cmd ipu_cmd;
alignas(16) tIPU_BP g_BP;
alignas(16) decoder_t decoder;
void IPUWorker();
static void (*IPUWorker)();
// Color conversion stuff, the memory layout is a total hack
// convert_data_buffer is a pointer to the internal rgb struct (the first param in convert_init_t)
@ -40,11 +40,11 @@ void IPUWorker();
//u8 PCT[] = {'r', 'I', 'P', 'B', 'D', '-', '-', '-'}; // unused?
// Quantization matrix
static rgb16_t vqclut[16]; //clut conversion table
static u16 s_thresh[2]; //thresholds for color conversions
rgb16_t g_ipu_vqclut[16]; //clut conversion table
u16 g_ipu_thresh[2]; //thresholds for color conversions
int coded_block_pattern = 0;
alignas(16) static u8 indx4[16*16/2];
alignas(16) u8 g_ipu_indx4[16*16/2];
uint eecount_on_last_vdec = 0;
bool FMVstarted = false;
@ -67,6 +67,7 @@ __fi void IPUProcessInterrupt()
void ipuReset()
{
IPUWorker = MULTI_ISA_SELECT(IPUWorker);
memzero(ipuRegs);
memzero(g_BP);
memzero(decoder);
@ -84,8 +85,8 @@ void ReportIPU()
Console.WriteLn(ipu_fifo.in.desc());
Console.WriteLn(ipu_fifo.out.desc());
Console.WriteLn(g_BP.desc());
Console.WriteLn("vqclut = 0x%x.", vqclut);
Console.WriteLn("s_thresh = 0x%x.", s_thresh);
Console.WriteLn("vqclut = 0x%x.", g_ipu_vqclut);
Console.WriteLn("thresh = 0x%x.", g_ipu_thresh);
Console.WriteLn("coded_block_pattern = 0x%x.", coded_block_pattern);
Console.WriteLn("g_decoder = 0x%x.", &decoder);
Console.WriteLn("mpeg2_scan = 0x%x.", &mpeg2_scan);
@ -101,8 +102,8 @@ void SaveStateBase::ipuFreeze()
Freeze(ipu_fifo);
Freeze(g_BP);
Freeze(vqclut);
Freeze(s_thresh);
Freeze(g_ipu_vqclut);
Freeze(g_ipu_thresh);
Freeze(coded_block_pattern);
Freeze(decoder);
Freeze(ipu_cmd);
@ -408,305 +409,13 @@ static __ri void ipuBDEC(tIPU_CMD_BDEC bdec)
memzero_sse_a(decoder.mb16);
}
static __fi bool ipuVDEC(u32 val)
{
static int count = 0;
if (count++ > 5) {
if (!FMVstarted) {
EnableFMV = true;
FMVstarted = true;
}
count = 0;
}
eecount_on_last_vdec = cpuRegs.cycle;
switch (ipu_cmd.pos[0])
{
case 0:
if (!bitstream_init()) return false;
switch ((val >> 26) & 3)
{
case 0://Macroblock Address Increment
decoder.mpeg1 = ipuRegs.ctrl.MP1;
ipuRegs.cmd.DATA = get_macroblock_address_increment();
break;
case 1://Macroblock Type
decoder.frame_pred_frame_dct = 1;
decoder.coding_type = ipuRegs.ctrl.PCT > 0 ? ipuRegs.ctrl.PCT : 1; // Kaiketsu Zorro Mezase doesn't set a Picture type, seems happy with I
ipuRegs.cmd.DATA = get_macroblock_modes();
break;
case 2://Motion Code
ipuRegs.cmd.DATA = get_motion_delta(0);
break;
case 3://DMVector
ipuRegs.cmd.DATA = get_dmv();
break;
jNO_DEFAULT
}
// HACK ATTACK! This code OR's the MPEG decoder's bitstream position into the upper
// 16 bits of DATA; which really doesn't make sense since (a) we already rewound the bits
// back into the IPU internal buffer above, and (b) the IPU doesn't have an MPEG internal
// 32-bit decoder buffer of its own anyway. Furthermore, setting the upper 16 bits to
// any value other than zero appears to work fine. When set to zero, however, FMVs run
// very choppy (basically only decoding/updating every 30th frame or so). So yeah,
// someone with knowledge on the subject please feel free to explain this one. :) --air
// The upper bits are the "length" of the decoded command, where the lower is the address.
// This is due to differences with IPU and the MPEG standard. See get_macroblock_address_increment().
ipuRegs.ctrl.ECD = (ipuRegs.cmd.DATA == 0);
[[fallthrough]];
case 1:
if (!getBits32((u8*)&ipuRegs.top, 0))
{
ipu_cmd.pos[0] = 1;
return false;
}
ipuRegs.top = BigEndian(ipuRegs.top);
IPU_LOG("VDEC command data 0x%x(0x%x). Skip 0x%X bits/Table=%d (%s), pct %d",
ipuRegs.cmd.DATA, ipuRegs.cmd.DATA >> 16, val & 0x3f, (val >> 26) & 3, (val >> 26) & 1 ?
((val >> 26) & 2 ? "DMV" : "MBT") : (((val >> 26) & 2 ? "MC" : "MBAI")), ipuRegs.ctrl.PCT);
return true;
jNO_DEFAULT
}
return false;
}
static __ri bool ipuFDEC(u32 val)
{
if (!getBits32((u8*)&ipuRegs.cmd.DATA, 0)) return false;
ipuRegs.cmd.DATA = BigEndian(ipuRegs.cmd.DATA);
ipuRegs.top = ipuRegs.cmd.DATA;
IPU_LOG("FDEC read: 0x%08x", ipuRegs.top);
return true;
}
static bool ipuSETIQ(u32 val)
{
if ((val >> 27) & 1)
{
u8 (&niq)[64] = decoder.niq;
for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
{
if (!getBits64((u8*)niq + 8 * ipu_cmd.pos[0], 1)) return false;
}
IPU_LOG("Read non-intra quantization matrix from FIFO.");
for (uint i = 0; i < 8; i++)
{
IPU_LOG("%02X %02X %02X %02X %02X %02X %02X %02X",
niq[i * 8 + 0], niq[i * 8 + 1], niq[i * 8 + 2], niq[i * 8 + 3],
niq[i * 8 + 4], niq[i * 8 + 5], niq[i * 8 + 6], niq[i * 8 + 7]);
}
}
else
{
u8 (&iq)[64] = decoder.iq;
for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
{
if (!getBits64((u8*)iq + 8 * ipu_cmd.pos[0], 1)) return false;
}
IPU_LOG("Read intra quantization matrix from FIFO.");
for (uint i = 0; i < 8; i++)
{
IPU_LOG("%02X %02X %02X %02X %02X %02X %02X %02X",
iq[i * 8 + 0], iq[i * 8 + 1], iq[i * 8 + 2], iq[i *8 + 3],
iq[i * 8 + 4], iq[i * 8 + 5], iq[i * 8 + 6], iq[i *8 + 7]);
}
}
return true;
}
static bool ipuSETVQ(u32 val)
{
for(;ipu_cmd.pos[0] < 4; ipu_cmd.pos[0]++)
{
if (!getBits64(((u8*)vqclut) + 8 * ipu_cmd.pos[0], 1)) return false;
}
IPU_LOG("SETVQ command. Read VQCLUT table from FIFO.\n"
"%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
"%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
"%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
"%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d",
vqclut[0].r, vqclut[0].g, vqclut[0].b,
vqclut[1].r, vqclut[1].g, vqclut[1].b,
vqclut[2].r, vqclut[2].g, vqclut[2].b,
vqclut[3].r, vqclut[3].g, vqclut[3].b,
vqclut[4].r, vqclut[4].g, vqclut[4].b,
vqclut[5].r, vqclut[5].g, vqclut[5].b,
vqclut[6].r, vqclut[6].g, vqclut[6].b,
vqclut[7].r, vqclut[7].g, vqclut[7].b,
vqclut[8].r, vqclut[8].g, vqclut[8].b,
vqclut[9].r, vqclut[9].g, vqclut[9].b,
vqclut[10].r, vqclut[10].g, vqclut[10].b,
vqclut[11].r, vqclut[11].g, vqclut[11].b,
vqclut[12].r, vqclut[12].g, vqclut[12].b,
vqclut[13].r, vqclut[13].g, vqclut[13].b,
vqclut[14].r, vqclut[14].g, vqclut[14].b,
vqclut[15].r, vqclut[15].g, vqclut[15].b);
return true;
}
// IPU Transfers are split into 8Qwords so we need to send ALL the data
static __ri bool ipuCSC(tIPU_CMD_CSC csc)
{
csc.log_from_YCbCr();
for (;ipu_cmd.index < (int)csc.MBC; ipu_cmd.index++)
{
for(;ipu_cmd.pos[0] < 48; ipu_cmd.pos[0]++)
{
if (!getBits64((u8*)&decoder.mb8 + 8 * ipu_cmd.pos[0], 1)) return false;
}
ipu_csc(decoder.mb8, decoder.rgb32, 0);
if (csc.OFM) ipu_dither(decoder.rgb32, decoder.rgb16, csc.DTE);
if (csc.OFM)
{
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
if (ipu_cmd.pos[1] < 32) return false;
}
else
{
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb32) + 4 * ipu_cmd.pos[1], 64 - ipu_cmd.pos[1]);
if (ipu_cmd.pos[1] < 64) return false;
}
ipu_cmd.pos[0] = 0;
ipu_cmd.pos[1] = 0;
}
return true;
}
static __ri bool ipuPACK(tIPU_CMD_CSC csc)
{
csc.log_from_RGB32();
for (;ipu_cmd.index < (int)csc.MBC; ipu_cmd.index++)
{
for(;ipu_cmd.pos[0] < (int)sizeof(macroblock_rgb32) / 8; ipu_cmd.pos[0]++)
{
if (!getBits64((u8*)&decoder.rgb32 + 8 * ipu_cmd.pos[0], 1)) return false;
}
ipu_dither(decoder.rgb32, decoder.rgb16, csc.DTE);
if (!csc.OFM) ipu_vq(decoder.rgb16, indx4);
if (csc.OFM)
{
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
if (ipu_cmd.pos[1] < 32) return false;
}
else
{
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*)indx4) + 4 * ipu_cmd.pos[1], 8 - ipu_cmd.pos[1]);
if (ipu_cmd.pos[1] < 8) return false;
}
ipu_cmd.pos[0] = 0;
ipu_cmd.pos[1] = 0;
}
return true;
}
static void ipuSETTH(u32 val)
{
s_thresh[0] = (val & 0x1ff);
s_thresh[1] = ((val >> 16) & 0x1ff);
g_ipu_thresh[0] = (val & 0x1ff);
g_ipu_thresh[1] = ((val >> 16) & 0x1ff);
IPU_LOG("SETTH (Set threshold value)command %x.", val&0x1ff01ff);
}
// --------------------------------------------------------------------------------------
// CORE Functions (referenced from MPEG library)
// --------------------------------------------------------------------------------------
__fi void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn)
{
int i;
u8* p = (u8*)&rgb32;
yuv2rgb();
if (s_thresh[0] > 0)
{
for (i = 0; i < 16*16; i++, p += 4)
{
if ((p[0] < s_thresh[0]) && (p[1] < s_thresh[0]) && (p[2] < s_thresh[0]))
*(u32*)p = 0;
else if ((p[0] < s_thresh[1]) && (p[1] < s_thresh[1]) && (p[2] < s_thresh[1]))
p[3] = 0x40;
}
}
else if (s_thresh[1] > 0)
{
for (i = 0; i < 16*16; i++, p += 4)
{
if ((p[0] < s_thresh[1]) && (p[1] < s_thresh[1]) && (p[2] < s_thresh[1]))
p[3] = 0x40;
}
}
if (sgn)
{
for (i = 0; i < 16*16; i++, p += 4)
{
*(u32*)p ^= 0x808080;
}
}
}
__fi void ipu_vq(macroblock_rgb16& rgb16, u8* indx4)
{
const auto closest_index = [&](int i, int j) {
u8 index = 0;
int min_distance = std::numeric_limits<int>::max();
for (u8 k = 0; k < 16; ++k)
{
const int dr = rgb16.c[i][j].r - vqclut[k].r;
const int dg = rgb16.c[i][j].g - vqclut[k].g;
const int db = rgb16.c[i][j].b - vqclut[k].b;
const int distance = dr * dr + dg * dg + db * db;
// XXX: If two distances are the same which index is used?
if (min_distance > distance)
{
index = k;
min_distance = distance;
}
}
return index;
};
for (int i = 0; i < 16; ++i)
for (int j = 0; j < 8; ++j)
indx4[i * 8 + j] = closest_index(i, 2 * j + 1) << 4 | closest_index(i, 2 * j);
}
// --------------------------------------------------------------------------------------
// Buffer reader
// --------------------------------------------------------------------------------------
@ -902,7 +611,7 @@ __fi void IPUCMD_WRITE(u32 val)
break;
jNO_DEFAULT;
}
}
ipuRegs.ctrl.BUSY = 1;
@ -916,71 +625,3 @@ __fi void IPUCMD_WRITE(u32 val)
else
IPUWorker();
}
__noinline void IPUWorker()
{
pxAssert(ipuRegs.ctrl.BUSY);
switch (ipu_cmd.CMD)
{
// These are unreachable (BUSY will always be 0 for them)
//case SCE_IPU_BCLR:
//case SCE_IPU_SETTH:
//break;
case SCE_IPU_IDEC:
if (!mpeg2sliceIDEC()) return;
//ipuRegs.ctrl.OFC = 0;
ipuRegs.topbusy = 0;
ipuRegs.cmd.BUSY = 0;
break;
case SCE_IPU_BDEC:
if (!mpeg2_slice()) return;
ipuRegs.topbusy = 0;
ipuRegs.cmd.BUSY = 0;
//if (ipuRegs.ctrl.SCD || ipuRegs.ctrl.ECD) hwIntcIrq(INTC_IPU);
break;
case SCE_IPU_VDEC:
if (!ipuVDEC(ipu_cmd.current)) return;
ipuRegs.topbusy = 0;
ipuRegs.cmd.BUSY = 0;
break;
case SCE_IPU_FDEC:
if (!ipuFDEC(ipu_cmd.current)) return;
ipuRegs.topbusy = 0;
ipuRegs.cmd.BUSY = 0;
break;
case SCE_IPU_SETIQ:
if (!ipuSETIQ(ipu_cmd.current)) return;
break;
case SCE_IPU_SETVQ:
if (!ipuSETVQ(ipu_cmd.current)) return;
break;
case SCE_IPU_CSC:
if (!ipuCSC(ipu_cmd.current)) return;
break;
case SCE_IPU_PACK:
if (!ipuPACK(ipu_cmd.current)) return;
break;
jNO_DEFAULT
}
// success
IPU_LOG("IPU Command finished");
ipuRegs.ctrl.BUSY = 0;
//ipu_cmd.current = 0xffffffff;
hwIntcIrq(INTC_IPU);
}

View File

@ -288,7 +288,11 @@ union tIPU_cmd
static IPUregisters& ipuRegs = (IPUregisters&)eeHw[0x2000];
extern bool FMVstarted;
extern bool EnableFMV;
alignas(16) extern tIPU_cmd ipu_cmd;
extern uint eecount_on_last_vdec;
extern int coded_block_pattern;
extern bool CommandExecuteQueued;

387
pcsx2/IPU/IPU_MultiISA.cpp Normal file
View File

@ -0,0 +1,387 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2022 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "IPU_MultiISA.h"
#include "IPU.h"
#include "IPUdma.h"
#include "yuv2rgb.h"
MULTI_ISA_UNSHARED_START
//////////////////////////////////////////////////////
// IPU Commands (exec on worker thread only)
static __fi bool ipuVDEC(u32 val)
{
static int count = 0;
if (count++ > 5) {
if (!FMVstarted) {
EnableFMV = true;
FMVstarted = true;
}
count = 0;
}
eecount_on_last_vdec = cpuRegs.cycle;
switch (ipu_cmd.pos[0])
{
case 0:
if (!bitstream_init()) return false;
switch ((val >> 26) & 3)
{
case 0://Macroblock Address Increment
decoder.mpeg1 = ipuRegs.ctrl.MP1;
ipuRegs.cmd.DATA = get_macroblock_address_increment();
break;
case 1://Macroblock Type
decoder.frame_pred_frame_dct = 1;
decoder.coding_type = ipuRegs.ctrl.PCT > 0 ? ipuRegs.ctrl.PCT : 1; // Kaiketsu Zorro Mezase doesn't set a Picture type, seems happy with I
ipuRegs.cmd.DATA = get_macroblock_modes();
break;
case 2://Motion Code
ipuRegs.cmd.DATA = get_motion_delta(0);
break;
case 3://DMVector
ipuRegs.cmd.DATA = get_dmv();
break;
jNO_DEFAULT
}
// HACK ATTACK! This code OR's the MPEG decoder's bitstream position into the upper
// 16 bits of DATA; which really doesn't make sense since (a) we already rewound the bits
// back into the IPU internal buffer above, and (b) the IPU doesn't have an MPEG internal
// 32-bit decoder buffer of its own anyway. Furthermore, setting the upper 16 bits to
// any value other than zero appears to work fine. When set to zero, however, FMVs run
// very choppy (basically only decoding/updating every 30th frame or so). So yeah,
// someone with knowledge on the subject please feel free to explain this one. :) --air
// The upper bits are the "length" of the decoded command, where the lower is the address.
// This is due to differences with IPU and the MPEG standard. See get_macroblock_address_increment().
ipuRegs.ctrl.ECD = (ipuRegs.cmd.DATA == 0);
[[fallthrough]];
case 1:
if (!getBits32((u8*)&ipuRegs.top, 0))
{
ipu_cmd.pos[0] = 1;
return false;
}
ipuRegs.top = BigEndian(ipuRegs.top);
IPU_LOG("VDEC command data 0x%x(0x%x). Skip 0x%X bits/Table=%d (%s), pct %d",
ipuRegs.cmd.DATA, ipuRegs.cmd.DATA >> 16, val & 0x3f, (val >> 26) & 3, (val >> 26) & 1 ?
((val >> 26) & 2 ? "DMV" : "MBT") : (((val >> 26) & 2 ? "MC" : "MBAI")), ipuRegs.ctrl.PCT);
return true;
jNO_DEFAULT
}
return false;
}
static __ri bool ipuFDEC(u32 val)
{
if (!getBits32((u8*)&ipuRegs.cmd.DATA, 0)) return false;
ipuRegs.cmd.DATA = BigEndian(ipuRegs.cmd.DATA);
ipuRegs.top = ipuRegs.cmd.DATA;
IPU_LOG("FDEC read: 0x%08x", ipuRegs.top);
return true;
}
static bool ipuSETIQ(u32 val)
{
if ((val >> 27) & 1)
{
u8 (&niq)[64] = decoder.niq;
for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
{
if (!getBits64((u8*)niq + 8 * ipu_cmd.pos[0], 1)) return false;
}
IPU_LOG("Read non-intra quantization matrix from FIFO.");
for (uint i = 0; i < 8; i++)
{
IPU_LOG("%02X %02X %02X %02X %02X %02X %02X %02X",
niq[i * 8 + 0], niq[i * 8 + 1], niq[i * 8 + 2], niq[i * 8 + 3],
niq[i * 8 + 4], niq[i * 8 + 5], niq[i * 8 + 6], niq[i * 8 + 7]);
}
}
else
{
u8 (&iq)[64] = decoder.iq;
for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
{
if (!getBits64((u8*)iq + 8 * ipu_cmd.pos[0], 1)) return false;
}
IPU_LOG("Read intra quantization matrix from FIFO.");
for (uint i = 0; i < 8; i++)
{
IPU_LOG("%02X %02X %02X %02X %02X %02X %02X %02X",
iq[i * 8 + 0], iq[i * 8 + 1], iq[i * 8 + 2], iq[i *8 + 3],
iq[i * 8 + 4], iq[i * 8 + 5], iq[i * 8 + 6], iq[i *8 + 7]);
}
}
return true;
}
static bool ipuSETVQ(u32 val)
{
for(;ipu_cmd.pos[0] < 4; ipu_cmd.pos[0]++)
{
if (!getBits64(((u8*)g_ipu_vqclut) + 8 * ipu_cmd.pos[0], 1)) return false;
}
IPU_LOG("SETVQ command. Read VQCLUT table from FIFO.\n"
"%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
"%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
"%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
"%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d",
g_ipu_vqclut[ 0].r, g_ipu_vqclut[ 0].g, g_ipu_vqclut[ 0].b,
g_ipu_vqclut[ 1].r, g_ipu_vqclut[ 1].g, g_ipu_vqclut[ 1].b,
g_ipu_vqclut[ 2].r, g_ipu_vqclut[ 2].g, g_ipu_vqclut[ 2].b,
g_ipu_vqclut[ 3].r, g_ipu_vqclut[ 3].g, g_ipu_vqclut[ 3].b,
g_ipu_vqclut[ 4].r, g_ipu_vqclut[ 4].g, g_ipu_vqclut[ 4].b,
g_ipu_vqclut[ 5].r, g_ipu_vqclut[ 5].g, g_ipu_vqclut[ 5].b,
g_ipu_vqclut[ 6].r, g_ipu_vqclut[ 6].g, g_ipu_vqclut[ 6].b,
g_ipu_vqclut[ 7].r, g_ipu_vqclut[ 7].g, g_ipu_vqclut[ 7].b,
g_ipu_vqclut[ 8].r, g_ipu_vqclut[ 8].g, g_ipu_vqclut[ 8].b,
g_ipu_vqclut[ 9].r, g_ipu_vqclut[ 9].g, g_ipu_vqclut[ 9].b,
g_ipu_vqclut[10].r, g_ipu_vqclut[10].g, g_ipu_vqclut[10].b,
g_ipu_vqclut[11].r, g_ipu_vqclut[11].g, g_ipu_vqclut[11].b,
g_ipu_vqclut[12].r, g_ipu_vqclut[12].g, g_ipu_vqclut[12].b,
g_ipu_vqclut[13].r, g_ipu_vqclut[13].g, g_ipu_vqclut[13].b,
g_ipu_vqclut[14].r, g_ipu_vqclut[14].g, g_ipu_vqclut[14].b,
g_ipu_vqclut[15].r, g_ipu_vqclut[15].g, g_ipu_vqclut[15].b);
return true;
}
// IPU Transfers are split into 8Qwords so we need to send ALL the data
static __ri bool ipuCSC(tIPU_CMD_CSC csc)
{
csc.log_from_YCbCr();
for (;ipu_cmd.index < (int)csc.MBC; ipu_cmd.index++)
{
for(;ipu_cmd.pos[0] < 48; ipu_cmd.pos[0]++)
{
if (!getBits64((u8*)&decoder.mb8 + 8 * ipu_cmd.pos[0], 1)) return false;
}
ipu_csc(decoder.mb8, decoder.rgb32, 0);
if (csc.OFM) ipu_dither(decoder.rgb32, decoder.rgb16, csc.DTE);
if (csc.OFM)
{
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
if (ipu_cmd.pos[1] < 32) return false;
}
else
{
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb32) + 4 * ipu_cmd.pos[1], 64 - ipu_cmd.pos[1]);
if (ipu_cmd.pos[1] < 64) return false;
}
ipu_cmd.pos[0] = 0;
ipu_cmd.pos[1] = 0;
}
return true;
}
static __ri bool ipuPACK(tIPU_CMD_CSC csc)
{
csc.log_from_RGB32();
for (;ipu_cmd.index < (int)csc.MBC; ipu_cmd.index++)
{
for(;ipu_cmd.pos[0] < (int)sizeof(macroblock_rgb32) / 8; ipu_cmd.pos[0]++)
{
if (!getBits64((u8*)&decoder.rgb32 + 8 * ipu_cmd.pos[0], 1)) return false;
}
ipu_dither(decoder.rgb32, decoder.rgb16, csc.DTE);
if (!csc.OFM) ipu_vq(decoder.rgb16, g_ipu_indx4);
if (csc.OFM)
{
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
if (ipu_cmd.pos[1] < 32) return false;
}
else
{
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*)g_ipu_indx4) + 4 * ipu_cmd.pos[1], 8 - ipu_cmd.pos[1]);
if (ipu_cmd.pos[1] < 8) return false;
}
ipu_cmd.pos[0] = 0;
ipu_cmd.pos[1] = 0;
}
return true;
}
// --------------------------------------------------------------------------------------
// CORE Functions (referenced from MPEG library)
// --------------------------------------------------------------------------------------
__fi void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn)
{
int i;
u8* p = (u8*)&rgb32;
yuv2rgb();
if (g_ipu_thresh[0] > 0)
{
for (i = 0; i < 16*16; i++, p += 4)
{
if ((p[0] < g_ipu_thresh[0]) && (p[1] < g_ipu_thresh[0]) && (p[2] < g_ipu_thresh[0]))
*(u32*)p = 0;
else if ((p[0] < g_ipu_thresh[1]) && (p[1] < g_ipu_thresh[1]) && (p[2] < g_ipu_thresh[1]))
p[3] = 0x40;
}
}
else if (g_ipu_thresh[1] > 0)
{
for (i = 0; i < 16*16; i++, p += 4)
{
if ((p[0] < g_ipu_thresh[1]) && (p[1] < g_ipu_thresh[1]) && (p[2] < g_ipu_thresh[1]))
p[3] = 0x40;
}
}
if (sgn)
{
for (i = 0; i < 16*16; i++, p += 4)
{
*(u32*)p ^= 0x808080;
}
}
}
__fi void ipu_vq(macroblock_rgb16& rgb16, u8* indx4)
{
const auto closest_index = [&](int i, int j) {
u8 index = 0;
int min_distance = std::numeric_limits<int>::max();
for (u8 k = 0; k < 16; ++k)
{
const int dr = rgb16.c[i][j].r - g_ipu_vqclut[k].r;
const int dg = rgb16.c[i][j].g - g_ipu_vqclut[k].g;
const int db = rgb16.c[i][j].b - g_ipu_vqclut[k].b;
const int distance = dr * dr + dg * dg + db * db;
// XXX: If two distances are the same which index is used?
if (min_distance > distance)
{
index = k;
min_distance = distance;
}
}
return index;
};
for (int i = 0; i < 16; ++i)
for (int j = 0; j < 8; ++j)
indx4[i * 8 + j] = closest_index(i, 2 * j + 1) << 4 | closest_index(i, 2 * j);
}
__noinline void IPUWorker()
{
pxAssert(ipuRegs.ctrl.BUSY);
switch (ipu_cmd.CMD)
{
// These are unreachable (BUSY will always be 0 for them)
//case SCE_IPU_BCLR:
//case SCE_IPU_SETTH:
//break;
case SCE_IPU_IDEC:
if (!mpeg2sliceIDEC()) return;
//ipuRegs.ctrl.OFC = 0;
ipuRegs.topbusy = 0;
ipuRegs.cmd.BUSY = 0;
break;
case SCE_IPU_BDEC:
if (!mpeg2_slice()) return;
ipuRegs.topbusy = 0;
ipuRegs.cmd.BUSY = 0;
//if (ipuRegs.ctrl.SCD || ipuRegs.ctrl.ECD) hwIntcIrq(INTC_IPU);
break;
case SCE_IPU_VDEC:
if (!ipuVDEC(ipu_cmd.current)) return;
ipuRegs.topbusy = 0;
ipuRegs.cmd.BUSY = 0;
break;
case SCE_IPU_FDEC:
if (!ipuFDEC(ipu_cmd.current)) return;
ipuRegs.topbusy = 0;
ipuRegs.cmd.BUSY = 0;
break;
case SCE_IPU_SETIQ:
if (!ipuSETIQ(ipu_cmd.current)) return;
break;
case SCE_IPU_SETVQ:
if (!ipuSETVQ(ipu_cmd.current)) return;
break;
case SCE_IPU_CSC:
if (!ipuCSC(ipu_cmd.current)) return;
break;
case SCE_IPU_PACK:
if (!ipuPACK(ipu_cmd.current)) return;
break;
jNO_DEFAULT
}
// success
IPU_LOG("IPU Command finished");
ipuRegs.ctrl.BUSY = 0;
//ipu_cmd.current = 0xffffffff;
hwIntcIrq(INTC_IPU);
}
MULTI_ISA_UNSHARED_END

27
pcsx2/IPU/IPU_MultiISA.h Normal file
View File

@ -0,0 +1,27 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2022 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "GS/MultiISA.h"
#include "mpeg2lib/Mpeg.h"
MULTI_ISA_DEF(void IPUWorker();)
// Quantization matrix
extern rgb16_t g_ipu_vqclut[16]; //clut conversion table
extern u16 g_ipu_thresh[2]; //thresholds for color conversions
alignas(16) extern u8 g_ipu_indx4[16*16/2];

View File

@ -21,6 +21,10 @@
#include "yuv2rgb.h"
#include "mpeg2lib/Mpeg.h"
#include "GS/MultiISA.h"
MULTI_ISA_UNSHARED_START
void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte);
void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte);
@ -120,3 +124,5 @@ __ri void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16
}
}
}
MULTI_ISA_UNSHARED_END

View File

@ -32,6 +32,8 @@
#include "IPU/IPU.h"
#include "Mpeg.h"
#include <array>
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
@ -45,9 +47,11 @@
* to +-3826 - this is the worst case for a column IDCT where the
* column inputs are 16-bit values.
*/
alignas(16) static u8 clip_lut[1024];
alignas(16) extern const std::array<u8, 1024> g_idct_clip_lut;
#define CLIP(i) ((clip_lut+384)[(i)])
#define CLIP(i) ((g_idct_clip_lut.data()+384)[(i)])
MULTI_ISA_UNSHARED_START
static __fi void BUTTERFLY(int& t0, int& t1, int w0, int w1, int d0, int d1)
{
@ -219,9 +223,21 @@ __ri void mpeg2_idct_add (const int last, s16 * block, s16 * dest, const int str
}
}
mpeg2_scan_pack::mpeg2_scan_pack()
MULTI_ISA_UNSHARED_END
#if MULTI_ISA_COMPILE_ONCE
static constexpr std::array<u8, 1024> make_clip_lut()
{
static const u8 mpeg2_scan_norm[64] = {
std::array<u8, 1024> lut = {};
for (int i = -384; i < 640; i++)
lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
return lut;
}
static constexpr mpeg2_scan_pack make_scan_pack()
{
constexpr u8 mpeg2_scan_norm[64] = {
/* Zig-Zag scan pattern */
0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
@ -229,7 +245,7 @@ mpeg2_scan_pack::mpeg2_scan_pack()
58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
};
static const u8 mpeg2_scan_alt[64] = {
constexpr u8 mpeg2_scan_alt[64] = {
/* Alternate scan pattern */
0, 8, 16, 24, 1, 9, 2, 10, 17, 25, 32, 40, 48, 56, 57, 49,
41, 33, 26, 18, 3, 11, 4, 12, 19, 27, 34, 42, 50, 58, 35, 43,
@ -237,15 +253,19 @@ mpeg2_scan_pack::mpeg2_scan_pack()
53, 61, 22, 30, 7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63
};
for (int i = -384; i < 640; i++)
clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
mpeg2_scan_pack pack = {};
for (int i = 0; i < 64; i++) {
int j = mpeg2_scan_norm[i];
norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
pack.norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
j = mpeg2_scan_alt[i];
alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
pack.alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
}
return pack;
}
alignas(16) const mpeg2_scan_pack mpeg2_scan;
alignas(16) constexpr std::array<u8, 1024> g_idct_clip_lut = make_clip_lut();
alignas(16) constexpr mpeg2_scan_pack mpeg2_scan = make_scan_pack();
#endif

View File

@ -33,8 +33,12 @@
#include "Mpeg.h"
#include "Vlc.h"
#include "GS/MultiISA.h"
#include "common/MemsetFast.inl"
#if MULTI_ISA_COMPILE_ONCE
const int non_linear_quantizer_scale [] =
{
0, 1, 2, 3, 4, 5, 6, 7,
@ -43,6 +47,10 @@ const int non_linear_quantizer_scale [] =
56, 64, 72, 80, 88, 96, 104, 112
};
#endif
MULTI_ISA_UNSHARED_START
/* Bitstream and buffer needs to be reallocated in order for successful
reading of the old data. Here the old data stored in the 2nd slot
of the internal buffer is copied to 1st slot, and the new data read
@ -1272,4 +1280,6 @@ __fi bool mpeg2_slice()
}
return true;
}
}
MULTI_ISA_UNSHARED_END

View File

@ -24,6 +24,12 @@
#pragma once
#include "IPU/IPU.h"
#include "GS/MultiISA.h"
#include "common/Assertions.h"
// the IPU is fixed to 16 byte strides (128-bit / QWC resolution):
static const uint decoder_stride = 16;
@ -184,30 +190,31 @@ struct mpeg2_scan_pack
{
u8 norm[64];
u8 alt[64];
mpeg2_scan_pack();
};
extern int bitstream_init ();
extern u32 UBITS(uint bits);
extern s32 SBITS(uint bits);
extern void mpeg2_idct_copy(s16 * block, u8* dest, int stride);
extern void mpeg2_idct_add(int last, s16 * block, s16* dest, int stride);
MULTI_ISA_DEF(
extern int bitstream_init();
extern bool mpeg2sliceIDEC();
extern bool mpeg2_slice();
extern int get_macroblock_address_increment();
extern int get_macroblock_modes();
extern void mpeg2_idct_copy(s16 * block, u8* dest, int stride);
extern void mpeg2_idct_add(int last, s16 * block, s16* dest, int stride);
extern int get_motion_delta(const int f_code);
extern int get_dmv();
extern bool mpeg2sliceIDEC();
extern bool mpeg2_slice();
extern int get_macroblock_address_increment();
extern int get_macroblock_modes();
extern void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn);
extern void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte);
extern void ipu_vq(macroblock_rgb16& rgb16, u8* indx4);
extern int get_motion_delta(const int f_code);
extern int get_dmv();
extern int slice (u8 * buffer);
extern void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn);
extern void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte);
extern void ipu_vq(macroblock_rgb16& rgb16, u8* indx4);
extern int slice (u8 * buffer);
)
#ifdef _MSC_VER
#define BigEndian(in) _byteswap_ulong(in)

View File

@ -36,6 +36,8 @@
#define IPU_RCR_COEFF 0xcc // 1.59375
#define IPU_BCB_COEFF 0x102 // 2.015625
MULTI_ISA_UNSHARED_START
// conforming implementation for reference, do not optimise
void yuv2rgb_reference(void)
{
@ -149,3 +151,5 @@ __ri void yuv2rgb_sse2()
}
}
}
MULTI_ISA_UNSHARED_END

View File

@ -15,7 +15,9 @@
#pragma once
extern void yuv2rgb_reference();
#include "GS/MultiISA.h"
MULTI_ISA_DEF(extern void yuv2rgb_reference();)
#define yuv2rgb yuv2rgb_sse2
extern void yuv2rgb_sse2();
MULTI_ISA_DEF(extern void yuv2rgb_sse2();)

View File

@ -576,6 +576,7 @@
<ClCompile Include="CDVD\CDVDisoReader.cpp" />
<ClCompile Include="Ipu\IPU.cpp" />
<ClCompile Include="Ipu\IPU_Fifo.cpp" />
<ClCompile Include="Ipu\IPU_MultiISA.cpp" />
<ClCompile Include="Ipu\yuv2rgb.cpp" />
<ClCompile Include="Ipu\mpeg2lib\Idct.cpp" />
<ClCompile Include="Ipu\mpeg2lib\Mpeg.cpp" />
@ -1017,6 +1018,7 @@
<ClInclude Include="CDVD\CDVDisoReader.h" />
<ClInclude Include="Ipu\IPU.h" />
<ClInclude Include="Ipu\IPU_Fifo.h" />
<ClInclude Include="Ipu\IPU_MultiISA.h" />
<ClInclude Include="Ipu\yuv2rgb.h" />
<ClInclude Include="Ipu\mpeg2lib\Mpeg.h" />
<ClInclude Include="Ipu\mpeg2lib\Vlc.h" />

View File

@ -692,6 +692,9 @@
<ClCompile Include="IPU\IPU_Fifo.cpp">
<Filter>System\Ps2\IPU</Filter>
</ClCompile>
<ClCompile Include="IPU\IPU_MultiISA.cpp">
<Filter>System\Ps2\IPU</Filter>
</ClCompile>
<ClCompile Include="IPU\yuv2rgb.cpp">
<Filter>System\Ps2\IPU</Filter>
</ClCompile>
@ -2011,6 +2014,9 @@
<ClInclude Include="IPU\IPU_Fifo.h">
<Filter>System\Ps2\IPU</Filter>
</ClInclude>
<ClInclude Include="IPU\IPU_MultiISA.h">
<Filter>System\Ps2\IPU</Filter>
</ClInclude>
<ClInclude Include="IPU\yuv2rgb.h">
<Filter>System\Ps2\IPU</Filter>
</ClInclude>

View File

@ -430,6 +430,7 @@
<ClCompile Include="CDVD\CDVDisoReader.cpp" />
<ClCompile Include="Ipu\IPU.cpp" />
<ClCompile Include="Ipu\IPU_Fifo.cpp" />
<ClCompile Include="Ipu\IPU_MultiISA.cpp" />
<ClCompile Include="Ipu\yuv2rgb.cpp" />
<ClCompile Include="Ipu\mpeg2lib\Idct.cpp" />
<ClCompile Include="Ipu\mpeg2lib\Mpeg.cpp" />
@ -739,6 +740,7 @@
<ClInclude Include="CDVD\CDVDcommon.h" />
<ClInclude Include="Ipu\IPU.h" />
<ClInclude Include="Ipu\IPU_Fifo.h" />
<ClInclude Include="Ipu\IPU_MultiISA.h" />
<ClInclude Include="Ipu\yuv2rgb.h" />
<ClInclude Include="Ipu\mpeg2lib\Mpeg.h" />
<ClInclude Include="Ipu\mpeg2lib\Vlc.h" />

View File

@ -617,6 +617,9 @@
<ClCompile Include="IPU\IPU_Fifo.cpp">
<Filter>System\Ps2\IPU</Filter>
</ClCompile>
<ClCompile Include="IPU\IPU_MultiISA.cpp">
<Filter>System\Ps2\IPU</Filter>
</ClCompile>
<ClCompile Include="IPU\yuv2rgb.cpp">
<Filter>System\Ps2\IPU</Filter>
</ClCompile>
@ -1532,6 +1535,9 @@
<ClInclude Include="IPU\IPU_Fifo.h">
<Filter>System\Ps2\IPU</Filter>
</ClInclude>
<ClInclude Include="IPU\IPU_MultiISA.h">
<Filter>System\Ps2\IPU</Filter>
</ClInclude>
<ClInclude Include="IPU\yuv2rgb.h">
<Filter>System\Ps2\IPU</Filter>
</ClInclude>