Update to v106r28 release.

byuu says:

Changelog:

  - SNES: started on skeleton of the new parallel PPU core

To build the new PPU core, set profile=fast via GNU make. The old core
is profile=accurate.

The names of the profiles, and the name of the folder for the fast PPU
are subject to change.

The new PPU core doesn't do anything but demonstrate the proof of
concept: every scanline, make a copy of all the PPU registers and CGRAM.
Share the VRAM and OAM. Batch render all scanlines at once using OpenMP
at the end of each frame and blit the result.

With no PPU core at all, bsnes runs 91% faster than with the accuracy
PPU (230fps vs 120fps.) That's the absolute theoretical best-case
scenario. With the skeleton in place, we're already around 220fps. It'll
go down more as the PPU line renderer starts to do real work. I don't
know where things will end up yet. I suppose we'll find out in time.

My own copy of TDM/GCC can't use OpenMP on Windows, so ... it won't
parallelize if you build with that. I'm going to have to switch to a
different MinGW distribution once this is complete, I suppose.
This commit is contained in:
Tim Allen 2018-05-26 13:29:14 +10:00
parent 8f5bc80f01
commit 6c8e3c885d
15 changed files with 373 additions and 32 deletions

View File

@ -3,9 +3,11 @@ include ../nall/GNUmakefile
binary := application
target := higan
profile := accurate
objects := libco emulator audio video resource
flags += -I. -I..
flags += $(if $(call streq,$(profile),accurate),-DPROFILE_ACCURATE,-DPROFILE_FAST)
ifeq ($(platform),windows)
ifeq ($(binary),application)

View File

@ -12,7 +12,7 @@ using namespace nall;
namespace Emulator {
static const string Name = "higan";
static const string Version = "106.27";
static const string Version = "106.28";
static const string Author = "byuu";
static const string License = "GPLv3";
static const string Website = "https://byuu.org/";

View File

@ -2,7 +2,7 @@ processors += wdc65816 spc700 arm7tdmi gsu hg51b upd96050
objects += sfc-interface sfc-system sfc-controller
objects += sfc-cartridge sfc-memory
objects += sfc-cpu sfc-smp sfc-dsp sfc-ppu
objects += sfc-cpu sfc-smp sfc-dsp $(if $(call streq,$(profile),accurate),sfc-ppu,sfc-ppu-fast)
objects += sfc-expansion sfc-satellaview sfc-21fx
objects += sfc-icd sfc-mcc sfc-dip sfc-event
objects += sfc-sa1 sfc-superfx
@ -22,6 +22,7 @@ obj/sfc-cpu.o: sfc/cpu/cpu.cpp
obj/sfc-smp.o: sfc/smp/smp.cpp
obj/sfc-dsp.o: sfc/dsp/dsp.cpp
obj/sfc-ppu.o: sfc/ppu/ppu.cpp
obj/sfc-ppu-fast.o: sfc/ppu-fast/ppu.cpp
obj/sfc-expansion.o: sfc/expansion/expansion.cpp
obj/sfc-satellaview.o: sfc/expansion/satellaview/satellaview.cpp

76
higan/sfc/ppu-fast/io.cpp Normal file
View File

@ -0,0 +1,76 @@
auto PPU::readIO(uint24 address, uint8 data) -> uint8 {
cpu.synchronize(ppu);
switch((uint16)address) {
case 0x2104: case 0x2105: case 0x2106: case 0x2108:
case 0x2109: case 0x210a: case 0x2114: case 0x2115:
case 0x2116: case 0x2118: case 0x2119: case 0x211a:
case 0x2124: case 0x2125: case 0x2126: case 0x2128:
case 0x2129: case 0x212a: {
return ppu1.mdr;
}
case 0x2134: { //MPYL
uint24 result = (int16)io.m7a * (int8)(io.m7b >> 8);
return ppu1.mdr = result.byte(0);
}
case 0x2135: { //MPYM
uint24 result = (int16)io.m7a * (int8)(io.m7b >> 8);
return ppu1.mdr = result.byte(1);
}
case 0x2136: { //MPYH
uint24 result = (int16)io.m7a * (int8)(io.m7b >> 8);
return ppu1.mdr = result.byte(2);
}
}
return data;
}
auto PPU::writeIO(uint24 address, uint8 data) -> void {
cpu.synchronize(ppu);
switch((uint16)address) {
case 0x211b: { //M7A
io.m7a = data << 8 | latch.mode7;
latch.mode7 = data;
return;
}
case 0x211c: { //M7B
io.m7b = data << 8 | latch.mode7;
latch.mode7 = data;
return;
}
case 0x211d: { //M7C
io.m7c = data << 8 | latch.mode7;
latch.mode7 = data;
return;
}
case 0x211e: { //M7D
io.m7d = data << 8 | latch.mode7;
latch.mode7 = data;
return;
}
case 0x211f: { //M7X
io.m7x = data << 8 | latch.mode7;
latch.mode7 = data;
return;
}
case 0x2120: { //M7Y
io.m7y = data << 8 | latch.mode7;
latch.mode7 = data;
return;
}
}
}

View File

@ -0,0 +1,6 @@
auto PPU::Line::render() -> void {
for(uint x : range(512)) {
outputLo[x] = 0x7ffff;
outputHi[x] = 0x7ffff;
}
}

View File

@ -0,0 +1,52 @@
auto PPU::readOAM(uint10 address) -> uint8 {
if(!address.bit(9)) {
uint n = address >> 2; //object#
address &= 3;
if(address == 0) return object[n].x.bits(0,7);
if(address == 1) return object[n].y;
if(address == 2) return object[n].character;
return (
object[n].nameselect << 0
| object[n].palette << 1
| object[n].priority << 4
| object[n].hflip << 6
| object[n].vflip << 7
);
} else {
uint n = (address & 0x1f) << 2; //object#
return (
object[n + 0].x.bit(8) << 0
| object[n + 0].size << 1
| object[n + 1].x.bit(8) << 2
| object[n + 1].size << 3
| object[n + 2].x.bit(8) << 4
| object[n + 2].size << 5
| object[n + 3].x.bit(8) << 6
| object[n + 3].size << 7
);
}
}
auto PPU::writeOAM(uint10 address, uint8 data) -> void {
if(!address.bit(9)) {
uint n = address >> 2; //object#
if(address == 0) { object[n].x.bits(0,7) = data; return; }
if(address == 1) { object[n].y = data; return; }
if(address == 2) { object[n].character = data; return; }
object[n].nameselect = data.bit (0);
object[n].palette = data.bits(1,3);
object[n].priority = data.bits(4,5);
object[n].hflip = data.bit (6);
object[n].vflip = data.bit (7);
} else {
uint n = (address & 0x1f) << 2; //object#
object[n + 0].x.bit(8) = data.bit(0);
object[n + 0].size = data.bit(1);
object[n + 1].x.bit(8) = data.bit(2);
object[n + 1].size = data.bit(3);
object[n + 2].x.bit(8) = data.bit(4);
object[n + 2].size = data.bit(5);
object[n + 3].x.bit(8) = data.bit(6);
object[n + 3].size = data.bit(7);
}
}

View File

@ -0,0 +1,92 @@
#include <sfc/sfc.hpp>
namespace SuperFamicom {
PPU ppu;
#include "io.cpp"
#include "object.cpp"
#include "line.cpp"
#include "serialization.cpp"
#include <sfc/ppu/counter/serialization.cpp>
PPU::PPU() {
output = new uint32[512 * 512];
output += 16 * 512; //overscan offset
for(uint y : range(240)) {
lines[y].y = y;
lines[y].outputLo = output + (y * 2 + 0) * 512;
lines[y].outputHi = output + (y * 2 + 1) * 512;
}
}
PPU::~PPU() {
output -= 16 * 512; //overscan offset
delete[] output;
}
auto PPU::Enter() -> void {
while(true) scheduler.synchronize(), ppu.main();
}
auto PPU::step(uint clocks) -> void {
tick(clocks);
Thread::step(clocks);
synchronize(cpu);
}
auto PPU::main() -> void {
scanline();
uint y = vcounter();
step(512);
if(y >= 1 && y <= vdisp()) {
memory::copy(&lines[y].cgram, &cgram, sizeof(cgram));
memory::copy(&lines[y].io, &io, sizeof(io));
}
step(624);
step(lineclocks() - 512 - 624);
}
auto PPU::scanline() -> void {
if(vcounter() == 0) {
frame();
}
if(vcounter() == 241) {
#pragma omp parallel for
for(uint y = 1; y < vdisp(); y++) {
lines[y].render();
}
scheduler.exit(Scheduler::Event::Frame);
}
}
auto PPU::frame() -> void {
}
auto PPU::refresh() -> void {
auto output = this->output;
if(!overscan()) output -= 14 * 512;
auto pitch = 512;
auto width = 512;
auto height = 480;
Emulator::video.refresh(output, pitch * sizeof(uint32), width, height);
}
auto PPU::load(Markup::Node node) -> bool {
return true;
}
auto PPU::power(bool reset) -> void {
create(Enter, system.cpuFrequency());
PPUcounter::reset();
memory::fill(output, 512 * 480 * sizeof(uint32));
function<auto (uint24, uint8) -> uint8> reader{&PPU::readIO, this};
function<auto (uint24, uint8) -> void> writer{&PPU::writeIO, this};
bus.map(reader, writer, "00-3f,80-bf:2100-213f");
}
}

104
higan/sfc/ppu-fast/ppu.hpp Normal file
View File

@ -0,0 +1,104 @@
struct PPU : Thread, PPUcounter {
alwaysinline auto interlace() const -> bool { return false; }
alwaysinline auto overscan() const -> bool { return false; }
alwaysinline auto vdisp() const -> uint { return 225; }
//ppu.cpp
PPU();
~PPU();
static auto Enter() -> void;
alwaysinline auto step(uint clocks) -> void;
auto main() -> void;
auto scanline() -> void;
auto frame() -> void;
auto refresh() -> void;
auto load(Markup::Node) -> bool;
auto power(bool reset) -> void;
auto latchCounters() -> void {}
//serialization.cpp
auto serialize(serializer&) -> void;
public:
uint32* output = nullptr;
uint16 vram[32 * 1024];
uint16 cgram[256];
struct {
uint4 version;
uint8 mdr;
} ppu1, ppu2;
struct Latch {
uint8 mode7;
} latch;
enum : uint {
INIDISP = 0x00, OBSEL = 0x01, OAMADDL = 0x02, OAMADDH = 0x03,
OAMDATA = 0x04, BGMODE = 0x05, MOSAIC = 0x06, BG1SC = 0x07,
BG2SC = 0x08, BG3SC = 0x09, BG4SC = 0x0a, BG12NBA = 0x0b,
BG34NBA = 0x0c, BG1HOFS = 0x0d, BG1VOFS = 0x0e, BG2HOFS = 0x0f,
BG2VOFS = 0x10, BG3HOFS = 0x11, BG3VOFS = 0x12, BG4HOFS = 0x13,
BG4VOFS = 0x14, VMAIN = 0x15, VMADDL = 0x16, VMADDH = 0x17,
VMDATAL = 0x18, VMDATAH = 0x19, M7SEL = 0x1a, M7A = 0x1b,
M7B = 0x1c, M7C = 0x1d, M7D = 0x1e, M7X = 0x1f,
M7Y = 0x20, CGADD = 0x21, CGDATA = 0x22, W12SEL = 0x23,
W34SEL = 0x24, WOBJSEL = 0x25, WH0 = 0x26, WH1 = 0x27,
WH2 = 0x28, WH3 = 0x29, WBGLOG = 0x2a, WOBJLOG = 0x2b,
TM = 0x2c, TS = 0x2d, TMW = 0x2e, TSW = 0x2f,
CGWSEL = 0x30, CGADDSUB = 0x31, COLDATA = 0x32, SETINI = 0x33,
MPYL = 0x34, MPYM = 0x35, MPYH = 0x36, SLHV = 0x37,
OAMDATAREAD = 0x38, VMDATALREAD = 0x39, VMDATAHREAD = 0x3a, CGDATAREAD = 0x3b,
OPHCT = 0x3c, OPVCT = 0x3d, STAT77 = 0x3e, STAT78 = 0x3f,
};
//io.cpp
auto readIO(uint24 address, uint8 data) -> uint8;
auto writeIO(uint24 address, uint8 data) -> void;
struct IO {
uint16 m7a;
uint16 m7b;
uint16 m7c;
uint16 m7d;
uint16 m7x;
uint16 m7y;
} io;
//object.cpp
auto readOAM(uint10 address) -> uint8;
auto writeOAM(uint10 address, uint8 data) -> void;
struct Object {
uint9 x;
uint8 y;
uint8 character;
uint1 nameselect;
uint1 vflip;
uint1 hflip;
uint2 priority;
uint3 palette;
uint1 size;
} object[128];
//bitplane -> bitmap tile caches
uint8 vram2bpp[4096 * 64];
uint8 vram4bpp[2048 * 64];
uint8 vram8bpp[1024 * 64];
struct Line {
//line.cpp
auto render() -> void;
uint y = 0;
uint32* outputLo = nullptr;
uint32* outputHi = nullptr;
uint15 cgram[256];
IO io;
} lines[240];
};
extern PPU ppu;

View File

@ -0,0 +1,4 @@
auto PPU::serialize(serializer& s) -> void {
Thread::serialize(s);
PPUcounter::serialize(s);
}

View File

@ -0,0 +1,11 @@
auto PPUcounter::serialize(serializer& s) -> void {
s.integer(status.interlace);
s.integer(status.field);
s.integer(status.vcounter);
s.integer(status.hcounter);
s.array(history.field);
s.array(history.vcounter);
s.array(history.hcounter);
s.integer(history.index);
}

View File

@ -1,10 +1,10 @@
auto PPU::OAM::read(uint10 addr) -> uint8 {
if(!addr.bit(9)) {
uint n = addr >> 2; //object#
addr &= 3;
if(addr == 0) return object[n].x.bits(0,7);
if(addr == 1) return object[n].y;
if(addr == 2) return object[n].character;
auto PPU::OAM::read(uint10 address) -> uint8 {
if(!address.bit(9)) {
uint n = address >> 2; //object#
address &= 3;
if(address == 0) return object[n].x.bits(0,7);
if(address == 1) return object[n].y;
if(address == 2) return object[n].character;
return (
object[n].nameselect << 0
| object[n].palette << 1
@ -13,7 +13,7 @@ auto PPU::OAM::read(uint10 addr) -> uint8 {
| object[n].vflip << 7
);
} else {
uint n = (addr & 0x1f) << 2; //object#
uint n = (address & 0x1f) << 2; //object#
return (
object[n + 0].x.bit(8) << 0
| object[n + 0].size << 1
@ -27,20 +27,20 @@ auto PPU::OAM::read(uint10 addr) -> uint8 {
}
}
auto PPU::OAM::write(uint10 addr, uint8 data) -> void {
if(!addr.bit(9)) {
uint n = addr >> 2; //object#
addr &= 3;
if(addr == 0) { object[n].x.bits(0,7) = data; return; }
if(addr == 1) { object[n].y = data; return; }
if(addr == 2) { object[n].character = data; return; }
auto PPU::OAM::write(uint10 address, uint8 data) -> void {
if(!address.bit(9)) {
uint n = address >> 2; //object#
address &= 3;
if(address == 0) { object[n].x.bits(0,7) = data; return; }
if(address == 1) { object[n].y = data; return; }
if(address == 2) { object[n].character = data; return; }
object[n].nameselect = data.bit (0);
object[n].palette = data.bits(1,3);
object[n].priority = data.bits(4,5);
object[n].hflip = data.bit (6);
object[n].vflip = data.bit (7);
} else {
uint n = (addr & 0x1f) << 2; //object#
uint n = (address & 0x1f) << 2; //object#
object[n + 0].x.bit(8) = data.bit(0);
object[n + 0].size = data.bit(1);
object[n + 1].x.bit(8) = data.bit(2);

View File

@ -1,6 +1,6 @@
struct OAM {
auto read(uint10 addr) -> uint8;
auto write(uint10 addr, uint8 data) -> void;
auto read(uint10 address) -> uint8;
auto write(uint10 address, uint8 data) -> void;
struct Object {
alwaysinline auto width() const -> uint;

View File

@ -10,6 +10,7 @@ PPU ppu;
#include "window/window.cpp"
#include "screen/screen.cpp"
#include "serialization.cpp"
#include "counter/serialization.cpp"
PPU::PPU() :
bg1(Background::ID::BG1),

View File

@ -1,15 +1,3 @@
auto PPUcounter::serialize(serializer& s) -> void {
s.integer(status.interlace);
s.integer(status.field);
s.integer(status.vcounter);
s.integer(status.hcounter);
s.array(history.field);
s.array(history.vcounter);
s.array(history.hcounter);
s.integer(history.index);
}
auto PPU::serialize(serializer& s) -> void {
Thread::serialize(s);
PPUcounter::serialize(s);

View File

@ -52,7 +52,11 @@ namespace SuperFamicom {
#include <sfc/cpu/cpu.hpp>
#include <sfc/smp/smp.hpp>
#include <sfc/dsp/dsp.hpp>
#if defined(PROFILE_ACCURATE)
#include <sfc/ppu/ppu.hpp>
#elif defined(PROFILE_FAST)
#include <sfc/ppu-fast/ppu.hpp>
#endif
#include <sfc/controller/controller.hpp>
#include <sfc/expansion/expansion.hpp>