Merge branch 'master' into vulkan

This commit is contained in:
Triang3l 2021-06-08 12:15:34 +03:00
commit 6cd9d42fd0
607 changed files with 452347 additions and 127070 deletions

2
.gitattributes vendored
View File

@ -7,3 +7,5 @@
*.csproj text eol=crlf -whitespace merge=union
*.vcxproj text eol=crlf -whitespace merge=union
*.props text eol=crlf -whitespace merge=union
src/**/shaders/bytecode/** linguist-generated=true

8
.gitmodules vendored
View File

@ -7,9 +7,6 @@
[submodule "third_party/binutils-ppc-cygwin"]
path = third_party/binutils-ppc-cygwin
url = https://github.com/benvanik/binutils-ppc-cygwin.git
[submodule "third_party/libav"]
path = third_party/libav
url = https://github.com/xenia-project/libav.git
[submodule "third_party/catch"]
path = third_party/catch
url = https://github.com/catchorg/Catch2.git
@ -42,7 +39,7 @@
url = https://github.com/jarro2783/cxxopts.git
[submodule "third_party/SDL2"]
path = third_party/SDL2
url = https://github.com/spurious/SDL-mirror.git
url = https://github.com/JoelLinn/SDL.git
[submodule "third_party/utfcpp"]
path = third_party/utfcpp
url = https://github.com/xenia-project/utfcpp.git
@ -67,6 +64,9 @@
[submodule "third_party/xxhash"]
path = third_party/xxhash
url = https://github.com/Cyan4973/xxHash.git
[submodule "third_party/FFmpeg"]
path = third_party/FFmpeg
url = https://github.com/xenia-project/FFmpeg.git
[submodule "third_party/glslang"]
path = third_party/glslang
url = https://github.com/KhronosGroup/glslang.git

View File

@ -61,7 +61,7 @@ Fixes and optimizations are always welcome (please!), but in addition to
that there are some major work areas still untouched:
* Help work through [missing functionality/bugs in games](https://github.com/xenia-project/xenia/labels/compat)
* Add input drivers for [DualShock4 (PS4) controllers](https://github.com/xenia-project/xenia/issues/60) (or anything else)
* Add input drivers for [third-party controllers](https://github.com/xenia-project/xenia/issues/1333)
* Skilled with Linux? A strong contributor is needed to [help with porting](https://github.com/xenia-project/xenia/labels/platform-linux)
See more projects [good for contributors](https://github.com/xenia-project/xenia/labels/good%20first%20issue). It's a good idea to ask on Discord and check the issues page before beginning work on

View File

@ -42,7 +42,6 @@ end
characterset("Unicode")
flags({
--"ExtraWarnings", -- Sets the compiler's maximum warning level.
"FatalWarnings", -- Treat warnings as errors.
})
@ -100,8 +99,8 @@ filter("platforms:Linux")
toolset("clang")
buildoptions({
-- "-mlzcnt", -- (don't) Assume lzcnt is supported.
({os.outputof("pkg-config --cflags gtk+-x11-3.0")})[1],
})
pkg_config.all("gtk+-x11-3.0")
links({
"stdc++fs",
"dl",
@ -109,16 +108,11 @@ filter("platforms:Linux")
"pthread",
"rt",
})
linkoptions({
({os.outputof("pkg-config --libs gtk+-3.0")})[1],
})
filter({"platforms:Linux", "kind:*App"})
linkgroups("On")
filter({"platforms:Linux", "language:C++", "toolset:gcc"})
links({
})
disablewarnings({
"unused-result"
})
@ -136,10 +130,6 @@ filter({"platforms:Linux", "toolset:gcc"})
end
filter({"platforms:Linux", "language:C++", "toolset:clang"})
links({
"c++",
"c++abi"
})
disablewarnings({
"deprecated-register"
})
@ -196,6 +186,7 @@ filter("platforms:Windows")
"shcore",
"shlwapi",
"dxguid",
"bcrypt",
})
-- Create scratch/ path
@ -203,7 +194,7 @@ if not os.isdir("scratch") then
os.mkdir("scratch")
end
solution("xenia")
workspace("xenia")
uuid("931ef4b0-6170-4f7a-aaf2-0fece7632747")
startproject("xenia-app")
if os.istarget("android") then
@ -233,14 +224,32 @@ solution("xenia")
include("third_party/discord-rpc.lua")
include("third_party/cxxopts.lua")
include("third_party/cpptoml.lua")
include("third_party/FFmpeg/premake5.lua")
include("third_party/fmt.lua")
include("third_party/glslang-spirv.lua")
include("third_party/imgui.lua")
include("third_party/libav.lua")
include("third_party/mspack.lua")
include("third_party/snappy.lua")
include("third_party/xxhash.lua")
if not os.istarget("android") then
-- SDL2 requires sdl2-config, and as of November 2020 isn't high-quality on
-- Android yet, most importantly in game controllers - the keycode and axis
-- enums are being ruined during conversion to SDL2 enums resulting in only
-- one controller (Nvidia Shield) being supported, digital triggers are also
-- not supported; lifecycle management (especially surface loss) is also
-- complicated.
include("third_party/SDL2.lua")
end
-- Disable treating warnings as fatal errors for all third party projects:
for _, prj in ipairs(premake.api.scope.current.solution.projects) do
project(prj.name)
removeflags({
"FatalWarnings",
})
end
include("src/xenia")
include("src/xenia/app/discord")
include("src/xenia/apu")
@ -260,14 +269,6 @@ solution("xenia")
include("src/xenia/vfs")
if not os.istarget("android") then
-- SDL2 requires sdl2-config, and as of November 2020 isn't high-quality on
-- Android yet, most importantly in game controllers - the keycode and axis
-- enums are being ruined during conversion to SDL2 enums resulting in only
-- one controller (Nvidia Shield) being supported, digital triggers are also
-- not supported; lifecycle management (especially surface loss) is also
-- complicated.
include("third_party/SDL2.lua")
include("src/xenia/apu/sdl")
include("src/xenia/helper/sdl")
include("src/xenia/hid/sdl")

View File

@ -430,31 +430,46 @@ void EmulatorWindow::ShowCommitID() {
}
void EmulatorWindow::UpdateTitle() {
std::string title(base_title_);
xe::StringBuffer sb;
sb.Append(base_title_);
// Title information, if available
if (emulator()->is_title_open()) {
auto game_title = emulator()->game_title();
title += fmt::format(" | [{:08X}] {}", emulator()->title_id(), game_title);
sb.AppendFormat(u8" | [{:08X}", emulator()->title_id());
auto title_version = emulator()->title_version();
if (!title_version.empty()) {
sb.Append(u8" v");
sb.Append(title_version);
}
sb.Append(u8"]");
auto title_name = emulator()->title_name();
if (!title_name.empty()) {
sb.Append(u8" ");
sb.Append(title_name);
}
}
// Graphics system name, if available
auto graphics_system = emulator()->graphics_system();
if (graphics_system) {
auto graphics_name = graphics_system->name();
title += fmt::format(" <{}>", graphics_name);
if (!graphics_name.empty()) {
sb.Append(u8" <");
sb.Append(graphics_name);
sb.Append(u8">");
}
}
if (Clock::guest_time_scalar() != 1.0) {
title += fmt::format(" (@{:.2f}x)", Clock::guest_time_scalar());
sb.AppendFormat(u8" (@{:.2f}x)", Clock::guest_time_scalar());
}
if (initializing_shader_storage_) {
title +=
" (Preloading shaders"
u8"\u2026"
")";
sb.Append(u8" (Preloading shaders\u2026)");
}
window_->set_title(title);
window_->set_title(sb.to_string_view());
}
void EmulatorWindow::SetInitializingShaderStorage(bool initializing) {

View File

@ -223,7 +223,7 @@ void AudioSystem::UnregisterClient(size_t index) {
}
bool AudioSystem::Save(ByteStream* stream) {
stream->Write('XAUD');
stream->Write(kAudioSaveSignature);
// Count the number of used clients first.
// Any gaps should be handled gracefully.
@ -251,7 +251,7 @@ bool AudioSystem::Save(ByteStream* stream) {
}
bool AudioSystem::Restore(ByteStream* stream) {
if (stream->Read<uint32_t>() != 'XAUD') {
if (stream->Read<uint32_t>() != kAudioSaveSignature) {
XELOGE("AudioSystem::Restore - Invalid magic value!");
return false;
}

View File

@ -23,6 +23,8 @@
namespace xe {
namespace apu {
constexpr fourcc_t kAudioSaveSignature = make_fourcc("XAUD");
class AudioDriver;
class XmaDecoder;

View File

@ -14,6 +14,6 @@ project("xenia-apu")
defines({
})
includedirs({
project_root.."/third_party/libav/",
project_root.."/third_party/FFmpeg/",
})
local_platform_files()

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2015 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -10,10 +10,11 @@
#ifndef XENIA_APU_XMA_CONTEXT_H_
#define XENIA_APU_XMA_CONTEXT_H_
#include <array>
#include <atomic>
#include <mutex>
#include <queue>
#include <vector>
//#include <vector>
#include "xenia/memory.h"
#include "xenia/xbox.h"
@ -30,6 +31,7 @@
// Forward declarations
struct AVCodec;
struct AVCodecParserContext;
struct AVCodecContext;
struct AVFrame;
struct AVPacket;
@ -121,29 +123,29 @@ struct XMA_CONTEXT_DATA {
static_assert_size(XMA_CONTEXT_DATA, 64);
#pragma pack(push, 1)
struct WmaProExtraData {
uint16_t bits_per_sample;
uint32_t channel_mask;
uint8_t unk06[8];
uint16_t decode_flags;
uint8_t unk10[2];
// XMA2WAVEFORMATEX
struct Xma2ExtraData {
uint8_t raw[34];
};
static_assert_size(WmaProExtraData, 18);
static_assert_size(Xma2ExtraData, 34);
#pragma pack(pop)
class XmaContext {
public:
static const uint32_t kBytesPerPacket = 2048;
static const uint32_t kBitsPerPacket = kBytesPerPacket * 8;
static const uint32_t kBitsPerHeader = 33;
static const uint32_t kBytesPerSample = 2;
static const uint32_t kSamplesPerFrame = 512;
static const uint32_t kSamplesPerSubframe = 128;
static const uint32_t kBytesPerFrame = kSamplesPerFrame * kBytesPerSample;
static const uint32_t kBytesPerSubframe =
static const uint32_t kBytesPerFrameChannel =
kSamplesPerFrame * kBytesPerSample;
static const uint32_t kBytesPerSubframeChannel =
kSamplesPerSubframe * kBytesPerSample;
static const uint32_t kOutputBytesPerBlock = 256;
static const uint32_t kOutputMaxSizeBytes = 31 * kOutputBytesPerBlock;
// static const uint32_t kOutputBytesPerBlock = 256;
// static const uint32_t kOutputMaxSizeBytes = 31 * kOutputBytesPerBlock;
explicit XmaContext();
~XmaContext();
@ -168,28 +170,29 @@ class XmaContext {
void set_is_enabled(bool is_enabled) { is_enabled_ = is_enabled; }
private:
static void SwapInputBuffer(XMA_CONTEXT_DATA* data);
static void NextPacket(XMA_CONTEXT_DATA* data);
static int GetSampleRate(int id);
// Get the offset of the next frame. Does not traverse packets.
static size_t GetNextFrame(uint8_t* block, size_t size, size_t bit_offset);
// Get the containing packet number of the frame pointed to by the offset.
static int GetFramePacketNumber(uint8_t* block, size_t size,
size_t bit_offset);
// Get the packet number and the index of the frame inside that packet
static std::tuple<int, int> GetFrameNumber(uint8_t* block, size_t size,
size_t bit_offset);
// Get the number of frames contained in the packet (including truncated) and
// if the last frame is split.
static std::tuple<int, bool> GetPacketFrameCount(uint8_t* packet);
// Convert sample format and swap bytes
static void ConvertFrame(const uint8_t** samples, bool is_two_channel,
uint8_t* output_buffer);
size_t SavePartial(uint8_t* packet, uint32_t frame_offset_bits,
size_t frame_size_bits, bool append);
bool ValidFrameOffset(uint8_t* block, size_t size_bytes,
size_t frame_offset_bits);
void DecodePackets(XMA_CONTEXT_DATA* data);
uint32_t GetFramePacketNumber(uint8_t* block, size_t size, size_t bit_offset);
int PrepareDecoder(uint8_t* block, size_t size, int sample_rate,
int channels);
bool ConvertFrame(const uint8_t** samples, int num_channels, int num_samples,
uint8_t* output_buffer);
int StartPacket(XMA_CONTEXT_DATA* data);
int PreparePacket(uint8_t* input, size_t seq_offset, size_t size,
int sample_rate, int channels);
void DiscardPacket();
int DecodePacket(uint8_t* output, size_t offset, size_t size,
size_t* read_bytes);
void Decode(XMA_CONTEXT_DATA* data);
int PrepareDecoder(uint8_t* packet, int sample_rate, bool is_two_channel);
Memory* memory_ = nullptr;
@ -198,22 +201,35 @@ class XmaContext {
std::mutex lock_;
bool is_allocated_ = false;
bool is_enabled_ = false;
// bool is_dirty_ = true;
// libav structures
AVCodec* codec_ = nullptr;
AVCodecContext* context_ = nullptr;
AVFrame* decoded_frame_ = nullptr;
AVPacket* packet_ = nullptr;
WmaProExtraData extra_data_;
// ffmpeg structures
AVPacket* av_packet_ = nullptr;
AVCodec* av_codec_ = nullptr;
AVCodecContext* av_context_ = nullptr;
AVFrame* av_frame_ = nullptr;
// uint32_t decoded_consumed_samples_ = 0; // TODO do this dynamically
// int decoded_idx_ = -1;
bool partial_frame_saved_ = false;
bool partial_frame_size_known_ = false;
size_t partial_frame_total_size_bits_ = 0;
size_t partial_frame_start_offset_bits_ = 0;
size_t partial_frame_offset_bits_ = 0; // blah internal don't use this
std::vector<uint8_t> partial_frame_buffer_;
// bool partial_frame_saved_ = false;
// bool partial_frame_size_known_ = false;
// size_t partial_frame_total_size_bits_ = 0;
// size_t partial_frame_start_offset_bits_ = 0;
// size_t partial_frame_offset_bits_ = 0; // blah internal don't use this
// std::vector<uint8_t> partial_frame_buffer_;
uint32_t packets_skip_ = 0;
uint8_t* current_frame_ = nullptr;
// bool split_frame_pending_ = false;
uint32_t split_frame_len_ = 0;
uint32_t split_frame_len_partial_ = 0;
uint8_t split_frame_padding_start_ = 0;
// first byte contains bit offset information
std::array<uint8_t, 1 + 4096> xma_frame_;
// uint8_t* current_frame_ = nullptr;
// conversion buffer for 2 channel frame
std::array<uint8_t, kBytesPerFrameChannel * 2> raw_frame_;
// std::vector<uint8_t> current_frame_ = std::vector<uint8_t>(0);
};
} // namespace apu

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2013 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -21,7 +21,7 @@
#include "xenia/kernel/xthread.h"
extern "C" {
#include "third_party/libav/libavutil/log.h"
#include "third_party/FFmpeg/libavutil/log.h"
} // extern "C"
// As with normal Microsoft, there are like twelve different ways to access
@ -48,7 +48,7 @@ extern "C" {
// do this, it's likely they are either passing the context to XAudio or
// using the XMA* functions.
DEFINE_bool(libav_verbose, false, "Verbose libav output (debug and above)",
DEFINE_bool(ffmpeg_verbose, false, "Verbose FFmpeg output (debug and above)",
"APU");
namespace xe {
@ -60,7 +60,7 @@ XmaDecoder::XmaDecoder(cpu::Processor* processor)
XmaDecoder::~XmaDecoder() = default;
void av_log_callback(void* avcl, int level, const char* fmt, va_list va) {
if (!cvars::libav_verbose && level > AV_LOG_WARNING) {
if (!cvars::ffmpeg_verbose && level > AV_LOG_WARNING) {
return;
}
@ -101,12 +101,12 @@ void av_log_callback(void* avcl, int level, const char* fmt, va_list va) {
StringBuffer buff;
buff.AppendVarargs(fmt, va);
xe::logging::AppendLogLineFormat(log_level, level_char, "libav: {}",
xe::logging::AppendLogLineFormat(log_level, level_char, "ffmpeg: {}",
buff.to_string_view());
}
X_STATUS XmaDecoder::Setup(kernel::KernelState* kernel_state) {
// Setup libav logging callback
// Setup ffmpeg logging callback
av_log_set_callback(av_log_callback);
// Let the processor know we want register access callbacks.
@ -277,10 +277,10 @@ uint32_t XmaDecoder::ReadRegister(uint32_t addr) {
default: {
const auto register_info = register_file_.GetRegisterInfo(r);
if (register_info) {
XELOGE("XMA: Read from unhandled register ({:04X}, {})", r,
XELOGW("XMA: Read from unhandled register ({:04X}, {})", r,
register_info->name);
} else {
XELOGE("XMA: Read from unknown register ({:04X})", r);
XELOGW("XMA: Read from unknown register ({:04X})", r);
}
break;
}
@ -348,10 +348,10 @@ void XmaDecoder::WriteRegister(uint32_t addr, uint32_t value) {
default: {
const auto register_info = register_file_.GetRegisterInfo(r);
if (register_info) {
XELOGE("XMA: Write to unhandled register ({:04X}, {}): {:08X}", r,
XELOGW("XMA: Write to unhandled register ({:04X}, {}): {:08X}", r,
register_info->name, value);
} else {
XELOGE("XMA: Write to unknown register ({:04X}): {:08X}", r, value);
XELOGW("XMA: Write to unknown register ({:04X}): {:08X}", r, value);
}
break;
}

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2015 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -18,6 +18,8 @@ namespace xe {
namespace apu {
namespace xma {
static const uint32_t kMaxFrameLength = 0x7FFF;
// Get number of frames that /begin/ in this packet.
uint32_t GetPacketFrameCount(uint8_t* packet) {
return (uint8_t)(packet[0] >> 2);
@ -27,11 +29,12 @@ uint32_t GetPacketFrameCount(uint8_t* packet) {
uint32_t GetPacketFrameOffset(uint8_t* packet) {
uint32_t val = (uint16_t)(((packet[0] & 0x3) << 13) | (packet[1] << 5) |
(packet[2] >> 3));
if (val == 0x7FFF) {
return -1;
} else {
return val + 32;
}
// if (val > kBitsPerPacket - kBitsPerHeader) {
// // There is no data in this packet
// return -1;
// } else {
return val + 32;
// }
}
uint32_t GetPacketMetadata(uint8_t* packet) {

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2013 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -13,6 +13,7 @@
#include <memory>
#include "xenia/base/assert.h"
#include "xenia/base/math.h"
namespace xe {
@ -45,12 +46,25 @@ void Arena::DebugFill() {
}
}
void* Arena::Alloc(size_t size) {
void* Arena::Alloc(size_t size, size_t align) {
assert_true(
xe::bit_count(align) == 1 && align <= 16,
"align needs to be a power of 2 and not greater than Chunk alignment");
// for alignment
const auto get_padding = [this, align]() -> size_t {
const size_t mask = align - 1;
size_t deviation = active_chunk_->offset & mask;
return (align - deviation) & mask;
};
if (active_chunk_) {
if (active_chunk_->capacity - active_chunk_->offset < size + 4096) {
if (active_chunk_->capacity - active_chunk_->offset <
size + get_padding() + 4096) {
Chunk* next = active_chunk_->next;
if (!next) {
assert_true(size < chunk_size_, "need to support larger chunks");
assert_true(size + get_padding() < chunk_size_,
"need to support larger chunks");
next = new Chunk(chunk_size_);
active_chunk_->next = next;
}
@ -61,8 +75,11 @@ void* Arena::Alloc(size_t size) {
head_chunk_ = active_chunk_ = new Chunk(chunk_size_);
}
active_chunk_->offset += get_padding();
uint8_t* p = active_chunk_->buffer + active_chunk_->offset;
active_chunk_->offset += size;
assert_true((reinterpret_cast<size_t>(p) & (align - 1)) == 0,
"alignment failed");
return p;
}
@ -113,6 +130,8 @@ void Arena::CloneContents(void* buffer, size_t buffer_length) {
Arena::Chunk::Chunk(size_t chunk_size)
: next(nullptr), capacity(chunk_size), buffer(0), offset(0) {
buffer = reinterpret_cast<uint8_t*>(malloc(capacity));
assert_true((reinterpret_cast<size_t>(buffer) & size_t(15)) == 0,
"16 byte alignment required");
}
Arena::Chunk::~Chunk() {

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2013 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -24,11 +24,13 @@ class Arena {
void Reset();
void DebugFill();
void* Alloc(size_t size);
void* Alloc(size_t size, size_t align);
template <typename T>
T* Alloc() {
return reinterpret_cast<T*>(Alloc(sizeof(T)));
return reinterpret_cast<T*>(Alloc(sizeof(T), alignof(T)));
}
// When rewinding aligned allocations, any padding that was applied during
// allocation will be leaked
void Rewind(size_t size);
void* CloneContents();

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2015 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -111,6 +111,8 @@ size_t BitStream::Copy(uint8_t* dest_buffer, size_t num_bits) {
// First: Copy the first few bits up to a byte boundary.
if (rel_offset_bits) {
uint64_t bits = Peek(8 - rel_offset_bits);
uint8_t clear_mask = ~((uint8_t(1) << rel_offset_bits) - 1);
dest_buffer[out_offset_bytes] &= clear_mask;
dest_buffer[out_offset_bytes] |= (uint8_t)bits;
bits_left -= 8 - rel_offset_bits;
@ -132,6 +134,8 @@ size_t BitStream::Copy(uint8_t* dest_buffer, size_t num_bits) {
uint64_t bits = Peek(bits_left);
bits <<= 8 - bits_left;
uint8_t clear_mask = ((uint8_t(1) << bits_left) - 1);
dest_buffer[out_offset_bytes] &= clear_mask;
dest_buffer[out_offset_bytes] |= (uint8_t)bits;
Advance(bits_left);
}

View File

@ -11,103 +11,113 @@
#define XENIA_BASE_BYTE_ORDER_H_
#include <cstdint>
#if defined __has_include
#if __has_include(<version>)
#include <version>
#endif
#endif
#if __cpp_lib_endian
#include <bit>
#endif
#include "xenia/base/assert.h"
#include "xenia/base/platform.h"
#if XE_PLATFORM_LINUX
#include <byteswap.h>
#if !__cpp_lib_endian
// Polyfill
#ifdef __BYTE_ORDER__
namespace std {
enum class endian {
little = __ORDER_LITTLE_ENDIAN__,
big = __ORDER_BIG_ENDIAN__,
native = __BYTE_ORDER__
};
}
#else
// Hardcode to little endian for now
namespace std {
enum class endian { little = 0, big = 1, native = 0 };
}
#endif
#endif
// Check for mixed endian
static_assert((std::endian::native == std::endian::big) ||
(std::endian::native == std::endian::little));
namespace xe {
#if XE_PLATFORM_WIN32
#if XE_COMPILER_MSVC
#define XENIA_BASE_BYTE_SWAP_16 _byteswap_ushort
#define XENIA_BASE_BYTE_SWAP_32 _byteswap_ulong
#define XENIA_BASE_BYTE_SWAP_64 _byteswap_uint64
#elif XE_PLATFORM_MAC
#define XENIA_BASE_BYTE_SWAP_16 OSSwapInt16
#define XENIA_BASE_BYTE_SWAP_32 OSSwapInt32
#define XENIA_BASE_BYTE_SWAP_64 OSSwapInt64
#else
#define XENIA_BASE_BYTE_SWAP_16 bswap_16
#define XENIA_BASE_BYTE_SWAP_32 bswap_32
#define XENIA_BASE_BYTE_SWAP_64 bswap_64
#define XENIA_BASE_BYTE_SWAP_16 __builtin_bswap16
#define XENIA_BASE_BYTE_SWAP_32 __builtin_bswap32
#define XENIA_BASE_BYTE_SWAP_64 __builtin_bswap64
#endif // XE_PLATFORM_WIN32
inline int8_t byte_swap(int8_t value) { return value; }
inline uint8_t byte_swap(uint8_t value) { return value; }
inline int16_t byte_swap(int16_t value) {
return static_cast<int16_t>(
XENIA_BASE_BYTE_SWAP_16(static_cast<int16_t>(value)));
}
inline uint16_t byte_swap(uint16_t value) {
return XENIA_BASE_BYTE_SWAP_16(value);
}
inline uint16_t byte_swap(char16_t value) {
return static_cast<char16_t>(XENIA_BASE_BYTE_SWAP_16(value));
}
inline int32_t byte_swap(int32_t value) {
return static_cast<int32_t>(
XENIA_BASE_BYTE_SWAP_32(static_cast<int32_t>(value)));
}
inline uint32_t byte_swap(uint32_t value) {
return XENIA_BASE_BYTE_SWAP_32(value);
}
inline int64_t byte_swap(int64_t value) {
return static_cast<int64_t>(
XENIA_BASE_BYTE_SWAP_64(static_cast<int64_t>(value)));
}
inline uint64_t byte_swap(uint64_t value) {
return XENIA_BASE_BYTE_SWAP_64(value);
}
inline float byte_swap(float value) {
uint32_t temp = byte_swap(*reinterpret_cast<uint32_t*>(&value));
return *reinterpret_cast<float*>(&temp);
}
inline double byte_swap(double value) {
uint64_t temp = byte_swap(*reinterpret_cast<uint64_t*>(&value));
return *reinterpret_cast<double*>(&temp);
}
template <typename T>
template <class T>
inline T byte_swap(T value) {
if (sizeof(T) == 4) {
return static_cast<T>(byte_swap(static_cast<uint32_t>(value)));
} else if (sizeof(T) == 2) {
return static_cast<T>(byte_swap(static_cast<uint16_t>(value)));
} else {
assert_always("not handled");
static_assert(
sizeof(T) == 8 || sizeof(T) == 4 || sizeof(T) == 2 || sizeof(T) == 1,
"byte_swap(T value): Type T has illegal size");
if constexpr (sizeof(T) == 8) {
uint64_t temp =
XENIA_BASE_BYTE_SWAP_64(*reinterpret_cast<uint64_t*>(&value));
return *reinterpret_cast<T*>(&temp);
} else if constexpr (sizeof(T) == 4) {
uint32_t temp =
XENIA_BASE_BYTE_SWAP_32(*reinterpret_cast<uint32_t*>(&value));
return *reinterpret_cast<T*>(&temp);
} else if constexpr (sizeof(T) == 2) {
uint16_t temp =
XENIA_BASE_BYTE_SWAP_16(*reinterpret_cast<uint16_t*>(&value));
return *reinterpret_cast<T*>(&temp);
} else if constexpr (sizeof(T) == 1) {
return value;
}
}
template <typename T>
struct be {
be() = default;
be(const T& src) : value(xe::byte_swap(src)) {} // NOLINT(runtime/explicit)
be(const be& other) { value = other.value; } // NOLINT(runtime/explicit)
operator T() const { return xe::byte_swap(value); }
template <typename T, std::endian E>
struct endian_store {
endian_store() = default;
endian_store(const T& src) {
if constexpr (std::endian::native == E) {
value = src;
} else {
value = xe::byte_swap(src);
}
}
endian_store(const endian_store& other) { value = other.value; }
operator T() const {
if constexpr (std::endian::native == E) {
return value;
} else {
return xe::byte_swap(value);
}
}
be<T>& operator+=(int a) {
endian_store<T, E>& operator+=(int a) {
*this = *this + a;
return *this;
}
be<T>& operator-=(int a) {
endian_store<T, E>& operator-=(int a) {
*this = *this - a;
return *this;
}
be<T>& operator++() {
endian_store<T, E>& operator++() {
*this += 1;
return *this;
} // ++a
be<T> operator++(int) {
endian_store<T, E> operator++(int) {
*this += 1;
return (*this - 1);
} // a++
be<T>& operator--() {
endian_store<T, E>& operator--() {
*this -= 1;
return *this;
} // --a
be<T> operator--(int) {
endian_store<T, E> operator--(int) {
*this -= 1;
return (*this + 1);
} // a--
@ -115,6 +125,11 @@ struct be {
T value;
};
template <typename T>
using be = endian_store<T, std::endian::big>;
template <typename T>
using le = endian_store<T, std::endian::little>;
} // namespace xe
#endif // XENIA_BASE_BYTE_ORDER_H_

View File

@ -23,6 +23,7 @@ namespace cvar {
cxxopts::Options options("xenia", "Xbox 360 Emulator");
std::map<std::string, ICommandVar*>* CmdVars;
std::map<std::string, IConfigVar*>* ConfigVars;
std::multimap<uint32_t, const IConfigVarUpdate*>* IConfigVarUpdate::updates_;
void PrintHelpAndExit() {
std::cout << options.help({""}) << std::endl;

View File

@ -17,6 +17,7 @@
#include "cpptoml/include/cpptoml.h"
#include "cxxopts/include/cxxopts.hpp"
#include "xenia/base/assert.h"
#include "xenia/base/filesystem.h"
#include "xenia/base/string_util.h"
@ -43,6 +44,7 @@ class IConfigVar : virtual public ICommandVar {
virtual std::string config_value() const = 0;
virtual void LoadConfigValue(std::shared_ptr<cpptoml::base> result) = 0;
virtual void LoadGameConfigValue(std::shared_ptr<cpptoml::base> result) = 0;
virtual void ResetConfigValueToDefault() = 0;
};
template <class T>
@ -75,6 +77,7 @@ class ConfigVar : public CommandVar<T>, virtual public IConfigVar {
ConfigVar<T>(const char* name, T* default_value, const char* description,
const char* category, bool is_transient);
std::string config_value() const override;
const T& GetTypedConfigValue() const;
const std::string& category() const override;
bool is_transient() const override;
void AddToLaunchOptions(cxxopts::Options* options) override;
@ -89,6 +92,7 @@ class ConfigVar : public CommandVar<T>, virtual public IConfigVar {
std::unique_ptr<T> config_value_ = nullptr;
std::unique_ptr<T> game_config_value_ = nullptr;
void UpdateValue() override;
void ResetConfigValueToDefault() override;
};
#pragma warning(pop)
@ -233,6 +237,10 @@ std::string ConfigVar<T>::config_value() const {
return this->ToString(this->default_value_);
}
template <class T>
const T& ConfigVar<T>::GetTypedConfigValue() const {
return config_value_ ? *config_value_ : this->default_value_;
}
template <class T>
void CommandVar<T>::SetCommandLineValue(const T val) {
commandline_value_ = std::make_unique<T>(val);
UpdateValue();
@ -247,36 +255,47 @@ void ConfigVar<T>::SetGameConfigValue(T val) {
game_config_value_ = std::make_unique<T>(val);
UpdateValue();
}
template <class T>
void ConfigVar<T>::ResetConfigValueToDefault() {
SetConfigValue(this->default_value_);
}
// CVars can be initialized before these, thus initialized on-demand using new.
extern std::map<std::string, ICommandVar*>* CmdVars;
extern std::map<std::string, IConfigVar*>* ConfigVars;
inline void AddConfigVar(IConfigVar* cv) {
if (!ConfigVars) ConfigVars = new std::map<std::string, IConfigVar*>();
ConfigVars->insert(std::pair<std::string, IConfigVar*>(cv->name(), cv));
if (!ConfigVars) {
ConfigVars = new std::map<std::string, IConfigVar*>;
}
ConfigVars->emplace(cv->name(), cv);
}
inline void AddCommandVar(ICommandVar* cv) {
if (!CmdVars) CmdVars = new std::map<std::string, ICommandVar*>();
CmdVars->insert(std::pair<std::string, ICommandVar*>(cv->name(), cv));
if (!CmdVars) {
CmdVars = new std::map<std::string, ICommandVar*>;
}
CmdVars->emplace(cv->name(), cv);
}
void ParseLaunchArguments(int& argc, char**& argv,
const std::string_view positional_help,
const std::vector<std::string>& positional_options);
template <typename T>
T* define_configvar(const char* name, T* default_value, const char* description,
const char* category, bool is_transient) {
IConfigVar* cfgVar = new ConfigVar<T>(name, default_value, description,
IConfigVar* define_configvar(const char* name, T* default_value,
const char* description, const char* category,
bool is_transient) {
IConfigVar* cfgvar = new ConfigVar<T>(name, default_value, description,
category, is_transient);
AddConfigVar(cfgVar);
return default_value;
AddConfigVar(cfgvar);
return cfgvar;
}
template <typename T>
T* define_cmdvar(const char* name, T* default_value, const char* description) {
ICommandVar* cmdVar = new CommandVar<T>(name, default_value, description);
AddCommandVar(cmdVar);
return default_value;
ICommandVar* define_cmdvar(const char* name, T* default_value,
const char* description) {
ICommandVar* cmdvar = new CommandVar<T>(name, default_value, description);
AddCommandVar(cmdvar);
return cmdvar;
}
#define DEFINE_bool(name, default_value, description, category) \
@ -285,6 +304,9 @@ T* define_cmdvar(const char* name, T* default_value, const char* description) {
#define DEFINE_int32(name, default_value, description, category) \
DEFINE_CVar(name, default_value, description, category, false, int32_t)
#define DEFINE_uint32(name, default_value, description, category) \
DEFINE_CVar(name, default_value, description, category, false, uint32_t)
#define DEFINE_uint64(name, default_value, description, category) \
DEFINE_CVar(name, default_value, description, category, false, uint64_t)
@ -314,7 +336,7 @@ T* define_cmdvar(const char* name, T* default_value, const char* description) {
type name = default_value; \
} \
namespace cv { \
static auto cv_##name = cvar::define_configvar( \
static cvar::IConfigVar* const cv_##name = cvar::define_configvar( \
#name, &cvars::name, description, category, is_transient); \
}
@ -324,7 +346,7 @@ T* define_cmdvar(const char* name, T* default_value, const char* description) {
std::string name = default_value; \
} \
namespace cv { \
static auto cv_##name = \
static cvar::ICommandVar* const cv_##name = \
cvar::define_cmdvar(#name, &cvars::name, description); \
}
@ -332,6 +354,8 @@ T* define_cmdvar(const char* name, T* default_value, const char* description) {
#define DECLARE_int32(name) DECLARE_CVar(name, int32_t)
#define DECLARE_uint32(name) DECLARE_CVar(name, uint32_t)
#define DECLARE_uint64(name) DECLARE_CVar(name, uint64_t)
#define DECLARE_double(name) DECLARE_CVar(name, double)
@ -345,6 +369,212 @@ T* define_cmdvar(const char* name, T* default_value, const char* description) {
extern type name; \
}
// Interface for changing the default value of a variable with auto-upgrading of
// users' configs (to distinguish between a leftover old default and an explicit
// override), without having to rename the variable.
//
// Two types of updates are supported:
// - Changing the value of the variable (UPDATE_from_type) from an explicitly
// specified previous default value to a new one, but keeping the
// user-specified value if it was not the default, and thus explicitly
// overridden.
// - Changing the meaning / domain of the variable (UPDATE_from_any), when
// previous user-specified overrides also stop making sense. Config variable
// type changes are also considered this type of updates (though
// UPDATE_from_type, if the new type doesn't match the previous one, is also
// safe to use - it behaves like UPDATE_from_any in this case).
//
// Rules of using UPDATE_:
// - Do not remove previous UPDATE_ entries (both typed and from-any) if you're
// adding a new UPDATE_from_type.
// This ensures that if the default was changed from 1 to 2 and then to 3,
// both users who last launched Xenia when it was 1 and when it was 2 receive
// the update (however, those who have explicitly changed it from 2 to 1 when
// 2 was the default will have it kept at 1).
// It's safe to remove the history before a new UPDATE_from_any, however.
// - The date should preferably be in UTC+0 timezone.
// - No other pull recent pull requests should have the same date (since builds
// are made after every commit).
// - IConfigVarUpdate::kLastCommittedUpdateDate must be updated - see the
// comment near its declaration.
constexpr uint32_t MakeConfigVarUpdateDate(uint32_t year, uint32_t month,
uint32_t day, uint32_t utc_hour) {
// Written to the config as a decimal number - pack as decimal for user
// readability.
// Using 31 bits in the 3rd millennium already - don't add more digits.
return utc_hour + day * 100 + month * 10000 + year * 1000000;
}
class IConfigVarUpdate {
public:
// This global highest version constant is used to ensure that version (which
// is stored as one value for the whole config file) is monotonically
// increased when commits - primarily pull requests - are pushed to the main
// branch.
//
// This is to prevent the following situation:
// - Pull request #1 created on day 1.
// - Pull request #2 created on day 2.
// - Pull request #2 from day 2 merged on day 3.
// - User launches the latest version on day 4.
// CVar default changes from PR #2 (day 2) applied because the user's config
// version is day 0, which is < 2.
// User's config has day 2 version now.
// - Pull request #1 from day 1 merged on day 5.
// - User launches the latest version on day 5.
// CVar default changes from PR #1 (day 1) IGNORED because the user's config
// version is day 2, which is >= 1.
//
// If this constant is not updated, static_assert will be triggered for a new
// DEFINE_, requiring this constant to be raised. But changing this will
// result in merge conflicts in all other pull requests also changing cvar
// defaults - before they're merged, they will need to be updated, which will
// ensure monotonic growth of the versions of all cvars on the main branch. In
// the example above, PR #1 will need to be updated before it's merged.
//
// If you've encountered a merge conflict here in your pull request:
// 1) Update any UPDATE_s you've added in the pull request to the current
// date.
// 2) Change this value to the same date.
// If you're reviewing a pull request with a change here, check if 1) has been
// done by the submitter before merging.
static constexpr uint32_t kLastCommittedUpdateDate =
MakeConfigVarUpdateDate(2020, 12, 31, 13);
virtual ~IConfigVarUpdate() = default;
virtual void Apply() const = 0;
static void ApplyUpdates(uint32_t config_date) {
if (!updates_) {
return;
}
auto it_end = updates_->end();
for (auto it = updates_->upper_bound(config_date); it != it_end; ++it) {
it->second->Apply();
}
}
// More reliable than kLastCommittedUpdateDate for actual usage
// (kLastCommittedUpdateDate is just a pull request merge order guard), though
// usually should be the same, but kLastCommittedUpdateDate may not include
// removal of cvars.
static uint32_t GetLastUpdateDate() {
return (updates_ && !updates_->empty()) ? updates_->crbegin()->first : 0;
}
protected:
IConfigVarUpdate(IConfigVar* const& config_var, uint32_t year, uint32_t month,
uint32_t day, uint32_t utc_hour)
: config_var_(config_var) {
if (!updates_) {
updates_ = new std::multimap<uint32_t, const IConfigVarUpdate*>;
}
updates_->emplace(MakeConfigVarUpdateDate(year, month, day, utc_hour),
this);
}
IConfigVar& config_var() const {
assert_not_null(config_var_);
return *config_var_;
}
private:
// Reference to pointer to loosen initialization order requirements.
IConfigVar* const& config_var_;
// Updates can be initialized before these, thus initialized on demand using
// `new`.
static std::multimap<uint32_t, const IConfigVarUpdate*>* updates_;
};
class ConfigVarUpdateFromAny : public IConfigVarUpdate {
public:
ConfigVarUpdateFromAny(IConfigVar* const& config_var, uint32_t year,
uint32_t month, uint32_t day, uint32_t utc_hour)
: IConfigVarUpdate(config_var, year, month, day, utc_hour) {}
void Apply() const override { config_var().ResetConfigValueToDefault(); }
};
template <typename T>
class ConfigVarUpdate : public IConfigVarUpdate {
public:
ConfigVarUpdate(IConfigVar* const& config_var, uint32_t year, uint32_t month,
uint32_t day, uint32_t utc_hour, const T& old_default_value)
: IConfigVarUpdate(config_var, year, month, day, utc_hour),
old_default_value_(old_default_value) {}
void Apply() const override {
IConfigVar& config_var_untyped = config_var();
ConfigVar<T>* config_var_typed =
dynamic_cast<ConfigVar<T>*>(&config_var_untyped);
// Update only from the previous default value if the same type,
// unconditionally reset if the type has been changed.
if (!config_var_typed ||
config_var_typed->GetTypedConfigValue() == old_default_value_) {
config_var_untyped.ResetConfigValueToDefault();
}
}
private:
T old_default_value_;
};
#define UPDATE_from_any(name, year, month, day, utc_hour) \
static_assert( \
cvar::MakeConfigVarUpdateDate(year, month, day, utc_hour) <= \
cvar::IConfigVarUpdate::kLastCommittedUpdateDate, \
"A new config variable default value update was added - raise " \
"cvar::IConfigVarUpdate::kLastCommittedUpdateDate to the same date in " \
"base/cvar.h to ensure coherence between different pull requests " \
"updating config variable defaults."); \
namespace cv { \
static const cvar::ConfigVarUpdateFromAny \
update_##name_##year_##month_##day_##utc_hour(cv_##name, year, month, \
day, utc_hour); \
}
#define UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, type) \
static_assert( \
cvar::MakeConfigVarUpdateDate(year, month, day, utc_hour) <= \
cvar::IConfigVarUpdate::kLastCommittedUpdateDate, \
"A new config variable default value update was added - raise " \
"cvar::IConfigVarUpdate::kLastCommittedUpdateDate to the same date in " \
"base/cvar.h to ensure coherence between different pull requests " \
"updating config variable defaults."); \
namespace cv { \
static const cvar::ConfigVarUpdate<type> \
update_##name_##year_##month_##day_##utc_hour(cv_##name, year, month, \
day, utc_hour, \
old_default_value); \
}
#define UPDATE_from_bool(name, year, month, day, utc_hour, old_default_value) \
UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, bool)
#define UPDATE_from_int32(name, year, month, day, utc_hour, old_default_value) \
UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, int32_t)
#define UPDATE_from_uint32(name, year, month, day, utc_hour, \
old_default_value) \
UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, uint32_t)
#define UPDATE_from_uint64(name, year, month, day, utc_hour, \
old_default_value) \
UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, uint64_t)
#define UPDATE_from_double(name, year, month, day, utc_hour, \
old_default_value) \
UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, double)
#define UPDATE_from_string(name, year, month, day, utc_hour, \
old_default_value) \
UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, std::string)
#define UPDATE_from_path(name, year, month, day, utc_hour, old_default_value) \
UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, \
std::filesystem::path)
} // namespace cvar
#endif // XENIA_CVAR_H_

View File

@ -9,6 +9,7 @@
#include "xenia/base/fuzzy.h"
#include <cctype>
#include <cstring>
#include <iostream>

View File

@ -12,6 +12,8 @@
#include <cstddef>
#include "xenia/base/xxhash.h"
namespace xe {
namespace hash {
@ -24,6 +26,13 @@ struct IdentityHasher {
size_t operator()(const Key& key) const { return static_cast<size_t>(key); }
};
template <typename Key>
struct XXHasher {
size_t operator()(const Key& key) const {
return static_cast<size_t>(XXH3_64bits(&key, sizeof(key)));
}
};
} // namespace hash
} // namespace xe

View File

@ -10,6 +10,7 @@
#ifndef XENIA_BASE_MAIN_H_
#define XENIA_BASE_MAIN_H_
#include <optional>
#include <string>
#include <vector>
@ -25,19 +26,26 @@ bool has_console_attached();
// launch.
struct EntryInfo {
std::string name;
std::string positional_usage;
std::vector<std::string> positional_options;
int (*entry_point)(const std::vector<std::string>& args);
bool transparent_options; // no argument parsing
std::optional<std::string> positional_usage;
std::optional<std::vector<std::string>> positional_options;
};
EntryInfo GetEntryInfo();
#define DEFINE_ENTRY_POINT(name, entry_point, positional_usage, ...) \
xe::EntryInfo xe::GetEntryInfo() { \
std::initializer_list<std::string> positional_options = {__VA_ARGS__}; \
return xe::EntryInfo( \
{name, positional_usage, \
std::vector<std::string>(std::move(positional_options)), \
entry_point}); \
return xe::EntryInfo{ \
name, entry_point, false, positional_usage, \
std::vector<std::string>(std::move(positional_options))}; \
}
// TODO(Joel Linn): Add some way to filter consumed arguments in
// cvar::ParseLaunchArguments()
#define DEFINE_ENTRY_POINT_TRANSPARENT(name, entry_point) \
xe::EntryInfo xe::GetEntryInfo() { \
return xe::EntryInfo{name, entry_point, true, std::nullopt, std::nullopt}; \
}
} // namespace xe

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -23,8 +23,10 @@ bool has_console_attached() { return true; }
extern "C" int main(int argc, char** argv) {
auto entry_info = xe::GetEntryInfo();
cvar::ParseLaunchArguments(argc, argv, entry_info.positional_usage,
entry_info.positional_options);
if (!entry_info.transparent_options) {
cvar::ParseLaunchArguments(argc, argv, entry_info.positional_usage.value(),
entry_info.positional_options.value());
}
std::vector<std::string> args;
for (int n = 0; n < argc; n++) {

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -104,8 +104,10 @@ static bool parse_launch_arguments(const xe::EntryInfo& entry_info,
LocalFree(wargv);
cvar::ParseLaunchArguments(argc, argv, entry_info.positional_usage,
entry_info.positional_options);
if (!entry_info.transparent_options) {
cvar::ParseLaunchArguments(argc, argv, entry_info.positional_usage.value(),
entry_info.positional_options.value());
}
args.clear();
for (int n = 0; n < argc; n++) {

View File

@ -18,6 +18,11 @@
#include "xenia/base/memory.h"
#include "xenia/base/platform_win.h"
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP | \
WINAPI_PARTITION_SYSTEM | WINAPI_PARTITION_GAMES)
#define XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
#endif
namespace xe {
class Win32MappedMemory : public MappedMemory {
@ -70,7 +75,7 @@ class Win32MappedMemory : public MappedMemory {
size_t aligned_length = length + (offset - aligned_offset);
UnmapViewOfFile(data_);
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
data_ = MapViewOfFile(mapping_handle, view_access_, aligned_offset >> 32,
aligned_offset & 0xFFFFFFFF, aligned_length);
#else
@ -139,7 +144,7 @@ std::unique_ptr<MappedMemory> MappedMemory::Open(
return nullptr;
}
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
mm->mapping_handle = CreateFileMapping(
mm->file_handle, nullptr, mapping_protect, DWORD(aligned_length >> 32),
DWORD(aligned_length), nullptr);
@ -152,7 +157,7 @@ std::unique_ptr<MappedMemory> MappedMemory::Open(
return nullptr;
}
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
mm->data_ = reinterpret_cast<uint8_t*>(MapViewOfFile(
mm->mapping_handle, view_access, DWORD(aligned_offset >> 32),
DWORD(aligned_offset), aligned_length));
@ -257,7 +262,7 @@ class Win32ChunkedMappedMemoryWriter : public ChunkedMappedMemoryWriter {
return false;
}
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
mapping_handle_ =
CreateFileMapping(file_handle_, nullptr, mapping_protect,
DWORD(capacity_ >> 32), DWORD(capacity_), nullptr);
@ -275,11 +280,11 @@ class Win32ChunkedMappedMemoryWriter : public ChunkedMappedMemoryWriter {
if (low_address_space) {
bool successful = false;
data_ = reinterpret_cast<uint8_t*>(0x10000000);
#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifndef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
HANDLE process = GetCurrentProcess();
#endif
for (int i = 0; i < 1000; ++i) {
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
if (MapViewOfFileEx(mapping_handle_, view_access, 0, 0, capacity_,
data_)) {
successful = true;
@ -311,7 +316,7 @@ class Win32ChunkedMappedMemoryWriter : public ChunkedMappedMemoryWriter {
}
}
} else {
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
data_ = reinterpret_cast<uint8_t*>(
MapViewOfFile(mapping_handle_, view_access, 0, 0, capacity_));
#else

View File

@ -17,6 +17,16 @@
#include <limits>
#include <numeric>
#include <type_traits>
#if defined __has_include
#if __has_include(<version>)
#include <version>
#endif
#endif
#if __cpp_lib_bitops
#include <bit>
#endif
#include "xenia/base/platform.h"
#if XE_ARCH_AMD64
@ -50,8 +60,20 @@ constexpr T round_up(T value, V multiple, bool force_non_zero = true) {
return (value + multiple - 1) / multiple * multiple;
}
constexpr float saturate(float value) {
return std::max(std::min(1.0f, value), -1.0f);
// Using the same conventions as in shading languages, returning 0 for NaN.
// std::max is `a < b ? b : a`, thus in case of NaN, the first argument is
// always returned. Also -0 is not < +0, so +0 is also chosen for it.
template <typename T>
constexpr T saturate_unsigned(T value) {
return std::min(static_cast<T>(1.0f), std::max(static_cast<T>(0.0f), value));
}
// This diverges from the GPU NaN rules for signed normalized formats (NaN
// should be converted to 0, not to -1), but this expectation is not needed most
// of time, and cannot be met for free (unlike for 0...1 clamping).
template <typename T>
constexpr T saturate_signed(T value) {
return std::min(static_cast<T>(1.0f), std::max(static_cast<T>(-1.0f), value));
}
// Gets the next power of two value that is greater than or equal to the given
@ -104,6 +126,23 @@ constexpr uint32_t select_bits(uint32_t value, uint32_t a, uint32_t b) {
return (value & make_bitmask(a, b)) >> a;
}
#if __cpp_lib_bitops
template <class T>
constexpr inline uint32_t bit_count(T v) {
return static_cast<uint32_t>(std::popcount(v));
}
#else
#if XE_COMPILER_MSVC || XE_COMPILER_INTEL
inline uint32_t bit_count(uint32_t v) { return __popcnt(v); }
inline uint32_t bit_count(uint64_t v) {
return static_cast<uint32_t>(__popcnt64(v));
}
#elif XE_COMPILER_GCC || XE_COMPILER_CLANG
static_assert(sizeof(unsigned int) == sizeof(uint32_t));
static_assert(sizeof(unsigned long long) == sizeof(uint64_t));
inline uint32_t bit_count(uint32_t v) { return __builtin_popcount(v); }
inline uint32_t bit_count(uint64_t v) { return __builtin_popcountll(v); }
#else
inline uint32_t bit_count(uint32_t v) {
v = v - ((v >> 1) & 0x55555555);
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
@ -119,6 +158,8 @@ inline uint32_t bit_count(uint64_t v) {
v = v + (v >> 32) & 0x0000007F;
return static_cast<uint32_t>(v);
}
#endif
#endif
// lzcnt instruction, typed for integers of all sizes.
// The number of leading zero bits in the value parameter. If value is zero, the
@ -245,7 +286,7 @@ inline bool bit_scan_forward(uint32_t v, uint32_t* out_first_set_index) {
return i != 0;
}
inline bool bit_scan_forward(uint64_t v, uint32_t* out_first_set_index) {
int i = ffsll(v);
int i = __builtin_ffsll(v);
*out_first_set_index = i - 1;
return i != 0;
}

View File

@ -43,6 +43,16 @@ void copy_128_aligned(void* dest, const void* src, size_t count) {
}
#if XE_ARCH_AMD64
// This works around a GCC bug
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100801
// TODO(Joel Linn): Remove this when fixed GCC versions are common place.
#if XE_COMPILER_GNUC
#define XE_WORKAROUND_LOOP_KILL_MOD(x) \
if ((count % (x)) == 0) __builtin_unreachable();
#else
#define XE_WORKAROUND_LOOP_KILL_MOD(x)
#endif
void copy_and_swap_16_aligned(void* dest_ptr, const void* src_ptr,
size_t count) {
assert_zero(reinterpret_cast<uintptr_t>(dest_ptr) & 0xF);
@ -61,6 +71,7 @@ void copy_and_swap_16_aligned(void* dest_ptr, const void* src_ptr,
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
}
for (; i < count; ++i) { // handle residual elements
XE_WORKAROUND_LOOP_KILL_MOD(8);
dest[i] = byte_swap(src[i]);
}
}
@ -80,6 +91,7 @@ void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
}
for (; i < count; ++i) { // handle residual elements
XE_WORKAROUND_LOOP_KILL_MOD(8);
dest[i] = byte_swap(src[i]);
}
}
@ -102,6 +114,7 @@ void copy_and_swap_32_aligned(void* dest_ptr, const void* src_ptr,
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
}
for (; i < count; ++i) { // handle residual elements
XE_WORKAROUND_LOOP_KILL_MOD(4);
dest[i] = byte_swap(src[i]);
}
}
@ -121,6 +134,7 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
}
for (; i < count; ++i) { // handle residual elements
XE_WORKAROUND_LOOP_KILL_MOD(4);
dest[i] = byte_swap(src[i]);
}
}
@ -143,6 +157,7 @@ void copy_and_swap_64_aligned(void* dest_ptr, const void* src_ptr,
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
}
for (; i < count; ++i) { // handle residual elements
XE_WORKAROUND_LOOP_KILL_MOD(2);
dest[i] = byte_swap(src[i]);
}
}
@ -162,6 +177,7 @@ void copy_and_swap_64_unaligned(void* dest_ptr, const void* src_ptr,
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
}
for (; i < count; ++i) { // handle residual elements
XE_WORKAROUND_LOOP_KILL_MOD(2);
dest[i] = byte_swap(src[i]);
}
}
@ -178,6 +194,7 @@ void copy_and_swap_16_in_32_aligned(void* dest_ptr, const void* src_ptr,
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
}
for (; i < count; ++i) { // handle residual elements
XE_WORKAROUND_LOOP_KILL_MOD(4);
dest[i] = (src[i] >> 16) | (src[i] << 16);
}
}
@ -194,6 +211,7 @@ void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr,
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
}
for (; i < count; ++i) { // handle residual elements
XE_WORKAROUND_LOOP_KILL_MOD(4);
dest[i] = (src[i] >> 16) | (src[i] << 16);
}
}

View File

@ -15,6 +15,7 @@
#include <filesystem>
#include <functional>
#include <string>
#include <string_view>
#include "xenia/base/assert.h"
#include "xenia/base/byte_order.h"
@ -441,6 +442,26 @@ inline void store_and_swap<std::u16string>(void* mem,
return store_and_swap<std::u16string_view>(mem, value);
}
using fourcc_t = uint32_t;
// Get FourCC in host byte order
// make_fourcc('a', 'b', 'c', 'd') == 0x61626364
constexpr inline fourcc_t make_fourcc(char a, char b, char c, char d) {
return fourcc_t((static_cast<fourcc_t>(a) << 24) |
(static_cast<fourcc_t>(b) << 16) |
(static_cast<fourcc_t>(c) << 8) | static_cast<fourcc_t>(d));
}
// Get FourCC in host byte order
// This overload requires fourcc.length() == 4
// make_fourcc("abcd") == 'abcd' == 0x61626364 for most compilers
constexpr inline fourcc_t make_fourcc(const std::string_view fourcc) {
if (fourcc.length() != 4) {
throw std::runtime_error("Invalid fourcc length");
}
return make_fourcc(fourcc[0], fourcc[1], fourcc[2], fourcc[3]);
}
} // namespace xe
#endif // XENIA_BASE_MEMORY_H_

View File

@ -11,6 +11,11 @@
#include "xenia/base/platform_win.h"
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP | \
WINAPI_PARTITION_SYSTEM | WINAPI_PARTITION_GAMES)
#define XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
#endif
namespace xe {
namespace memory {
@ -75,12 +80,11 @@ PageAccess ToXeniaProtectFlags(DWORD access) {
}
bool IsWritableExecutableMemorySupported() {
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
return true;
#else
// To test FromApp functions on desktop, replace
// WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) with 0 in the #ifs and
// link to WindowsApp.lib.
// To test FromApp functions on desktop, undefine
// XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS and link to WindowsApp.lib.
return false;
#endif
}
@ -103,7 +107,7 @@ void* AllocFixed(void* base_address, size_t length,
break;
}
DWORD protect = ToWin32ProtectFlags(access);
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
return VirtualAlloc(base_address, length, alloc_type, protect);
#else
return VirtualAllocFromApp(base_address, length, ULONG(alloc_type),
@ -135,7 +139,7 @@ bool Protect(void* base_address, size_t length, PageAccess access,
*out_old_access = PageAccess::kNoAccess;
}
DWORD new_protect = ToWin32ProtectFlags(access);
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
DWORD old_protect = 0;
BOOL result = VirtualProtect(base_address, length, new_protect, &old_protect);
#else
@ -174,7 +178,7 @@ FileMappingHandle CreateFileMappingHandle(const std::filesystem::path& path,
DWORD protect =
ToWin32ProtectFlags(access) | (commit ? SEC_COMMIT : SEC_RESERVE);
auto full_path = "Local" / path;
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
return CreateFileMappingW(INVALID_HANDLE_VALUE, nullptr, protect,
static_cast<DWORD>(length >> 32),
static_cast<DWORD>(length), full_path.c_str());
@ -191,7 +195,7 @@ void CloseFileMappingHandle(FileMappingHandle handle,
void* MapFileView(FileMappingHandle handle, void* base_address, size_t length,
PageAccess access, size_t file_offset) {
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
DWORD target_address_low = static_cast<DWORD>(file_offset);
DWORD target_address_high = static_cast<DWORD>(file_offset >> 32);
DWORD file_access = 0;

View File

@ -85,18 +85,17 @@
#endif // XE_PLATFORM_MAC
#if XE_COMPILER_MSVC
#define XEPACKEDSTRUCT(name, value) \
__pragma(pack(push, 1)) struct name value __pragma(pack(pop));
#define XEPACKEDSTRUCTANONYMOUS(value) \
__pragma(pack(push, 1)) struct value __pragma(pack(pop));
#define XEPACKEDUNION(name, value) \
__pragma(pack(push, 1)) union name value __pragma(pack(pop));
#define _XEPACKEDSCOPE(body) __pragma(pack(push, 1)) body __pragma(pack(pop));
#else
#define XEPACKEDSTRUCT(name, value) struct __attribute__((packed)) name value;
#define XEPACKEDSTRUCTANONYMOUS(value) struct __attribute__((packed)) value;
#define XEPACKEDUNION(name, value) union __attribute__((packed)) name value;
#define _XEPACKEDSCOPE(body) \
_Pragma("pack(push, 1)") body; \
_Pragma("pack(pop)");
#endif // XE_PLATFORM_WIN32
#define XEPACKEDSTRUCT(name, value) _XEPACKEDSCOPE(struct name value)
#define XEPACKEDSTRUCTANONYMOUS(value) _XEPACKEDSCOPE(struct value)
#define XEPACKEDUNION(name, value) _XEPACKEDSCOPE(union name value)
namespace xe {
#if XE_PLATFORM_WIN32

View File

@ -22,6 +22,7 @@
#define NOMINMAX
#include <ObjBase.h>
#include <SDKDDKVer.h>
#include <bcrypt.h>
#include <dwmapi.h>
#include <shellapi.h>
#include <shlwapi.h>

View File

@ -87,12 +87,12 @@ struct string_key_case : internal::string_key_base {
namespace std {
template <>
struct std::hash<xe::string_key> {
struct hash<xe::string_key> {
std::size_t operator()(const xe::string_key& t) const { return t.hash(); }
};
template <>
struct std::hash<xe::string_key_case> {
struct hash<xe::string_key_case> {
std::size_t operator()(const xe::string_key_case& t) const {
return t.hash();
}

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2015 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -18,7 +18,7 @@ namespace xe {
namespace base {
namespace test {
TEST_CASE("copy_128_aligned", "Copy and Swap") {
TEST_CASE("copy_128_aligned", "[copy_and_swap]") {
alignas(128) uint8_t src[256], dest[256];
for (uint8_t i = 0; i < 255; ++i) {
src[i] = 255 - i;
@ -37,7 +37,7 @@ TEST_CASE("copy_128_aligned", "Copy and Swap") {
REQUIRE(std::memcmp(dest, src + 1, 128));
}
TEST_CASE("copy_and_swap_16_aligned", "Copy and Swap") {
TEST_CASE("copy_and_swap_16_aligned", "[copy_and_swap]") {
alignas(16) uint16_t a = 0x1111, b = 0xABCD;
copy_and_swap_16_aligned(&a, &b, 1);
REQUIRE(a == 0xCDAB);
@ -93,7 +93,7 @@ TEST_CASE("copy_and_swap_16_aligned", "Copy and Swap") {
REQUIRE(std::strcmp(f, "s atdnra dlagimnne.t") == 0);
}
TEST_CASE("copy_and_swap_16_unaligned", "Copy and Swap") {
TEST_CASE("copy_and_swap_16_unaligned", "[copy_and_swap]") {
uint16_t a = 0x1111, b = 0xABCD;
copy_and_swap_16_unaligned(&a, &b, 1);
REQUIRE(a == 0xCDAB);
@ -139,7 +139,7 @@ TEST_CASE("copy_and_swap_16_unaligned", "Copy and Swap") {
"noeg rhtnas atdnra dlagimnne.t") == 0);
}
TEST_CASE("copy_and_swap_32_aligned", "Copy and Swap") {
TEST_CASE("copy_and_swap_32_aligned", "[copy_and_swap]") {
alignas(32) uint32_t a = 0x11111111, b = 0x89ABCDEF;
copy_and_swap_32_aligned(&a, &b, 1);
REQUIRE(a == 0xEFCDAB89);
@ -195,7 +195,7 @@ TEST_CASE("copy_and_swap_32_aligned", "Copy and Swap") {
REQUIRE(std::strcmp(f, "ats radnla dmngi.tne") == 0);
}
TEST_CASE("copy_and_swap_32_unaligned", "Copy and Swap") {
TEST_CASE("copy_and_swap_32_unaligned", "[copy_and_swap]") {
uint32_t a = 0x11111111, b = 0x89ABCDEF;
copy_and_swap_32_unaligned(&a, &b, 1);
REQUIRE(a == 0xEFCDAB89);
@ -259,7 +259,7 @@ TEST_CASE("copy_and_swap_32_unaligned", "Copy and Swap") {
"regnahtats radnla dmngi.tne") == 0);
}
TEST_CASE("copy_and_swap_64_aligned", "Copy and Swap") {
TEST_CASE("copy_and_swap_64_aligned", "[copy_and_swap]") {
alignas(64) uint64_t a = 0x1111111111111111, b = 0x0123456789ABCDEF;
copy_and_swap_64_aligned(&a, &b, 1);
REQUIRE(a == 0xEFCDAB8967452301);
@ -317,7 +317,7 @@ TEST_CASE("copy_and_swap_64_aligned", "Copy and Swap") {
REQUIRE(std::strcmp(f, "radnats mngila d") == 0);
}
TEST_CASE("copy_and_swap_64_unaligned", "Copy and Swap") {
TEST_CASE("copy_and_swap_64_unaligned", "[copy_and_swap]") {
uint64_t a = 0x1111111111111111, b = 0x0123456789ABCDEF;
copy_and_swap_64_unaligned(&a, &b, 1);
REQUIRE(a == 0xEFCDAB8967452301);
@ -407,12 +407,12 @@ TEST_CASE("copy_and_swap_64_unaligned", "Copy and Swap") {
"regradnats mngila d") == 0);
}
TEST_CASE("copy_and_swap_16_in_32_aligned", "Copy and Swap") {
TEST_CASE("copy_and_swap_16_in_32_aligned", "[copy_and_swap]") {
// TODO(bwrsandman): test once properly understood.
REQUIRE(true == true);
}
TEST_CASE("copy_and_swap_16_in_32_unaligned", "Copy and Swap") {
TEST_CASE("copy_and_swap_16_in_32_unaligned", "[copy_and_swap]") {
// TODO(bwrsandman): test once properly understood.
REQUIRE(true == true);
}
@ -425,7 +425,7 @@ TEST_CASE("create_and_close_file_mapping", "Virtual Memory Mapping") {
xe::memory::CloseFileMappingHandle(memory, path);
}
TEST_CASE("map_view", "Virtual Memory Mapping") {
TEST_CASE("map_view", "[virtual_memory_mapping]") {
auto path = fmt::format("xenia_test_{}", Clock::QueryHostTickCount());
const size_t length = 0x100;
auto memory = xe::memory::CreateFileMappingHandle(
@ -442,7 +442,7 @@ TEST_CASE("map_view", "Virtual Memory Mapping") {
xe::memory::CloseFileMappingHandle(memory, path);
}
TEST_CASE("read_write_view", "Virtual Memory Mapping") {
TEST_CASE("read_write_view", "[virtual_memory_mapping]") {
const size_t length = 0x100;
auto path = fmt::format("xenia_test_{}", Clock::QueryHostTickCount());
auto memory = xe::memory::CreateFileMappingHandle(
@ -469,6 +469,40 @@ TEST_CASE("read_write_view", "Virtual Memory Mapping") {
xe::memory::CloseFileMappingHandle(memory, path);
}
TEST_CASE("make_fourcc", "[fourcc]") {
SECTION("'1234'") {
const uint32_t fourcc_host = 0x31323334;
constexpr fourcc_t fourcc_1 = make_fourcc('1', '2', '3', '4');
constexpr fourcc_t fourcc_2 = make_fourcc("1234");
REQUIRE(fourcc_1 == fourcc_host);
REQUIRE(fourcc_2 == fourcc_host);
REQUIRE(fourcc_1 == fourcc_2);
REQUIRE(fourcc_2 == fourcc_1);
}
SECTION("'ABcd'") {
const uint32_t fourcc_host = 0x41426364;
constexpr fourcc_t fourcc_1 = make_fourcc('A', 'B', 'c', 'd');
constexpr fourcc_t fourcc_2 = make_fourcc("ABcd");
REQUIRE(fourcc_1 == fourcc_host);
REQUIRE(fourcc_2 == fourcc_host);
REQUIRE(fourcc_1 == fourcc_2);
REQUIRE(fourcc_2 == fourcc_1);
}
SECTION("'XEN\\0'") {
const uint32_t fourcc_host = 0x58454E00;
constexpr fourcc_t fourcc = make_fourcc('X', 'E', 'N', '\0');
REQUIRE(fourcc == fourcc_host);
}
SECTION("length()!=4") {
REQUIRE_THROWS(make_fourcc("AB\0\0"));
REQUIRE_THROWS(make_fourcc("AB\0\0AB"));
REQUIRE_THROWS(make_fourcc("ABCDEFGH"));
}
}
} // namespace test
} // namespace base
} // namespace xe

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2018 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -84,17 +84,17 @@ TEST_CASE("Enable process to set thread affinity") {
EnableAffinityConfiguration();
}
TEST_CASE("Yield Current Thread", "MaybeYield") {
TEST_CASE("Yield Current Thread", "[maybe_yield]") {
// Run to see if there are any errors
MaybeYield();
}
TEST_CASE("Sync with Memory Barrier", "SyncMemory") {
TEST_CASE("Sync with Memory Barrier", "[sync_memory]") {
// Run to see if there are any errors
SyncMemory();
}
TEST_CASE("Sleep Current Thread", "Sleep") {
TEST_CASE("Sleep Current Thread", "[sleep]") {
auto wait_time = 50ms;
auto start = std::chrono::steady_clock::now();
Sleep(wait_time);
@ -102,7 +102,7 @@ TEST_CASE("Sleep Current Thread", "Sleep") {
REQUIRE(duration >= wait_time);
}
TEST_CASE("Sleep Current Thread in Alertable State", "Sleep") {
TEST_CASE("Sleep Current Thread in Alertable State", "[sleep]") {
auto wait_time = 50ms;
auto start = std::chrono::steady_clock::now();
auto result = threading::AlertableSleep(wait_time);
@ -154,7 +154,7 @@ TEST_CASE("HighResolutionTimer") {
// Time the actual sleep duration
{
const auto interval = 50ms;
std::atomic<uint64_t> counter;
std::atomic<uint64_t> counter(0);
auto start = std::chrono::steady_clock::now();
auto cb = [&counter] { ++counter; };
auto pTimer = HighResolutionTimer::CreateRepeating(interval, cb);
@ -201,7 +201,7 @@ TEST_CASE("HighResolutionTimer") {
// spawned from differing threads
}
TEST_CASE("Wait on Multiple Handles", "Wait") {
TEST_CASE("Wait on Multiple Handles", "[wait]") {
auto mutant = Mutant::Create(true);
auto semaphore = Semaphore::Create(10, 10);
auto event_ = Event::CreateManualResetEvent(false);
@ -244,7 +244,7 @@ TEST_CASE("Signal and Wait") {
REQUIRE(result == WaitResult::kSuccess);
}
TEST_CASE("Wait on Event", "Event") {
TEST_CASE("Wait on Event", "[event]") {
auto evt = Event::CreateAutoResetEvent(false);
WaitResult result;
@ -262,7 +262,7 @@ TEST_CASE("Wait on Event", "Event") {
REQUIRE(result == WaitResult::kTimeout);
}
TEST_CASE("Reset Event", "Event") {
TEST_CASE("Reset Event", "[event]") {
auto evt = Event::CreateAutoResetEvent(false);
WaitResult result;
@ -283,7 +283,7 @@ TEST_CASE("Reset Event", "Event") {
REQUIRE(result == WaitResult::kSuccess);
}
TEST_CASE("Wait on Multiple Events", "Event") {
TEST_CASE("Wait on Multiple Events", "[event]") {
auto events = std::array<std::unique_ptr<Event>, 4>{
Event::CreateAutoResetEvent(false),
Event::CreateAutoResetEvent(false),
@ -348,7 +348,7 @@ TEST_CASE("Wait on Multiple Events", "Event") {
// REQUIRE(order[3] == '3');
}
TEST_CASE("Wait on Semaphore", "Semaphore") {
TEST_CASE("Wait on Semaphore", "[semaphore]") {
WaitResult result;
std::unique_ptr<Semaphore> sem;
int previous_count = 0;
@ -406,9 +406,13 @@ TEST_CASE("Wait on Semaphore", "Semaphore") {
sem = Semaphore::Create(5, 5);
Sleep(10ms);
// Occupy the semaphore with 5 threads
auto func = [&sem] {
std::atomic<int> wait_count(0);
volatile bool threads_terminate(false);
auto func = [&sem, &wait_count, &threads_terminate] {
auto res = Wait(sem.get(), false, 100ms);
Sleep(500ms);
wait_count++;
while (!threads_terminate) {
}
if (res == WaitResult::kSuccess) {
sem->Release(1, nullptr);
}
@ -417,12 +421,14 @@ TEST_CASE("Wait on Semaphore", "Semaphore") {
std::thread(func), std::thread(func), std::thread(func),
std::thread(func), std::thread(func),
};
// Give threads time to acquire semaphore
Sleep(10ms);
// Wait for threads to finish semaphore calls
while (wait_count != 5) {
}
// Attempt to acquire full semaphore with current (6th) thread
result = Wait(sem.get(), false, 20ms);
REQUIRE(result == WaitResult::kTimeout);
// Give threads time to release semaphore
threads_terminate = true;
for (auto& t : threads) {
t.join();
}
@ -444,7 +450,7 @@ TEST_CASE("Wait on Semaphore", "Semaphore") {
// REQUIRE(sem.get() == nullptr);
}
TEST_CASE("Wait on Multiple Semaphores", "Semaphore") {
TEST_CASE("Wait on Multiple Semaphores", "[semaphore]") {
WaitResult all_result;
std::pair<WaitResult, size_t> any_result;
int previous_count;
@ -501,7 +507,7 @@ TEST_CASE("Wait on Multiple Semaphores", "Semaphore") {
REQUIRE(previous_count == 4);
}
TEST_CASE("Wait on Mutant", "Mutant") {
TEST_CASE("Wait on Mutant", "[mutant]") {
WaitResult result;
std::unique_ptr<Mutant> mut;
@ -558,7 +564,7 @@ TEST_CASE("Wait on Mutant", "Mutant") {
REQUIRE(mut->Release());
}
TEST_CASE("Wait on Multiple Mutants", "Mutant") {
TEST_CASE("Wait on Multiple Mutants", "[mutant]") {
WaitResult all_result;
std::pair<WaitResult, size_t> any_result;
std::unique_ptr<Mutant> mut0, mut1;
@ -621,7 +627,7 @@ TEST_CASE("Wait on Multiple Mutants", "Mutant") {
thread2.join();
}
TEST_CASE("Wait on Timer", "Timer") {
TEST_CASE("Wait on Timer", "[timer]") {
WaitResult result;
std::unique_ptr<Timer> timer;
@ -686,7 +692,7 @@ TEST_CASE("Wait on Timer", "Timer") {
REQUIRE(result == WaitResult::kTimeout); // No more signals from repeating
}
TEST_CASE("Wait on Multiple Timers", "Timer") {
TEST_CASE("Wait on Multiple Timers", "[timer]") {
WaitResult all_result;
std::pair<WaitResult, size_t> any_result;
@ -724,13 +730,13 @@ TEST_CASE("Wait on Multiple Timers", "Timer") {
REQUIRE(any_result.second == 1);
}
TEST_CASE("Create and Trigger Timer Callbacks", "Timer") {
TEST_CASE("Create and Trigger Timer Callbacks", "[timer]") {
// TODO(bwrsandman): Check which thread performs callback and timing of
// callback
REQUIRE(true);
}
TEST_CASE("Set and Test Current Thread ID", "Thread") {
TEST_CASE("Set and Test Current Thread ID", "[thread]") {
// System ID
auto system_id = current_thread_system_id();
REQUIRE(system_id > 0);
@ -763,71 +769,76 @@ TEST_CASE("Set and Test Current Thread Name", "Thread") {
REQUIRE_NOTHROW(set_name(old_thread_name));
}
TEST_CASE("Create and Run Thread", "Thread") {
TEST_CASE("Create and Run Thread", "[thread]") {
std::unique_ptr<Thread> thread;
WaitResult result;
Thread::CreationParameters params = {};
auto func = [] { Sleep(20ms); };
// Create most basic case of thread
thread = Thread::Create(params, func);
REQUIRE(thread->native_handle() != nullptr);
REQUIRE_NOTHROW(thread->affinity_mask());
REQUIRE(thread->name().empty());
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kSuccess);
SECTION("Create most basic case of thread") {
thread = Thread::Create(params, func);
REQUIRE(thread->native_handle() != nullptr);
REQUIRE_NOTHROW(thread->affinity_mask());
REQUIRE(thread->name().empty());
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kSuccess);
}
// Add thread name
std::string new_name = "Test thread name";
thread = Thread::Create(params, func);
auto name = thread->name();
INFO(name.c_str());
REQUIRE(name.empty());
thread->set_name(new_name);
REQUIRE(thread->name() == new_name);
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kSuccess);
SECTION("Add thread name") {
std::string new_name = "Test thread name";
thread = Thread::Create(params, func);
auto name = thread->name();
INFO(name.c_str());
REQUIRE(name.empty());
thread->set_name(new_name);
REQUIRE(thread->name() == new_name);
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kSuccess);
}
// Use Terminate to end an infinitely looping thread
thread = Thread::Create(params, [] {
while (true) {
Sleep(1ms);
}
});
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kTimeout);
thread->Terminate(-1);
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kSuccess);
SECTION("Use Terminate to end an infinitely looping thread") {
thread = Thread::Create(params, [] {
while (true) {
Sleep(1ms);
}
});
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kTimeout);
thread->Terminate(-1);
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kSuccess);
}
// Call Exit from inside an infinitely looping thread
thread = Thread::Create(params, [] {
while (true) {
SECTION("Call Exit from inside an infinitely looping thread") {
thread = Thread::Create(params, [] {
Thread::Exit(-1);
}
});
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kSuccess);
FAIL("Function must not return");
});
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kSuccess);
}
// Call timeout wait on self
result = Wait(Thread::GetCurrentThread(), false, 50ms);
REQUIRE(result == WaitResult::kTimeout);
SECTION("Call timeout wait on self") {
result = Wait(Thread::GetCurrentThread(), false, 50ms);
REQUIRE(result == WaitResult::kTimeout);
}
params.stack_size = 16 * 1024 * 1024;
thread = Thread::Create(params, [] {
while (true) {
SECTION("16kb stack size") {
params.stack_size = 16 * 1024 * 1024;
thread = Thread::Create(params, [] {
Thread::Exit(-1);
}
});
REQUIRE(thread != nullptr);
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kSuccess);
FAIL("Function must not return");
});
REQUIRE(thread != nullptr);
result = Wait(thread.get(), false, 50ms);
REQUIRE(result == WaitResult::kSuccess);
}
// TODO(bwrsandman): Test with different priorities
// TODO(bwrsandman): Test setting and getting thread affinity
}
TEST_CASE("Test Suspending Thread", "Thread") {
TEST_CASE("Test Suspending Thread", "[thread]") {
std::unique_ptr<Thread> thread;
WaitResult result;
Thread::CreationParameters params = {};
@ -888,7 +899,7 @@ TEST_CASE("Test Suspending Thread", "Thread") {
REQUIRE(result == threading::WaitResult::kSuccess);
}
TEST_CASE("Test Thread QueueUserCallback", "Thread") {
TEST_CASE("Test Thread QueueUserCallback", "[thread]") {
std::unique_ptr<Thread> thread;
WaitResult result;
Thread::CreationParameters params = {};

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -16,16 +16,220 @@
namespace xe::base::test {
// TODO(gibbed): bit messy?
// TODO(gibbed): predicate variant?
#define TEST_EXAMPLE(func, left, right) REQUIRE(func(left) == right)
#define TEST_EXAMPLES_1(func, language, results) \
TEST_EXAMPLE(func, examples::k##language##Values[0], results.language[0])
#define TEST_EXAMPLES_2(func, language, results) \
TEST_EXAMPLE(func, examples::k##language##Values[0], results.language[0]); \
TEST_EXAMPLE(func, examples::k##language##Values[1], results.language[1])
#define TEST_EXAMPLES_3(func, language, results) \
TEST_EXAMPLE(func, examples::k##language##Values[0], results.language[0]); \
TEST_EXAMPLE(func, examples::k##language##Values[1], results.language[1]); \
TEST_EXAMPLE(func, examples::k##language##Values[2], results.language[2])
namespace examples {
// https://www.cl.cam.ac.uk/~mgk25/ucs/examples/quickbrown.txt
TEST_CASE("utf8::split", "UTF-8 Split") {
const size_t kDanishCount = 1;
const char* kDanishValues[kDanishCount] = {
u8"Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen Wolther "
u8"spillede på xylofon.",
};
#define TEST_LANGUAGE_EXAMPLES_Danish(func, results) \
TEST_EXAMPLES_1(func, Danish, results)
const size_t kGermanCount = 3;
const char* kGermanValues[kGermanCount] = {
u8"Falsches Üben von Xylophonmusik quält jeden größeren Zwerg",
u8"Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich",
u8"Heizölrückstoßabdämpfung",
};
#define TEST_LANGUAGE_EXAMPLES_German(func, results) \
TEST_EXAMPLES_2(func, German, results)
const size_t kGreekCount = 2;
const char* kGreekValues[kGreekCount] = {
u8"Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ πιὰ στὸ χρυσαφὶ ξέφωτο",
u8"Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία",
};
#define TEST_LANGUAGE_EXAMPLES_Greek(func, results) \
TEST_EXAMPLES_2(func, Greek, results)
const size_t kEnglishCount = 1;
const char* kEnglishValues[kEnglishCount] = {
u8"The quick brown fox jumps over the lazy dog",
};
#define TEST_LANGUAGE_EXAMPLES_English(func, results) \
TEST_EXAMPLES_1(func, English, results)
const size_t kSpanishCount = 1;
const char* kSpanishValues[kSpanishCount] = {
u8"El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, "
u8"añoraba a su querido cachorro.",
};
#define TEST_LANGUAGE_EXAMPLES_Spanish(func, results) \
TEST_EXAMPLES_1(func, Spanish, results)
const size_t kFrenchCount = 3;
const char* kFrenchValues[kFrenchCount] = {
u8"Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à "
u8"côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce qui "
u8"lui permet de penser à la cænogenèse de l'être dont il est question "
u8"dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui, "
u8"pense-t-il, diminue çà et là la qualité de son œuvre.",
u8"l'île exiguë\n"
u8"Où l'obèse jury mûr\n"
u8"Fête l'haï volapük,\n"
u8"Âne ex aéquo au whist,\n"
u8"Ôtez ce vœu déçu.",
u8"Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en canoë "
u8"au delà des îles, près du mälström où brûlent les novæ.",
};
#define TEST_LANGUAGE_EXAMPLES_French(func, results) \
TEST_EXAMPLES_3(func, French, results)
const size_t kIrishGaelicCount = 1;
const char* kIrishGaelicValues[kIrishGaelicCount] = {
u8"D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh",
};
#define TEST_LANGUAGE_EXAMPLES_IrishGaelic(func, results) \
TEST_EXAMPLES_1(func, IrishGaelic, results)
const size_t kHungarianCount = 1;
const char* kHungarianValues[kHungarianCount] = {
u8"Árvíztűrő tükörfúrógép",
};
#define TEST_LANGUAGE_EXAMPLES_Hungarian(func, results) \
TEST_EXAMPLES_1(func, Hungarian, results)
const size_t kIcelandicCount = 2;
const char* kIcelandicValues[kIcelandicCount] = {
u8"Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa",
u8"Sævör grét áðan því úlpan var ónýt",
};
#define TEST_LANGUAGE_EXAMPLES_Icelandic(func, results) \
TEST_EXAMPLES_2(func, Icelandic, results)
const size_t kJapaneseCount = 2;
const char* kJapaneseValues[kJapaneseCount] = {
u8"いろはにほへとちりぬるを\n"
u8"わかよたれそつねならむ\n"
u8"うゐのおくやまけふこえて\n"
u8"あさきゆめみしゑひもせす\n",
u8"イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム\n"
u8"ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン",
};
#define TEST_LANGUAGE_EXAMPLES_Japanese(func, results) \
TEST_EXAMPLES_2(func, Japanese, results)
const size_t kHebrewCount = 1;
const char* kHebrewValues[kHebrewCount] = {
u8"? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה",
};
#define TEST_LANGUAGE_EXAMPLES_Hebrew(func, results) \
TEST_EXAMPLES_1(func, Hebrew, results)
const size_t kPolishCount = 1;
const char* kPolishValues[kPolishCount] = {
u8"Pchnąć w tę łódź jeża lub ośm skrzyń fig",
};
#define TEST_LANGUAGE_EXAMPLES_Polish(func, results) \
TEST_EXAMPLES_1(func, Polish, results)
const size_t kRussianCount = 2;
const char* kRussianValues[kRussianCount] = {
u8"В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!",
u8"Съешь же ещё этих мягких французских булок да выпей чаю",
};
#define TEST_LANGUAGE_EXAMPLES_Russian(func, results) \
TEST_EXAMPLES_2(func, Russian, results)
const size_t kTurkishCount = 1;
const char* kTurkishValues[kTurkishCount] = {
u8"Pijamalı hasta, yağız şoföre çabucak güvendi.",
};
#define TEST_LANGUAGE_EXAMPLES_Turkish(func, results) \
TEST_EXAMPLES_1(func, Turkish, results)
#define TEST_LANGUAGE_EXAMPLES(func, results) \
TEST_LANGUAGE_EXAMPLES_Danish(func, results); \
TEST_LANGUAGE_EXAMPLES_German(func, results); \
TEST_LANGUAGE_EXAMPLES_Greek(func, results); \
TEST_LANGUAGE_EXAMPLES_English(func, results); \
TEST_LANGUAGE_EXAMPLES_Spanish(func, results); \
TEST_LANGUAGE_EXAMPLES_French(func, results); \
TEST_LANGUAGE_EXAMPLES_IrishGaelic(func, results); \
TEST_LANGUAGE_EXAMPLES_Hungarian(func, results); \
TEST_LANGUAGE_EXAMPLES_Icelandic(func, results); \
TEST_LANGUAGE_EXAMPLES_Japanese(func, results); \
TEST_LANGUAGE_EXAMPLES_Hebrew(func, results); \
TEST_LANGUAGE_EXAMPLES_Polish(func, results); \
TEST_LANGUAGE_EXAMPLES_Russian(func, results); \
TEST_LANGUAGE_EXAMPLES_Turkish(func, results)
} // namespace examples
#define TEST_EXAMPLE_RESULT(language) T language[examples::k##language##Count]
template <typename T>
struct example_results {
TEST_EXAMPLE_RESULT(Danish);
TEST_EXAMPLE_RESULT(German);
TEST_EXAMPLE_RESULT(Greek);
TEST_EXAMPLE_RESULT(English);
TEST_EXAMPLE_RESULT(Spanish);
TEST_EXAMPLE_RESULT(French);
TEST_EXAMPLE_RESULT(IrishGaelic);
TEST_EXAMPLE_RESULT(Hungarian);
TEST_EXAMPLE_RESULT(Icelandic);
TEST_EXAMPLE_RESULT(Japanese);
TEST_EXAMPLE_RESULT(Hebrew);
TEST_EXAMPLE_RESULT(Polish);
TEST_EXAMPLE_RESULT(Russian);
TEST_EXAMPLE_RESULT(Turkish);
};
#undef TEST_EXAMPLE_RESULT
TEST_CASE("UTF-8 Count", "[utf8]") {
example_results<size_t> results = {};
results.Danish[0] = 88;
results.German[0] = 58;
results.German[1] = 54;
results.Greek[0] = 52;
results.Greek[1] = 33;
results.English[0] = 43;
results.Spanish[0] = 99;
results.French[0] = 327;
results.French[1] = 93;
results.French[2] = 126;
results.IrishGaelic[0] = 68;
results.Hungarian[0] = 22;
results.Icelandic[0] = 50;
results.Icelandic[1] = 34;
results.Japanese[0] = 51;
results.Japanese[1] = 55;
results.Hebrew[0] = 52;
results.Polish[0] = 40;
results.Russian[0] = 54;
results.Russian[1] = 55;
results.Turkish[0] = 45;
TEST_LANGUAGE_EXAMPLES(utf8::count, results);
}
// TODO(gibbed): lower_ascii
// TODO(gibbed): upper_ascii
// TODO(gibbed): hash_fnv1a
// TODO(gibbed): hash_fnv1a_case
TEST_CASE("UTF-8 Split", "[utf8]") {
std::vector<std::string_view> parts;
// Danish
parts = utf8::split(
u8"Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen Wolther "
u8"spillede på xylofon.",
u8"æcå");
parts = utf8::split(examples::kDanishValues[0], u8"æcå");
REQUIRE(parts.size() == 4);
REQUIRE(parts[0] == u8"Quizdeltagerne spiste jordb");
REQUIRE(parts[1] == u8"r med fløde, mens ");
@ -33,43 +237,41 @@ TEST_CASE("utf8::split", "UTF-8 Split") {
REQUIRE(parts[3] == u8" xylofon.");
// German
parts = utf8::split(
u8"Falsches Üben von Xylophonmusik quält jeden größeren Zwerg\n"
u8"Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich\n"
u8"Heizölrückstoßabdämpfung",
u8"ßS");
REQUIRE(parts.size() == 4);
parts = utf8::split(examples::kGermanValues[0], u8"ßS");
REQUIRE(parts.size() == 2);
REQUIRE(parts[0] == u8"Falsches Üben von Xylophonmusik quält jeden grö");
REQUIRE(parts[1] ==
u8"eren Zwerg\nZwölf Boxkämpfer jagten Eva quer über den ");
REQUIRE(parts[2] == u8"ylter Deich\nHeizölrücksto");
REQUIRE(parts[3] == u8"abdämpfung");
REQUIRE(parts[1] == u8"eren Zwerg");
parts = utf8::split(examples::kGermanValues[1], u8"ßS");
REQUIRE(parts.size() == 2);
REQUIRE(parts[0] == u8"Zwölf Boxkämpfer jagten Eva quer über den ");
REQUIRE(parts[1] == u8"ylter Deich");
parts = utf8::split(examples::kGermanValues[2], u8"ßS");
REQUIRE(parts.size() == 2);
REQUIRE(parts[0] == u8"Heizölrücksto");
REQUIRE(parts[1] == u8"abdämpfung");
// Greek
parts = utf8::split(
u8"Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ πιὰ στὸ χρυσαφὶ ξέφωτο\n"
u8"Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία",
u8"πφ");
REQUIRE(parts.size() == 6);
parts = utf8::split(examples::kGreekValues[0], u8"πφ");
REQUIRE(parts.size() == 4);
REQUIRE(parts[0] == u8"Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ ");
REQUIRE(parts[1] == u8"ιὰ στὸ χρυσα");
REQUIRE(parts[2] == u8"ὶ ξέ");
REQUIRE(parts[3] == u8"ωτο\nΞεσκε");
REQUIRE(parts[4] == u8"άζω τὴν ψυχο");
REQUIRE(parts[5] == u8"θόρα βδελυγμία");
REQUIRE(parts[3] == u8"ωτο");
parts = utf8::split(examples::kGreekValues[1], u8"πφ");
REQUIRE(parts.size() == 3);
REQUIRE(parts[0] == u8"Ξεσκε");
REQUIRE(parts[1] == u8"άζω τὴν ψυχο");
REQUIRE(parts[2] == u8"θόρα βδελυγμία");
// English
parts = utf8::split("The quick brown fox jumps over the lazy dog", "xy");
parts = utf8::split(examples::kEnglishValues[0], "xy");
REQUIRE(parts.size() == 3);
REQUIRE(parts[0] == u8"The quick brown fo");
REQUIRE(parts[1] == u8" jumps over the laz");
REQUIRE(parts[2] == u8" dog");
// Spanish
parts = utf8::split(
u8"El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y "
u8"frío, añoraba a su querido cachorro.",
u8"ójd");
parts = utf8::split(examples::kSpanishValues[0], u8"ójd");
REQUIRE(parts.size() == 4);
REQUIRE(parts[0] == u8"El pingüino Wenceslao hizo kil");
REQUIRE(parts[1] == u8"metros ba");
@ -88,52 +290,254 @@ TEST_CASE("utf8::split", "UTF-8 Split") {
// TODO(gibbed): Turkish
}
TEST_CASE("utf8::equal_z", "UTF-8 Equal Z") {
TEST_CASE("UTF-8 Equal Z", "[utf8]") {
REQUIRE(utf8::equal_z(u8"foo", u8"foo\0"));
REQUIRE_FALSE(utf8::equal_z(u8"bar", u8"baz\0"));
}
TEST_CASE("utf8::equal_case_z", "UTF-8 Equal Case Z") {
REQUIRE(utf8::equal_z(u8"foo", u8"foo\0"));
REQUIRE_FALSE(utf8::equal_z(u8"bar", u8"baz\0"));
TEST_CASE("UTF-8 Equal Case", "[utf8]") {
REQUIRE(utf8::equal_case(u8"foo", u8"foo\0"));
REQUIRE_FALSE(utf8::equal_case(u8"bar", u8"baz\0"));
}
TEST_CASE("utf8::join_paths", "UTF-8 Join Paths") {
REQUIRE(utf8::join_paths({u8"X:", u8"foo", u8"bar", u8"baz", u8"qux"},
'\\') == "X:\\foo\\bar\\baz\\qux");
REQUIRE(utf8::join_paths({u8"X:", u8"foo", u8"bar", u8"baz", u8"qux"}, '/') ==
"X:/foo/bar/baz/qux");
TEST_CASE("UTF-8 Equal Case Z", "[utf8]") {
REQUIRE(utf8::equal_case_z(u8"foo", u8"foo\0"));
REQUIRE_FALSE(utf8::equal_case_z(u8"bar", u8"baz\0"));
}
TEST_CASE("utf8::fix_path_separators", "UTF-8 Fix Path Separators") {
REQUIRE(utf8::fix_path_separators("X:\\foo/bar\\baz/qux", '\\') ==
"X:\\foo\\bar\\baz\\qux");
REQUIRE(utf8::fix_path_separators("X:\\foo/bar\\baz/qux", '/') ==
"X:/foo/bar/baz/qux");
// TODO(gibbed): find_any_of
// TODO(gibbed): find_any_of_case
// TODO(gibbed): find_first_of
// TODO(gibbed): find_first_of_case
// TODO(gibbed): starts_with
// TODO(gibbed): starts_with_case
// TODO(gibbed): ends_with
// TODO(gibbed): ends_with_case
// TODO(gibbed): split_path
#define TEST_PATH(func, input, output) \
do { \
std::string input_value = input; \
std::string output_value = output; \
REQUIRE(func(input_value, '/') == output_value); \
std::replace(input_value.begin(), input_value.end(), '/', '\\'); \
std::replace(output_value.begin(), output_value.end(), '/', '\\'); \
REQUIRE(func(input_value, '\\') == output_value); \
} while (0)
#define TEST_PATH_RAW(func, input, output) \
do { \
std::string output_value = output; \
REQUIRE(func(input, '/') == output_value); \
std::replace(output_value.begin(), output_value.end(), '/', '\\'); \
REQUIRE(func(input, '\\') == output_value); \
} while (0)
#define TEST_PATHS(func, output, ...) \
do { \
std::vector<std::string> input_values = {__VA_ARGS__}; \
std::string output_value = output; \
REQUIRE(func(input_values, '/') == output_value); \
for (auto it = input_values.begin(); it != input_values.end(); ++it) { \
std::replace((*it).begin(), (*it).end(), '/', '\\'); \
} \
std::replace(output_value.begin(), output_value.end(), '/', '\\'); \
REQUIRE(func(input_values, '\\') == output_value); \
} while (0)
TEST_CASE("UTF-8 Join Paths", "[utf8]") {
TEST_PATHS(utf8::join_paths, u8"");
TEST_PATHS(utf8::join_paths, u8"foo", u8"foo");
TEST_PATHS(utf8::join_paths, u8"foo/bar", u8"foo", u8"bar");
TEST_PATHS(utf8::join_paths, "X:/foo/bar/baz/qux", u8"X:", u8"foo", u8"bar",
u8"baz", u8"qux");
}
TEST_CASE("utf8::find_name_from_path", "UTF-8 Find Name From Path") {
REQUIRE(utf8::find_name_from_path("X:\\foo\\bar\\baz\\qux", '\\') == "qux");
REQUIRE(utf8::find_name_from_path("X:/foo/bar/baz/qux", '/') == "qux");
// TODO(gibbed): join_guest_paths
TEST_CASE("UTF-8 Fix Path Separators", "[utf8]") {
TEST_PATH_RAW(utf8::fix_path_separators, "", "");
TEST_PATH_RAW(utf8::fix_path_separators, "\\", "/");
TEST_PATH_RAW(utf8::fix_path_separators, "/", "/");
TEST_PATH_RAW(utf8::fix_path_separators, "\\foo", "/foo");
TEST_PATH_RAW(utf8::fix_path_separators, "\\foo/", "/foo/");
TEST_PATH_RAW(utf8::fix_path_separators, "/foo", "/foo");
TEST_PATH_RAW(utf8::fix_path_separators, "\\foo/bar\\baz/qux",
"/foo/bar/baz/qux");
TEST_PATH_RAW(utf8::fix_path_separators, "\\\\foo//bar\\\\baz//qux",
"/foo/bar/baz/qux");
TEST_PATH_RAW(utf8::fix_path_separators, "foo", "foo");
TEST_PATH_RAW(utf8::fix_path_separators, "foo/", "foo/");
TEST_PATH_RAW(utf8::fix_path_separators, "foo/bar\\baz/qux",
"foo/bar/baz/qux");
TEST_PATH_RAW(utf8::fix_path_separators, "foo//bar\\\\baz//qux",
"foo/bar/baz/qux");
TEST_PATH_RAW(utf8::fix_path_separators, "X:", "X:");
TEST_PATH_RAW(utf8::fix_path_separators, "X:\\", "X:/");
TEST_PATH_RAW(utf8::fix_path_separators, "X:/", "X:/");
TEST_PATH_RAW(utf8::fix_path_separators, "X:\\foo", "X:/foo");
TEST_PATH_RAW(utf8::fix_path_separators, "X:\\foo/", "X:/foo/");
TEST_PATH_RAW(utf8::fix_path_separators, "X:/foo", "X:/foo");
TEST_PATH_RAW(utf8::fix_path_separators, "X:\\foo/bar\\baz/qux",
"X:/foo/bar/baz/qux");
TEST_PATH_RAW(utf8::fix_path_separators, "X:\\\\foo//bar\\\\baz//qux",
"X:/foo/bar/baz/qux");
}
TEST_CASE("utf8::find_base_path", "UTF-8 Find Base Path") {
REQUIRE(utf8::find_base_path("X:\\foo\\bar\\baz\\qux", '\\') ==
"X:\\foo\\bar\\baz");
REQUIRE(utf8::find_base_path("X:/foo/bar/baz/qux", '/') == "X:/foo/bar/baz");
// TODO(gibbed): fix_guest_path_separators
TEST_CASE("UTF-8 Find Name From Path", "[utf8]") {
TEST_PATH(utf8::find_name_from_path, "/", "");
TEST_PATH(utf8::find_name_from_path, "foo/bar/baz/qux/", "qux");
TEST_PATH(utf8::find_name_from_path, "foo/bar/baz/qux.txt", "qux.txt");
TEST_PATH(utf8::find_name_from_path, "ほげ/ぴよ/ふが/ほげら/ほげほげ/",
"ほげほげ");
TEST_PATH(utf8::find_name_from_path, "ほげ/ぴよ/ふが/ほげら/ほげほげ.txt",
"ほげほげ.txt");
TEST_PATH(utf8::find_name_from_path, "/foo/bar/baz/qux.txt", "qux.txt");
TEST_PATH(utf8::find_name_from_path, "/ほげ/ぴよ/ふが/ほげら/ほげほげ/",
"ほげほげ");
TEST_PATH(utf8::find_name_from_path, "/ほげ/ぴよ/ふが/ほげら/ほげほげ.txt",
"ほげほげ.txt");
TEST_PATH(utf8::find_name_from_path, "X:/foo/bar/baz/qux.txt", "qux.txt");
TEST_PATH(utf8::find_name_from_path, "X:/ほげ/ぴよ/ふが/ほげら/ほげほげ/",
"ほげほげ");
TEST_PATH(utf8::find_name_from_path, "X:/ほげ/ぴよ/ふが/ほげら/ほげほげ.txt",
"ほげほげ.txt");
TEST_PATH(utf8::find_name_from_path, "X:/ほげ/ぴよ/ふが/ほげら.ほげほげ",
"ほげら.ほげほげ");
}
TEST_CASE("utf8::canonicalize_path", "UTF-8 Canonicalize Path") {
REQUIRE(utf8::canonicalize_path("X:\\foo\\bar\\baz\\qux", '\\') ==
"X:\\foo\\bar\\baz\\qux");
REQUIRE(utf8::canonicalize_path("X:\\foo\\.\\baz\\qux", '\\') ==
"X:\\foo\\baz\\qux");
REQUIRE(utf8::canonicalize_path("X:\\foo\\..\\baz\\qux", '\\') ==
"X:\\baz\\qux");
REQUIRE(utf8::canonicalize_path("X:\\.\\bar\\baz\\qux", '\\') ==
"X:\\bar\\baz\\qux");
REQUIRE(utf8::canonicalize_path("X:\\..\\bar\\baz\\qux", '\\') ==
"X:\\bar\\baz\\qux");
// TODO(gibbed): find_name_from_guest_path
TEST_CASE("UTF-8 Find Base Name From Path", "[utf8]") {
TEST_PATH(utf8::find_base_name_from_path, "foo/bar/baz/qux.txt", "qux");
TEST_PATH(utf8::find_base_name_from_path, "foo/bar/baz/qux/", "qux");
TEST_PATH(utf8::find_base_name_from_path,
"ほげ/ぴよ/ふが/ほげら/ほげほげ.txt", "ほげほげ");
TEST_PATH(utf8::find_base_name_from_path, "ほげ/ぴよ/ふが/ほげら/ほげほげ/",
"ほげほげ");
TEST_PATH(utf8::find_base_name_from_path, "ほげ/ぴよ/ふが/ほげら.ほげほげ",
"ほげら");
TEST_PATH(utf8::find_base_name_from_path, "/foo/bar/baz/qux.txt", "qux");
TEST_PATH(utf8::find_base_name_from_path, "/foo/bar/baz/qux/", "qux");
TEST_PATH(utf8::find_base_name_from_path,
"/ほげ/ぴよ/ふが/ほげら/ほげほげ.txt", "ほげほげ");
TEST_PATH(utf8::find_base_name_from_path, "/ほげ/ぴよ/ふが/ほげら/ほげほげ/",
"ほげほげ");
TEST_PATH(utf8::find_base_name_from_path, "/ほげ/ぴよ/ふが/ほげら.ほげほげ",
"ほげら");
TEST_PATH(utf8::find_base_name_from_path, "X:/foo/bar/baz/qux.txt", "qux");
TEST_PATH(utf8::find_base_name_from_path, "X:/foo/bar/baz/qux/", "qux");
TEST_PATH(utf8::find_base_name_from_path,
"X:/ほげ/ぴよ/ふが/ほげら/ほげほげ.txt", "ほげほげ");
TEST_PATH(utf8::find_base_name_from_path,
"X:/ほげ/ぴよ/ふが/ほげら/ほげほげ/", "ほげほげ");
TEST_PATH(utf8::find_base_name_from_path, "X:/ほげ/ぴよ/ふが/ほげら.ほげほげ",
"ほげら");
}
// TODO(gibbed): find_base_name_from_guest_path
TEST_CASE("UTF-8 Find Base Path", "[utf8]") {
TEST_PATH(utf8::find_base_path, "", "");
TEST_PATH(utf8::find_base_path, "/", "");
TEST_PATH(utf8::find_base_path, "//", "");
TEST_PATH(utf8::find_base_path, "/foo", "");
TEST_PATH(utf8::find_base_path, "/foo/", "");
TEST_PATH(utf8::find_base_path, "/foo/bar", "/foo");
TEST_PATH(utf8::find_base_path, "/foo/bar/", "/foo");
TEST_PATH(utf8::find_base_path, "/foo/bar/baz/qux", "/foo/bar/baz");
TEST_PATH(utf8::find_base_path, "/foo/bar/baz/qux/", "/foo/bar/baz");
TEST_PATH(utf8::find_base_path, "/ほげ/ぴよ/ふが/ほげら/ほげほげ",
"/ほげ/ぴよ/ふが/ほげら");
TEST_PATH(utf8::find_base_path, "/ほげ/ぴよ/ふが/ほげら/ほげほげ/",
"/ほげ/ぴよ/ふが/ほげら");
TEST_PATH(utf8::find_base_path, "foo", "");
TEST_PATH(utf8::find_base_path, "foo/", "");
TEST_PATH(utf8::find_base_path, "foo/bar", "foo");
TEST_PATH(utf8::find_base_path, "foo/bar/", "foo");
TEST_PATH(utf8::find_base_path, "foo/bar/baz/qux", "foo/bar/baz");
TEST_PATH(utf8::find_base_path, "foo/bar/baz/qux/", "foo/bar/baz");
TEST_PATH(utf8::find_base_path, "ほげ/ぴよ/ふが/ほげら/ほげほげ",
"ほげ/ぴよ/ふが/ほげら");
TEST_PATH(utf8::find_base_path, "ほげ/ぴよ/ふが/ほげら/ほげほげ/",
"ほげ/ぴよ/ふが/ほげら");
TEST_PATH(utf8::find_base_path, "X:", "");
TEST_PATH(utf8::find_base_path, "X:/", "");
TEST_PATH(utf8::find_base_path, "X:/foo", "X:");
TEST_PATH(utf8::find_base_path, "X:/foo/", "X:");
TEST_PATH(utf8::find_base_path, "X:/foo/bar", "X:/foo");
TEST_PATH(utf8::find_base_path, "X:/foo/bar/", "X:/foo");
TEST_PATH(utf8::find_base_path, "X:/foo/bar/baz/qux", "X:/foo/bar/baz");
TEST_PATH(utf8::find_base_path, "X:/foo/bar/baz/qux/", "X:/foo/bar/baz");
TEST_PATH(utf8::find_base_path, "X:/ほげ/ぴよ/ふが/ほげら/ほげほげ",
"X:/ほげ/ぴよ/ふが/ほげら");
TEST_PATH(utf8::find_base_path, "X:/ほげ/ぴよ/ふが/ほげら/ほげほげ/",
"X:/ほげ/ぴよ/ふが/ほげら");
}
// TODO(gibbed): find_base_guest_path
TEST_CASE("UTF-8 Canonicalize Path", "[utf8]") {
TEST_PATH(utf8::canonicalize_path, "foo/bar/baz/qux", "foo/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "foo/bar/baz/qux/", "foo/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "foo/./baz/qux", "foo/baz/qux");
TEST_PATH(utf8::canonicalize_path, "foo/./baz/qux/", "foo/baz/qux");
TEST_PATH(utf8::canonicalize_path, "foo/../baz/qux", "baz/qux");
TEST_PATH(utf8::canonicalize_path, "foo/../baz/qux/", "baz/qux");
TEST_PATH(utf8::canonicalize_path, "foo/./baz/../qux", "foo/qux");
TEST_PATH(utf8::canonicalize_path, "foo/./baz/../qux/", "foo/qux");
TEST_PATH(utf8::canonicalize_path, "foo/./../baz/qux", "baz/qux");
TEST_PATH(utf8::canonicalize_path, "foo/./../baz/qux/", "baz/qux");
TEST_PATH(utf8::canonicalize_path, "./bar/baz/qux", "bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "./bar/baz/qux/", "bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "../bar/baz/qux", "bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "../bar/baz/qux/", "bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "ほげ/ぴよ/./ふが/../ほげら/ほげほげ",
"ほげ/ぴよ/ほげら/ほげほげ");
TEST_PATH(utf8::canonicalize_path, "ほげ/ぴよ/./ふが/../ほげら/ほげほげ/",
"ほげ/ぴよ/ほげら/ほげほげ");
TEST_PATH(utf8::canonicalize_path, "/foo/bar/baz/qux", "/foo/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/foo/bar/baz/qux/", "/foo/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/foo/./baz/qux", "/foo/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/foo/./baz/qux/", "/foo/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/foo/../baz/qux", "/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/foo/../baz/qux/", "/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/foo/./baz/../qux", "/foo/qux");
TEST_PATH(utf8::canonicalize_path, "/foo/./baz/../qux/", "/foo/qux");
TEST_PATH(utf8::canonicalize_path, "/foo/./../baz/qux", "/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/foo/./../baz/qux/", "/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/./bar/baz/qux", "/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/./bar/baz/qux/", "/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/../bar/baz/qux", "/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/../bar/baz/qux/", "/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "/ほげ/ぴよ/./ふが/../ほげら/ほげほげ",
"/ほげ/ぴよ/ほげら/ほげほげ");
TEST_PATH(utf8::canonicalize_path, "/ほげ/ぴよ/./ふが/../ほげら/ほげほげ/",
"/ほげ/ぴよ/ほげら/ほげほげ");
TEST_PATH(utf8::canonicalize_path, "X:/foo/bar/baz/qux",
"X:/foo/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/foo/bar/baz/qux/",
"X:/foo/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/foo/./baz/qux", "X:/foo/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/foo/./baz/qux/", "X:/foo/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/foo/../baz/qux", "X:/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/foo/../baz/qux/", "X:/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/foo/./baz/../qux", "X:/foo/qux");
TEST_PATH(utf8::canonicalize_path, "X:/foo/./baz/../qux/", "X:/foo/qux");
TEST_PATH(utf8::canonicalize_path, "X:/foo/./../baz/qux", "X:/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/foo/./../baz/qux/", "X:/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/./bar/baz/qux", "X:/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/./bar/baz/qux/", "X:/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/../bar/baz/qux", "X:/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/../bar/baz/qux/", "X:/bar/baz/qux");
TEST_PATH(utf8::canonicalize_path, "X:/ほげ/ぴよ/./ふが/../ほげら/ほげほげ",
"X:/ほげ/ぴよ/ほげら/ほげほげ");
TEST_PATH(utf8::canonicalize_path, "X:/ほげ/ぴよ/./ふが/../ほげら/ほげほげ/",
"X:/ほげ/ぴよ/ほげら/ほげほげ");
}
// TODO(gibbed): canonicalize_guest_path
} // namespace xe::base::test

View File

@ -155,29 +155,36 @@ bool SetTlsValue(TlsHandle handle, uintptr_t value) {
class PosixHighResolutionTimer : public HighResolutionTimer {
public:
explicit PosixHighResolutionTimer(std::function<void()> callback)
: callback_(std::move(callback)), timer_(nullptr) {}
: callback_(std::move(callback)), valid_(false) {}
~PosixHighResolutionTimer() override {
if (timer_) timer_delete(timer_);
if (valid_) timer_delete(timer_);
}
bool Initialize(std::chrono::milliseconds period) {
if (valid_) {
// Double initialization
assert_always();
return false;
}
// Create timer
sigevent sev{};
sev.sigev_notify = SIGEV_SIGNAL;
sev.sigev_signo = GetSystemSignal(SignalType::kHighResolutionTimer);
sev.sigev_value.sival_ptr = (void*)&callback_;
if (timer_create(CLOCK_REALTIME, &sev, &timer_) == -1) return false;
if (timer_create(CLOCK_MONOTONIC, &sev, &timer_) == -1) return false;
// Start timer
itimerspec its{};
its.it_value = DurationToTimeSpec(period);
its.it_interval = its.it_value;
return timer_settime(timer_, 0, &its, nullptr) != -1;
valid_ = timer_settime(timer_, 0, &its, nullptr) != -1;
return valid_;
}
private:
std::function<void()> callback_;
timer_t timer_;
bool valid_; // all values for timer_t are legal so we need this
};
std::unique_ptr<HighResolutionTimer> HighResolutionTimer::CreateRepeating(
@ -187,7 +194,7 @@ std::unique_ptr<HighResolutionTimer> HighResolutionTimer::CreateRepeating(
if (!timer->Initialize(period)) {
return nullptr;
}
return std::unique_ptr<HighResolutionTimer>(timer.release());
return std::move(timer);
}
class PosixConditionBase {
@ -419,7 +426,7 @@ class PosixCondition<Timer> : public PosixConditionBase {
sev.sigev_notify = SIGEV_SIGNAL;
sev.sigev_signo = GetSystemSignal(SignalType::kTimer);
sev.sigev_value.sival_ptr = this;
if (timer_create(CLOCK_REALTIME, &sev, &timer_) == -1) return false;
if (timer_create(CLOCK_MONOTONIC, &sev, &timer_) == -1) return false;
}
// Start timer
@ -728,31 +735,44 @@ class PosixCondition<Thread> : public PosixConditionBase {
}
void Terminate(int exit_code) {
bool is_current_thread = pthread_self() == thread_;
{
std::unique_lock<std::mutex> lock(state_mutex_);
if (state_ == State::kFinished) {
if (is_current_thread) {
// This is really bad. Some thread must have called Terminate() on us
// just before we decided to terminate ourselves
assert_always();
for (;;) {
// Wait for pthread_cancel() to actually happen.
}
}
return;
}
state_ = State::kFinished;
}
std::lock_guard<std::mutex> lock(mutex_);
// Sometimes the thread can call terminate twice before stopping
if (thread_ == 0) return;
auto thread = thread_;
exit_code_ = exit_code;
signaled_ = true;
cond_.notify_all();
{
std::lock_guard<std::mutex> lock(mutex_);
exit_code_ = exit_code;
signaled_ = true;
cond_.notify_all();
}
if (is_current_thread) {
pthread_exit(reinterpret_cast<void*>(exit_code));
} else {
#ifdef XE_PLATFORM_ANDROID
if (pthread_kill(thread, GetSystemSignal(SignalType::kThreadTerminate)) !=
0) {
assert_always();
}
if (pthread_kill(thread_,
GetSystemSignal(SignalType::kThreadTerminate)) != 0) {
assert_always();
}
#else
if (pthread_cancel(thread) != 0) {
assert_always();
}
if (pthread_cancel(thread_) != 0) {
assert_always();
}
#endif
}
}
void WaitStarted() const {
@ -778,7 +798,6 @@ class PosixCondition<Thread> : public PosixConditionBase {
inline void post_execution() override {
if (thread_) {
pthread_join(thread_, nullptr);
thread_ = 0;
}
}
pthread_t thread_;
@ -1115,13 +1134,12 @@ Thread* Thread::GetCurrentThread() {
void Thread::Exit(int exit_code) {
if (current_thread_) {
current_thread_->Terminate(exit_code);
// Sometimes the current thread keeps running after being cancelled.
// Prevent other calls from this thread from using current_thread_.
current_thread_ = nullptr;
} else {
// Should only happen with the main thread
pthread_exit(reinterpret_cast<void*>(exit_code));
}
// Function must not return
assert_always();
}
void set_name(const std::string_view name) {

View File

@ -111,30 +111,34 @@ bool SetTlsValue(TlsHandle handle, uintptr_t value) {
class Win32HighResolutionTimer : public HighResolutionTimer {
public:
Win32HighResolutionTimer(std::function<void()> callback)
: callback_(callback) {}
: callback_(std::move(callback)) {}
~Win32HighResolutionTimer() override {
if (handle_) {
if (valid_) {
DeleteTimerQueueTimer(nullptr, handle_, INVALID_HANDLE_VALUE);
handle_ = nullptr;
}
}
bool Initialize(std::chrono::milliseconds period) {
return CreateTimerQueueTimer(
&handle_, nullptr,
[](PVOID param, BOOLEAN timer_or_wait_fired) {
auto timer =
reinterpret_cast<Win32HighResolutionTimer*>(param);
timer->callback_();
},
this, 0, DWORD(period.count()), WT_EXECUTEINTIMERTHREAD)
? true
: false;
if (valid_) {
// Double initialization
assert_always();
return false;
}
valid_ = !!CreateTimerQueueTimer(
&handle_, nullptr,
[](PVOID param, BOOLEAN timer_or_wait_fired) {
auto timer = reinterpret_cast<Win32HighResolutionTimer*>(param);
timer->callback_();
},
this, 0, DWORD(period.count()), WT_EXECUTEINTIMERTHREAD);
return valid_;
}
private:
HANDLE handle_ = nullptr;
std::function<void()> callback_;
HANDLE handle_ = nullptr;
bool valid_ = false; // Documentation does not state which HANDLE is invalid
};
std::unique_ptr<HighResolutionTimer> HighResolutionTimer::CreateRepeating(
@ -143,7 +147,7 @@ std::unique_ptr<HighResolutionTimer> HighResolutionTimer::CreateRepeating(
if (!timer->Initialize(period)) {
return nullptr;
}
return std::unique_ptr<HighResolutionTimer>(timer.release());
return std::move(timer);
}
template <typename T>

View File

@ -19,9 +19,7 @@
namespace utfcpp = utf8;
using citer = std::string_view::const_iterator;
using criter = std::string_view::const_reverse_iterator;
using utf8_citer = utfcpp::iterator<std::string_view::const_iterator>;
using utf8_criter = utfcpp::iterator<std::string_view::const_reverse_iterator>;
namespace xe::utf8 {
@ -54,25 +52,10 @@ std::pair<utf8_citer, utf8_citer> make_citer(const utf8_citer begin,
utf8_citer(end.base(), begin.base(), end.base())};
}
std::pair<utf8_criter, utf8_criter> make_criter(const std::string_view view) {
return {utf8_criter(view.crbegin(), view.crbegin(), view.crend()),
utf8_criter(view.crend(), view.crbegin(), view.crend())};
}
std::pair<utf8_criter, utf8_criter> make_criter(const utf8_criter begin,
const utf8_criter end) {
return {utf8_criter(begin.base(), begin.base(), end.base()),
utf8_criter(end.base(), begin.base(), end.base())};
}
size_t byte_length(utf8_citer begin, utf8_citer end) {
return size_t(std::distance(begin.base(), end.base()));
}
size_t byte_length(utf8_criter begin, utf8_criter end) {
return size_t(std::distance(begin.base(), end.base()));
}
size_t count(const std::string_view view) {
return size_t(utfcpp::distance(view.cbegin(), view.cend()));
}
@ -435,21 +418,23 @@ bool ends_with(const std::string_view haystack, const std::string_view needle) {
return false;
}
auto [haystack_begin, haystack_end] = make_criter(haystack);
auto [needle_begin, needle_end] = make_criter(needle);
auto [haystack_begin, haystack_end] = make_citer(haystack);
auto [needle_begin, needle_end] = make_citer(needle);
auto needle_count = count(needle);
auto it = haystack_begin;
auto it = haystack_end;
auto end = it;
for (size_t i = 0; i < needle_count; ++i) {
if (end == haystack_end) {
--it;
for (size_t i = 1; i < needle_count; ++i) {
if (it == haystack_begin) {
// not enough room in target for search
return false;
}
++end;
--it;
}
auto [sub_start, sub_end] = make_criter(it, end);
auto [sub_start, sub_end] = make_citer(it, end);
return std::equal(needle_begin, needle_end, sub_start, sub_end);
}
@ -461,21 +446,23 @@ bool ends_with_case(const std::string_view haystack,
return false;
}
auto [haystack_begin, haystack_end] = make_criter(haystack);
auto [needle_begin, needle_end] = make_criter(needle);
auto [haystack_begin, haystack_end] = make_citer(haystack);
auto [needle_begin, needle_end] = make_citer(needle);
auto needle_count = count(needle);
auto it = haystack_begin;
auto it = haystack_end;
auto end = it;
--it;
for (size_t i = 0; i < needle_count; ++i) {
if (end == haystack_end) {
if (it == haystack_begin) {
// not enough room in target for search
return false;
}
++end;
--it;
}
auto [sub_start, sub_end] = make_criter(it, end);
auto [sub_start, sub_end] = make_citer(it, end);
return std::equal(needle_begin, needle_end, sub_start, sub_end,
equal_ascii_case);
}
@ -492,7 +479,9 @@ std::string join_paths(const std::string_view left_path,
return std::string(left_path);
}
auto [it, end] = make_criter(left_path);
utf8_citer it;
std::tie(std::ignore, it) = make_citer(left_path);
--it;
std::string result = std::string(left_path);
if (*it != static_cast<uint32_t>(separator)) {
@ -501,7 +490,20 @@ std::string join_paths(const std::string_view left_path,
return result + std::string(right_path);
}
std::string join_paths(std::vector<std::string_view> paths,
std::string join_paths(const std::vector<std::string>& paths,
char32_t separator) {
std::string result;
auto it = paths.cbegin();
if (it != paths.cend()) {
result = *it++;
for (; it != paths.cend(); ++it) {
result = join_paths(result, *it, separator);
}
}
return result;
}
std::string join_paths(const std::vector<std::string_view>& paths,
char32_t separator) {
std::string result;
auto it = paths.cbegin();
@ -528,8 +530,20 @@ std::string fix_path_separators(const std::string_view path,
std::string result;
auto it = path_begin;
auto last = it;
auto is_separator = [old_separator, new_separator](char32_t c) {
return c == uint32_t(old_separator) || c == uint32_t(new_separator);
};
// Begins with a separator
if (is_separator(*it)) {
utfcpp::append(new_separator, result);
++it;
last = it;
}
for (;;) {
it = std::find(it, path_end, uint32_t(old_separator));
it = std::find_if(it, path_end, is_separator);
if (it == path_end) {
break;
}
@ -563,25 +577,40 @@ std::string find_name_from_path(const std::string_view path,
return std::string();
}
auto [begin, end] = make_criter(path);
auto [begin, end] = make_citer(path);
auto it = begin;
auto it = end;
--it;
// path is padded with separator
size_t padding = 0;
if (*it == uint32_t(separator)) {
++it;
if (it == begin) {
return std::string();
}
--it;
padding = 1;
}
if (it == end) {
// path is just separator
if (it == begin) {
return std::string();
}
it = std::find(it, end, uint32_t(separator));
if (it == end) {
// search for separator
while (it != begin) {
if (*it == uint32_t(separator)) {
break;
}
--it;
}
// no separator -- copy entire string (except trailing separator)
if (it == begin) {
return std::string(path.substr(0, path.size() - padding));
}
auto length = byte_length(begin, it);
auto length = byte_length(std::next(it), end);
auto offset = path.length() - length;
return std::string(path.substr(offset, length - padding));
}
@ -593,20 +622,25 @@ std::string find_base_name_from_path(const std::string_view path,
return std::string();
}
auto [begin, end] = make_criter(name);
auto [begin, end] = make_citer(name);
auto it = std::find(begin, end, uint32_t('.'));
if (it == end) {
auto it = end;
--it;
while (it != begin) {
if (*it == uint32_t('.')) {
break;
}
--it;
}
if (it == begin) {
return name;
}
it++;
if (it == end) {
return std::string();
}
auto length = name.length() - byte_length(begin, it);
return std::string(name.substr(0, length));
auto length = byte_length(it, end);
auto offset = name.length() - length;
return std::string(name.substr(0, offset));
}
std::string find_base_path(const std::string_view path, char32_t separator) {
@ -614,25 +648,33 @@ std::string find_base_path(const std::string_view path, char32_t separator) {
return std::string();
}
auto [begin, end] = make_criter(path);
auto [begin, end] = make_citer(path);
auto it = begin;
auto it = end;
--it;
// skip trailing separator
if (*it == uint32_t(separator)) {
++it;
if (it == begin) {
return std::string();
}
--it;
}
it = std::find(it, end, uint32_t(separator));
if (it == end) {
while (it != begin) {
if (*it == uint32_t(separator)) {
break;
}
--it;
}
if (it == begin) {
return std::string();
}
++it;
if (it == end) {
return std::string();
}
auto length = path.length() - byte_length(begin, it);
return std::string(path.substr(0, length));
auto length = byte_length(it, end);
auto offset = path.length() - length;
return std::string(path.substr(0, offset));
}
std::string canonicalize_path(const std::string_view path, char32_t separator) {

View File

@ -68,7 +68,10 @@ std::string join_paths(const std::string_view left_path,
const std::string_view right_path,
char32_t separator = kPathSeparator);
std::string join_paths(std::vector<std::string_view> paths,
std::string join_paths(const std::vector<std::string>& paths,
char32_t separator = kPathSeparator);
std::string join_paths(const std::vector<std::string_view>& paths,
char32_t separator = kPathSeparator);
inline std::string join_paths(
@ -86,7 +89,12 @@ inline std::string join_guest_paths(const std::string_view left_path,
return join_paths(left_path, right_path, kGuestPathSeparator);
}
inline std::string join_guest_paths(std::vector<std::string_view> paths) {
inline std::string join_guest_paths(const std::vector<std::string>& paths) {
return join_paths(paths, kGuestPathSeparator);
}
inline std::string join_guest_paths(
const std::vector<std::string_view>& paths) {
return join_paths(paths, kGuestPathSeparator);
}

View File

@ -106,18 +106,6 @@ typedef struct alignas(16) vec128_s {
};
};
vec128_s() = default;
vec128_s(const vec128_s& other) {
high = other.high;
low = other.low;
}
vec128_s& operator=(const vec128_s& b) {
high = b.high;
low = b.low;
return *this;
}
bool operator==(const vec128_s& b) const {
return low == b.low && high == b.high;
}

View File

@ -11,6 +11,7 @@
#include "third_party/cpptoml/include/cpptoml.h"
#include "third_party/fmt/include/fmt/format.h"
#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/base/filesystem.h"
#include "xenia/base/logging.h"
@ -29,6 +30,13 @@ std::shared_ptr<cpptoml::table> ParseFile(
}
CmdVar(config, "", "Specifies the target config to load.");
DEFINE_uint32(
defaults_date, 0,
"Do not modify - internal version of the default values in the config, for "
"seamless updates if default value of any option is changed.",
"Config");
namespace config {
std::string config_name = "xenia.config.toml";
std::filesystem::path config_folder;
@ -46,8 +54,19 @@ std::shared_ptr<cpptoml::table> ParseConfig(
}
}
void ReadConfig(const std::filesystem::path& file_path) {
void ReadConfig(const std::filesystem::path& file_path,
bool update_if_no_version_stored) {
if (!cvar::ConfigVars) {
return;
}
const auto config = ParseConfig(file_path);
// Loading an actual global config file that exists - if there's no
// defaults_date in it, it's very old (before updating was added at all, thus
// all defaults need to be updated).
auto defaults_date_cvar =
dynamic_cast<cvar::ConfigVar<uint32_t>*>(cv::cv_defaults_date);
assert_not_null(defaults_date_cvar);
defaults_date_cvar->SetConfigValue(0);
for (auto& it : *cvar::ConfigVars) {
auto config_var = static_cast<cvar::IConfigVar*>(it.second);
auto config_key = config_var->category() + "." + config_var->name();
@ -55,10 +74,17 @@ void ReadConfig(const std::filesystem::path& file_path) {
config_var->LoadConfigValue(config->get_qualified(config_key));
}
}
uint32_t config_defaults_date = defaults_date_cvar->GetTypedConfigValue();
if (update_if_no_version_stored || config_defaults_date) {
cvar::IConfigVarUpdate::ApplyUpdates(config_defaults_date);
}
XELOGI("Loaded config: {}", xe::path_to_utf8(file_path));
}
void ReadGameConfig(const std::filesystem::path& file_path) {
if (!cvar::ConfigVars) {
return;
}
const auto config = ParseConfig(file_path);
for (auto& it : *cvar::ConfigVars) {
auto config_var = static_cast<cvar::IConfigVar*>(it.second);
@ -71,9 +97,18 @@ void ReadGameConfig(const std::filesystem::path& file_path) {
}
void SaveConfig() {
// All cvar defaults have been updated on loading - store the current date.
auto defaults_date_cvar =
dynamic_cast<cvar::ConfigVar<uint32_t>*>(cv::cv_defaults_date);
assert_not_null(defaults_date_cvar);
defaults_date_cvar->SetConfigValue(
cvar::IConfigVarUpdate::GetLastUpdateDate());
std::vector<cvar::IConfigVar*> vars;
for (const auto& s : *cvar::ConfigVars) {
vars.push_back(s.second);
if (cvar::ConfigVars) {
for (const auto& s : *cvar::ConfigVars) {
vars.push_back(s.second);
}
}
std::sort(vars.begin(), vars.end(), [](auto a, auto b) {
if (a->category() < b->category()) return true;
@ -167,7 +202,12 @@ void SetupConfig(const std::filesystem::path& config_folder) {
if (!cvars::config.empty()) {
config_path = xe::to_path(cvars::config);
if (std::filesystem::exists(config_path)) {
ReadConfig(config_path);
// An external config file may contain only explicit overrides - in this
// case, it will likely not contain the defaults version; don't update
// from the version 0 in this case. Or, it may be a full config - in this
// case, if it's recent enough (created at least in 2021), it will contain
// the version number - updates the defaults in it.
ReadConfig(config_path, false);
return;
}
}
@ -176,10 +216,11 @@ void SetupConfig(const std::filesystem::path& config_folder) {
if (!config_folder.empty()) {
config_path = config_folder / config_name;
if (std::filesystem::exists(config_path)) {
ReadConfig(config_path);
ReadConfig(config_path, true);
}
// we only want to save the config if the user is using the default
// config, we don't want to override a user created specific config
// Re-save the loaded config to present the most up-to-date list of
// parameters to the user, if new options were added, descriptions were
// updated, or default values were changed.
SaveConfig();
}
}

16
src/xenia/cpp.hint Normal file
View File

@ -0,0 +1,16 @@
// Hint files help the Visual Studio IDE interpret Visual C++ identifiers
// such as names of functions and macros.
// For more information see https://go.microsoft.com/fwlink/?linkid=865984
#define DECLARE_XAM_EXPORT_(name, category, tags)
#define DECLARE_XAM_EXPORT1(name, category, tag)
#define DECLARE_XAM_EXPORT2(name, category, tag1, tag2)
#define DECLARE_XBDM_EXPORT_(name, category, tags)
#define DECLARE_XBDM_EXPORT1(name, category, tag)
#define DECLARE_XBOXKRNL_EXPORT_(name, category, tags)
#define DECLARE_XBOXKRNL_EXPORT1(name, category, tag)
#define DECLARE_XBOXKRNL_EXPORT2(name, category, tag1, tag2)
#define DECLARE_XBOXKRNL_EXPORT3(name, category, tag1, tag2, tag3)
#define DECLARE_XBOXKRNL_EXPORT4(name, category, tag1, tag2, tag3, tag4)

View File

@ -519,7 +519,7 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
}
// X64Emitter handles actually resolving functions.
extern "C" uint64_t ResolveFunction(void* raw_context, uint32_t target_address);
uint64_t ResolveFunction(void* raw_context, uint64_t target_address);
ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
// ebx = target PPC address
@ -548,7 +548,7 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
mov(rcx, rsi); // context
mov(rdx, rbx);
mov(rax, uint64_t(&ResolveFunction));
mov(rax, reinterpret_cast<uint64_t>(&ResolveFunction));
call(rax);
EmitLoadVolatileRegs();

View File

@ -382,15 +382,14 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
}
// This is used by the X64ThunkEmitter's ResolveFunctionThunk.
extern "C" uint64_t ResolveFunction(void* raw_context,
uint64_t target_address) {
uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
// TODO(benvanik): required?
assert_not_zero(target_address);
auto fn =
thread_state->processor()->ResolveFunction((uint32_t)target_address);
auto fn = thread_state->processor()->ResolveFunction(
static_cast<uint32_t>(target_address));
assert_not_null(fn);
auto x64_fn = static_cast<X64Function*>(fn);
uint64_t addr = reinterpret_cast<uint64_t>(x64_fn->machine_code());
@ -801,7 +800,7 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
if (!v.low && !v.high) {
// 0000...
vpxor(dest, dest);
} else if (v.low == ~0ull && v.high == ~0ull) {
} else if (v.low == ~uint64_t(0) && v.high == ~uint64_t(0)) {
// 1111...
vpcmpeqb(dest, dest);
} else {
@ -818,10 +817,10 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) {
float f;
uint32_t i;
} x = {v};
if (!v) {
// 0
if (!x.i) {
// +0.0f (but not -0.0f because it may be used to flip the sign via xor).
vpxor(dest, dest);
} else if (x.i == ~0U) {
} else if (x.i == ~uint32_t(0)) {
// 1111...
vpcmpeqb(dest, dest);
} else {
@ -837,10 +836,10 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) {
double d;
uint64_t i;
} x = {v};
if (!v) {
// 0
if (!x.i) {
// +0.0 (but not -0.0 because it may be used to flip the sign via xor).
vpxor(dest, dest);
} else if (x.i == ~0ULL) {
} else if (x.i == ~uint64_t(0)) {
// 1111...
vpcmpeqb(dest, dest);
} else {

View File

@ -105,8 +105,7 @@ struct Op : OpBase {
struct VoidOp : Op<VoidOp, KEY_TYPE_X> {
protected:
template <typename T, KeyType KEY_TYPE>
friend struct Op;
friend struct Op<VoidOp, KEY_TYPE_X>;
template <hir::Opcode OPCODE, typename... Ts>
friend struct I;
void Load(const Instr::Op& op) {}
@ -116,8 +115,7 @@ struct OffsetOp : Op<OffsetOp, KEY_TYPE_O> {
uint64_t value;
protected:
template <typename T, KeyType KEY_TYPE>
friend struct Op;
friend struct Op<OffsetOp, KEY_TYPE_O>;
template <hir::Opcode OPCODE, typename... Ts>
friend struct I;
void Load(const Instr::Op& op) { this->value = op.offset; }
@ -127,8 +125,7 @@ struct SymbolOp : Op<SymbolOp, KEY_TYPE_S> {
Function* value;
protected:
template <typename T, KeyType KEY_TYPE>
friend struct Op;
friend struct Op<SymbolOp, KEY_TYPE_S>;
template <hir::Opcode OPCODE, typename... Ts>
friend struct I;
bool Load(const Instr::Op& op) {
@ -141,8 +138,7 @@ struct LabelOp : Op<LabelOp, KEY_TYPE_L> {
hir::Label* value;
protected:
template <typename T, KeyType KEY_TYPE>
friend struct Op;
friend struct Op<LabelOp, KEY_TYPE_L>;
template <hir::Opcode OPCODE, typename... Ts>
friend struct I;
void Load(const Instr::Op& op) { this->value = op.label; }

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2014 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -73,14 +73,14 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder,
// Stash for value map. We may want to maintain this during building.
auto arena = builder->arena();
auto value_map = reinterpret_cast<Value**>(
arena->Alloc(sizeof(Value*) * max_value_estimate));
arena->Alloc(sizeof(Value*) * max_value_estimate, alignof(Value)));
// Allocate incoming bitvectors for use by blocks. We don't need outgoing
// because they are only used during the block iteration.
// Mapped by block ordinal.
// TODO(benvanik): cache this list, grow as needed, etc.
auto incoming_bitvectors =
(llvm::BitVector**)arena->Alloc(sizeof(llvm::BitVector*) * block_count);
auto incoming_bitvectors = (llvm::BitVector**)arena->Alloc(
sizeof(llvm::BitVector*) * block_count, alignof(llvm::BitVector));
for (auto n = 0u; n < block_count; n++) {
incoming_bitvectors[n] = new llvm::BitVector(max_value_estimate);
}

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -45,7 +45,7 @@ bool FinalizationPass::Run(HIRBuilder* builder) {
while (label) {
if (!label->name) {
const size_t label_len = 6 + 4;
char* name = reinterpret_cast<char*>(arena->Alloc(label_len + 1));
char* name = reinterpret_cast<char*>(arena->Alloc(label_len + 1, 1));
assert_true(label->id <= 9999);
auto end = fmt::format_to_n(name, label_len, "_label{}", label->id);
name[end.size] = '\0';

View File

@ -39,7 +39,7 @@ enum class ExportCategory : uint8_t {
};
struct ExportTag {
typedef uint32_t type;
using type = uint32_t;
// packed like so:
// ll...... cccccccc ........ ..bihssi

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -739,7 +739,7 @@ void HIRBuilder::Comment(std::string_view value) {
return;
}
auto size = value.size();
auto p = reinterpret_cast<char*>(arena_->Alloc(size + 1));
auto p = reinterpret_cast<char*>(arena_->Alloc(size + 1, 1));
std::memcpy(p, value.data(), size);
p[size] = '\0';
Instr* i = AppendInstr(OPCODE_COMMENT_info, 0);
@ -752,7 +752,7 @@ void HIRBuilder::Comment(const StringBuffer& value) {
return;
}
auto size = value.length();
auto p = reinterpret_cast<char*>(arena_->Alloc(size + 1));
auto p = reinterpret_cast<char*>(arena_->Alloc(size + 1, 1));
std::memcpy(p, value.buffer(), size);
p[size] = '\0';
Instr* i = AppendInstr(OPCODE_COMMENT_info, 0);

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -75,7 +75,7 @@ class HIRBuilder {
template <typename... Args>
void CommentFormat(const std::string_view format, const Args&... args) {
static const uint32_t kMaxCommentSize = 1024;
char* p = reinterpret_cast<char*>(arena_->Alloc(kMaxCommentSize));
char* p = reinterpret_cast<char*>(arena_->Alloc(kMaxCommentSize, 1));
auto result = fmt::format_to_n(p, kMaxCommentSize - 1, format, args...);
p[result.size] = '\0';
size_t rewind = kMaxCommentSize - 1 - result.size;

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -104,8 +104,8 @@ bool PPCHIRBuilder::Emit(GuestFunction* function, uint32_t flags) {
// instruction may have a label assigned to it if it hasn't been hit
// yet.
size_t list_size = instr_count_ * sizeof(void*);
instr_offset_list_ = (Instr**)arena_->Alloc(list_size);
label_list_ = (Label**)arena_->Alloc(list_size);
instr_offset_list_ = (Instr**)arena_->Alloc(list_size, alignof(void*));
label_list_ = (Label**)arena_->Alloc(list_size, alignof(void*));
std::memset(instr_offset_list_, 0, list_size);
std::memset(label_list_, 0, list_size);
@ -244,7 +244,7 @@ void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) {
char name_buffer[13];
auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address);
name_buffer[format_result.size] = '\0';
label->name = (char*)arena_->Alloc(sizeof(name_buffer));
label->name = (char*)arena_->Alloc(sizeof(name_buffer), 1);
memcpy(label->name, name_buffer, sizeof(name_buffer));
}

View File

@ -432,12 +432,12 @@ void Processor::LowerIrql(Irql old_value) {
}
bool Processor::Save(ByteStream* stream) {
stream->Write('PROC');
stream->Write(kProcessorSaveSignature);
return true;
}
bool Processor::Restore(ByteStream* stream) {
if (stream->Read<uint32_t>() != 'PROC') {
if (stream->Read<uint32_t>() != kProcessorSaveSignature) {
XELOGE("Processor::Restore - Invalid magic value!");
return false;
}

View File

@ -34,6 +34,8 @@ DECLARE_bool(debug);
namespace xe {
namespace cpu {
constexpr fourcc_t kProcessorSaveSignature = make_fourcc("PROC");
class Breakpoint;
class StackWalker;
class XexModule;

View File

@ -20,7 +20,7 @@
#include "xenia/cpu/processor.h"
#include "xenia/cpu/test_module.h"
#include "third_party/catch/single_include/catch.hpp"
#include "third_party/catch/include/catch.hpp"
#define XENIA_TEST_X64 1

View File

@ -249,12 +249,11 @@ int XexModule::ApplyPatch(XexModule* module) {
// Patch base XEX header
uint32_t original_image_size = module->image_size();
uint32_t header_target_size = patch_header->delta_headers_target_offset +
patch_header->delta_headers_source_size;
uint32_t header_target_size = patch_header->size_of_target_headers;
if (!header_target_size) {
header_target_size =
patch_header->size_of_target_headers; // unsure which is more correct..
header_target_size = patch_header->delta_headers_target_offset +
patch_header->delta_headers_source_size;
}
size_t mem_size = module->xex_header_mem_.size();
@ -299,6 +298,9 @@ int XexModule::ApplyPatch(XexModule* module) {
module->xex_header_mem_.resize(header_target_size);
}
// Update security info context with latest security info data
module->ReadSecurityInfo();
uint32_t new_image_size = module->image_size();
// Check if we need to alloc new memory for the patched xex
@ -446,14 +448,9 @@ int XexModule::ApplyPatch(XexModule* module) {
}
}
// byteswap versions because of bitfields...
xex2_version source_ver, target_ver;
source_ver.value =
xe::byte_swap<uint32_t>(patch_header->source_version.value);
target_ver.value =
xe::byte_swap<uint32_t>(patch_header->target_version.value);
source_ver = patch_header->source_version();
target_ver = patch_header->target_version();
XELOGI(
"XEX patch applied successfully: base version: {}.{}.{}.{}, new "
"version: {}.{}.{}.{}",
@ -867,25 +864,7 @@ int XexModule::ReadPEHeaders() {
return 0;
}
bool XexModule::Load(const std::string_view name, const std::string_view path,
const void* xex_addr, size_t xex_length) {
auto src_header = reinterpret_cast<const xex2_header*>(xex_addr);
if (src_header->magic == 'XEX1') {
xex_format_ = kFormatXex1;
} else if (src_header->magic == 'XEX2') {
xex_format_ = kFormatXex2;
} else {
return false;
}
assert_false(loaded_);
loaded_ = true;
// Read in XEX headers
xex_header_mem_.resize(src_header->header_size);
std::memcpy(xex_header_mem_.data(), src_header, src_header->header_size);
void XexModule::ReadSecurityInfo() {
if (xex_format_ == kFormatXex1) {
const xex1_security_info* xex1_sec_info =
reinterpret_cast<const xex1_security_info*>(
@ -913,6 +892,29 @@ bool XexModule::Load(const std::string_view name, const std::string_view path,
security_info_.page_descriptor_count = xex2_sec_info->page_descriptor_count;
security_info_.page_descriptors = xex2_sec_info->page_descriptors;
}
}
bool XexModule::Load(const std::string_view name, const std::string_view path,
const void* xex_addr, size_t xex_length) {
auto src_header = reinterpret_cast<const xex2_header*>(xex_addr);
if (src_header->magic == kXEX1Signature) {
xex_format_ = kFormatXex1;
} else if (src_header->magic == kXEX2Signature) {
xex_format_ = kFormatXex2;
} else {
return false;
}
assert_false(loaded_);
loaded_ = true;
// Read in XEX headers
xex_header_mem_.resize(src_header->header_size);
std::memcpy(xex_header_mem_.data(), src_header, src_header->header_size);
// Read/convert XEX1/XEX2 security info to a common format
ReadSecurityInfo();
auto sec_header = xex_security_info();
@ -1104,8 +1106,8 @@ bool XexModule::SetupLibraryImports(const std::string_view name,
ImportLibrary library_info;
library_info.name = base_name;
library_info.id = library->id;
library_info.version.value = library->version.value;
library_info.min_version.value = library->version_min.value;
library_info.version.value = library->version().value;
library_info.min_version.value = library->version_min().value;
// Imports are stored as {import descriptor, thunk addr, import desc, ...}
// Even thunks have an import descriptor (albeit unused/useless)

View File

@ -25,6 +25,10 @@ class KernelState;
namespace xe {
namespace cpu {
constexpr fourcc_t kXEX1Signature = make_fourcc("XEX1");
constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2");
constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F');
class Runtime;
class XexModule : public xe::cpu::Module {
@ -170,6 +174,8 @@ class XexModule : public xe::cpu::Module {
std::unique_ptr<Function> CreateFunction(uint32_t address) override;
private:
void ReadSecurityInfo();
int ReadImage(const void* xex_addr, size_t xex_length, bool use_dev_key);
int ReadImageUncompressed(const void* xex_addr, size_t xex_length);
int ReadImageBasicCompressed(const void* xex_addr, size_t xex_length);

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -68,7 +68,8 @@ Emulator::Emulator(const std::filesystem::path& command_line,
storage_root_(storage_root),
content_root_(content_root),
cache_root_(cache_root),
game_title_(),
title_name_(),
title_version_(),
display_window_(nullptr),
memory_(),
audio_system_(),
@ -78,7 +79,7 @@ Emulator::Emulator(const std::filesystem::path& command_line,
file_system_(),
kernel_state_(),
main_thread_(),
title_id_(0),
title_id_(std::nullopt),
paused_(false),
restoring_(false),
restore_fence_() {}
@ -246,8 +247,9 @@ X_STATUS Emulator::TerminateTitle() {
}
kernel_state_->TerminateTitle();
title_id_ = 0;
game_title_ = "";
title_id_ = std::nullopt;
title_name_ = "";
title_version_ = "";
on_terminate();
return X_STATUS_SUCCESS;
}
@ -418,8 +420,11 @@ bool Emulator::SaveToFile(const std::filesystem::path& path) {
// Save the emulator state to a file
ByteStream stream(map->data(), map->size());
stream.Write('XSAV');
stream.Write(title_id_);
stream.Write(kEmulatorSaveSignature);
stream.Write(title_id_.has_value());
if (title_id_.has_value()) {
stream.Write(title_id_.value());
}
// It's important we don't hold the global lock here! XThreads need to step
// forward (possibly through guarded regions) without worry!
@ -449,12 +454,19 @@ bool Emulator::RestoreFromFile(const std::filesystem::path& path) {
auto lock = global_critical_region::AcquireDirect();
ByteStream stream(map->data(), map->size());
if (stream.Read<uint32_t>() != 'XSAV') {
if (stream.Read<uint32_t>() != kEmulatorSaveSignature) {
return false;
}
auto title_id = stream.Read<uint32_t>();
if (title_id != title_id_) {
auto has_title_id = stream.Read<bool>();
std::optional<uint32_t> title_id;
if (!has_title_id) {
title_id = {};
} else {
title_id = stream.Read<uint32_t>();
}
if (title_id_.has_value() != title_id.has_value() ||
title_id_.value() != title_id.value()) {
// Swapping between titles is unsupported at the moment.
assert_always();
return false;
@ -642,11 +654,28 @@ std::string Emulator::FindLaunchModule() {
return path + default_module;
}
static std::string format_version(xex2_version version) {
// fmt::format doesn't like bit fields
uint32_t major, minor, build, qfe;
major = version.major;
minor = version.minor;
build = version.build;
qfe = version.qfe;
if (qfe) {
return fmt::format("{}.{}.{}.{}", major, minor, build, qfe);
}
if (build) {
return fmt::format("{}.{}.{}", major, minor, build);
}
return fmt::format("{}.{}", major, minor);
}
X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
const std::string_view module_path) {
// Reset state.
title_id_ = 0;
game_title_ = "";
title_id_ = std::nullopt;
title_name_ = "";
title_version_ = "";
display_window_->SetIcon(nullptr, 0);
// Allow xam to request module loads.
@ -662,8 +691,15 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
// Grab the current title ID.
xex2_opt_execution_info* info = nullptr;
module->GetOptHeader(XEX_HEADER_EXECUTION_INFO, &info);
if (info) {
if (!info) {
title_id_ = 0;
} else {
title_id_ = info->title_id;
auto title_version = info->version();
if (title_version.value != 0) {
title_version_ = format_version(title_version);
}
}
// Try and load the resource database (xex only).
@ -677,7 +713,12 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
kernel::util::XdbfGameData db(
module->memory()->TranslateVirtual(resource_data), resource_size);
if (db.is_valid()) {
game_title_ = db.title();
// TODO(gibbed): get title respective to user locale.
title_name_ = db.title(XLanguage::kEnglish);
if (title_name_.empty()) {
// If English title is unavailable, get the title in default locale.
title_name_ = db.title();
}
auto icon_block = db.icon();
if (icon_block) {
display_window_->SetIcon(icon_block.buffer, icon_block.size);
@ -691,7 +732,8 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
// playing before the video can be seen if doing this in parallel with the
// main thread.
on_shader_storage_initialization(true);
graphics_system_->InitializeShaderStorage(cache_root_, title_id_, true);
graphics_system_->InitializeShaderStorage(cache_root_, title_id_.value(),
true);
on_shader_storage_initialization(false);
auto main_thread = kernel_state_->LaunchModule(module);
@ -699,7 +741,7 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
return X_STATUS_UNSUCCESSFUL;
}
main_thread_ = main_thread;
on_launch(title_id_, game_title_);
on_launch(title_id_.value(), title_name_);
return X_STATUS_SUCCESS;
}

View File

@ -11,6 +11,7 @@
#define XENIA_EMULATOR_H_
#include <functional>
#include <optional>
#include <string>
#include "xenia/base/delegate.h"
@ -43,6 +44,8 @@ class Window;
namespace xe {
constexpr fourcc_t kEmulatorSaveSignature = make_fourcc("XSAV");
// The main type that runs the whole emulator.
// This is responsible for initializing and managing all the various subsystems.
class Emulator {
@ -65,14 +68,19 @@ class Emulator {
// Folder files safe to remove without significant side effects are stored in.
const std::filesystem::path& cache_root() const { return cache_root_; }
// Title of the game in the default language.
const std::string& game_title() const { return game_title_; }
// Name of the title in the default language.
const std::string& title_name() const { return title_name_; }
// Version of the title as a string.
const std::string& title_version() const { return title_version_; }
// Currently running title ID
uint32_t title_id() const { return title_id_; }
uint32_t title_id() const {
return !title_id_.has_value() ? 0 : title_id_.value();
}
// Are we currently running a title?
bool is_title_open() const { return title_id_ != 0; }
bool is_title_open() const { return title_id_.has_value(); }
// Window used for displaying graphical output.
ui::Window* display_window() const { return display_window_; }
@ -172,7 +180,8 @@ class Emulator {
std::filesystem::path content_root_;
std::filesystem::path cache_root_;
std::string game_title_;
std::string title_name_;
std::string title_version_;
ui::Window* display_window_;
@ -188,7 +197,7 @@ class Emulator {
std::unique_ptr<kernel::KernelState> kernel_state_;
kernel::object_ref<kernel::XThread> main_thread_;
uint32_t title_id_; // Currently running title ID
std::optional<uint32_t> title_id_; // Currently running title ID
bool paused_;
bool restoring_;

View File

@ -257,22 +257,21 @@ bool CommandProcessor::SetupContext() { return true; }
void CommandProcessor::ShutdownContext() { context_.reset(); }
void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t log2_size) {
void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t size_log2) {
read_ptr_index_ = 0;
primary_buffer_ptr_ = ptr;
primary_buffer_size_ = 1 << log2_size;
primary_buffer_size_ = uint32_t(1) << (size_log2 + 3);
}
void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,
uint32_t block_size) {
uint32_t block_size_log2) {
// CP_RB_RPTR_ADDR Ring Buffer Read Pointer Address 0x70C
// ptr = RB_RPTR_ADDR, pointer to write back the address to.
read_ptr_writeback_ptr_ = ptr;
// CP_RB_CNTL Ring Buffer Control 0x704
// block_size = RB_BLKSZ, number of quadwords read between updates of the
// read pointer.
read_ptr_update_freq_ =
static_cast<uint32_t>(pow(2.0, static_cast<double>(block_size)) / 4);
// block_size = RB_BLKSZ, log2 of number of quadwords read between updates of
// the read pointer.
read_ptr_update_freq_ = uint32_t(1) << block_size_log2 >> 2;
}
void CommandProcessor::UpdateWritePointer(uint32_t value) {
@ -825,8 +824,8 @@ bool CommandProcessor::ExecutePacketType3_XE_SWAP(RingBuffer* reader,
// VdSwap will post this to tell us we need to swap the screen/fire an
// interrupt.
// 63 words here, but only the first has any data.
uint32_t magic = reader->ReadAndSwap<uint32_t>();
assert_true(magic == 'SWAP');
uint32_t magic = reader->ReadAndSwap<fourcc_t>();
assert_true(magic == kSwapSignature);
// TODO(benvanik): only swap frontbuffer ptr.
uint32_t frontbuffer_ptr = reader->ReadAndSwap<uint32_t>();
@ -1146,6 +1145,8 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_EXT(RingBuffer* reader,
bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader,
uint32_t packet,
uint32_t count) {
// Set by D3D as BE but struct ABI is LE
const uint32_t kQueryFinished = xe::byte_swap(0xFFFFFEED);
assert_true(count == 1);
uint32_t initiator = reader->ReadAndSwap<uint32_t>();
// Writeback initiator.
@ -1161,10 +1162,13 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader,
register_file_->values[XE_GPU_REG_RB_SAMPLE_COUNT_ADDR].u32);
// 0xFFFFFEED is written to this two locations by D3D only on D3DISSUE_END
// and used to detect a finished query.
bool isEnd = pSampleCounts->ZPass_A == xe::byte_swap(0xFFFFFEED) &&
pSampleCounts->ZPass_B == xe::byte_swap(0xFFFFFEED);
bool is_end_via_z_pass = pSampleCounts->ZPass_A == kQueryFinished &&
pSampleCounts->ZPass_B == kQueryFinished;
// Older versions of D3D also checks for ZFail (First Gears of War)
bool is_end_via_z_fail = pSampleCounts->ZFail_A == kQueryFinished &&
pSampleCounts->ZFail_B == kQueryFinished;
std::memset(pSampleCounts, 0, sizeof(xe_gpu_depth_sample_counts));
if (isEnd) {
if (is_end_via_z_pass || is_end_via_z_fail) {
pSampleCounts->ZPass_A = fake_sample_count;
pSampleCounts->Total_A = fake_sample_count;
}
@ -1173,40 +1177,77 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader,
return true;
}
bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingBuffer* reader,
uint32_t packet,
uint32_t count) {
// initiate fetch of index buffer and draw
// if dword0 != 0, this is a conditional draw based on viz query.
bool CommandProcessor::ExecutePacketType3Draw(RingBuffer* reader,
uint32_t packet,
const char* opcode_name,
uint32_t viz_query_condition,
uint32_t count_remaining) {
// if viz_query_condition != 0, this is a conditional draw based on viz query.
// This ID matches the one issued in PM4_VIZ_QUERY
uint32_t dword0 = reader->ReadAndSwap<uint32_t>(); // viz query info
// uint32_t viz_id = dword0 & 0x3F;
// uint32_t viz_id = viz_query_condition & 0x3F;
// when true, render conditionally based on query result
// uint32_t viz_use = dword0 & 0x100;
// uint32_t viz_use = viz_query_condition & 0x100;
assert_not_zero(count_remaining);
if (!count_remaining) {
XELOGE("{}: Packet too small, can't read VGT_DRAW_INITIATOR", opcode_name);
return false;
}
reg::VGT_DRAW_INITIATOR vgt_draw_initiator;
vgt_draw_initiator.value = reader->ReadAndSwap<uint32_t>();
--count_remaining;
WriteRegister(XE_GPU_REG_VGT_DRAW_INITIATOR, vgt_draw_initiator.value);
bool success = true;
// TODO(Triang3l): Remove IndexBufferInfo and replace handling of all this
// with PrimitiveProcessor when the old Vulkan renderer is removed.
bool is_indexed = false;
IndexBufferInfo index_buffer_info;
switch (vgt_draw_initiator.source_select) {
case xenos::SourceSelect::kDMA: {
// Indexed draw.
is_indexed = true;
index_buffer_info.guest_base = reader->ReadAndSwap<uint32_t>();
uint32_t index_size = reader->ReadAndSwap<uint32_t>();
index_buffer_info.endianness =
static_cast<xenos::Endian>(index_size >> 30);
index_size &= 0x00FFFFFF;
// Two separate bounds checks so if there's only one missing register
// value out of two, one uint32_t will be skipped in the command buffer,
// not two.
assert_not_zero(count_remaining);
if (!count_remaining) {
XELOGE("{}: Packet too small, can't read VGT_DMA_BASE", opcode_name);
return false;
}
uint32_t vgt_dma_base = reader->ReadAndSwap<uint32_t>();
--count_remaining;
WriteRegister(XE_GPU_REG_VGT_DMA_BASE, vgt_dma_base);
reg::VGT_DMA_SIZE vgt_dma_size;
assert_not_zero(count_remaining);
if (!count_remaining) {
XELOGE("{}: Packet too small, can't read VGT_DMA_SIZE", opcode_name);
return false;
}
vgt_dma_size.value = reader->ReadAndSwap<uint32_t>();
--count_remaining;
WriteRegister(XE_GPU_REG_VGT_DMA_SIZE, vgt_dma_size.value);
uint32_t index_size_bytes =
vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16
? sizeof(uint16_t)
: sizeof(uint32_t);
// The base address must already be word-aligned according to the R6xx
// documentation, but for safety.
index_buffer_info.guest_base = vgt_dma_base & ~(index_size_bytes - 1);
index_buffer_info.endianness = vgt_dma_size.swap_mode;
index_buffer_info.format = vgt_draw_initiator.index_size;
index_size *=
(vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) ? 4 : 2;
index_buffer_info.length = index_size;
index_buffer_info.length = vgt_dma_size.num_words * index_size_bytes;
index_buffer_info.count = vgt_draw_initiator.num_indices;
} break;
case xenos::SourceSelect::kImmediate: {
// TODO(Triang3l): VGT_IMMED_DATA.
XELOGE(
"{}: Using immediate vertex indices, which are not supported yet. "
"Report the game to Xenia developers!",
opcode_name, uint32_t(vgt_draw_initiator.source_select));
success = false;
assert_always();
} break;
case xenos::SourceSelect::kAutoIndex: {
@ -1215,71 +1256,65 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingBuffer* reader,
index_buffer_info.length = 0;
} break;
default: {
// Invalid source select.
assert_always();
// Invalid source selection.
success = false;
assert_unhandled_case(vgt_draw_initiator.source_select);
} break;
}
auto viz_query = register_file_->Get<reg::PA_SC_VIZ_QUERY>();
if (viz_query.viz_query_ena && viz_query.kill_pix_post_hi_z) {
// TODO(Triang3l): Don't drop the draw call completely if the vertex shader
// has memexport.
// TODO(Triang3l || JoelLinn): Handle this properly in the render backends.
return true;
// Skip to the next command, for example, if there are immediate indexes that
// we don't support yet.
reader->AdvanceRead(count_remaining * sizeof(uint32_t));
if (success) {
auto viz_query = register_file_->Get<reg::PA_SC_VIZ_QUERY>();
if (!(viz_query.viz_query_ena && viz_query.kill_pix_post_hi_z)) {
// TODO(Triang3l): Don't drop the draw call completely if the vertex
// shader has memexport.
// TODO(Triang3l || JoelLinn): Handle this properly in the render
// backends.
success = IssueDraw(
vgt_draw_initiator.prim_type, vgt_draw_initiator.num_indices,
is_indexed ? &index_buffer_info : nullptr,
xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
vgt_draw_initiator.prim_type));
if (!success) {
XELOGE("{}({}, {}, {}): Failed in backend", opcode_name,
vgt_draw_initiator.num_indices,
uint32_t(vgt_draw_initiator.prim_type),
uint32_t(vgt_draw_initiator.source_select));
}
}
}
bool success =
IssueDraw(vgt_draw_initiator.prim_type, vgt_draw_initiator.num_indices,
is_indexed ? &index_buffer_info : nullptr,
xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
vgt_draw_initiator.prim_type));
if (!success) {
XELOGE("PM4_DRAW_INDX({}, {}, {}): Failed in backend",
vgt_draw_initiator.num_indices,
uint32_t(vgt_draw_initiator.prim_type),
uint32_t(vgt_draw_initiator.source_select));
}
return success;
}
return true;
bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingBuffer* reader,
uint32_t packet,
uint32_t count) {
// "initiate fetch of index buffer and draw"
// Generally used by Xbox 360 Direct3D 9 for kDMA and kAutoIndex sources.
// With a viz query token as the first one.
uint32_t count_remaining = count;
assert_not_zero(count_remaining);
if (!count_remaining) {
XELOGE("PM4_DRAW_INDX: Packet too small, can't read the viz query token");
return false;
}
uint32_t viz_query_condition = reader->ReadAndSwap<uint32_t>();
--count_remaining;
return ExecutePacketType3Draw(reader, packet, "PM4_DRAW_INDX",
viz_query_condition, count_remaining);
}
bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingBuffer* reader,
uint32_t packet,
uint32_t count) {
// draw using supplied indices in packet
reg::VGT_DRAW_INITIATOR vgt_draw_initiator;
vgt_draw_initiator.value = reader->ReadAndSwap<uint32_t>();
WriteRegister(XE_GPU_REG_VGT_DRAW_INITIATOR, vgt_draw_initiator.value);
assert_true(vgt_draw_initiator.source_select ==
xenos::SourceSelect::kAutoIndex);
// Index buffer unused as automatic.
// uint32_t indices_size =
// vgt_draw_initiator.num_indices *
// (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32 ? 4
// : 2);
// uint32_t index_ptr = reader->ptr();
// TODO(Triang3l): VGT_IMMED_DATA.
reader->AdvanceRead((count - 1) * sizeof(uint32_t));
auto viz_query = register_file_->Get<reg::PA_SC_VIZ_QUERY>();
if (viz_query.viz_query_ena && viz_query.kill_pix_post_hi_z) {
// TODO(Triang3l): Don't drop the draw call completely if the vertex shader
// has memexport.
// TODO(Triang3l || JoelLinn): Handle this properly in the render backends.
return true;
}
bool success = IssueDraw(
vgt_draw_initiator.prim_type, vgt_draw_initiator.num_indices, nullptr,
xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
vgt_draw_initiator.prim_type));
if (!success) {
XELOGE("PM4_DRAW_INDX_IMM({}, {}): Failed in backend",
vgt_draw_initiator.num_indices,
uint32_t(vgt_draw_initiator.prim_type));
}
return true;
// "draw using supplied indices in packet"
// Generally used by Xbox 360 Direct3D 9 for kAutoIndex source.
// No viz query token.
return ExecutePacketType3Draw(reader, packet, "PM4_DRAW_INDX_2", 0, count);
}
bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingBuffer* reader,

View File

@ -144,8 +144,8 @@ class CommandProcessor {
virtual void RestoreEdramSnapshot(const void* snapshot) = 0;
void InitializeRingBuffer(uint32_t ptr, uint32_t page_count);
void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size);
void InitializeRingBuffer(uint32_t ptr, uint32_t size_log2);
void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size_log2);
void UpdateWritePointer(uint32_t value);
@ -218,6 +218,10 @@ class CommandProcessor {
uint32_t count);
bool ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader, uint32_t packet,
uint32_t count);
bool ExecutePacketType3Draw(RingBuffer* reader, uint32_t packet,
const char* opcode_name,
uint32_t viz_query_condition,
uint32_t count_remaining);
bool ExecutePacketType3_DRAW_INDX(RingBuffer* reader, uint32_t packet,
uint32_t count);
bool ExecutePacketType3_DRAW_INDX_2(RingBuffer* reader, uint32_t packet,

File diff suppressed because it is too large Load Diff

View File

@ -20,11 +20,11 @@
#include "xenia/base/assert.h"
#include "xenia/gpu/command_processor.h"
#include "xenia/gpu/d3d12/d3d12_graphics_system.h"
#include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
#include "xenia/gpu/d3d12/d3d12_render_target_cache.h"
#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
#include "xenia/gpu/d3d12/deferred_command_list.h"
#include "xenia/gpu/d3d12/pipeline_cache.h"
#include "xenia/gpu/d3d12/primitive_converter.h"
#include "xenia/gpu/d3d12/render_target_cache.h"
#include "xenia/gpu/d3d12/texture_cache.h"
#include "xenia/gpu/draw_util.h"
#include "xenia/gpu/dxbc_shader.h"
@ -89,7 +89,7 @@ class D3D12CommandProcessor : public CommandProcessor {
// there are 4 render targets bound with the same EDRAM base (clearly not
// correct usage), but the shader only clears 1, and then EDRAM buffer stores
// conflict with each other.
uint32_t GetCurrentColorMask(const Shader* pixel_shader) const;
uint32_t GetCurrentColorMask(uint32_t shader_writes_color_targets) const;
void PushTransitionBarrier(
ID3D12Resource* resource, D3D12_RESOURCE_STATES old_state,
@ -131,7 +131,7 @@ class D3D12CommandProcessor : public CommandProcessor {
// descriptors, this must only be used to allocate SRVs, otherwise it won't
// work on Nvidia Fermi (root signature creation will fail)!
bool RequestOneUseSingleViewDescriptors(
uint32_t count, ui::d3d12::util::DescriptorCPUGPUHandlePair* handles_out);
uint32_t count, ui::d3d12::util::DescriptorCpuGpuHandlePair* handles_out);
// These are needed often, so they are always allocated.
enum class SystemBindlessView : uint32_t {
kSharedMemoryRawSRV,
@ -149,6 +149,7 @@ class D3D12CommandProcessor : public CommandProcessor {
kEdramR32G32B32A32UintSRV,
kEdramRawUAV,
kEdramR32UintUAV,
kEdramR32G32UintUAV,
kEdramR32G32B32A32UintUAV,
kGammaRampNormalSRV,
@ -164,16 +165,18 @@ class D3D12CommandProcessor : public CommandProcessor {
kCount,
};
ui::d3d12::util::DescriptorCPUGPUHandlePair GetSystemBindlessViewHandlePair(
ui::d3d12::util::DescriptorCpuGpuHandlePair GetSystemBindlessViewHandlePair(
SystemBindlessView view) const;
ui::d3d12::util::DescriptorCPUGPUHandlePair
ui::d3d12::util::DescriptorCpuGpuHandlePair
GetSharedMemoryUintPow2BindlessSRVHandlePair(
uint32_t element_size_bytes_pow2) const;
ui::d3d12::util::DescriptorCPUGPUHandlePair
ui::d3d12::util::DescriptorCpuGpuHandlePair
GetSharedMemoryUintPow2BindlessUAVHandlePair(
uint32_t element_size_bytes_pow2) const;
ui::d3d12::util::DescriptorCPUGPUHandlePair
ui::d3d12::util::DescriptorCpuGpuHandlePair
GetEdramUintPow2BindlessSRVHandlePair(uint32_t element_size_bytes_pow2) const;
ui::d3d12::util::DescriptorCpuGpuHandlePair
GetEdramUintPow2BindlessUAVHandlePair(uint32_t element_size_bytes_pow2) const;
// Returns a single temporary GPU-side buffer within a submission for tasks
// like texture untiling and resolving.
@ -185,19 +188,20 @@ class D3D12CommandProcessor : public CommandProcessor {
void ReleaseScratchGPUBuffer(ID3D12Resource* buffer,
D3D12_RESOURCE_STATES new_state);
// Sets the current SSAA sample positions, needs to be done before setting
// render targets or copying to depth render targets.
void SetSamplePositions(xenos::MsaaSamples sample_positions);
// Returns a pipeline with deferred creation by its handle. May return nullptr
// if failed to create the pipeline.
ID3D12PipelineState* GetD3D12PipelineByHandle(void* handle) const {
return pipeline_cache_->GetD3D12PipelineByHandle(handle);
}
// Sets the current pipeline to a compute one. This is for cache invalidation
// primarily. A submission must be open.
void SetComputePipeline(ID3D12PipelineState* pipeline);
// Sets the current cached values to external ones. This is for cache
// invalidation primarily. A submission must be open.
void SetExternalPipeline(ID3D12PipelineState* pipeline);
void SetExternalGraphicsRootSignature(ID3D12RootSignature* root_signature);
void SetViewport(const D3D12_VIEWPORT& viewport);
void SetScissorRect(const D3D12_RECT& scissor_rect);
void SetStencilReference(uint32_t stencil_ref);
void SetPrimitiveTopology(D3D12_PRIMITIVE_TOPOLOGY primitive_topology);
// For the pipeline cache to call when binding layout UIDs may be reused.
void NotifyShaderBindingsLayoutUIDsInvalidated();
@ -351,12 +355,13 @@ class D3D12CommandProcessor : public CommandProcessor {
void UpdateFixedFunctionState(const draw_util::ViewportInfo& viewport_info,
const draw_util::Scissor& scissor,
bool primitive_polygonal);
void UpdateSystemConstantValues(
bool shared_memory_is_uav, bool primitive_polygonal,
uint32_t line_loop_closing_index, xenos::Endian index_endian,
const draw_util::ViewportInfo& viewport_info, uint32_t pixel_size_x,
uint32_t pixel_size_y, uint32_t used_texture_mask, uint32_t color_mask,
const RenderTargetCache::PipelineRenderTarget render_targets[4]);
void UpdateSystemConstantValues(bool shared_memory_is_uav,
bool primitive_polygonal,
uint32_t line_loop_closing_index,
xenos::Endian index_endian,
const draw_util::ViewportInfo& viewport_info,
uint32_t used_texture_mask,
uint32_t color_mask);
bool UpdateBindings(const D3D12Shader* vertex_shader,
const D3D12Shader* pixel_shader,
ID3D12RootSignature* root_signature);
@ -418,10 +423,8 @@ class D3D12CommandProcessor : public CommandProcessor {
// bindful - mainly because of CopyDescriptorsSimple, which takes the majority
// of UpdateBindings time, and that's outside the emulator's control even).
bool bindless_resources_used_ = false;
// Should a rasterizer-ordered UAV of the EDRAM buffer with format conversion
// and blending performed in pixel shaders be used instead of host render
// targets.
bool edram_rov_used_ = false;
std::unique_ptr<D3D12RenderTargetCache> render_target_cache_;
std::unique_ptr<ui::d3d12::D3D12UploadBufferPool> constant_buffer_pool_;
@ -487,14 +490,12 @@ class D3D12CommandProcessor : public CommandProcessor {
std::unique_ptr<D3D12SharedMemory> shared_memory_;
std::unique_ptr<D3D12PrimitiveProcessor> primitive_processor_;
std::unique_ptr<PipelineCache> pipeline_cache_;
std::unique_ptr<TextureCache> texture_cache_;
std::unique_ptr<RenderTargetCache> render_target_cache_;
std::unique_ptr<PrimitiveConverter> primitive_converter_;
// Mip 0 contains the normal gamma ramp (256 entries), mip 1 contains the PWL
// ramp (128 entries). DXGI_FORMAT_R10G10B10A2_UNORM 1D.
ID3D12Resource* gamma_ramp_texture_ = nullptr;
@ -508,10 +509,9 @@ class D3D12CommandProcessor : public CommandProcessor {
static constexpr uint32_t kSwapTextureWidth = 1280;
static constexpr uint32_t kSwapTextureHeight = 720;
std::pair<uint32_t, uint32_t> GetSwapTextureSize() const {
if (texture_cache_->IsResolutionScale2X()) {
return std::make_pair(kSwapTextureWidth * 2, kSwapTextureHeight * 2);
}
return std::make_pair(kSwapTextureWidth, kSwapTextureHeight);
uint32_t resolution_scale = texture_cache_->GetDrawResolutionScale();
return std::make_pair(kSwapTextureWidth * resolution_scale,
kSwapTextureHeight * resolution_scale);
}
ID3D12Resource* swap_texture_ = nullptr;
D3D12_PLACED_SUBRESOURCE_FOOTPRINT swap_texture_copy_footprint_;
@ -549,9 +549,6 @@ class D3D12CommandProcessor : public CommandProcessor {
bool ff_blend_factor_update_needed_;
bool ff_stencil_ref_update_needed_;
// Current SSAA sample positions (to be updated by the render target cache).
xenos::MsaaSamples current_sample_positions_;
// Currently bound pipeline, either a graphics pipeline from the pipeline
// cache (with potentially deferred creation - current_external_pipeline_ is
// nullptr in this case) or a non-Xenos graphics or compute pipeline

View File

@ -9,6 +9,8 @@
#include "xenia/gpu/d3d12/d3d12_graphics_system.h"
#include <algorithm>
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
@ -20,10 +22,12 @@ namespace xe {
namespace gpu {
namespace d3d12 {
// Generated with `xb gendxbc`.
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/fullscreen_vs.h"
// Generated with `xb buildshaders`.
namespace shaders {
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/fullscreen_tc_vs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/stretch_gamma_ps.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/stretch_ps.h"
} // namespace shaders
D3D12GraphicsSystem::D3D12GraphicsSystem() {}
@ -138,10 +142,10 @@ X_STATUS D3D12GraphicsSystem::Setup(cpu::Processor* processor,
// Create the stretch pipelines.
D3D12_GRAPHICS_PIPELINE_STATE_DESC stretch_pipeline_desc = {};
stretch_pipeline_desc.pRootSignature = stretch_root_signature_;
stretch_pipeline_desc.VS.pShaderBytecode = fullscreen_vs;
stretch_pipeline_desc.VS.BytecodeLength = sizeof(fullscreen_vs);
stretch_pipeline_desc.PS.pShaderBytecode = stretch_ps;
stretch_pipeline_desc.PS.BytecodeLength = sizeof(stretch_ps);
stretch_pipeline_desc.VS.pShaderBytecode = shaders::fullscreen_tc_vs;
stretch_pipeline_desc.VS.BytecodeLength = sizeof(shaders::fullscreen_tc_vs);
stretch_pipeline_desc.PS.pShaderBytecode = shaders::stretch_ps;
stretch_pipeline_desc.PS.BytecodeLength = sizeof(shaders::stretch_ps);
// The shader will set alpha to 1, don't use output-merger to preserve it.
stretch_pipeline_desc.BlendState.RenderTarget[0].RenderTargetWriteMask =
D3D12_COLOR_WRITE_ENABLE_ALL;
@ -165,8 +169,8 @@ X_STATUS D3D12GraphicsSystem::Setup(cpu::Processor* processor,
return X_STATUS_UNSUCCESSFUL;
}
stretch_pipeline_desc.pRootSignature = stretch_gamma_root_signature_;
stretch_pipeline_desc.PS.pShaderBytecode = stretch_gamma_ps;
stretch_pipeline_desc.PS.BytecodeLength = sizeof(stretch_gamma_ps);
stretch_pipeline_desc.PS.pShaderBytecode = shaders::stretch_gamma_ps;
stretch_pipeline_desc.PS.BytecodeLength = sizeof(shaders::stretch_gamma_ps);
if (FAILED(device->CreateGraphicsPipelineState(
&stretch_pipeline_desc, IID_PPV_ARGS(&stretch_gamma_pipeline_)))) {
XELOGE(

View File

@ -0,0 +1,173 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
#include <algorithm>
#include <cstdint>
#include <memory>
#include <utility>
#include "xenia/base/assert.h"
#include "xenia/base/logging.h"
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
#include "xenia/gpu/d3d12/deferred_command_list.h"
#include "xenia/ui/d3d12/d3d12_provider.h"
#include "xenia/ui/d3d12/d3d12_util.h"
namespace xe {
namespace gpu {
namespace d3d12 {
D3D12PrimitiveProcessor::~D3D12PrimitiveProcessor() { Shutdown(true); }
bool D3D12PrimitiveProcessor::Initialize() {
if (!InitializeCommon(true, false, false, true)) {
Shutdown();
return false;
}
frame_index_buffer_pool_ = std::make_unique<ui::d3d12::D3D12UploadBufferPool>(
command_processor_.GetD3D12Context().GetD3D12Provider(),
std::max(size_t(kMinRequiredConvertedIndexBufferSize),
ui::GraphicsUploadBufferPool::kDefaultPageSize));
return true;
}
void D3D12PrimitiveProcessor::Shutdown(bool from_destructor) {
frame_index_buffer_pool_.reset();
builtin_index_buffer_upload_.Reset();
builtin_index_buffer_gpu_address_ = 0;
builtin_index_buffer_.Reset();
if (!from_destructor) {
ShutdownCommon();
}
}
void D3D12PrimitiveProcessor::CompletedSubmissionUpdated() {
if (builtin_index_buffer_upload_ &&
command_processor_.GetCompletedSubmission() >=
builtin_index_buffer_upload_submission_) {
builtin_index_buffer_upload_.Reset();
}
}
void D3D12PrimitiveProcessor::BeginSubmission() {
if (builtin_index_buffer_upload_ &&
builtin_index_buffer_upload_submission_ == UINT64_MAX) {
// No need to submit deferred barriers - builtin_index_buffer_ has never
// been used yet, so it's in the initial state, and
// builtin_index_buffer_upload_ is in an upload heap, so it's GENERIC_READ.
command_processor_.GetDeferredCommandList().D3DCopyResource(
builtin_index_buffer_.Get(), builtin_index_buffer_upload_.Get());
command_processor_.PushTransitionBarrier(builtin_index_buffer_.Get(),
D3D12_RESOURCE_STATE_COPY_DEST,
D3D12_RESOURCE_STATE_INDEX_BUFFER);
builtin_index_buffer_upload_submission_ =
command_processor_.GetCurrentSubmission();
}
}
void D3D12PrimitiveProcessor::BeginFrame() {
frame_index_buffer_pool_->Reclaim(command_processor_.GetCompletedFrame());
}
void D3D12PrimitiveProcessor::EndFrame() {
ClearPerFrameCache();
frame_index_buffers_.clear();
}
bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
uint32_t index_count, std::function<void(uint16_t*)> fill_callback) {
assert_not_zero(index_count);
assert_null(builtin_index_buffer_);
assert_null(builtin_index_buffer_upload_);
const ui::d3d12::D3D12Provider& provider =
command_processor_.GetD3D12Context().GetD3D12Provider();
ID3D12Device* device = provider.GetDevice();
D3D12_RESOURCE_DESC resource_desc;
ui::d3d12::util::FillBufferResourceDesc(
resource_desc, UINT64(sizeof(uint16_t) * index_count),
D3D12_RESOURCE_FLAG_NONE);
Microsoft::WRL::ComPtr<ID3D12Resource> draw_resource;
if (FAILED(device->CreateCommittedResource(
&ui::d3d12::util::kHeapPropertiesDefault,
provider.GetHeapFlagCreateNotZeroed(), &resource_desc,
D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
IID_PPV_ARGS(&draw_resource)))) {
XELOGE(
"D3D12 primitive processor: Failed to create the built-in index "
"buffer GPU resource with {} 16-bit indices",
index_count);
return false;
}
Microsoft::WRL::ComPtr<ID3D12Resource> upload_resource;
if (FAILED(device->CreateCommittedResource(
&ui::d3d12::util::kHeapPropertiesUpload,
provider.GetHeapFlagCreateNotZeroed(), &resource_desc,
D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
IID_PPV_ARGS(&upload_resource)))) {
XELOGE(
"D3D12 primitive processor: Failed to create the built-in index "
"buffer upload resource with {} 16-bit indices",
index_count);
return false;
}
D3D12_RANGE upload_read_range = {};
void* mapping;
if (FAILED(upload_resource->Map(0, &upload_read_range, &mapping))) {
XELOGE(
"D3D12 primitive processor: Failed to map the built-in index buffer "
"upload resource with {} 16-bit indices",
index_count);
return false;
}
fill_callback(reinterpret_cast<uint16_t*>(mapping));
upload_resource->Unmap(0, nullptr);
// Successfully created the buffer and wrote the data to upload.
builtin_index_buffer_ = std::move(draw_resource);
builtin_index_buffer_gpu_address_ =
builtin_index_buffer_->GetGPUVirtualAddress();
builtin_index_buffer_upload_ = std::move(upload_resource);
// Schedule uploading in the first submission.
builtin_index_buffer_upload_submission_ = UINT64_MAX;
return true;
}
void* D3D12PrimitiveProcessor::RequestHostConvertedIndexBufferForCurrentFrame(
xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,
uint32_t coalignment_original_address, size_t& backend_handle_out) {
size_t index_size = format == xenos::IndexFormat::kInt16 ? sizeof(uint16_t)
: sizeof(uint32_t);
D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
uint8_t* mapping = frame_index_buffer_pool_->Request(
command_processor_.GetCurrentFrame(),
index_size * index_count +
(coalign_for_simd ? XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE : 0),
index_size, nullptr, nullptr, &gpu_address);
if (!mapping) {
return false;
}
if (coalign_for_simd) {
ptrdiff_t coalignment_offset =
GetSimdCoalignmentOffset(mapping, coalignment_original_address);
mapping += coalignment_offset;
gpu_address = D3D12_GPU_VIRTUAL_ADDRESS(gpu_address + coalignment_offset);
}
backend_handle_out = frame_index_buffers_.size();
frame_index_buffers_.push_back(gpu_address);
return mapping;
}
} // namespace d3d12
} // namespace gpu
} // namespace xe

View File

@ -0,0 +1,90 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_GPU_D3D12_D3D12_PRIMITIVE_PROCESSOR_H_
#define XENIA_GPU_D3D12_D3D12_PRIMITIVE_PROCESSOR_H_
#include <cstdint>
#include <deque>
#include <memory>
#include "xenia/base/assert.h"
#include "xenia/gpu/primitive_processor.h"
#include "xenia/ui/d3d12/d3d12_api.h"
#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
namespace xe {
namespace gpu {
namespace d3d12 {
class D3D12CommandProcessor;
class D3D12PrimitiveProcessor final : public PrimitiveProcessor {
public:
D3D12PrimitiveProcessor(const RegisterFile& register_file, Memory& memory,
TraceWriter& trace_writer,
SharedMemory& shared_memory,
D3D12CommandProcessor& command_processor)
: PrimitiveProcessor(register_file, memory, trace_writer, shared_memory),
command_processor_(command_processor) {}
~D3D12PrimitiveProcessor();
bool Initialize();
void Shutdown(bool from_destructor = false);
void ClearCache() { frame_index_buffer_pool_->ClearCache(); }
void CompletedSubmissionUpdated();
void BeginSubmission();
void BeginFrame();
void EndFrame();
D3D12_GPU_VIRTUAL_ADDRESS GetBuiltinIndexBufferGpuAddress(
size_t handle) const {
assert_not_null(builtin_index_buffer_);
return D3D12_GPU_VIRTUAL_ADDRESS(builtin_index_buffer_gpu_address_ +
GetBuiltinIndexBufferOffsetBytes(handle));
}
D3D12_GPU_VIRTUAL_ADDRESS GetConvertedIndexBufferGpuAddress(
size_t handle) const {
return frame_index_buffers_[handle];
}
protected:
bool InitializeBuiltin16BitIndexBuffer(
uint32_t index_count,
std::function<void(uint16_t*)> fill_callback) override;
void* RequestHostConvertedIndexBufferForCurrentFrame(
xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,
uint32_t coalignment_original_address,
size_t& backend_handle_out) override;
private:
D3D12CommandProcessor& command_processor_;
Microsoft::WRL::ComPtr<ID3D12Resource> builtin_index_buffer_;
D3D12_GPU_VIRTUAL_ADDRESS builtin_index_buffer_gpu_address_ = 0;
// Temporary buffer copied in the beginning of the first submission for
// uploading to builtin_index_buffer_, destroyed when the submission when it
// was uploaded is completed.
Microsoft::WRL::ComPtr<ID3D12Resource> builtin_index_buffer_upload_;
// UINT64_MAX means not uploaded yet and needs uploading in the first
// submission (if the upload buffer exists at all).
uint64_t builtin_index_buffer_upload_submission_ = UINT64_MAX;
std::unique_ptr<ui::d3d12::D3D12UploadBufferPool> frame_index_buffer_pool_;
// Indexed by the backend handles.
std::deque<D3D12_GPU_VIRTUAL_ADDRESS> frame_index_buffers_;
};
} // namespace d3d12
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_D3D12_D3D12_PRIMITIVE_PROCESSOR_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,841 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_GPU_D3D12_D3D12_RENDER_TARGET_CACHE_H_
#define XENIA_GPU_D3D12_D3D12_RENDER_TARGET_CACHE_H_
#include <algorithm>
#include <array>
#include <cstddef>
#include <cstdint>
#include <deque>
#include <functional>
#include <memory>
#include <unordered_map>
#include <utility>
#include <vector>
#include "xenia/base/assert.h"
#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
#include "xenia/gpu/d3d12/texture_cache.h"
#include "xenia/gpu/draw_util.h"
#include "xenia/gpu/render_target_cache.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"
#include "xenia/memory.h"
#include "xenia/ui/d3d12/d3d12_cpu_descriptor_pool.h"
#include "xenia/ui/d3d12/d3d12_provider.h"
#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
#include "xenia/ui/d3d12/d3d12_util.h"
namespace xe {
namespace gpu {
namespace d3d12 {
class D3D12CommandProcessor;
class D3D12RenderTargetCache final : public RenderTargetCache {
public:
D3D12RenderTargetCache(const RegisterFile& register_file,
D3D12CommandProcessor& command_processor,
TraceWriter& trace_writer,
bool bindless_resources_used)
: RenderTargetCache(register_file),
command_processor_(command_processor),
trace_writer_(trace_writer),
bindless_resources_used_(bindless_resources_used) {}
~D3D12RenderTargetCache() override;
bool Initialize();
void Shutdown(bool from_destructor = false);
void CompletedSubmissionUpdated();
void BeginSubmission();
Path GetPath() const override { return path_; }
uint32_t GetResolutionScale() const override { return resolution_scale_; }
bool Update(bool is_rasterization_done,
uint32_t shader_writes_color_targets) override;
void InvalidateCommandListRenderTargets() {
are_current_command_list_render_targets_valid_ = false;
}
bool msaa_2x_supported() const { return msaa_2x_supported_; }
void WriteEdramRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
void WriteEdramRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
void WriteEdramUintPow2SRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
uint32_t element_size_bytes_pow2);
void WriteEdramUintPow2UAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
uint32_t element_size_bytes_pow2);
// Performs the resolve to a shared memory area according to the current
// register values, and also clears the render targets if needed. Must be in a
// frame for calling.
bool Resolve(const Memory& memory, D3D12SharedMemory& shared_memory,
TextureCache& texture_cache, uint32_t& written_address_out,
uint32_t& written_length_out);
// Returns true if any downloads were submitted to the command processor.
bool InitializeTraceSubmitDownloads();
void InitializeTraceCompleteDownloads();
void RestoreEdramSnapshot(const void* snapshot);
// For host render targets.
bool gamma_render_target_as_srgb() const {
return gamma_render_target_as_srgb_;
}
// Using R16G16[B16A16]_SNORM, which are -1...1, not the needed -32...32.
// Persistent data doesn't depend on this, so can be overriden by per-game
// configuration.
bool IsFixed16TruncatedToMinus1To1() const {
return GetPath() == Path::kHostRenderTargets &&
!cvars::snorm16_render_target_full_range;
}
DepthFloat24Conversion depth_float24_conversion() const {
return depth_float24_conversion_;
}
DXGI_FORMAT GetColorResourceDXGIFormat(
xenos::ColorRenderTargetFormat format) const;
DXGI_FORMAT GetColorDrawDXGIFormat(
xenos::ColorRenderTargetFormat format) const;
DXGI_FORMAT GetColorOwnershipTransferDXGIFormat(
xenos::ColorRenderTargetFormat format,
bool* is_integer_out = nullptr) const;
static DXGI_FORMAT GetDepthResourceDXGIFormat(
xenos::DepthRenderTargetFormat format);
static DXGI_FORMAT GetDepthDSVDXGIFormat(
xenos::DepthRenderTargetFormat format);
static DXGI_FORMAT GetDepthSRVDepthDXGIFormat(
xenos::DepthRenderTargetFormat format);
static DXGI_FORMAT GetDepthSRVStencilDXGIFormat(
xenos::DepthRenderTargetFormat format);
protected:
class D3D12RenderTarget final : public RenderTarget {
public:
// descriptor_draw_srgb is only used for k_8_8_8_8 render targets when host
// sRGB (gamma_render_target_as_srgb) is used. descriptor_load is present
// when the DXGI formats are different for drawing and bit-exact loading
// (for NaN pattern preservation across EDRAM tile ownership transfers in
// floating-point formats, and to distinguish between two -1 representations
// in snorm formats).
D3D12RenderTarget(
RenderTargetKey key, D3D12RenderTargetCache& render_target_cache,
ID3D12Resource* resource,
ui::d3d12::D3D12CpuDescriptorPool::Descriptor&& descriptor_draw,
ui::d3d12::D3D12CpuDescriptorPool::Descriptor&& descriptor_draw_srgb,
ui::d3d12::D3D12CpuDescriptorPool::Descriptor&&
descriptor_load_separate,
ui::d3d12::D3D12CpuDescriptorPool::Descriptor&& descriptor_srv,
ui::d3d12::D3D12CpuDescriptorPool::Descriptor&& descriptor_srv_stencil,
D3D12_RESOURCE_STATES resource_state)
: RenderTarget(key),
render_target_cache_(render_target_cache),
resource_(resource),
descriptor_draw_(std::move(descriptor_draw)),
descriptor_draw_srgb_(std::move(descriptor_draw_srgb)),
descriptor_load_separate_(std::move(descriptor_load_separate)),
descriptor_srv_(std::move(descriptor_srv)),
descriptor_srv_stencil_(std::move(descriptor_srv_stencil)),
resource_state_(resource_state) {}
ID3D12Resource* resource() const { return resource_.Get(); }
const ui::d3d12::D3D12CpuDescriptorPool::Descriptor& descriptor_draw()
const {
return descriptor_draw_;
}
const ui::d3d12::D3D12CpuDescriptorPool::Descriptor& descriptor_draw_srgb()
const {
return descriptor_draw_srgb_;
}
const ui::d3d12::D3D12CpuDescriptorPool::Descriptor& descriptor_srv()
const {
return descriptor_srv_;
}
const ui::d3d12::D3D12CpuDescriptorPool::Descriptor&
descriptor_srv_stencil() const {
return descriptor_srv_stencil_;
}
const ui::d3d12::D3D12CpuDescriptorPool::Descriptor&
descriptor_load_separate() const {
return descriptor_load_separate_;
}
D3D12_RESOURCE_STATES SetResourceState(D3D12_RESOURCE_STATES new_state) {
D3D12_RESOURCE_STATES old_state = resource_state_;
resource_state_ = new_state;
return old_state;
}
uint32_t temporary_srv_descriptor_index() const {
return temporary_srv_descriptor_index_;
}
void SetTemporarySRVDescriptorIndex(uint32_t index) {
temporary_srv_descriptor_index_ = index;
}
uint32_t temporary_srv_descriptor_index_stencil() const {
return temporary_srv_descriptor_index_stencil_;
}
void SetTemporarySRVDescriptorIndexStencil(uint32_t index) {
temporary_srv_descriptor_index_stencil_ = index;
}
uint32_t temporary_sort_index() const { return temporary_sort_index_; }
void SetTemporarySortIndex(uint32_t index) {
temporary_sort_index_ = index;
}
private:
D3D12RenderTargetCache& render_target_cache_;
Microsoft::WRL::ComPtr<ID3D12Resource> resource_;
ui::d3d12::D3D12CpuDescriptorPool::Descriptor descriptor_draw_;
ui::d3d12::D3D12CpuDescriptorPool::Descriptor descriptor_draw_srgb_;
ui::d3d12::D3D12CpuDescriptorPool::Descriptor descriptor_load_separate_;
// Texture SRV non-shader-visible descriptors, to prepare shader-visible
// descriptors faster, by copying rather than by creating every time.
// TODO(Triang3l): With bindless resources, persistently store them in the
// heap.
ui::d3d12::D3D12CpuDescriptorPool::Descriptor descriptor_srv_;
ui::d3d12::D3D12CpuDescriptorPool::Descriptor descriptor_srv_stencil_;
D3D12_RESOURCE_STATES resource_state_;
// Temporary storage for indices in operations like transfers and dumps.
uint32_t temporary_srv_descriptor_index_ = UINT32_MAX;
uint32_t temporary_srv_descriptor_index_stencil_ = UINT32_MAX;
uint32_t temporary_sort_index_ = 0;
};
uint32_t GetMaxRenderTargetWidth() const override {
return D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION;
}
uint32_t GetMaxRenderTargetHeight() const override {
return D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION;
}
xenos::ColorRenderTargetFormat GetHostRelevantColorFormat(
xenos::ColorRenderTargetFormat format) const override;
RenderTarget* CreateRenderTarget(RenderTargetKey key) override;
bool IsHostDepthEncodingDifferent(
xenos::DepthRenderTargetFormat format) const override;
void RequestPixelShaderInterlockBarrier() override;
private:
enum class EdramBufferModificationStatus {
// No uncommitted ROV/UAV writes.
kUnmodified,
// Need to commit before the next ROV usage with overlap.
kAsROV,
// Need to commit before any next ROV usage.
kAsUAV,
};
void TransitionEdramBuffer(D3D12_RESOURCE_STATES new_state);
void MarkEdramBufferModified(
EdramBufferModificationStatus modification_status =
EdramBufferModificationStatus::kAsUAV);
void CommitEdramBufferUAVWrites(EdramBufferModificationStatus commit_status =
EdramBufferModificationStatus::kAsROV);
D3D12CommandProcessor& command_processor_;
TraceWriter& trace_writer_;
bool bindless_resources_used_;
Path path_ = Path::kHostRenderTargets;
uint32_t resolution_scale_ = 1;
// For host render targets, an EDRAM-sized scratch buffer for:
// - Guest render target data copied from host render targets during copying
// in resolves and in frame trace creation.
// - Host float32 depth in ownership transfers when the host depth texture and
// the destination are the same.
// For rasterizer-ordered view, the buffer containing the EDRAM data.
// (Note that if a hybrid RTV / DSV + ROV approach to color render targets is
// added, which is, however, unlikely as it would have very complicated
// interaction with depth / stencil testing, host depth will need to be
// copied to a different buffer - the same range may have ROV-owned color and
// host float32 depth at the same time).
ID3D12Resource* edram_buffer_ = nullptr;
D3D12_RESOURCE_STATES edram_buffer_state_;
EdramBufferModificationStatus edram_buffer_modification_status_ =
EdramBufferModificationStatus::kUnmodified;
// Non-shader-visible descriptor heap containing pre-created SRV and UAV
// descriptors of the EDRAM buffer, for faster binding (by copying rather
// than creation).
enum class EdramBufferDescriptorIndex : uint32_t {
kRawSRV,
kR32UintSRV,
kR32G32UintSRV,
kR32G32B32A32UintSRV,
kRawUAV,
kR32UintUAV,
kR32G32UintUAV,
kR32G32B32A32UintUAV,
kCount,
};
ID3D12DescriptorHeap* edram_buffer_descriptor_heap_ = nullptr;
D3D12_CPU_DESCRIPTOR_HANDLE edram_buffer_descriptor_heap_start_;
// Resolve copying root signature and pipelines.
// Parameter 0 - draw_util::ResolveCopyShaderConstants or its ::DestRelative.
// Parameter 1 - destination (shared memory or a part of it).
// Parameter 2 - source (EDRAM).
ID3D12RootSignature* resolve_copy_root_signature_ = nullptr;
static const std::pair<const void*, size_t>
kResolveCopyShaders[size_t(draw_util::ResolveCopyShaderIndex::kCount)];
ID3D12PipelineState* resolve_copy_pipelines_[size_t(
draw_util::ResolveCopyShaderIndex::kCount)] = {};
// For traces.
ID3D12Resource* edram_snapshot_download_buffer_ = nullptr;
std::unique_ptr<ui::d3d12::D3D12UploadBufferPool>
edram_snapshot_restore_pool_;
// For host render targets.
enum TransferCBVRegister : uint32_t {
kTransferCBVRegisterStencilMask,
kTransferCBVRegisterAddress,
kTransferCBVRegisterHostDepthAddress,
};
enum TransferSRVRegister : uint32_t {
kTransferSRVRegisterColor,
kTransferSRVRegisterDepth,
kTransferSRVRegisterStencil,
kTransferSRVRegisterHostDepth,
kTransferSRVRegisterCount,
};
enum TransferUsedRootParameter : uint32_t {
// Changed 8 times per transfer.
kTransferUsedRootParameterStencilMaskConstant,
kTransferUsedRootParameterColorSRV,
// Mutually exclusive with ColorSRV.
kTransferUsedRootParameterDepthSRV,
// Mutually exclusive with ColorSRV.
kTransferUsedRootParameterStencilSRV,
// May happen to be the same for different sources.
kTransferUsedRootParameterAddressConstant,
kTransferUsedRootParameterHostDepthSRV,
kTransferUsedRootParameterHostDepthAddressConstant,
kTransferUsedRootParameterCount,
kTransferUsedRootParameterStencilMaskConstantBit =
uint32_t(1) << kTransferUsedRootParameterStencilMaskConstant,
kTransferUsedRootParameterColorSRVBit =
uint32_t(1) << kTransferUsedRootParameterColorSRV,
kTransferUsedRootParameterDepthSRVBit =
uint32_t(1) << kTransferUsedRootParameterDepthSRV,
kTransferUsedRootParameterStencilSRVBit =
uint32_t(1) << kTransferUsedRootParameterStencilSRV,
kTransferUsedRootParameterAddressConstantBit =
uint32_t(1) << kTransferUsedRootParameterAddressConstant,
kTransferUsedRootParameterHostDepthSRVBit =
uint32_t(1) << kTransferUsedRootParameterHostDepthSRV,
kTransferUsedRootParameterHostDepthAddressConstantBit =
uint32_t(1) << kTransferUsedRootParameterHostDepthAddressConstant,
kTransferUsedRootParametersDescriptorMask =
kTransferUsedRootParameterColorSRVBit |
kTransferUsedRootParameterDepthSRVBit |
kTransferUsedRootParameterStencilSRVBit |
kTransferUsedRootParameterHostDepthSRVBit,
};
enum class TransferRootSignatureIndex {
kColor,
kDepth,
kDepthStencil,
kColorToStencilBit,
kStencilToStencilBit,
kColorAndHostDepth,
kDepthAndHostDepth,
kDepthStencilAndHostDepth,
kCount,
};
static const uint32_t
kTransferUsedRootParameters[size_t(TransferRootSignatureIndex::kCount)];
enum class TransferMode : uint32_t {
// 1 SRV (color texture), source constant.
kColorToDepth,
// 1 SRV (color texture), source constant.
kColorToColor,
// 1 or 2 SRVs (depth texture, stencil texture if SV_StencilRef is
// supported), source constant.
kDepthToDepth,
// 2 SRVs (depth texture, stencil texture), source constant.
kDepthToColor,
// 1 SRV (color texture), mask constant (most frequently changed, 8 times
// per transfer), source constant.
kColorToStencilBit,
// 1 SRV (stencil texture), mask constant, source constant.
kDepthToStencilBit,
// Two-source modes, using the host depth if it, when converted to the guest
// format, matches what's in the owner source (not modified, keep host
// precision), or the guest data otherwise (significantly modified, possibly
// cleared). Stencil for SV_StencilRef is always taken from the guest
// source.
// 2 SRVs (color texture, host depth texture or buffer), source constant,
// host depth source constant.
kColorAndHostDepthToDepth,
// When using different source and destination depth formats. 2 or 3 SRVs
// (depth texture, stencil texture if SV_StencilRef is supported, host depth
// texture or buffer), source constant, host depth source constant.
kDepthAndHostDepthToDepth,
kCount,
};
enum class TransferOutput {
kColor,
kDepth,
// With this output, kTransferCBVRegisterStencilMask is used.
kStencilBit,
};
struct TransferModeInfo {
TransferOutput output;
TransferRootSignatureIndex root_signature_no_stencil_ref;
TransferRootSignatureIndex root_signature_with_stencil_ref;
};
static const TransferModeInfo kTransferModes[size_t(TransferMode::kCount)];
union TransferShaderKey {
struct {
xenos::MsaaSamples dest_msaa_samples : xenos::kMsaaSamplesBits;
uint32_t dest_host_relevant_format : xenos::kRenderTargetFormatBits;
xenos::MsaaSamples source_msaa_samples : xenos::kMsaaSamplesBits;
// Always 1x when host_depth_source_is_copy is true not to create the same
// pipeline for different MSAA sample counts as it doesn't matter in this
// case.
xenos::MsaaSamples host_depth_source_msaa_samples
: xenos::kMsaaSamplesBits;
uint32_t source_host_relevant_format : xenos::kRenderTargetFormatBits;
// If host depth is also fetched, whether it's pre-copied to the EDRAM
// buffer (but since it's just a scratch buffer, with tiles laid out
// linearly with the same pitch as in the original render target; also no
// swapping of 40-sample columns as opposed to the host render target -
// this is done only for the color source).
uint32_t host_depth_source_is_copy : 1;
// Last bits because this affects the root signature - after sorting, only
// change it as fewer times as possible. Depth buffers have an additional
// depth SRV.
static_assert(size_t(TransferMode::kCount) <= (size_t(1) << 3));
TransferMode mode : 3;
};
uint32_t key = 0;
struct Hasher {
size_t operator()(const TransferShaderKey& key) const {
return std::hash<uint32_t>{}(key.key);
}
};
bool operator==(const TransferShaderKey& other_key) const {
return key == other_key.key;
}
bool operator!=(const TransferShaderKey& other_key) const {
return !(*this == other_key);
}
bool operator<(const TransferShaderKey& other_key) const {
return key < other_key.key;
}
};
union TransferAddressConstant {
struct {
// All in tiles.
uint32_t dest_pitch : xenos::kEdramPitchTilesBits;
uint32_t source_pitch : xenos::kEdramPitchTilesBits;
// Safe to use 12 bits for signed difference - no ownership transfer can
// ever occur between render targets with EDRAM base >= 2048 as this would
// result in 0-length spans. 10 + 10 + 12 is exactly 32, any more bits,
// and more root 32-bit constants will be used.
// Destination base in tiles minus source base in tiles (not vice versa
// because this is a transform of the coordinate system, not addresses
// themselves).
// 0 for host_depth_source_is_copy (ignored in this case anyway as
// destination == source anyway).
int32_t source_to_dest : xenos::kEdramBaseTilesBits;
};
uint32_t constant = 0;
bool operator==(const TransferAddressConstant& other_constant) const {
return constant == other_constant.constant;
}
bool operator!=(const TransferAddressConstant& other_constant) const {
return !(*this == other_constant);
}
};
static_assert(sizeof(TransferAddressConstant) == sizeof(uint32_t));
struct TransferInvocation {
Transfer transfer;
TransferShaderKey shader_key;
TransferInvocation(const Transfer& transfer,
const TransferShaderKey& shader_key)
: transfer(transfer), shader_key(shader_key) {}
bool operator<(const TransferInvocation& other_invocation) {
// TODO(Triang3l): See if it may be better to sort by the source in the
// first place, especially when reading the same data multiple times (like
// to write the stencil bits after depth) for better read locality.
// Sort by the shader key primarily to reduce pipeline state (context)
// switches.
if (shader_key != other_invocation.shader_key) {
return shader_key < other_invocation.shader_key;
}
// Host depth render targets are changed rarely if they exist, won't save
// many binding changes, ignore them for simplicity (their existence is
// caught by the shader key change).
assert_not_null(transfer.source);
assert_not_null(other_invocation.transfer.source);
uint32_t source_index =
static_cast<const D3D12RenderTarget*>(transfer.source)
->temporary_sort_index();
uint32_t other_source_index = static_cast<const D3D12RenderTarget*>(
other_invocation.transfer.source)
->temporary_sort_index();
if (source_index != other_source_index) {
return source_index < other_source_index;
}
return transfer.start_tiles < other_invocation.transfer.start_tiles;
}
bool CanBeMergedIntoOneDraw(
const TransferInvocation& other_invocation) const {
return shader_key == other_invocation.shader_key &&
transfer.AreSourcesSame(other_invocation.transfer);
}
};
union HostDepthStoreRectangleConstant {
struct {
// - 1 because the maximum is 0x1FFF / 8, not 0x2000 / 8.
uint32_t x_pixels_div_8 : xenos::kResolveSizeBits - 1 -
xenos::kResolveAlignmentPixelsLog2;
uint32_t y_pixels_div_8 : xenos::kResolveSizeBits - 1 -
xenos::kResolveAlignmentPixelsLog2;
uint32_t width_pixels_div_8_minus_1 : xenos::kResolveSizeBits - 1 -
xenos::kResolveAlignmentPixelsLog2;
};
uint32_t constant = 0;
};
static_assert(sizeof(HostDepthStoreRectangleConstant) == sizeof(uint32_t));
union HostDepthStoreRenderTargetConstant {
struct {
uint32_t pitch_tiles : xenos::kEdramPitchTilesBits;
// 1 to 3.
uint32_t resolution_scale : 2;
// For native 2x MSAA vs. 2x over 4x.
uint32_t second_sample_index : 2;
};
uint32_t constant = 0;
};
static_assert(sizeof(HostDepthStoreRenderTargetConstant) == sizeof(uint32_t));
enum {
kHostDepthStoreRootParameterRectangleConstant,
kHostDepthStoreRootParameterRenderTargetConstant,
kHostDepthStoreRootParameterSource,
kHostDepthStoreRootParameterDest,
kHostDepthStoreRootParameterCount,
};
union DumpPipelineKey {
struct {
xenos::MsaaSamples msaa_samples : 2;
uint32_t host_relevant_format : 4;
// Last bit because this affects the root signature - after sorting, only
// change it at most once. Depth buffers have an additional stencil SRV.
uint32_t is_depth : 1;
};
uint32_t key = 0;
struct Hasher {
size_t operator()(const DumpPipelineKey& key) const {
return std::hash<uint32_t>{}(key.key);
}
};
bool operator==(const DumpPipelineKey& other_key) const {
return key == other_key.key;
}
bool operator!=(const DumpPipelineKey& other_key) const {
return !(*this == other_key);
}
bool operator<(const DumpPipelineKey& other_key) const {
return key < other_key.key;
}
xenos::ColorRenderTargetFormat GetColorFormat() const {
assert_false(is_depth);
return xenos::ColorRenderTargetFormat(host_relevant_format);
}
xenos::DepthRenderTargetFormat GetDepthFormat() const {
assert_true(is_depth);
return xenos::DepthRenderTargetFormat(host_relevant_format);
}
};
union DumpOffsets {
struct {
// Absolute index of the first thread group's tile within the source
// texture.
uint32_t first_group_tile_source_relative : xenos::kEdramBaseTilesBits;
uint32_t source_base_tiles : xenos::kEdramBaseTilesBits;
};
uint32_t offsets = 0;
bool operator==(const DumpOffsets& other_offsets) const {
return offsets == other_offsets.offsets;
}
bool operator!=(const DumpOffsets& other_offsets) const {
return !(*this == other_offsets);
}
};
static_assert(sizeof(DumpOffsets) == sizeof(uint32_t));
union DumpPitches {
struct {
// Both in tiles.
uint32_t source_pitch : xenos::kEdramPitchTilesBits;
uint32_t dest_pitch : xenos::kEdramPitchTilesBits;
};
uint32_t pitches = 0;
bool operator==(const DumpPitches& other_pitches) const {
return pitches == other_pitches.pitches;
}
bool operator!=(const DumpPitches& other_pitches) const {
return !(*this == other_pitches);
}
};
static_assert(sizeof(DumpPitches) == sizeof(uint32_t));
enum DumpCbuffer : uint32_t {
kDumpCbufferOffsets,
kDumpCbufferPitches,
kDumpCbufferCount,
};
enum DumpRootParameter : uint32_t {
// May be changed multiple times for the same source.
kDumpRootParameterOffsets,
// One resolve may need multiple sources.
kDumpRootParameterSource,
// May be different for different sources.
kDumpRootParameterColorPitches = kDumpRootParameterSource + 1,
// Only changed between 32bpp and 64bpp.
kDumpRootParameterColorEdram,
kDumpRootParameterColorCount,
// Same change frequency than the source (though currently the command
// processor can't contiguously allocate multiple descriptors with bindless,
// when such functionality is added, switch to one root signature).
kDumpRootParameterDepthStencil = kDumpRootParameterSource + 1,
kDumpRootParameterDepthPitches,
kDumpRootParameterDepthEdram,
kDumpRootParameterDepthCount,
};
struct DumpInvocation {
ResolveCopyDumpRectangle rectangle;
DumpPipelineKey pipeline_key;
DumpInvocation(const ResolveCopyDumpRectangle& rectangle,
const DumpPipelineKey& pipeline_key)
: rectangle(rectangle), pipeline_key(pipeline_key) {}
bool operator<(const DumpInvocation& other_invocation) {
// Sort by the pipeline key primarily to reduce pipeline state (context)
// switches.
if (pipeline_key != other_invocation.pipeline_key) {
return pipeline_key < other_invocation.pipeline_key;
}
assert_not_null(rectangle.render_target);
uint32_t render_target_index =
static_cast<const D3D12RenderTarget*>(rectangle.render_target)
->temporary_sort_index();
const ResolveCopyDumpRectangle& other_rectangle =
other_invocation.rectangle;
uint32_t other_render_target_index =
static_cast<const D3D12RenderTarget*>(other_rectangle.render_target)
->temporary_sort_index();
if (render_target_index != other_render_target_index) {
return render_target_index < other_render_target_index;
}
if (rectangle.row_first != other_rectangle.row_first) {
return rectangle.row_first < other_rectangle.row_first;
}
return rectangle.row_first_start < other_rectangle.row_first_start;
}
};
// Returns:
// - A pointer to 1 pipeline for writing color or depth (or stencil via
// SV_StencilRef).
// - A pointer to 8 pipelines for writing stencil by discarding samples
// depending on whether they have one bit set, from 1 << 0 to 1 << 7.
// - Null if failed to create.
ID3D12PipelineState* const* GetOrCreateTransferPipelines(
TransferShaderKey key);
static TransferMode GetTransferMode(bool dest_is_stencil_bit,
bool dest_is_depth, bool source_is_depth,
bool source_has_host_depth) {
assert_true(dest_is_depth ||
(!dest_is_stencil_bit && !source_has_host_depth));
if (dest_is_stencil_bit) {
return source_is_depth ? TransferMode::kDepthToStencilBit
: TransferMode::kColorToStencilBit;
}
if (dest_is_depth) {
if (source_is_depth) {
return source_has_host_depth ? TransferMode::kDepthAndHostDepthToDepth
: TransferMode::kDepthToDepth;
}
return source_has_host_depth ? TransferMode::kColorAndHostDepthToDepth
: TransferMode::kColorToDepth;
}
return source_is_depth ? TransferMode::kDepthToColor
: TransferMode::kColorToColor;
}
// Do ownership transfers for render targets - each render target / vector may
// be null / empty in case there's nothing to do for them.
// resolve_clear_rectangle is expected to be provided by
// PrepareHostRenderTargetsResolveClear which should do all the needed size
// bound checks.
void PerformTransfersAndResolveClears(
uint32_t render_target_count, RenderTarget* const* render_targets,
const std::vector<Transfer>* render_target_transfers,
const uint64_t* render_target_resolve_clear_values = nullptr,
const Transfer::Rectangle* resolve_clear_rectangle = nullptr);
// Accepts an array of (1 + xenos::kMaxColorRenderTargets) render targets,
// first depth, then color.
void SetCommandListRenderTargets(
RenderTarget* const* depth_and_color_render_targets);
ID3D12PipelineState* GetOrCreateDumpPipeline(DumpPipelineKey key);
// Writes contents of host render targets within rectangles from
// ResolveInfo::GetCopyEdramTileSpan to edram_buffer_.
void DumpRenderTargets(uint32_t dump_base, uint32_t dump_row_length_used,
uint32_t dump_rows, uint32_t dump_pitch);
bool use_stencil_reference_output_ = false;
bool gamma_render_target_as_srgb_ = false;
DepthFloat24Conversion depth_float24_conversion_ =
DepthFloat24Conversion::kOnCopy;
bool msaa_2x_supported_ = false;
std::shared_ptr<ui::d3d12::D3D12CpuDescriptorPool> descriptor_pool_color_;
std::shared_ptr<ui::d3d12::D3D12CpuDescriptorPool> descriptor_pool_depth_;
std::shared_ptr<ui::d3d12::D3D12CpuDescriptorPool> descriptor_pool_srv_;
ui::d3d12::D3D12CpuDescriptorPool::Descriptor null_rtv_descriptor_ss_;
ui::d3d12::D3D12CpuDescriptorPool::Descriptor null_rtv_descriptor_ms_;
// Possible tile ownership transfer paths:
// - To color:
// - From color: 1 SRV (color).
// - From depth: 2 SRVs (depth, stencil).
// - To depth / stencil (with SV_StencilRef):
// - From color: 1 SRV (color).
// - From depth: 2 SRVs (depth, stencil).
// - From color and float32 depth: 2 SRVs (color with stencil, depth).
// - Different depth buffer: depth SRV is a texture.
// - Same depth buffer: depth SRV is a buffer (pre-copied).
// - To depth (no SV_StencilRef):
// - From color: 1 SRV (color).
// - From depth: 1 SRV (depth).
// - From color and float32 depth: 2 SRVs (color, depth).
// - Different depth buffer: depth SRV is a texture.
// - Same depth buffer: depth SRV is a buffer (pre-copied).
// - To stencil (no SV_StencilRef):
// - From color: 1 SRV (color).
// - From depth: 1 SRV (stencil).
const RenderTarget* const*
current_command_list_render_targets_[1 + xenos::kMaxColorRenderTargets];
uint32_t are_current_command_list_render_targets_srgb_ = 0;
bool are_current_command_list_render_targets_valid_ = false;
// Temporary storage for descriptors used in PerformTransfersAndResolveClears
// and DumpRenderTargets.
std::vector<D3D12_CPU_DESCRIPTOR_HANDLE> current_temporary_descriptors_cpu_;
std::vector<ui::d3d12::util::DescriptorCpuGpuHandlePair>
current_temporary_descriptors_gpu_;
ID3D12RootSignature* host_depth_store_root_signature_ = nullptr;
ID3D12PipelineState*
host_depth_store_pipelines_[size_t(xenos::MsaaSamples::k4X) + 1] = {};
std::unique_ptr<ui::d3d12::D3D12UploadBufferPool>
transfer_vertex_buffer_pool_;
ID3D12RootSignature* transfer_root_signatures_[size_t(
TransferRootSignatureIndex::kCount)] = {};
std::unordered_map<TransferShaderKey, ID3D12PipelineState*,
TransferShaderKey::Hasher>
transfer_pipelines_;
std::unordered_map<TransferShaderKey, std::array<ID3D12PipelineState*, 8>,
TransferShaderKey::Hasher>
transfer_stencil_bit_pipelines_;
// Temporary storage for PerformTransfersAndResolveClears.
std::vector<TransferInvocation> current_transfer_invocations_;
// Temporary storage for DumpRenderTargets.
std::vector<ResolveCopyDumpRectangle> dump_rectangles_;
std::vector<DumpInvocation> dump_invocations_;
ID3D12RootSignature* dump_root_signature_color_ = nullptr;
ID3D12RootSignature* dump_root_signature_depth_ = nullptr;
// Compute pipelines for copying host render target contents to the EDRAM
// buffer. May be null if failed to create.
std::unordered_map<DumpPipelineKey, ID3D12PipelineState*,
DumpPipelineKey::Hasher>
dump_pipelines_;
// Parameter 0 - 2 root constants (red, green).
ID3D12RootSignature* uint32_rtv_clear_root_signature_ = nullptr;
// [32 or 32_32][MSAA samples].
ID3D12PipelineState*
uint32_rtv_clear_pipelines_[2][size_t(xenos::MsaaSamples::k4X) + 1] = {};
std::vector<Transfer> clear_transfers_[2];
// Temporary storage for DXBC building.
std::vector<uint32_t> built_shader_;
// For rasterizer-ordered view (pixel shader interlock).
static const std::pair<const void*, size_t> kResolveROVClear32bppShaders[3];
static const std::pair<const void*, size_t> kResolveROVClear64bppShaders[3];
ID3D12RootSignature* resolve_rov_clear_root_signature_ = nullptr;
// Clearing 32bpp color or depth.
ID3D12PipelineState* resolve_rov_clear_32bpp_pipeline_ = nullptr;
// Clearing 64bpp color.
ID3D12PipelineState* resolve_rov_clear_64bpp_pipeline_ = nullptr;
};
} // namespace d3d12
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_D3D12_D3D12_RENDER_TARGET_CACHE_H_

View File

@ -99,7 +99,7 @@ void D3D12Shader::D3D12Translation::DisassembleDxbcAndDxil(
}
Shader::Translation* D3D12Shader::CreateTranslationInstance(
uint32_t modification) {
uint64_t modification) {
return new D3D12Translation(*this, modification);
}

View File

@ -23,7 +23,7 @@ class D3D12Shader : public DxbcShader {
public:
class D3D12Translation : public DxbcTranslation {
public:
D3D12Translation(D3D12Shader& shader, uint32_t modification)
D3D12Translation(D3D12Shader& shader, uint64_t modification)
: DxbcTranslation(shader, modification) {}
void DisassembleDxbcAndDxil(const ui::d3d12::D3D12Provider& provider,
@ -60,7 +60,7 @@ class D3D12Shader : public DxbcShader {
}
protected:
Translation* CreateTranslationInstance(uint32_t modification) override;
Translation* CreateTranslationInstance(uint64_t modification) override;
private:
std::atomic_flag binding_layout_user_uids_set_up_ = ATOMIC_FLAG_INIT;

View File

@ -192,6 +192,11 @@ void D3D12SharedMemory::CompletedSubmissionUpdated() {
upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission());
}
void D3D12SharedMemory::BeginSubmission() {
// ExecuteCommandLists is a full UAV barrier.
buffer_uav_writes_commit_needed_ = false;
}
void D3D12SharedMemory::CommitUAVWritesAndTransitionBuffer(
D3D12_RESOURCE_STATES new_state) {
if (buffer_state_ == new_state) {
@ -421,7 +426,7 @@ bool D3D12SharedMemory::UploadRanges(
return false;
}
MakeRangeValid(upload_range_start << page_size_log2(),
uint32_t(upload_buffer_size), false);
uint32_t(upload_buffer_size), false, false);
std::memcpy(
upload_buffer_mapping,
memory().TranslatePhysical(upload_range_start << page_size_log2()),

View File

@ -43,6 +43,7 @@ class D3D12SharedMemory : public SharedMemory {
}
void CompletedSubmissionUpdated();
void BeginSubmission();
// RequestRange may transition the buffer to copy destination - call it before
// UseForReading or UseForWriting.

View File

@ -40,6 +40,23 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
stream += kCommandHeaderSizeElements;
stream_remaining -= kCommandHeaderSizeElements;
switch (header.command) {
case Command::kD3DClearDepthStencilView: {
auto& args =
*reinterpret_cast<const ClearDepthStencilViewHeader*>(stream);
command_list->ClearDepthStencilView(
args.depth_stencil_view, args.clear_flags, args.depth, args.stencil,
args.num_rects,
args.num_rects ? reinterpret_cast<const D3D12_RECT*>(&args + 1)
: nullptr);
} break;
case Command::kD3DClearRenderTargetView: {
auto& args =
*reinterpret_cast<const ClearRenderTargetViewHeader*>(stream);
command_list->ClearRenderTargetView(
args.render_target_view, args.color_rgba, args.num_rects,
args.num_rects ? reinterpret_cast<const D3D12_RECT*>(&args + 1)
: nullptr);
} break;
case Command::kD3DClearUnorderedAccessViewUint: {
auto& args =
*reinterpret_cast<const ClearUnorderedAccessViewHeader*>(stream);
@ -64,11 +81,12 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
auto& args = *reinterpret_cast<const CopyTextureArguments*>(stream);
command_list->CopyTextureRegion(&args.dst, 0, 0, 0, &args.src, nullptr);
} break;
case Command::kCopyTextureRegion: {
case Command::kD3DCopyTextureRegion: {
auto& args =
*reinterpret_cast<const CopyTextureRegionArguments*>(stream);
command_list->CopyTextureRegion(&args.dst, args.dst_x, args.dst_y,
args.dst_z, &args.src, &args.src_box);
*reinterpret_cast<const D3DCopyTextureRegionArguments*>(stream);
command_list->CopyTextureRegion(
&args.dst, args.dst_x, args.dst_y, args.dst_z, &args.src,
args.has_src_box ? &args.src_box : nullptr);
} break;
case Command::kD3DDispatch: {
if (current_pipeline_state != nullptr) {
@ -107,6 +125,17 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
command_list->IASetPrimitiveTopology(
*reinterpret_cast<const D3D12_PRIMITIVE_TOPOLOGY*>(stream));
} break;
case Command::kD3DIASetVertexBuffers: {
static_assert(alignof(D3D12_VERTEX_BUFFER_VIEW) <= alignof(uintmax_t));
auto& args =
*reinterpret_cast<const D3DIASetVertexBuffersHeader*>(stream);
command_list->IASetVertexBuffers(
args.start_slot, args.num_views,
reinterpret_cast<const D3D12_VERTEX_BUFFER_VIEW*>(
reinterpret_cast<const uint8_t*>(stream) +
xe::align(sizeof(D3DIASetVertexBuffersHeader),
alignof(D3D12_VERTEX_BUFFER_VIEW))));
} break;
case Command::kD3DOMSetBlendFactor: {
command_list->OMSetBlendFactor(reinterpret_cast<const FLOAT*>(stream));
} break;

View File

@ -15,6 +15,7 @@
#include <cstring>
#include <vector>
#include "xenia/base/assert.h"
#include "xenia/base/math.h"
#include "xenia/ui/d3d12/d3d12_api.h"
@ -33,6 +34,47 @@ class DeferredCommandList {
void Execute(ID3D12GraphicsCommandList* command_list,
ID3D12GraphicsCommandList1* command_list_1);
D3D12_RECT* ClearDepthStencilViewAllocatedRects(
D3D12_CPU_DESCRIPTOR_HANDLE depth_stencil_view,
D3D12_CLEAR_FLAGS clear_flags, FLOAT depth, UINT8 stencil,
UINT num_rects) {
auto args = reinterpret_cast<ClearDepthStencilViewHeader*>(WriteCommand(
Command::kD3DClearDepthStencilView,
sizeof(ClearDepthStencilViewHeader) + num_rects * sizeof(D3D12_RECT)));
args->depth_stencil_view = depth_stencil_view;
args->clear_flags = clear_flags;
args->depth = depth;
args->stencil = stencil;
args->num_rects = num_rects;
return num_rects ? reinterpret_cast<D3D12_RECT*>(args + 1) : nullptr;
}
void D3DClearDepthStencilView(D3D12_CPU_DESCRIPTOR_HANDLE depth_stencil_view,
D3D12_CLEAR_FLAGS clear_flags, FLOAT depth,
UINT8 stencil, UINT num_rects,
const D3D12_RECT* rects) {
D3D12_RECT* allocated_rects = ClearDepthStencilViewAllocatedRects(
depth_stencil_view, clear_flags, depth, stencil, num_rects);
if (num_rects) {
assert_not_null(allocated_rects);
std::memcpy(allocated_rects, rects, num_rects * sizeof(D3D12_RECT));
}
}
void D3DClearRenderTargetView(D3D12_CPU_DESCRIPTOR_HANDLE render_target_view,
const FLOAT color_rgba[4], UINT num_rects,
const D3D12_RECT* rects) {
auto args = reinterpret_cast<ClearRenderTargetViewHeader*>(WriteCommand(
Command::kD3DClearRenderTargetView,
sizeof(ClearRenderTargetViewHeader) + num_rects * sizeof(D3D12_RECT)));
args->render_target_view = render_target_view;
std::memcpy(args->color_rgba, color_rgba, 4 * sizeof(FLOAT));
args->num_rects = num_rects;
if (num_rects != 0) {
std::memcpy(args + 1, rects, num_rects * sizeof(D3D12_RECT));
}
}
void D3DClearUnorderedAccessViewUint(
D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle_in_current_heap,
D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle, ID3D12Resource* resource,
@ -79,18 +121,25 @@ class DeferredCommandList {
std::memcpy(&args.src, &src, sizeof(D3D12_TEXTURE_COPY_LOCATION));
}
void CopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION& dst, UINT dst_x,
UINT dst_y, UINT dst_z,
const D3D12_TEXTURE_COPY_LOCATION& src,
const D3D12_BOX& src_box) {
auto& args = *reinterpret_cast<CopyTextureRegionArguments*>(WriteCommand(
Command::kCopyTextureRegion, sizeof(CopyTextureRegionArguments)));
std::memcpy(&args.dst, &dst, sizeof(D3D12_TEXTURE_COPY_LOCATION));
void D3DCopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION* dst, UINT dst_x,
UINT dst_y, UINT dst_z,
const D3D12_TEXTURE_COPY_LOCATION* src,
const D3D12_BOX* src_box) {
assert_not_null(dst);
assert_not_null(src);
auto& args = *reinterpret_cast<D3DCopyTextureRegionArguments*>(WriteCommand(
Command::kD3DCopyTextureRegion, sizeof(D3DCopyTextureRegionArguments)));
std::memcpy(&args.dst, dst, sizeof(D3D12_TEXTURE_COPY_LOCATION));
args.dst_x = dst_x;
args.dst_y = dst_y;
args.dst_z = dst_z;
std::memcpy(&args.src, &src, sizeof(D3D12_TEXTURE_COPY_LOCATION));
args.src_box = src_box;
std::memcpy(&args.src, src, sizeof(D3D12_TEXTURE_COPY_LOCATION));
if (src_box) {
args.has_src_box = true;
args.src_box = *src_box;
} else {
args.has_src_box = false;
}
}
void D3DDispatch(UINT thread_group_count_x, UINT thread_group_count_y,
@ -147,6 +196,23 @@ class DeferredCommandList {
arg = primitive_topology;
}
void D3DIASetVertexBuffers(UINT start_slot, UINT num_views,
const D3D12_VERTEX_BUFFER_VIEW* views) {
if (num_views == 0) {
return;
}
static_assert(alignof(D3D12_VERTEX_BUFFER_VIEW) <= alignof(uintmax_t));
const size_t header_size = xe::align(sizeof(D3DIASetVertexBuffersHeader),
alignof(D3D12_VERTEX_BUFFER_VIEW));
auto args = reinterpret_cast<D3DIASetVertexBuffersHeader*>(WriteCommand(
Command::kD3DIASetVertexBuffers,
header_size + num_views * sizeof(D3D12_VERTEX_BUFFER_VIEW)));
args->start_slot = start_slot;
args->num_views = num_views;
std::memcpy(reinterpret_cast<uint8_t*>(args) + header_size, views,
sizeof(D3D12_VERTEX_BUFFER_VIEW) * num_views);
}
void D3DOMSetBlendFactor(const FLOAT blend_factor[4]) {
auto args = reinterpret_cast<FLOAT*>(
WriteCommand(Command::kD3DOMSetBlendFactor, 4 * sizeof(FLOAT)));
@ -333,16 +399,19 @@ class DeferredCommandList {
private:
enum class Command {
kD3DClearDepthStencilView,
kD3DClearRenderTargetView,
kD3DClearUnorderedAccessViewUint,
kD3DCopyBufferRegion,
kD3DCopyResource,
kCopyTexture,
kCopyTextureRegion,
kD3DCopyTextureRegion,
kD3DDispatch,
kD3DDrawIndexedInstanced,
kD3DDrawInstanced,
kD3DIASetIndexBuffer,
kD3DIASetPrimitiveTopology,
kD3DIASetVertexBuffers,
kD3DOMSetBlendFactor,
kD3DOMSetRenderTargets,
kD3DOMSetStencilRef,
@ -370,12 +439,26 @@ class DeferredCommandList {
static constexpr size_t kCommandHeaderSizeElements =
(sizeof(CommandHeader) + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
struct ClearDepthStencilViewHeader {
D3D12_CPU_DESCRIPTOR_HANDLE depth_stencil_view;
D3D12_CLEAR_FLAGS clear_flags;
FLOAT depth;
UINT8 stencil;
UINT num_rects;
};
struct ClearRenderTargetViewHeader {
D3D12_CPU_DESCRIPTOR_HANDLE render_target_view;
FLOAT color_rgba[4];
UINT num_rects;
};
struct ClearUnorderedAccessViewHeader {
D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle_in_current_heap;
D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle;
ID3D12Resource* resource;
union {
float values_float[4];
FLOAT values_float[4];
UINT values_uint[4];
};
UINT num_rects;
@ -399,13 +482,14 @@ class DeferredCommandList {
D3D12_TEXTURE_COPY_LOCATION src;
};
struct CopyTextureRegionArguments {
struct D3DCopyTextureRegionArguments {
D3D12_TEXTURE_COPY_LOCATION dst;
UINT dst_x;
UINT dst_y;
UINT dst_z;
D3D12_TEXTURE_COPY_LOCATION src;
D3D12_BOX src_box;
bool has_src_box;
};
struct D3DDispatchArguments {
@ -429,6 +513,11 @@ class DeferredCommandList {
UINT start_instance_location;
};
struct D3DIASetVertexBuffersHeader {
UINT start_slot;
UINT num_views;
};
struct D3DOMSetRenderTargetsArguments {
uint8_t num_render_target_descriptors;
bool rts_single_handle_to_descriptor_range;

File diff suppressed because it is too large Load Diff

View File

@ -23,11 +23,13 @@
#include "xenia/base/hash.h"
#include "xenia/base/platform.h"
#include "xenia/base/string_buffer.h"
#include "xenia/base/threading.h"
#include "xenia/gpu/d3d12/d3d12_render_target_cache.h"
#include "xenia/gpu/d3d12/d3d12_shader.h"
#include "xenia/gpu/d3d12/render_target_cache.h"
#include "xenia/gpu/dxbc_shader_translator.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/primitive_processor.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/xenos.h"
#include "xenia/ui/d3d12/d3d12_api.h"
@ -43,10 +45,9 @@ class PipelineCache {
static constexpr size_t kLayoutUIDEmpty = 0;
PipelineCache(D3D12CommandProcessor& command_processor,
const RegisterFile& register_file, bool bindless_resources_used,
bool edram_rov_used,
flags::DepthFloat24Conversion depth_float24_conversion,
uint32_t resolution_scale);
const RegisterFile& register_file,
const D3D12RenderTargetCache& render_target_cache,
bool bindless_resources_used);
~PipelineCache();
bool Initialize();
@ -62,22 +63,28 @@ class PipelineCache {
D3D12Shader* LoadShader(xenos::ShaderType shader_type,
const uint32_t* host_address, uint32_t dword_count);
// Analyze shader microcode on the translator thread.
void AnalyzeShaderUcode(Shader& shader) {
shader.AnalyzeUcode(ucode_disasm_buffer_);
}
// Retrieves the shader modifications for the current state, and returns
// whether they are valid.
bool GetCurrentShaderModifications(
DxbcShaderTranslator::Modification& vertex_shader_modification_out,
DxbcShaderTranslator::Modification& pixel_shader_modification_out) const;
// Translates shaders if needed, also making shader info up to date.
bool EnsureShadersTranslated(D3D12Shader::D3D12Translation* vertex_shader,
D3D12Shader::D3D12Translation* pixel_shader);
// Retrieves the shader modification for the current state. The shader must
// have microcode analyzed.
DxbcShaderTranslator::Modification
PipelineCache::GetCurrentVertexShaderModification(
const Shader& shader,
Shader::HostVertexShaderType host_vertex_shader_type) const;
DxbcShaderTranslator::Modification
PipelineCache::GetCurrentPixelShaderModification(const Shader& shader) const;
// If draw_util::IsRasterizationPotentiallyDone is false, the pixel shader
// MUST be made nullptr BEFORE calling this!
bool ConfigurePipeline(
D3D12Shader::D3D12Translation* vertex_shader,
D3D12Shader::D3D12Translation* pixel_shader,
xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format,
const RenderTargetCache::PipelineRenderTarget render_targets[5],
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
uint32_t bound_depth_and_color_render_target_bits,
const uint32_t* bound_depth_and_color_render_targets_formats,
void** pipeline_handle_out, ID3D12RootSignature** root_signature_out);
// Returns a pipeline with deferred creation by its handle. May return nullptr
@ -93,9 +100,7 @@ class PipelineCache {
uint32_t ucode_dword_count : 31;
xenos::ShaderType type : 1;
reg::SQ_PROGRAM_CNTL sq_program_cntl;
static constexpr uint32_t kVersion = 0x20201207;
static constexpr uint32_t kVersion = 0x20201219;
});
// Update PipelineDescription::kVersion if any of the Pipeline* enums are
@ -138,6 +143,8 @@ class PipelineCache {
kNone,
kFront,
kBack,
// Special case, handled via disabling the pixel shader and depth / stencil.
kDisableRasterization,
};
enum class PipelineBlendFactor : uint32_t {
@ -171,10 +178,10 @@ class PipelineCache {
XEPACKEDSTRUCT(PipelineDescription, {
uint64_t vertex_shader_hash;
uint64_t vertex_shader_modification;
// 0 if drawing without a pixel shader.
uint64_t pixel_shader_hash;
uint32_t vertex_shader_modification;
uint32_t pixel_shader_modification;
uint64_t pixel_shader_modification;
int32_t depth_bias;
float depth_bias_slope_scaled;
@ -189,12 +196,12 @@ class PipelineCache {
PipelineCullMode cull_mode : 2; // 9
uint32_t front_counter_clockwise : 1; // 10
uint32_t depth_clip : 1; // 11
uint32_t rov_msaa : 1; // 12
xenos::DepthRenderTargetFormat depth_format : 1; // 13
xenos::CompareFunction depth_func : 3; // 16
uint32_t depth_write : 1; // 17
uint32_t stencil_enable : 1; // 18
uint32_t stencil_read_mask : 8; // 26
xenos::MsaaSamples host_msaa_samples : 2; // 13
xenos::DepthRenderTargetFormat depth_format : 1; // 14
xenos::CompareFunction depth_func : 3; // 17
uint32_t depth_write : 1; // 18
uint32_t stencil_enable : 1; // 19
uint32_t stencil_read_mask : 8; // 27
uint32_t stencil_write_mask : 8; // 8
xenos::StencilOp stencil_front_fail_op : 3; // 11
@ -206,9 +213,9 @@ class PipelineCache {
xenos::StencilOp stencil_back_pass_op : 3; // 29
xenos::CompareFunction stencil_back_func : 3; // 32
PipelineRenderTarget render_targets[4];
PipelineRenderTarget render_targets[xenos::kMaxColorRenderTargets];
static constexpr uint32_t kVersion = 0x20201207;
static constexpr uint32_t kVersion = 0x20210425;
});
XEPACKEDSTRUCT(PipelineStoredDescription, {
@ -223,27 +230,26 @@ class PipelineCache {
PipelineDescription description;
};
// Returns the host vertex shader type for the current draw if it's valid and
// supported, or Shader::HostVertexShaderType(-1) if not.
Shader::HostVertexShaderType GetCurrentHostVertexShaderTypeIfValid() const;
D3D12Shader* LoadShader(xenos::ShaderType shader_type,
const uint32_t* host_address, uint32_t dword_count,
uint64_t data_hash);
// Can be called from multiple threads.
bool TranslateShader(DxbcShaderTranslator& translator,
D3D12Shader::D3D12Translation& translation,
reg::SQ_PROGRAM_CNTL cntl,
IDxbcConverter* dxbc_converter = nullptr,
IDxcUtils* dxc_utils = nullptr,
IDxcCompiler* dxc_compiler = nullptr);
bool TranslateAnalyzedShader(DxbcShaderTranslator& translator,
D3D12Shader::D3D12Translation& translation,
IDxbcConverter* dxbc_converter = nullptr,
IDxcUtils* dxc_utils = nullptr,
IDxcCompiler* dxc_compiler = nullptr);
// If draw_util::IsRasterizationPotentiallyDone is false, the pixel shader
// MUST be made nullptr BEFORE calling this! The shaders must be translated
// and valid.
bool GetCurrentStateDescription(
D3D12Shader::D3D12Translation* vertex_shader,
D3D12Shader::D3D12Translation* pixel_shader,
xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format,
const RenderTargetCache::PipelineRenderTarget render_targets[5],
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
uint32_t bound_depth_and_color_render_target_bits,
const uint32_t* bound_depth_and_color_render_target_formats,
PipelineRuntimeDescription& runtime_description_out);
ID3D12PipelineState* CreateD3D12Pipeline(
@ -251,13 +257,12 @@ class PipelineCache {
D3D12CommandProcessor& command_processor_;
const RegisterFile& register_file_;
const D3D12RenderTargetCache& render_target_cache_;
bool bindless_resources_used_;
bool edram_rov_used_;
// 20e4 depth conversion mode to use for non-ROV output.
flags::DepthFloat24Conversion depth_float24_conversion_;
uint32_t resolution_scale_;
// Reusable shader translator.
// Temporary storage for AnalyzeUcode calls on the processor thread.
StringBuffer ucode_disasm_buffer_;
// Reusable shader translator for the processor thread.
std::unique_ptr<DxbcShaderTranslator> shader_translator_;
// Command processor thread DXIL conversion/disassembly interfaces, if DXIL
@ -332,8 +337,7 @@ class PipelineCache {
std::condition_variable storage_write_request_cond_;
// Storage thread input is protected with storage_write_request_lock_, and the
// thread is notified about its change via storage_write_request_cond_.
std::deque<std::pair<const Shader*, reg::SQ_PROGRAM_CNTL>>
storage_write_shader_queue_;
std::deque<const Shader*> storage_write_shader_queue_;
std::deque<PipelineStoredDescription> storage_write_pipeline_queue_;
bool storage_write_flush_shaders_ = false;
bool storage_write_flush_pipelines_ = false;

View File

@ -1,762 +0,0 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2018 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include "xenia/gpu/d3d12/primitive_converter.h"
#include <algorithm>
#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/base/platform.h"
#include "xenia/base/profiling.h"
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
#include "xenia/ui/d3d12/d3d12_util.h"
DEFINE_bool(d3d12_convert_quads_to_triangles, false,
"Convert quad lists to triangle lists on the CPU instead of using "
"a geometry shader. Not recommended for playing, for debugging "
"primarily (because PIX fails to display vertices when a geometry "
"shader is used), and this way quads can't be discarded correctly "
"when the game uses vertex kill functionality.",
"D3D12");
namespace xe {
namespace gpu {
namespace d3d12 {
PrimitiveConverter::PrimitiveConverter(D3D12CommandProcessor& command_processor,
const RegisterFile& register_file,
Memory& memory,
TraceWriter& trace_writer)
: command_processor_(command_processor),
register_file_(register_file),
memory_(memory),
trace_writer_(trace_writer) {
system_page_size_ = uint32_t(memory::page_size());
}
PrimitiveConverter::~PrimitiveConverter() { Shutdown(); }
bool PrimitiveConverter::Initialize() {
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
auto device = provider.GetDevice();
D3D12_HEAP_FLAGS heap_flag_create_not_zeroed =
provider.GetHeapFlagCreateNotZeroed();
// There can be at most 65535 indices in a Xenos draw call (16 bit index
// count), but they can be up to 4 bytes large, and conversion can add more
// indices (almost triple the count for triangle strips or fans, for
// instance).
buffer_pool_ = std::make_unique<ui::d3d12::D3D12UploadBufferPool>(
provider, std::max(sizeof(uint32_t) * 3 * 65535,
ui::d3d12::D3D12UploadBufferPool::kDefaultPageSize));
// Create the static index buffer for non-indexed drawing.
D3D12_RESOURCE_DESC static_ib_desc;
ui::d3d12::util::FillBufferResourceDesc(
static_ib_desc, kStaticIBTotalCount * sizeof(uint16_t),
D3D12_RESOURCE_FLAG_NONE);
if (FAILED(device->CreateCommittedResource(
&ui::d3d12::util::kHeapPropertiesUpload, heap_flag_create_not_zeroed,
&static_ib_desc, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
IID_PPV_ARGS(&static_ib_upload_)))) {
XELOGE(
"Failed to create the upload buffer for the primitive conversion "
"static index buffer");
Shutdown();
return false;
}
D3D12_RANGE static_ib_read_range;
static_ib_read_range.Begin = 0;
static_ib_read_range.End = 0;
void* static_ib_mapping;
if (FAILED(static_ib_upload_->Map(0, &static_ib_read_range,
&static_ib_mapping))) {
XELOGE(
"Failed to map the upload buffer for the primitive conversion "
"static index buffer");
Shutdown();
return false;
}
uint16_t* static_ib_data = reinterpret_cast<uint16_t*>(static_ib_mapping);
// Triangle fans as triangle lists.
// https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans
// Ordered as (v1, v2, v0), (v2, v3, v0).
uint16_t* static_ib_data_pointer =
&static_ib_data[kStaticIBTriangleFanOffset];
for (uint32_t i = 2; i < kMaxNonIndexedVertices; ++i) {
*(static_ib_data_pointer++) = i - 1;
*(static_ib_data_pointer++) = i;
*(static_ib_data_pointer++) = 0;
}
static_ib_data_pointer = &static_ib_data[kStaticIBQuadOffset];
for (uint32_t i = 0; i < (kMaxNonIndexedVertices >> 2); ++i) {
uint32_t quad_index = i << 2;
*(static_ib_data_pointer++) = quad_index;
*(static_ib_data_pointer++) = quad_index + 1;
*(static_ib_data_pointer++) = quad_index + 2;
*(static_ib_data_pointer++) = quad_index;
*(static_ib_data_pointer++) = quad_index + 2;
*(static_ib_data_pointer++) = quad_index + 3;
}
static_ib_upload_->Unmap(0, nullptr);
// Not uploaded yet.
static_ib_upload_submission_ = UINT64_MAX;
if (FAILED(device->CreateCommittedResource(
&ui::d3d12::util::kHeapPropertiesDefault, heap_flag_create_not_zeroed,
&static_ib_desc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
IID_PPV_ARGS(&static_ib_)))) {
XELOGE("Failed to create the primitive conversion static index buffer");
Shutdown();
return false;
}
static_ib_gpu_address_ = static_ib_->GetGPUVirtualAddress();
memory_regions_invalidated_.store(0ull, std::memory_order_relaxed);
memory_invalidation_callback_handle_ =
memory_.RegisterPhysicalMemoryInvalidationCallback(
MemoryInvalidationCallbackThunk, this);
return true;
}
void PrimitiveConverter::Shutdown() {
if (memory_invalidation_callback_handle_ != nullptr) {
memory_.UnregisterPhysicalMemoryInvalidationCallback(
memory_invalidation_callback_handle_);
memory_invalidation_callback_handle_ = nullptr;
}
ui::d3d12::util::ReleaseAndNull(static_ib_);
ui::d3d12::util::ReleaseAndNull(static_ib_upload_);
buffer_pool_.reset();
}
void PrimitiveConverter::ClearCache() { buffer_pool_->ClearCache(); }
void PrimitiveConverter::CompletedSubmissionUpdated() {
if (static_ib_upload_ && command_processor_.GetCompletedSubmission() >=
static_ib_upload_submission_) {
// Completely uploaded - release the upload buffer.
static_ib_upload_->Release();
static_ib_upload_ = nullptr;
}
}
void PrimitiveConverter::BeginSubmission() {
// Got a command list now - upload and transition the static index buffer if
// needed.
if (static_ib_upload_ && static_ib_upload_submission_ == UINT64_MAX) {
command_processor_.GetDeferredCommandList().D3DCopyResource(
static_ib_, static_ib_upload_);
command_processor_.PushTransitionBarrier(static_ib_,
D3D12_RESOURCE_STATE_COPY_DEST,
D3D12_RESOURCE_STATE_INDEX_BUFFER);
static_ib_upload_submission_ = command_processor_.GetCurrentSubmission();
}
}
void PrimitiveConverter::BeginFrame() {
buffer_pool_->Reclaim(command_processor_.GetCompletedFrame());
converted_indices_cache_.clear();
memory_regions_used_ = 0;
}
xenos::PrimitiveType PrimitiveConverter::GetReplacementPrimitiveType(
xenos::PrimitiveType type) {
switch (type) {
case xenos::PrimitiveType::kTriangleFan:
return xenos::PrimitiveType::kTriangleList;
case xenos::PrimitiveType::kLineLoop:
return xenos::PrimitiveType::kLineStrip;
case xenos::PrimitiveType::kQuadList:
if (cvars::d3d12_convert_quads_to_triangles) {
return xenos::PrimitiveType::kTriangleList;
}
break;
default:
break;
}
return type;
}
PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives(
xenos::PrimitiveType source_type, uint32_t address, uint32_t index_count,
xenos::IndexFormat index_format, xenos::Endian index_endianness,
D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out, uint32_t& index_count_out) {
bool index_32bit = index_format == xenos::IndexFormat::kInt32;
const auto& regs = register_file_;
bool reset = regs.Get<reg::PA_SU_SC_MODE_CNTL>().multi_prim_ib_ena;
// Swap the reset index because we will be comparing unswapped values to it.
uint32_t reset_index = xenos::GpuSwap(
regs[XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX].u32, index_endianness);
// If the specified reset index is the same as the one used by Direct3D 12
// (0xFFFF or 0xFFFFFFFF - in the pipeline cache, we use the former for
// 16-bit and the latter for 32-bit indices), we can use the buffer directly.
uint32_t reset_index_host = index_32bit ? 0xFFFFFFFFu : 0xFFFFu;
// Degenerate line loops are just lines.
if (source_type == xenos::PrimitiveType::kLineLoop && index_count <= 2) {
source_type = xenos::PrimitiveType::kLineStrip;
}
// Check if need to convert at all.
if (source_type == xenos::PrimitiveType::kTriangleStrip ||
source_type == xenos::PrimitiveType::kLineStrip) {
if (!reset || reset_index == reset_index_host) {
return ConversionResult::kConversionNotNeeded;
}
} else if (source_type == xenos::PrimitiveType::kQuadList) {
if (!cvars::d3d12_convert_quads_to_triangles) {
return ConversionResult::kConversionNotNeeded;
}
} else if (source_type != xenos::PrimitiveType::kTriangleFan &&
source_type != xenos::PrimitiveType::kLineLoop) {
return ConversionResult::kConversionNotNeeded;
}
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
SCOPE_profile_cpu_f("gpu");
#endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
// Exit early for clearly empty draws, without even reading the memory.
uint32_t index_count_min;
if (source_type == xenos::PrimitiveType::kLineStrip ||
source_type == xenos::PrimitiveType::kLineLoop) {
index_count_min = 2;
} else if (source_type == xenos::PrimitiveType::kQuadList) {
index_count_min = 4;
} else {
index_count_min = 3;
}
if (index_count < index_count_min) {
return ConversionResult::kPrimitiveEmpty;
}
// Invalidate the cache if data behind any entry was modified.
if (memory_regions_invalidated_.exchange(0ull, std::memory_order_acquire) &
memory_regions_used_) {
converted_indices_cache_.clear();
memory_regions_used_ = 0;
}
address &= index_32bit ? 0x1FFFFFFC : 0x1FFFFFFE;
uint32_t index_size = index_32bit ? sizeof(uint32_t) : sizeof(uint16_t);
uint32_t index_buffer_size = index_size * index_count;
uint32_t address_last = address + index_size * (index_count - 1);
// Create the cache entry, currently only for the key.
ConvertedIndices converted_indices;
converted_indices.key.address = address;
converted_indices.key.source_type = source_type;
converted_indices.key.format = index_format;
converted_indices.key.count = index_count;
converted_indices.key.reset = reset ? 1 : 0;
converted_indices.reset_index = reset_index;
// Try to find the previously converted index buffer.
auto found_range =
converted_indices_cache_.equal_range(converted_indices.key.value);
for (auto iter = found_range.first; iter != found_range.second; ++iter) {
const ConvertedIndices& found_converted = iter->second;
if (reset && found_converted.reset_index != reset_index) {
continue;
}
if (found_converted.converted_index_count == 0) {
return ConversionResult::kPrimitiveEmpty;
}
if (!found_converted.gpu_address) {
return ConversionResult::kConversionNotNeeded;
}
gpu_address_out = found_converted.gpu_address;
index_count_out = found_converted.converted_index_count;
return ConversionResult::kConverted;
}
// Get the memory usage mask for cache invalidation.
// 1 bit = (512 / 64) MB = 8 MB.
uint64_t memory_regions_used_bits = ~((1ull << (address >> 23)) - 1);
if (address_last < (63 << 23)) {
memory_regions_used_bits = (1ull << ((address_last >> 23) + 1)) - 1;
}
union {
const void* source;
const uint8_t* source_8;
const uint16_t* source_16;
const uint32_t* source_32;
uintptr_t source_uintptr;
};
source = memory_.TranslatePhysical(address);
// Calculate the new index count, and also check if there's nothing to convert
// in the buffer (for instance, if not using actually primitive reset).
uint32_t converted_index_count = 0;
bool conversion_needed = false;
bool simd = false;
// Optimization specific to primitive types - if reset index not found in the
// source index buffer, can set this to false and use a faster way of copying.
bool reset_actually_used = reset;
if (source_type == xenos::PrimitiveType::kTriangleFan) {
// Triangle fans are not supported by Direct3D 12 at all.
conversion_needed = true;
trace_writer_.WriteMemoryRead(address, index_buffer_size);
if (reset) {
uint32_t current_fan_index_count = 0;
for (uint32_t i = 0; i < index_count; ++i) {
uint32_t index = index_format == xenos::IndexFormat::kInt32
? source_32[i]
: source_16[i];
if (index == reset_index) {
current_fan_index_count = 0;
continue;
}
if (++current_fan_index_count >= 3) {
converted_index_count += 3;
}
}
} else {
converted_index_count = 3 * (index_count - 2);
}
} else if (source_type == xenos::PrimitiveType::kTriangleStrip ||
source_type == xenos::PrimitiveType::kLineStrip) {
converted_index_count = index_count;
// Check if the restart index is used at all in this buffer because reading
// vertices from a default heap is faster than from an upload heap.
conversion_needed = false;
trace_writer_.WriteMemoryRead(address, index_buffer_size);
#if XE_ARCH_AMD64
// Will use SIMD to copy 16-byte blocks using _mm_or_si128.
simd = true;
union {
const void* check_source;
const uint32_t* check_source_16;
const uint32_t* check_source_32;
const __m128i* check_source_128;
uintptr_t check_source_uintptr;
};
check_source = source;
uint32_t check_indices_remaining = index_count;
alignas(16) uint64_t check_result[2];
if (index_format == xenos::IndexFormat::kInt32) {
while (check_indices_remaining != 0 && (check_source_uintptr & 15)) {
--check_indices_remaining;
if (*(check_source_32++) == reset_index) {
conversion_needed = true;
check_indices_remaining = 0;
}
}
__m128i check_reset_index_vector = _mm_set1_epi32(reset_index);
while (check_indices_remaining >= 4) {
check_indices_remaining -= 4;
_mm_store_si128(reinterpret_cast<__m128i*>(&check_result),
_mm_cmpeq_epi32(_mm_load_si128(check_source_128++),
check_reset_index_vector));
if (check_result[0] || check_result[1]) {
conversion_needed = true;
check_indices_remaining = 0;
}
}
while (check_indices_remaining != 0) {
--check_indices_remaining;
if (*(check_source_32++) == reset_index) {
conversion_needed = true;
check_indices_remaining = 0;
}
}
} else {
while (check_indices_remaining != 0 && (check_source_uintptr & 15)) {
--check_indices_remaining;
if (*(check_source_16++) == reset_index) {
conversion_needed = true;
check_indices_remaining = 0;
}
}
__m128i check_reset_index_vector = _mm_set1_epi16(reset_index);
while (check_indices_remaining >= 8) {
check_indices_remaining -= 8;
_mm_store_si128(reinterpret_cast<__m128i*>(&check_result),
_mm_cmpeq_epi16(_mm_load_si128(check_source_128++),
check_reset_index_vector));
if (check_result[0] || check_result[1]) {
conversion_needed = true;
check_indices_remaining = 0;
}
}
while (check_indices_remaining != 0) {
--check_indices_remaining;
if (*(check_source_16++) == reset_index) {
conversion_needed = true;
check_indices_remaining = 0;
}
}
}
#else
if (index_format == xenos::IndexFormat::kInt32) {
for (uint32_t i = 0; i < index_count; ++i) {
if (source_32[i] == reset_index) {
conversion_needed = true;
break;
}
}
} else {
for (uint32_t i = 0; i < index_count; ++i) {
if (source_16[i] == reset_index) {
conversion_needed = true;
break;
}
}
}
#endif // XE_ARCH_AMD64
} else if (source_type == xenos::PrimitiveType::kLineLoop) {
conversion_needed = true;
trace_writer_.WriteMemoryRead(address, index_buffer_size);
if (reset) {
reset_actually_used = false;
uint32_t current_strip_index_count = 0;
for (uint32_t i = 0; i < index_count; ++i) {
uint32_t index = index_format == xenos::IndexFormat::kInt32
? source_32[i]
: source_16[i];
if (index == reset_index) {
reset_actually_used = true;
// Loop strips with more than 2 vertices.
if (current_strip_index_count > 2) {
++converted_index_count;
}
current_strip_index_count = 0;
continue;
}
// Start a new strip if 2 vertices, add one vertex if more.
if (++current_strip_index_count >= 2) {
converted_index_count += current_strip_index_count == 2 ? 2 : 1;
}
}
} else {
converted_index_count = index_count + 1;
}
} else if (source_type == xenos::PrimitiveType::kQuadList) {
conversion_needed = true;
trace_writer_.WriteMemoryRead(address, index_buffer_size);
converted_index_count = (index_count >> 2) * 6;
}
converted_indices.converted_index_count = converted_index_count;
// If nothing to convert, store this result so the check won't be happening
// again and again and exit.
if (!conversion_needed || converted_index_count == 0) {
converted_indices.gpu_address = 0;
converted_indices_cache_.emplace(converted_indices.key.value,
converted_indices);
memory_regions_used_ |= memory_regions_used_bits;
return converted_index_count == 0 ? ConversionResult::kPrimitiveEmpty
: ConversionResult::kConversionNotNeeded;
}
// Convert.
D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
void* target = AllocateIndices(index_format, converted_index_count,
simd ? address & 15 : 0, gpu_address);
if (target == nullptr) {
return ConversionResult::kFailed;
}
if (source_type == xenos::PrimitiveType::kTriangleFan) {
// https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans
// Ordered as (v1, v2, v0), (v2, v3, v0).
if (reset) {
uint32_t current_fan_index_count = 0;
uint32_t current_fan_first_index = 0;
if (index_format == xenos::IndexFormat::kInt32) {
uint32_t* target_32 = reinterpret_cast<uint32_t*>(target);
for (uint32_t i = 0; i < index_count; ++i) {
uint32_t index = source_32[i];
if (index == reset_index) {
current_fan_index_count = 0;
continue;
}
if (current_fan_index_count == 0) {
current_fan_first_index = index;
}
if (++current_fan_index_count >= 3) {
*(target_32++) = source_32[i - 1];
*(target_32++) = index;
*(target_32++) = current_fan_first_index;
}
}
} else {
uint16_t* target_16 = reinterpret_cast<uint16_t*>(target);
for (uint32_t i = 0; i < index_count; ++i) {
uint16_t index = source_16[i];
if (index == reset_index) {
current_fan_index_count = 0;
continue;
}
if (current_fan_index_count == 0) {
current_fan_first_index = index;
}
if (++current_fan_index_count >= 3) {
*(target_16++) = source_16[i - 1];
*(target_16++) = index;
*(target_16++) = uint16_t(current_fan_first_index);
}
}
}
} else {
if (index_format == xenos::IndexFormat::kInt32) {
uint32_t* target_32 = reinterpret_cast<uint32_t*>(target);
for (uint32_t i = 2; i < index_count; ++i) {
*(target_32++) = source_32[i - 1];
*(target_32++) = source_32[i];
*(target_32++) = source_32[0];
}
} else {
uint16_t* target_16 = reinterpret_cast<uint16_t*>(target);
for (uint32_t i = 2; i < index_count; ++i) {
*(target_16++) = source_16[i - 1];
*(target_16++) = source_16[i];
*(target_16++) = source_16[0];
}
}
}
} else if (source_type == xenos::PrimitiveType::kTriangleStrip ||
source_type == xenos::PrimitiveType::kLineStrip) {
#if XE_ARCH_AMD64
// Replace the reset index with the maximum representable value - vector OR
// gives 0 or 0xFFFF/0xFFFFFFFF, which is exactly what is needed.
// Allocations in the target index buffer are aligned with 16-byte
// granularity, and within 16-byte vectors, both the source and the target
// start at the same offset.
union {
const __m128i* source_aligned_128;
uintptr_t source_aligned_uintptr;
};
source_aligned_uintptr = source_uintptr & ~(uintptr_t(15));
union {
__m128i* target_aligned_128;
uintptr_t target_aligned_uintptr;
};
target_aligned_uintptr =
reinterpret_cast<uintptr_t>(target) & ~(uintptr_t(15));
uint32_t vector_count = (address_last >> 4) - (address >> 4) + 1;
if (index_format == xenos::IndexFormat::kInt32) {
__m128i reset_index_vector = _mm_set1_epi32(reset_index);
for (uint32_t i = 0; i < vector_count; ++i) {
__m128i indices_vector = _mm_load_si128(source_aligned_128++);
__m128i indices_are_reset_vector =
_mm_cmpeq_epi32(indices_vector, reset_index_vector);
_mm_store_si128(target_aligned_128++,
_mm_or_si128(indices_vector, indices_are_reset_vector));
}
} else {
__m128i reset_index_vector = _mm_set1_epi16(reset_index);
for (uint32_t i = 0; i < vector_count; ++i) {
__m128i indices_vector = _mm_load_si128(source_aligned_128++);
__m128i indices_are_reset_vector =
_mm_cmpeq_epi16(indices_vector, reset_index_vector);
_mm_store_si128(target_aligned_128++,
_mm_or_si128(indices_vector, indices_are_reset_vector));
}
}
#else
if (index_format == xenos::IndexFormat::kInt32) {
for (uint32_t i = 0; i < index_count; ++i) {
uint32_t index = source_32[i];
reinterpret_cast<uint32_t*>(target)[i] =
index == reset_index ? 0xFFFFFFFFu : index;
}
} else {
for (uint32_t i = 0; i < index_count; ++i) {
uint16_t index = source_16[i];
reinterpret_cast<uint16_t*>(target)[i] =
index == reset_index ? 0xFFFFu : index;
}
}
#endif // XE_ARCH_AMD64
} else if (source_type == xenos::PrimitiveType::kLineLoop) {
if (reset_actually_used) {
uint32_t current_strip_index_count = 0;
uint32_t current_strip_first_index = 0;
if (index_format == xenos::IndexFormat::kInt32) {
uint32_t* target_32 = reinterpret_cast<uint32_t*>(target);
for (uint32_t i = 0; i < index_count; ++i) {
uint32_t index = source_32[i];
if (index == reset_index) {
if (current_strip_index_count > 2) {
*(target_32++) = current_strip_first_index;
}
current_strip_index_count = 0;
continue;
}
if (current_strip_index_count == 0) {
current_strip_first_index = index;
}
++current_strip_index_count;
if (current_strip_index_count >= 2) {
if (current_strip_index_count == 2) {
*(target_32++) = current_strip_first_index;
}
*(target_32++) = index;
}
}
} else {
uint16_t* target_16 = reinterpret_cast<uint16_t*>(target);
for (uint32_t i = 0; i < index_count; ++i) {
uint16_t index = source_16[i];
if (index == reset_index) {
if (current_strip_index_count > 2) {
*(target_16++) = uint16_t(current_strip_first_index);
}
current_strip_index_count = 0;
continue;
}
if (current_strip_index_count == 0) {
current_strip_first_index = index;
}
++current_strip_index_count;
if (current_strip_index_count >= 2) {
if (current_strip_index_count == 2) {
*(target_16++) = uint16_t(current_strip_first_index);
}
*(target_16++) = index;
}
}
}
} else {
std::memcpy(target, source, index_count * index_size);
if (converted_index_count > index_count) {
if (index_format == xenos::IndexFormat::kInt32) {
reinterpret_cast<uint32_t*>(target)[index_count] = source_32[0];
} else {
reinterpret_cast<uint16_t*>(target)[index_count] = source_16[0];
}
}
}
} else if (source_type == xenos::PrimitiveType::kQuadList) {
uint32_t quad_count = index_count >> 4;
if (index_format == xenos::IndexFormat::kInt32) {
uint32_t* target_32 = reinterpret_cast<uint32_t*>(target);
for (uint32_t i = 0; i < quad_count; ++i) {
uint32_t quad_index = i << 2;
*(target_32++) = source_32[quad_index];
*(target_32++) = source_32[quad_index + 1];
*(target_32++) = source_32[quad_index + 2];
*(target_32++) = source_32[quad_index];
*(target_32++) = source_32[quad_index + 2];
*(target_32++) = source_32[quad_index + 3];
}
} else {
uint16_t* target_16 = reinterpret_cast<uint16_t*>(target);
for (uint32_t i = 0; i < quad_count; ++i) {
uint32_t quad_index = i << 2;
*(target_16++) = source_16[quad_index];
*(target_16++) = source_16[quad_index + 1];
*(target_16++) = source_16[quad_index + 2];
*(target_16++) = source_16[quad_index];
*(target_16++) = source_16[quad_index + 2];
*(target_16++) = source_16[quad_index + 3];
}
}
}
// Cache and return the indices.
converted_indices.gpu_address = gpu_address;
converted_indices_cache_.emplace(converted_indices.key.value,
converted_indices);
memory_regions_used_ |= memory_regions_used_bits;
gpu_address_out = gpu_address;
index_count_out = converted_index_count;
return ConversionResult::kConverted;
}
void* PrimitiveConverter::AllocateIndices(
xenos::IndexFormat format, uint32_t count, uint32_t simd_offset,
D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out) {
if (count == 0) {
return nullptr;
}
uint32_t size =
count * (format == xenos::IndexFormat::kInt32 ? sizeof(uint32_t)
: sizeof(uint16_t));
// 16-align all index data because SIMD is used to replace the reset index
// (without that, 4-alignment would be required anyway to mix 16-bit and
// 32-bit indices in one buffer page).
size = xe::align(size, uint32_t(16));
// Add some space to align SIMD register components the same way in the source
// and the buffer.
simd_offset &= 15;
if (simd_offset != 0) {
size += 16;
}
D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
uint8_t* mapping =
buffer_pool_->Request(command_processor_.GetCurrentFrame(), size, 16,
nullptr, nullptr, &gpu_address);
if (mapping == nullptr) {
XELOGE("Failed to allocate space for {} converted {}-bit vertex indices",
count, format == xenos::IndexFormat::kInt32 ? 32 : 16);
return nullptr;
}
gpu_address_out = gpu_address + simd_offset;
return mapping + simd_offset;
}
std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryInvalidationCallback(
uint32_t physical_address_start, uint32_t length, bool exact_range) {
// 1 bit = (512 / 64) MB = 8 MB. Invalidate a region of this size.
uint32_t bit_index_first = physical_address_start >> 23;
uint32_t bit_index_last = (physical_address_start + length - 1) >> 23;
uint64_t bits = ~((1ull << bit_index_first) - 1);
if (bit_index_last < 63) {
bits &= (1ull << (bit_index_last + 1)) - 1;
}
memory_regions_invalidated_ |= bits;
return std::make_pair<uint32_t, uint32_t>(0, UINT32_MAX);
}
std::pair<uint32_t, uint32_t>
PrimitiveConverter::MemoryInvalidationCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length,
bool exact_range) {
return reinterpret_cast<PrimitiveConverter*>(context_ptr)
->MemoryInvalidationCallback(physical_address_start, length, exact_range);
}
D3D12_GPU_VIRTUAL_ADDRESS PrimitiveConverter::GetStaticIndexBuffer(
xenos::PrimitiveType source_type, uint32_t index_count,
uint32_t& index_count_out) const {
if (index_count > kMaxNonIndexedVertices) {
assert_always();
return D3D12_GPU_VIRTUAL_ADDRESS(0);
}
if (source_type == xenos::PrimitiveType::kTriangleFan) {
index_count_out = (std::max(index_count, uint32_t(2)) - 2) * 3;
return static_ib_gpu_address_ +
kStaticIBTriangleFanOffset * sizeof(uint16_t);
}
if (source_type == xenos::PrimitiveType::kQuadList &&
cvars::d3d12_convert_quads_to_triangles) {
index_count_out = (index_count >> 2) * 6;
return static_ib_gpu_address_ + kStaticIBQuadOffset * sizeof(uint16_t);
}
return D3D12_GPU_VIRTUAL_ADDRESS(0);
}
void PrimitiveConverter::InitializeTrace() {
// WriteMemoryRead must not be skipped.
converted_indices_cache_.clear();
memory_regions_used_ = 0;
}
} // namespace d3d12
} // namespace gpu
} // namespace xe

View File

@ -1,189 +0,0 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2018 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_GPU_D3D12_PRIMITIVE_CONVERTER_H_
#define XENIA_GPU_D3D12_PRIMITIVE_CONVERTER_H_
#include <atomic>
#include <memory>
#include <unordered_map>
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"
#include "xenia/memory.h"
#include "xenia/ui/d3d12/d3d12_context.h"
#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
namespace xe {
namespace gpu {
namespace d3d12 {
class D3D12CommandProcessor;
// Index buffer cache for primitive types not natively supported by Direct3D 12:
// - Triangle and line strips with non-0xFFFF/0xFFFFFFFF reset index.
// - Triangle fans.
// - Line loops (only indexed ones - non-indexed are better handled in vertex
// shaders, otherwise a whole index buffer would have to be created for every
// vertex count value).
// - Quad lists (for debugging since geometry shaders break PIX - as an
// alternative to the geometry shader).
class PrimitiveConverter {
public:
PrimitiveConverter(D3D12CommandProcessor& command_processor,
const RegisterFile& register_file, Memory& memory,
TraceWriter& trace_writer);
~PrimitiveConverter();
bool Initialize();
void Shutdown();
void ClearCache();
void CompletedSubmissionUpdated();
void BeginSubmission();
void BeginFrame();
// Returns the primitive type that the original type will be converted to.
static xenos::PrimitiveType GetReplacementPrimitiveType(
xenos::PrimitiveType type);
enum class ConversionResult {
// Converted to a transient buffer.
kConverted,
// Conversion not required - use the index buffer in shared memory.
kConversionNotNeeded,
// No errors, but nothing to render.
kPrimitiveEmpty,
// Total failure of the draw call.
kFailed
};
// Converts an index buffer to the primitive type returned by
// GetReplacementPrimitiveType. If conversion has been performed, the returned
// buffer will be in the GENERIC_READ state (it's in an upload heap). Only
// writing to the outputs if returning kConverted. The restart index will be
// handled internally from the register values.
ConversionResult ConvertPrimitives(xenos::PrimitiveType source_type,
uint32_t address, uint32_t index_count,
xenos::IndexFormat index_format,
xenos::Endian index_endianness,
D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out,
uint32_t& index_count_out);
// Returns the 16-bit index buffer for drawing unsupported non-indexed
// primitives in INDEX_BUFFER state, for non-indexed drawing. Returns 0 if
// conversion is not available (can draw natively).
D3D12_GPU_VIRTUAL_ADDRESS GetStaticIndexBuffer(
xenos::PrimitiveType source_type, uint32_t index_count,
uint32_t& index_count_out) const;
// Callback for invalidating buffers mid-frame.
std::pair<uint32_t, uint32_t> MemoryInvalidationCallback(
uint32_t physical_address_start, uint32_t length, bool exact_range);
void InitializeTrace();
private:
// simd_offset is source address & 15 - if SIMD is used, the source and the
// target must have the same alignment within one register. 0 is optimal when
// not using SIMD.
void* AllocateIndices(xenos::IndexFormat format, uint32_t count,
uint32_t simd_offset,
D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out);
static std::pair<uint32_t, uint32_t> MemoryInvalidationCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length,
bool exact_range);
D3D12CommandProcessor& command_processor_;
const RegisterFile& register_file_;
Memory& memory_;
TraceWriter& trace_writer_;
std::unique_ptr<ui::d3d12::D3D12UploadBufferPool> buffer_pool_;
// Static index buffers for emulating unsupported primitive types when drawing
// without an index buffer.
// CPU-side, used only for uploading - destroyed once the copy commands have
// been completed.
ID3D12Resource* static_ib_upload_ = nullptr;
uint64_t static_ib_upload_submission_;
// GPU-side - used for drawing.
ID3D12Resource* static_ib_ = nullptr;
D3D12_GPU_VIRTUAL_ADDRESS static_ib_gpu_address_;
// In PM4 draw packets, 16 bits are used for the vertex count.
static constexpr uint32_t kMaxNonIndexedVertices = 65535;
static constexpr uint32_t kStaticIBTriangleFanOffset = 0;
static constexpr uint32_t kStaticIBTriangleFanCount =
(kMaxNonIndexedVertices - 2) * 3;
static constexpr uint32_t kStaticIBQuadOffset =
kStaticIBTriangleFanOffset + kStaticIBTriangleFanCount;
static constexpr uint32_t kStaticIBQuadCount =
(kMaxNonIndexedVertices >> 2) * 6;
static constexpr uint32_t kStaticIBTotalCount =
kStaticIBQuadOffset + kStaticIBQuadCount;
// Not identifying the index buffer uniquely - reset index must also be
// checked if reset is enabled.
union ConvertedIndicesKey {
uint64_t value;
struct {
uint32_t address; // 32
xenos::PrimitiveType source_type : 6; // 38
xenos::IndexFormat format : 1; // 39
uint32_t count : 16; // 55
uint32_t reset : 1; // 56
};
// Clearing the unused bits.
ConvertedIndicesKey() : value(0) {}
ConvertedIndicesKey(const ConvertedIndicesKey& key) : value(key.value) {}
ConvertedIndicesKey& operator=(const ConvertedIndicesKey& key) {
value = key.value;
return *this;
}
bool operator==(const ConvertedIndicesKey& key) const {
return value == key.value;
}
bool operator!=(const ConvertedIndicesKey& key) const {
return value != key.value;
}
};
struct ConvertedIndices {
ConvertedIndicesKey key;
// If reset is enabled, this also must be checked to find cached indices.
uint32_t reset_index;
// Zero GPU address if conversion not needed or the resulting index buffer
// is empty.
D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
// When conversion is not needed, this must be equal to the original index
// count.
uint32_t converted_index_count;
};
// Cache for a single frame.
std::unordered_multimap<uint64_t, ConvertedIndices> converted_indices_cache_;
// Very coarse cache invalidation - if something is modified in a 8 MB portion
// of the physical memory and converted indices are also there, invalidate all
// the cache.
uint64_t memory_regions_used_;
std::atomic<uint64_t> memory_regions_invalidated_ = 0;
void* memory_invalidation_callback_handle_ = nullptr;
uint32_t system_page_size_;
};
} // namespace d3d12
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_D3D12_PRIMITIVE_CONVERTER_H_

File diff suppressed because it is too large Load Diff

View File

@ -1,574 +0,0 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2018 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_GPU_D3D12_RENDER_TARGET_CACHE_H_
#define XENIA_GPU_D3D12_RENDER_TARGET_CACHE_H_
#include <memory>
#include <unordered_map>
#include "xenia/base/cvar.h"
#include "xenia/gpu/d3d12/d3d12_shader.h"
#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
#include "xenia/gpu/d3d12/texture_cache.h"
#include "xenia/gpu/draw_util.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"
#include "xenia/memory.h"
#include "xenia/ui/d3d12/d3d12_api.h"
#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
DECLARE_bool(d3d12_16bit_rtv_full_range);
namespace xe {
namespace gpu {
namespace d3d12 {
class D3D12CommandProcessor;
// =============================================================================
// How EDRAM is used by Xenos:
// (Copied from the old version of the render target cache, so implementation
// info may differ from the way EDRAM is emulated now.)
// =============================================================================
//
// On the 360 the render target is an opaque block of memory in EDRAM that's
// only accessible via resolves. We use this to our advantage to simulate
// something like it as best we can by having a shared backing memory with
// a multitude of views for each tile location in EDRAM.
//
// This allows us to have the same base address write to the same memory
// regardless of framebuffer format. Resolving then uses whatever format the
// resolve requests straight from the backing memory.
//
// EDRAM is a beast and we only approximate it as best we can. Basically,
// the 10MiB of EDRAM is composed of 2048 5120b tiles. Each tile is 80x16px.
// +-----+-----+-----+---
// |tile0|tile1|tile2|... 2048 times
// +-----+-----+-----+---
// Operations dealing with EDRAM deal in tile offsets, so base 0x100 is tile
// offset 256, 256*5120=1310720b into the buffer. All rendering operations are
// aligned to tiles so trying to draw at 256px wide will have a real width of
// 320px by rounding up to the next tile.
//
// MSAA and other settings will modify the exact pixel sizes, like 4X makes
// each tile effectively 40x8px / 2X makes each tile 80x8px, but they are still
// all 5120b. As we try to emulate this we adjust our viewport when rendering to
// stretch pixels as needed.
//
// It appears that games also take advantage of MSAA stretching tiles when doing
// clears. Games will clear a view with 1/2X pitch/height and 4X MSAA and then
// later draw to that view with 1X pitch/height and 1X MSAA.
//
// The good news is that games cannot read EDRAM directly but must use a copy
// operation to get the data out. That gives us a chance to do whatever we
// need to (re-tile, etc) only when requested.
//
// To approximate the tiled EDRAM layout we use a single large chunk of memory.
// From this memory we create many VkImages (and VkImageViews) of various
// formats and dimensions as requested by the game. These are used as
// attachments during rendering and as sources during copies. They are also
// heavily aliased - lots of images will reference the same locations in the
// underlying EDRAM buffer. The only requirement is that there are no hazards
// with specific tiles (reading/writing the same tile through different images)
// and otherwise it should be ok *fingers crossed*.
//
// One complication is the copy/resolve process itself: we need to give back
// the data asked for in the format desired and where it goes is arbitrary
// (any address in physical memory). If the game is good we get resolves of
// EDRAM into fixed base addresses with scissored regions. If the game is bad
// we are broken.
//
// Resolves from EDRAM result in tiled textures - that's texture tiles, not
// EDRAM tiles. If we wanted to ensure byte-for-byte correctness we'd need to
// then tile the images as we wrote them out. For now, we just attempt to
// get the (X, Y) in linear space and do that. This really comes into play
// when multiple resolves write to the same texture or memory aliased by
// multiple textures - which is common due to predicated tiling. The examples
// below demonstrate what this looks like, but the important thing is that
// we are aware of partial textures and overlapping regions.
//
// Example with multiple render targets:
// Two color targets of 256x256px tightly packed in EDRAM:
// color target 0: base 0x0, pitch 320, scissor 0,0, 256x256
// starts at tile 0, buffer offset 0
// contains 64 tiles (320/80)*(256/16)
// color target 1: base 0x40, pitch 320, scissor 256,0, 256x256
// starts at tile 64 (after color target 0), buffer offset 327680b
// contains 64 tiles
// In EDRAM each set of 64 tiles is contiguous:
// +------+------+ +------+------+------+
// |ct0.0 |ct0.1 |...|ct0.63|ct1.0 |ct1.1 |...
// +------+------+ +------+------+------+
// To render into these, we setup two VkImages:
// image 0: bound to buffer offset 0, 320x256x4=327680b
// image 1: bound to buffer offset 327680b, 320x256x4=327680b
// So when we render to them:
// +------+-+ scissored to 256x256, actually 320x256
// | . | | <- . appears at some untiled offset in the buffer, but
// | | | consistent if aliased with the same format
// +------+-+
// In theory, this gives us proper aliasing in most cases.
//
// Example with horizontal predicated tiling:
// Trying to render 1024x576 @4X MSAA, splitting into two regions
// horizontally:
// +----------+
// | 1024x288 |
// +----------+
// | 1024x288 |
// +----------+
// EDRAM configured for 1056x288px with tile size 2112x567px (4X MSAA):
// color target 0: base 0x0, pitch 1080, 26x36 tiles
// First render (top):
// window offset 0,0
// scissor 0,0, 1024x288
// First resolve (top):
// RB_COPY_DEST_BASE 0x1F45D000
// RB_COPY_DEST_PITCH pitch=1024, height=576
// vertices: 0,0, 1024,0, 1024,288
// Second render (bottom):
// window offset 0,-288
// scissor 0,288, 1024x288
// Second resolve (bottom):
// RB_COPY_DEST_BASE 0x1F57D000 (+1179648b)
// RB_COPY_DEST_PITCH pitch=1024, height=576
// (exactly 1024x288*4b after first resolve)
// vertices: 0,288, 1024,288, 1024,576
// Resolving here is easy as the textures are contiguous in memory. We can
// snoop in the first resolve with the dest height to know the total size,
// and in the second resolve see that it overlaps and place it in the
// existing target.
//
// Example with vertical predicated tiling:
// Trying to render 1280x720 @2X MSAA, splitting into two regions
// vertically:
// +-----+-----+
// | 640 | 640 |
// | x | x |
// | 720 | 720 |
// +-----+-----+
// EDRAM configured for 640x736px with tile size 640x1472px (2X MSAA):
// color target 0: base 0x0, pitch 640, 8x92 tiles
// First render (left):
// window offset 0,0
// scissor 0,0, 640x720
// First resolve (left):
// RB_COPY_DEST_BASE 0x1BC6D000
// RB_COPY_DEST_PITCH pitch=1280, height=720
// vertices: 0,0, 640,0, 640,720
// Second render (right):
// window offset -640,0
// scissor 640,0, 640x720
// Second resolve (right):
// RB_COPY_DEST_BASE 0x1BC81000 (+81920b)
// RB_COPY_DEST_PITCH pitch=1280, height=720
// vertices: 640,0, 1280,0, 1280,720
// Resolving here is much more difficult as resolves are tiled and the right
// half of the texture is 81920b away:
// 81920/4bpp=20480px, /32 (texture tile size)=640px
// We know the texture size with the first resolve and with the second we
// must check for overlap then compute the offset (in both X and Y).
//
// =============================================================================
// Surface size:
// =============================================================================
//
// XGSurfaceSize code in game executables calculates the size in tiles in the
// following order:
// 1) If MSAA is >=2x, multiply the height by 2.
// 2) If MSAA is 4x, multiply the width by 2.
// 3) 80x16-align multisampled width and height.
// 4) Multiply width*height by 4 or 8 depending on the pixel format.
// 5) Divide the byte size by 5120.
// This means that when working with EDRAM surface sizes we should assume that a
// multisampled surface is the same as a single-sampled surface with 2x height
// and width - however, format size doesn't effect the dimensions. Surface pitch
// in the surface info register is single-sampled.
//
// =============================================================================
// Rasterizer-ordered view usage:
// =============================================================================
//
// There is a separate output merger emulation path currently in development,
// using rasterizer-ordered views for writing directly to the 10 MB EDRAM buffer
// instead of the host output merger for render target output.
//
// The convential method of implementing Xenos render targets via host render
// targets has various flaws that may be impossible to fix:
// - k_16_16 and k_16_16_16_16 have -32...32 range on Xenos, but there's no
// equivalent format on PC APIs. They may be emulated using snorm16 (by
// dividing shader color output by 32) or float32, however, blending behaves
// incorrectly for both. In the former case, multiplicative blending may not
// work correctly - 1 becomes 1/32, and instead of 1 * 1 = 1, you get
// 1/32 * 1/32 = 1/1024. For 32-bit floats, additive blending result may go up
// to infinity.
// - k_2_10_10_10_FLOAT has similar blending issues, though less prominent, when
// emulated via float16 render targets. In addition to a greater range for
// RGB (values can go up to 65504 and infinity rather than 31.875), alpha is
// represented totally differently - in k_2_10_10_10_FLOAT, it may have only
// 4 values, and adding, for example, 0.1 to 0.333 will still result in 0.333,
// while with float16, it will be increasing, and the limit is infinity.
// - Due to simultaneously bound host render targets being independent from each
// other, and because the height is unknown (and the viewport and scissor are
// not always present - D3DPT_RECTLIST is used very commonly, especially for
// clearing (Direct3D 9 Clear is implemented this way on the Xbox 360) and
// copying, and it's usually drawn without a viewport and with the scissor of
// the maximum possible size), there may be cases of simultaneously bound
// render targets overlapping each other in the EDRAM in a way that is
// difficult to resolve, and stores/loads may destroy data.
//
// =============================================================================
// 2x width and height scaling implementation:
// =============================================================================
//
// For ease of mapping EDRAM addresses, host pixels (top-left, top-right,
// bottom-left, bottom-right) within EACH GUEST SAMPLE are stored consecutively,
// this means that the address of each sample with 4x resolution enabled is 4x
// the address of it without increased resolution - and you only need to add
// (uint(SV_Position.y) * 2u + uint(SV_Position.x)) to the dword/qword index to
// get each of the 4 host pixels for each sample.
class RenderTargetCache {
public:
// Direct3D 12 debug layer is giving errors that contradict each other when
// you use null RTV descriptors - if you set a valid format in RTVFormats in
// the pipeline state, it says that null descriptors can only be used if the
// format in the pipeline state is DXGI_FORMAT_UNKNOWN, however, if
// DXGI_FORMAT_UNKNOWN is set, it complains that the format in the pipeline
// state doesn't match the RTV format. So we have to make render target
// bindings consecutive and remap the output indices in pixel shaders.
struct PipelineRenderTarget {
uint32_t guest_render_target;
DXGI_FORMAT format;
};
RenderTargetCache(D3D12CommandProcessor& command_processor,
const RegisterFile& register_file,
TraceWriter& trace_writer, bool bindless_resources_used,
bool edram_rov_used);
~RenderTargetCache();
bool Initialize(const TextureCache& texture_cache);
void Shutdown();
void ClearCache();
flags::DepthFloat24Conversion depth_float24_conversion() const {
return depth_float24_conversion_;
}
void CompletedSubmissionUpdated();
void BeginSubmission();
void EndFrame();
// Called in the beginning of a draw call - may bind pipelines and change the
// view descriptor heap.
bool UpdateRenderTargets(const D3D12Shader* pixel_shader);
// Returns the host-to-guest mappings and host formats of currently bound
// render targets for pipeline creation and remapping in shaders. They are
// consecutive, and format DXGI_FORMAT_UNKNOWN terminates the list. Depth
// format is in the 5th render target.
const PipelineRenderTarget* GetCurrentPipelineRenderTargets() const {
return current_pipeline_render_targets_;
}
// Performs the resolve to a shared memory area according to the current
// register values, and also clears the EDRAM buffer if needed. Must be in a
// frame for calling.
bool Resolve(const Memory& memory, D3D12SharedMemory& shared_memory,
TextureCache& texture_cache, uint32_t& written_address_out,
uint32_t& written_length_out);
// Flushes the render targets to EDRAM and unbinds them, for instance, when
// the command processor takes over framebuffer bindings to draw something
// special. May change the CBV/SRV/UAV descriptor heap.
void FlushAndUnbindRenderTargets();
void WriteEdramRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
void WriteEdramRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
void WriteEdramUintPow2SRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
uint32_t element_size_bytes_pow2);
void WriteEdramUintPow2UAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
uint32_t element_size_bytes_pow2);
// Totally necessary to rely on the base format - Too Human switches between
// 2_10_10_10_FLOAT and 2_10_10_10_FLOAT_AS_16_16_16_16 every draw.
static xenos::ColorRenderTargetFormat GetBaseColorFormat(
xenos::ColorRenderTargetFormat format);
static DXGI_FORMAT GetColorDXGIFormat(xenos::ColorRenderTargetFormat format);
// Nvidia may have higher performance with 24-bit depth, AMD should have no
// performance difference, but with EDRAM loads/stores less conversion should
// be performed by the shaders if D24S8 is emulated as D24_UNORM_S8_UINT, and
// it's probably more accurate.
static DXGI_FORMAT GetDepthDXGIFormat(xenos::DepthRenderTargetFormat format) {
return format == xenos::DepthRenderTargetFormat::kD24FS8
? DXGI_FORMAT_D32_FLOAT_S8X24_UINT
: DXGI_FORMAT_D24_UNORM_S8_UINT;
}
// Returns true if any downloads were submitted to the command processor.
bool InitializeTraceSubmitDownloads();
void InitializeTraceCompleteDownloads();
void RestoreEdramSnapshot(const void* snapshot);
private:
enum class EdramLoadStoreMode {
kColor32bpp,
kColor64bpp,
kColor7e3,
kDepthUnorm,
kDepthFloat,
kDepthFloat24And32,
kCount
};
struct EdramLoadStoreModeInfo {
const void* load_shader;
size_t load_shader_size;
const WCHAR* load_pipeline_name;
const void* store_shader;
size_t store_shader_size;
const WCHAR* store_pipeline_name;
};
union RenderTargetKey {
struct {
// Supersampled (_ss - scaled 2x if needed) dimensions, divided by 80x16.
// The limit is 2560x2560 without AA, 2560x5120 with 2x AA, and 5120x5120
// with 4x AA, and twice as much (up to 10240x10240) with 2x resolution
// scaling.
uint32_t width_ss_div_80 : 8; // 8
uint32_t height_ss_div_16 : 10; // 18
uint32_t is_depth : 1; // 19
uint32_t format : 4; // 23
};
uint32_t value;
// Clearing the unused bits.
RenderTargetKey() : value(0) {}
RenderTargetKey(const RenderTargetKey& key) : value(key.value) {}
RenderTargetKey& operator=(const RenderTargetKey& key) {
value = key.value;
return *this;
}
bool operator==(const RenderTargetKey& key) const {
return value == key.value;
}
bool operator!=(const RenderTargetKey& key) const {
return value != key.value;
}
};
struct RenderTarget {
ID3D12Resource* resource;
D3D12_RESOURCE_STATES state;
D3D12_CPU_DESCRIPTOR_HANDLE handle;
RenderTargetKey key;
#if 0
// The first 4 MB page in the heaps.
uint32_t heap_page_first;
// The number of 4 MB pages this render target uses.
uint32_t heap_page_count;
#else
// Index of the render target when multiple render targets with the same key
// are bound simultaneously.
uint32_t instance;
#endif
// Color/depth and stencil layouts.
D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprints[2];
// Buffer size needed to copy the render target to the EDRAM buffer.
uint32_t copy_buffer_size;
};
struct RenderTargetBinding {
// Whether this render target has been used since the last full update.
bool is_bound;
uint32_t edram_base;
// How many 16-pixel rows has already been drawn to the render target since
// the last full update.
uint32_t edram_dirty_rows;
union {
uint32_t format;
xenos::ColorRenderTargetFormat color_format;
xenos::DepthRenderTargetFormat depth_format;
};
RenderTarget* render_target;
};
uint32_t GetEdramBufferSize() const;
void TransitionEdramBuffer(D3D12_RESOURCE_STATES new_state);
void CommitEdramBufferUAVWrites(bool force);
void ClearBindings();
#if 0
// Checks if the heap for the render target exists and tries to create it if
// it's not.
bool MakeHeapResident(uint32_t heap_index);
#endif
// Creates a new RTV/DSV descriptor heap if needed to be able to allocate one
// descriptor in it.
bool EnsureRTVHeapAvailable(bool is_depth);
// Returns true if a render target with such key can be created.
static bool GetResourceDesc(RenderTargetKey key, D3D12_RESOURCE_DESC& desc);
#if 0
RenderTarget* FindOrCreateRenderTarget(RenderTargetKey key,
uint32_t heap_page_first);
#else
RenderTarget* FindOrCreateRenderTarget(RenderTargetKey key,
uint32_t instance);
#endif
EdramLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format) const;
// Must be in a frame to call. Stores the dirty areas of the currently bound
// render targets and marks them as clean.
void StoreRenderTargetsToEdram();
// Must be in a frame to call. Loads the render targets from the EDRAM buffer,
// filling all the rows the render target can hold.
void LoadRenderTargetsFromEdram(uint32_t render_target_count,
RenderTarget* const* render_targets,
const uint32_t* edram_bases);
D3D12CommandProcessor& command_processor_;
const RegisterFile& register_file_;
TraceWriter& trace_writer_;
bool bindless_resources_used_;
bool edram_rov_used_;
// 20e4 depth conversion mode to use for non-ROV output.
flags::DepthFloat24Conversion depth_float24_conversion_;
// Whether 1 guest pixel is rendered as 2x2 host pixels (currently only
// supported with ROV).
bool resolution_scale_2x_ = false;
// The EDRAM buffer allowing color and depth data to be reinterpreted.
ID3D12Resource* edram_buffer_ = nullptr;
D3D12_RESOURCE_STATES edram_buffer_state_;
// Whether there have been any outstanding UAV writes and a UAV barrier is
// needed before accessing the EDRAM buffer in an unordered way again.
bool edram_buffer_modified_ = false;
// Non-shader-visible descriptor heap containing pre-created SRV and UAV
// descriptors of the EDRAM buffer, for faster binding (via copying rather
// than creation).
enum class EdramBufferDescriptorIndex : uint32_t {
kRawSRV,
kR32UintSRV,
kR32G32UintSRV,
kR32G32B32A32UintSRV,
kRawUAV,
kR32UintUAV,
kR32G32B32A32UintUAV,
kCount,
};
ID3D12DescriptorHeap* edram_buffer_descriptor_heap_ = nullptr;
D3D12_CPU_DESCRIPTOR_HANDLE edram_buffer_descriptor_heap_start_;
// EDRAM root signatures.
ID3D12RootSignature* edram_load_store_root_signature_ = nullptr;
struct EdramLoadStoreRootConstants {
uint32_t rt_color_depth_offset;
uint32_t rt_color_depth_pitch;
uint32_t rt_stencil_offset;
uint32_t rt_stencil_pitch;
// 0:10 - EDRAM base in tiles.
// 11 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
// 12 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
// 13 - whether 2x resolution scale is used.
// 14 - whether to apply the hack and duplicate the top/left
// half-row/half-column to reduce the impact of half-pixel offset with
// 2x resolution scale (obsolete since the move to the new resolve
// code).
// 15 - whether it's a depth render target.
// 16: - EDRAM pitch in tiles.
uint32_t base_samples_2x_depth_pitch;
};
// EDRAM pipelines for the RTV/DSV path.
static const EdramLoadStoreModeInfo
edram_load_store_mode_info_[size_t(EdramLoadStoreMode::kCount)];
ID3D12PipelineState*
edram_load_pipelines_[size_t(EdramLoadStoreMode::kCount)] = {};
// Store pipelines are not created with ROV.
ID3D12PipelineState*
edram_store_pipelines_[size_t(EdramLoadStoreMode::kCount)] = {};
// Resolve root signatures and pipelines.
ID3D12RootSignature* resolve_copy_root_signature_ = nullptr;
static const std::pair<const uint8_t*, size_t>
resolve_copy_shaders_[size_t(draw_util::ResolveCopyShaderIndex::kCount)];
ID3D12PipelineState* resolve_copy_pipelines_[size_t(
draw_util::ResolveCopyShaderIndex::kCount)] = {};
ID3D12RootSignature* resolve_clear_root_signature_ = nullptr;
// Clearing 32bpp color, depth with ROV, or unorm depth without ROV.
ID3D12PipelineState* resolve_clear_32bpp_pipeline_ = nullptr;
// Clearing 64bpp color.
ID3D12PipelineState* resolve_clear_64bpp_pipeline_ = nullptr;
// Clearing float depth without ROV, both the float24 and the host float32
// versions.
ID3D12PipelineState* resolve_clear_depth_24_32_pipeline_ = nullptr;
// FIXME(Triang3l): Investigate what's wrong with placed RTV/DSV aliasing on
// Nvidia Maxwell 1st generation and older.
#if 0
// 48 MB heaps backing used render targets resources, created when needed.
// 24 MB proved to be not enough to store a single render target occupying the
// entire EDRAM - a 32-bit depth/stencil one - at some resolution.
// But we also need more than 32 MB to be able to resolve the entire EDRAM
// into a k_32_32_32_32_FLOAT texture.
// TODO(Triang3l): With 2x resolution scale, render targets can take 4x more
// memory - won't fit in this heap size. Resolution scale support was added
// when placed resources already have been disabled, however.
ID3D12Heap* heaps_[5] = {};
static constexpr uint32_t kHeap4MBPages = 12;
#endif
static constexpr uint32_t kRenderTargetDescriptorHeapSize = 2048;
// Descriptor heap, for linear allocation of heaps and descriptors.
struct RenderTargetDescriptorHeap {
ID3D12DescriptorHeap* heap;
D3D12_CPU_DESCRIPTOR_HANDLE start_handle;
// When descriptors_used is >= kRenderTargetDescriptorHeapSize, a new heap
// must be allocated and linked to the one that became full now.
uint32_t descriptors_used;
RenderTargetDescriptorHeap* previous;
};
RenderTargetDescriptorHeap* descriptor_heaps_color_ = nullptr;
RenderTargetDescriptorHeap* descriptor_heaps_depth_ = nullptr;
std::unordered_multimap<uint32_t, RenderTarget*> render_targets_;
uint32_t current_surface_pitch_ = 0;
xenos::MsaaSamples current_msaa_samples_ = xenos::MsaaSamples::k1X;
// current_edram_max_rows_ is for RTV/DSV only (render target texture size).
uint32_t current_edram_max_rows_ = 0;
RenderTargetBinding current_bindings_[5] = {};
bool apply_to_command_list_ = true;
PipelineRenderTarget current_pipeline_render_targets_[5];
// For traces only.
ID3D12Resource* edram_snapshot_download_buffer_ = nullptr;
std::unique_ptr<ui::d3d12::D3D12UploadBufferPool>
edram_snapshot_restore_pool_;
};
} // namespace d3d12
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_D3D12_RENDER_TARGET_CACHE_H_

File diff suppressed because it is too large Load Diff

View File

@ -10,18 +10,24 @@
#ifndef XENIA_GPU_D3D12_TEXTURE_CACHE_H_
#define XENIA_GPU_D3D12_TEXTURE_CACHE_H_
#include <array>
#include <atomic>
#include <cstring>
#include <unordered_map>
#include <utility>
#include <vector>
#include "xenia/base/assert.h"
#include "xenia/base/hash.h"
#include "xenia/base/mutex.h"
#include "xenia/gpu/d3d12/d3d12_shader.h"
#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/texture_info.h"
#include "xenia/gpu/texture_util.h"
#include "xenia/gpu/xenos.h"
#include "xenia/ui/d3d12/d3d12_api.h"
#include "xenia/ui/d3d12/d3d12_provider.h"
namespace xe {
namespace gpu {
@ -37,7 +43,7 @@ class D3D12CommandProcessor;
// found in game executables explaining the valid usage of BaseAddress when
// streaming the largest LOD (it says games should not use 0 as the base address
// when the largest LOD isn't loaded, but rather, either allocate a valid
// address for it or make it the same as MipAddress):
// address for it or make it the same as mip_address):
// - If the texture has a base address, but no mip address, it's not mipmapped -
// the host texture has only the largest level too.
// - If the texture has different non-zero base address and mip address, a host
@ -51,77 +57,59 @@ class D3D12CommandProcessor;
// the mip address, a mipmapped texture is created, but min/max LOD is clamped
// to the lower bound of 1 - the game is expected to do that anyway until the
// largest LOD is loaded.
// TODO(Triang3l): Check if there are any games with BaseAddress==MipAddress
// but min or max LOD being 0, especially check Modern Warfare 2/3.
// TODO(Triang3l): Attach the largest LOD to existing textures with a valid
// MipAddress but no BaseAddress to save memory because textures are streamed
// this way anyway.
// TODO(Triang3l): Attach the largest LOD to existing textures with a valid
// mip_address but no base ever used yet (no base_address) to save memory
// because textures are streamed this way anyway.
class TextureCache {
union TextureKey {
struct {
// Physical 4 KB page with the base mip level, disregarding A/C/E address
// range prefix.
uint32_t base_page : 17; // 17 total
xenos::DataDimension dimension : 2; // 19
uint32_t width : 13; // 32
struct TextureKey {
// Physical 4 KB page with the base mip level, disregarding A/C/E address
// range prefix.
uint32_t base_page : 17; // 17 total
xenos::DataDimension dimension : 2; // 19
uint32_t width : 13; // 32
uint32_t height : 13; // 45
uint32_t tiled : 1; // 46
uint32_t packed_mips : 1; // 47
// Physical 4 KB page with mip 1 and smaller.
uint32_t mip_page : 17; // 64
uint32_t height : 13; // 45
uint32_t tiled : 1; // 46
uint32_t packed_mips : 1; // 47
// Physical 4 KB page with mip 1 and smaller.
uint32_t mip_page : 17; // 64
// Layers for stacked and 3D, 6 for cube, 1 for other dimensions.
uint32_t depth : 10; // 74
uint32_t pitch : 9; // 83
uint32_t mip_max_level : 4; // 87
xenos::TextureFormat format : 6; // 93
xenos::Endian endianness : 2; // 95
// Whether this texture is signed and has a different host representation
// than an unsigned view of the same guest texture.
uint32_t signed_separate : 1; // 96
// Whether this texture is a 2x-scaled resolve target.
uint32_t scaled_resolve : 1; // 97
// Layers for stacked and 3D, 6 for cube, 1 for other dimensions.
uint32_t depth : 10; // 74
uint32_t mip_max_level : 4; // 78
xenos::TextureFormat format : 6; // 84
xenos::Endian endianness : 2; // 86
// Whether this texture is signed and has a different host representation
// than an unsigned view of the same guest texture.
uint32_t signed_separate : 1; // 87
// Whether this texture is a 2x-scaled resolve target.
uint32_t scaled_resolve : 1; // 88
};
struct {
// The key used for unordered_multimap lookup. Single uint32_t instead of
// a uint64_t so XXH hash can be calculated in a stable way due to no
// padding.
uint32_t map_key[2];
// The key used to identify one texture within unordered_multimap buckets.
uint32_t bucket_key;
};
TextureKey() { MakeInvalid(); }
TextureKey(const TextureKey& key) {
SetMapKey(key.GetMapKey());
bucket_key = key.bucket_key;
std::memcpy(this, &key, sizeof(*this));
}
TextureKey& operator=(const TextureKey& key) {
SetMapKey(key.GetMapKey());
bucket_key = key.bucket_key;
std::memcpy(this, &key, sizeof(*this));
return *this;
}
bool operator==(const TextureKey& key) const {
return GetMapKey() == key.GetMapKey() && bucket_key == key.bucket_key;
}
bool operator!=(const TextureKey& key) const {
return GetMapKey() != key.GetMapKey() || bucket_key != key.bucket_key;
}
uint64_t GetMapKey() const {
return uint64_t(map_key[0]) | (uint64_t(map_key[1]) << 32);
}
void SetMapKey(uint64_t key) {
map_key[0] = uint32_t(key);
map_key[1] = uint32_t(key >> 32);
}
bool IsInvalid() const {
// Zero base and zero width is enough for a binding to be invalid.
return map_key[0] == 0;
// Zero size is enough for a binding to be invalid (not possible on the
// real GPU since dimensions minus 1 are stored).
return !width;
}
void MakeInvalid() {
// Reset all for a stable hash.
SetMapKey(0);
bucket_key = 0;
// Zero everything, including the padding, for a stable hash.
std::memset(this, 0, sizeof(*this));
}
using Hasher = xe::hash::XXHasher<TextureKey>;
bool operator==(const TextureKey& key) const {
return !std::memcmp(this, &key, sizeof(*this));
}
bool operator!=(const TextureKey& key) const { return !(*this == key); }
};
public:
@ -168,16 +156,18 @@ class TextureCache {
};
TextureCache(D3D12CommandProcessor& command_processor,
const RegisterFile& register_file, bool bindless_resources_used,
D3D12SharedMemory& shared_memory);
const RegisterFile& register_file,
D3D12SharedMemory& shared_memory, bool bindless_resources_used,
uint32_t draw_resolution_scale);
~TextureCache();
bool Initialize(bool edram_rov_used);
bool Initialize();
void Shutdown();
void ClearCache();
void TextureFetchConstantWritten(uint32_t index);
void BeginSubmission();
void BeginFrame();
void EndFrame();
@ -196,19 +186,29 @@ class TextureCache {
bool AreActiveTextureSRVKeysUpToDate(
const TextureSRVKey* keys,
const D3D12Shader::TextureBinding* host_shader_bindings,
uint32_t host_shader_binding_count) const;
size_t host_shader_binding_count) const;
// Exports the current binding data to texture SRV keys so they can be stored
// for checking whether subsequent draw calls can keep using the same
// bindings. Write host_shader_binding_count keys.
void WriteActiveTextureSRVKeys(
TextureSRVKey* keys,
const D3D12Shader::TextureBinding* host_shader_bindings,
uint32_t host_shader_binding_count) const;
size_t host_shader_binding_count) const;
// Returns the post-swizzle signedness of a currently bound texture (must be
// called after RequestTextures).
uint8_t GetActiveTextureSwizzledSigns(uint32_t index) const {
return texture_bindings_[index].swizzled_signs;
}
bool IsActiveTextureResolved(uint32_t index) const {
const TextureBinding& binding = texture_bindings_[index];
if (binding.texture && binding.texture->IsResolved()) {
return true;
}
if (binding.texture_signed && binding.texture_signed->IsResolved()) {
return true;
}
return false;
}
void WriteActiveTextureBindfulSRV(
const D3D12Shader::TextureBinding& host_shader_binding,
D3D12_CPU_DESCRIPTOR_HANDLE handle);
@ -221,26 +221,37 @@ class TextureCache {
D3D12_CPU_DESCRIPTOR_HANDLE handle) const;
void MarkRangeAsResolved(uint32_t start_unscaled, uint32_t length_unscaled);
bool IsResolutionScale2X() const { return scaled_resolve_buffer_ != nullptr; }
ID3D12Resource* GetScaledResolveBuffer() const {
return scaled_resolve_buffer_;
}
// Ensures the buffer tiles backing the range are resident.
bool EnsureScaledResolveBufferResident(uint32_t start_unscaled,
uint32_t length_unscaled);
void UseScaledResolveBufferForReading();
void UseScaledResolveBufferForWriting();
void MarkScaledResolveBufferUAVWritesCommitNeeded() {
if (scaled_resolve_buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
scaled_resolve_buffer_uav_writes_commit_needed_ = true;
static uint32_t GetMaxDrawResolutionScale(
const ui::d3d12::D3D12Provider& provider) {
// 31 because 2 GB buffers are used.
if (provider.GetTiledResourcesTier() < D3D12_TILED_RESOURCES_TIER_1 ||
provider.GetVirtualAddressBitsPerResource() < 31) {
return 1;
}
return kMaxDrawResolutionScale;
}
uint32_t GetDrawResolutionScale() const { return draw_resolution_scale_; }
// Ensures the tiles backing the range in the buffers are allocated.
bool EnsureScaledResolveMemoryCommitted(uint32_t start_unscaled,
uint32_t length_unscaled);
// Makes the specified range of up to 1-2 GB currently accessible on the GPU.
// One draw call can access only at most one range - the same memory is
// accessible through different buffers based on the range needed, so aliasing
// barriers are required.
bool MakeScaledResolveRangeCurrent(uint32_t start_unscaled,
uint32_t length_unscaled);
// These functions create a view of the range specified in the last successful
// MakeScaledResolveRangeCurrent call because that function must be called
// before this.
void CreateCurrentScaledResolveRangeUintPow2SRV(
D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2);
void CreateCurrentScaledResolveRangeUintPow2UAV(
D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2);
void TransitionCurrentScaledResolveRange(D3D12_RESOURCE_STATES new_state);
void MarkCurrentScaledResolveRangeUAVWritesCommitNeeded() {
assert_true(draw_resolution_scale_ > 1);
GetCurrentScaledResolveBuffer().SetUAVBarrierPending();
}
// Can't address more than 512 MB on Nvidia, so an offset is required.
void CreateScaledResolveBufferUintPow2UAV(D3D12_CPU_DESCRIPTOR_HANDLE handle,
uint32_t guest_address_bytes,
uint32_t guest_length_bytes,
uint32_t element_size_bytes_pow2);
// Returns the ID3D12Resource of the front buffer texture (in
// PIXEL_SHADER_RESOURCE state), or nullptr in case of failure, and writes the
@ -251,6 +262,8 @@ class TextureCache {
xenos::TextureFormat& format_out);
private:
static constexpr uint32_t kMaxDrawResolutionScale = 3;
enum class LoadMode {
k8bpb,
k16bpb,
@ -281,7 +294,82 @@ class TextureCache {
kUnknown = kCount
};
struct LoadModeInfo {
struct LoadShaderInfo {
// Rules of data access in load shaders:
// - Source reading (from the shared memory or the scaled resolve buffer):
// - Guest data may be stored in a sparsely-allocated buffer, or, in
// Direct3D 12 terms, a tiled buffer. This means that some regions of
// the buffer may not be mapped. On tiled resources tier 1 hardware,
// accesing unmapped tiles results in undefined behavior, including a
// GPU page fault and device removal. So, shaders must not try to access
// potentially unmapped regions (that are outside the texture memory
// extents calculated on the CPU, taking into account that Xenia can't
// overestimate texture sizes freely since it must not try to upload
// unallocated pages on the CPU).
// - Buffer tiles have 64 KB size on Direct3D 12. Vulkan has its own
// alignment requirements for sparse binding. But overall, we're
// allocating pretty large regions.
// - Resolution scaling disabled:
// - Shared memory allocates regions of power of two sizes that map
// directly to the same portions of the 512 MB of the console's
// physical memory. So, a 64 KB-aligned host buffer region is also 64
// KB-aligned in the guest address space.
// - Tiled textures: 32x32x4-block tiles are always resident each as a
// whole. If the width is bigger than the pitch, the overflowing
// 32x32x4 tiles are also loaded as entire tiles. We do not have
// separate shaders for 2D and 3D. So, for tiled textures, it's safe
// to consider that if any location within a 32x32-aligned portion is
// within the texture bounds, the entire 32x32 portion also can be
// read.
// - Linear textures: Pitch is aligned to 256 bytes. Row count, however,
// is not aligned to anything (unless the mip tail is being loaded).
// The overflowing last row in case `width > pitch`, however, is made
// resident up to the last texel in it. But row start alignment is
// 256, which is a power of two, and is smaller than the Direct3D 12
// tile size of 64 KB. So, if any block within a 256-aligned region is
// within the texture bounds, without resolution scaling, reading from
// any location in that 256-aligned region is safe.
// - Since we use the same shaders for tiled and linear textures (as
// well as 1D textures), this means that without resolution scaling,
// it's safe to access a min(256 bytes, 32 blocks)-aligned portion
// along X, but only within the same row of blocks, with bounds
// checking only for such portion as a whole, but without additional
// bounds checking inside of it.
// - Therefore, it's recommended that shaders read power-of-two amounts
// of blocks (so there will naturally be some alignment to some power
// of two), and this way, each thread may read at most 16 16bpb blocks
// or at most 32 8bpb or smaller blocks with in a single
// `if (x < width)` for the whole aligned range of the same length.
// - Resolution scaling enabled:
// - For simplicity, unlike in the shared memory, buffer tile boundaries
// are not aligned to powers of 2 the same way as guest addresses are.
// While for 2x resolution scaling it still happens to be the case
// because `host address = guest address << 1`, for 3x, it's not - a
// 64 KB host tile would represent 7281.777 guest bytes (though we
// scale texels, not bytes, but that's what it would be for k_8
// textures).
// - The above would affect the `width > pitch` case for linear
// textures, requiring overestimating the width in calculation of the
// range of the tiles to map, while not doing this overestimation on
// the guest memory extent calculation side (otherwise it may result
// in attempting to upload unallocated memory on the CPU). For
// example, let's take look at an extreme case of a 369x28 k_8 texture
// with pitch of 256 bytes. The last row, in guest memory, would be
// loaded from the [7168, 7281) range, or, with 3x3 resolution
// scaling, from bytes [64512, 65529). However, if we try to
// unconditionally load 2 pixels, like the texture is 370x28, we will
// be accessing the bytes [64512, 65538). But bytes 65536 and 65537
// will be in another 64 KB tile, which may be not mapped yet.
// However, none of this is an issue for one simple reason - resolving
// is only possible to tiled textures, so linear textures will never
// be resolution-scaled.
// - Tiled textures have potentially referenced guest 32x32-block tiles
// loaded in their entirety. So, just like for unscaled textures, if
// any block within a tile is available, the entire tile is as well.
// - Destination writing (to the linear buffer):
// - host_x_blocks_per_thread specifies how many pixels can be written
// without bounds checking within increments of that amount - the pitch
// of the destination buffer is manually overaligned if needed.
const void* shader;
size_t shader_size;
// Log2 of the sizes, in bytes, of the source (guest) SRV and the
@ -289,11 +377,15 @@ class TextureCache {
// may copy multiple blocks per one invocation.
uint32_t srv_bpe_log2;
uint32_t uav_bpe_log2;
// Optional shader for loading 2x-scaled resolve targets.
const void* shader_2x;
size_t shader_2x_size;
uint32_t srv_bpe_log2_2x;
uint32_t uav_bpe_log2_2x;
// Number of guest blocks (or texels for uncompressed) along X axis written
// by every compute shader thread - rows in the upload buffer are padded to
// at least this amount.
uint32_t host_x_blocks_per_thread;
};
struct LoadModeInfo {
// For different drawing resolution scales.
LoadShaderInfo shaders[kMaxDrawResolutionScale];
};
struct HostFormat {
@ -341,24 +433,19 @@ class TextureCache {
ID3D12Resource* resource;
uint64_t resource_size;
D3D12_RESOURCE_STATES state;
// Whether the most up-to-date base / mips contain pages with data from a
// resolve operation (rather than from the CPU or memexport), primarily for
// choosing between piecewise linear gamma and sRGB when the former is
// emulated with the latter.
bool base_resolved;
bool mips_resolved;
uint64_t last_usage_frame;
uint64_t last_usage_time;
Texture* used_previous;
Texture* used_next;
// Byte size of the top guest mip level.
uint32_t base_size;
// Byte size of mips between 1 and key.mip_max_level, containing all array
// slices.
uint32_t mip_size;
// Offsets of all the array slices on a mip level relative to mips_address
// (0 for mip 0, it's relative to base_address then, and for mip 1).
uint32_t mip_offsets[14];
// Byte sizes of an array slice on each mip level.
uint32_t slice_sizes[14];
// Row pitches on each mip level (for linear layout mainly).
uint32_t pitches[14];
texture_util::TextureGuestLayout guest_layout;
// For bindful - indices in the non-shader-visible descriptor cache for
// copying to the shader-visible heap (much faster than recreating, which,
@ -375,6 +462,14 @@ class TextureCache {
bool base_in_sync;
// Whether the recent mip data has been loaded from the memory.
bool mips_in_sync;
bool IsResolved() const { return base_resolved || mips_resolved; }
uint32_t GetGuestBaseSize() const {
return guest_layout.base.level_data_extent_bytes;
}
uint32_t GetGuestMipsSize() const {
return guest_layout.mips_total_extent_bytes;
}
};
struct SRVDescriptorCachePage {
@ -385,24 +480,24 @@ class TextureCache {
struct LoadConstants {
// vec4 0.
uint32_t is_tiled_3d_endian;
// Base offset in bytes.
uint32_t guest_base;
// For linear textures - row byte pitch.
uint32_t guest_pitch;
// In blocks - and for mipmaps, it's also power-of-two-aligned.
uint32_t guest_storage_width_height[2];
uint32_t guest_offset;
// For tiled textures - row pitch in blocks, aligned to 32.
// For linear textures - row pitch in bytes.
uint32_t guest_pitch_aligned;
// For 3D textures only (ignored otherwise) - aligned to 32.
uint32_t guest_z_stride_block_rows_aligned;
// vec4 1.
// If this is a packed mip tail, this is aligned to tile dimensions.
uint32_t size_blocks[3];
uint32_t is_3d_endian;
// Base offset in bytes.
uint32_t host_offset;
// vec4 2.
// Base offset in bytes.
uint32_t host_base;
uint32_t host_pitch;
uint32_t height_texels;
static constexpr uint32_t kGuestPitchTiled = UINT32_MAX;
};
struct TextureBinding {
@ -427,6 +522,66 @@ class TextureCache {
}
};
static uint32_t GetMaxHostTextureWidthHeight(xenos::DataDimension dimension) {
switch (dimension) {
case xenos::DataDimension::k1D:
case xenos::DataDimension::k2DOrStacked:
// 1D and 2D are emulated as 2D arrays.
return D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION;
case xenos::DataDimension::k3D:
return D3D12_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
case xenos::DataDimension::kCube:
return D3D12_REQ_TEXTURECUBE_DIMENSION;
default:
assert_unhandled_case(dimension);
return 0;
}
}
static uint32_t GetMaxHostTextureDepth(xenos::DataDimension dimension) {
switch (dimension) {
case xenos::DataDimension::k1D:
case xenos::DataDimension::k2DOrStacked:
// 1D and 2D are emulated as 2D arrays.
return D3D12_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION;
case xenos::DataDimension::k3D:
return D3D12_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
case xenos::DataDimension::kCube:
return D3D12_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION / 6 * 6;
default:
assert_unhandled_case(dimension);
return 0;
}
}
class ScaledResolveVirtualBuffer {
public:
ScaledResolveVirtualBuffer(ID3D12Resource* resource,
D3D12_RESOURCE_STATES resource_state)
: resource_(resource), resource_state_(resource_state) {}
ID3D12Resource* resource() const { return resource_.Get(); }
D3D12_RESOURCE_STATES SetResourceState(D3D12_RESOURCE_STATES new_state) {
D3D12_RESOURCE_STATES old_state = resource_state_;
if (old_state == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
uav_barrier_pending_ = false;
}
resource_state_ = new_state;
return old_state;
}
// After writing through a UAV.
void SetUAVBarrierPending() {
if (resource_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
uav_barrier_pending_ = true;
}
}
// After an aliasing barrier (which is even stronger than an UAV barrier).
void ClearUAVBarrierPending() { uav_barrier_pending_ = false; }
private:
Microsoft::WRL::ComPtr<ID3D12Resource> resource_;
D3D12_RESOURCE_STATES resource_state_;
bool uav_barrier_pending_ = false;
};
// Whether the signed version of the texture has a different representation on
// the host than its unsigned version (for example, if it's a fixed-point
// texture emulated with a larger host pixel format).
@ -522,6 +677,42 @@ class TextureCache {
// an error.
void ClearBindings();
size_t GetScaledResolveBufferCount() const {
assert_true(draw_resolution_scale_ > 1);
// Make sure any range up to 1 GB is accessible through 1 or 2 buffers.
// 2x2 scale buffers - just one 2 GB buffer for all 2 GB.
// 3x3 scale buffers - 4 buffers:
// +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5
// |___________________|___________________|
// |___________________|______________|
// Buffer N has an offset of N * 1 GB in the scaled resolve address space.
// The logic is:
// - 2 GB can be accessed through a [0 GB ... 2 GB) buffer - only need one.
// - 2.1 GB needs [0 GB ... 2 GB) and [1 GB ... 2.1 GB) - two buffers.
// - 3 GB needs [0 GB ... 2 GB) and [1 GB ... 3 GB) - two buffers.
// - 3.1 GB needs [0 GB ... 2 GB), [1 GB ... 3 GB) and [2 GB ... 3.1 GB) -
// three buffers.
uint64_t address_space_size =
uint64_t(SharedMemory::kBufferSize) *
(draw_resolution_scale_ * draw_resolution_scale_);
return size_t((address_space_size - 1) >> 30);
}
// Returns indices of two scaled resolve virtual buffers that the location in
// memory may be accessible through. May be the same if it's a location near
// the beginning or the end of the address represented only by one buffer.
std::array<size_t, 2> GetPossibleScaledResolveBufferIndices(
uint64_t address_scaled) const {
assert_true(draw_resolution_scale_ > 1);
size_t address_gb = size_t(address_scaled >> 30);
size_t max_index = GetScaledResolveBufferCount() - 1;
// In different cases for 3x3:
// +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5
// |12________2________|1_________2________|
// |1_________2________|1_________12__|
return std::array<size_t, 2>{
std::min(address_gb, max_index),
std::min(std::max(address_gb, size_t(1)) - size_t(1), max_index)};
}
// Checks if there are any pages that contain scaled resolve data within the
// range.
bool IsRangeScaledResolved(uint32_t start_unscaled, uint32_t length_unscaled);
@ -534,6 +725,18 @@ class TextureCache {
void ScaledResolveGlobalWatchCallback(uint32_t address_first,
uint32_t address_last,
bool invalidated_by_gpu);
// The index is also the gigabyte offset of the buffer from the start of the
// scaled physical memory address space.
size_t GetCurrentScaledResolveBufferIndex() const {
return scaled_resolve_1gb_buffer_indices_
[scaled_resolve_current_range_start_scaled_ >> 30];
}
ScaledResolveVirtualBuffer& GetCurrentScaledResolveBuffer() {
ScaledResolveVirtualBuffer* scaled_resolve_buffer =
scaled_resolve_2gb_buffers_[GetCurrentScaledResolveBufferIndex()];
assert_not_null(scaled_resolve_buffer);
return *scaled_resolve_buffer;
}
static const HostFormat host_formats_[64];
@ -541,16 +744,16 @@ class TextureCache {
D3D12CommandProcessor& command_processor_;
const RegisterFile& register_file_;
bool bindless_resources_used_;
D3D12SharedMemory& shared_memory_;
bool bindless_resources_used_;
static const LoadModeInfo load_mode_info_[];
ID3D12RootSignature* load_root_signature_ = nullptr;
ID3D12PipelineState* load_pipelines_[size_t(LoadMode::kCount)] = {};
// Load pipelines for 2x-scaled resolved targets.
ID3D12PipelineState* load_pipelines_2x_[size_t(LoadMode::kCount)] = {};
// Load pipelines for resolution-scaled resolve targets.
ID3D12PipelineState* load_pipelines_scaled_[size_t(LoadMode::kCount)] = {};
std::unordered_multimap<uint64_t, Texture*> textures_;
std::unordered_map<TextureKey, Texture*, TextureKey::Hasher> textures_;
uint64_t textures_total_size_ = 0;
Texture* texture_used_first_ = nullptr;
Texture* texture_used_last_ = nullptr;
@ -592,37 +795,73 @@ class TextureCache {
};
uint8_t unsupported_format_features_used_[64];
// The 2 GB tiled buffer for resolved data with 2x resolution scale.
static constexpr uint32_t kScaledResolveBufferSizeLog2 = 31;
static constexpr uint32_t kScaledResolveBufferSize =
1u << kScaledResolveBufferSizeLog2;
ID3D12Resource* scaled_resolve_buffer_ = nullptr;
D3D12_RESOURCE_STATES scaled_resolve_buffer_state_ =
D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
bool scaled_resolve_buffer_uav_writes_commit_needed_ = false;
uint32_t draw_resolution_scale_ = 1;
// The tiled buffer for resolved data with resolution scaling.
// Because on Direct3D 12 (at least on Windows 10 2004) typed SRV or UAV
// creation fails for offsets above 4 GB, a single tiled 4.5 GB buffer can't
// be used for 3x resolution scaling.
// Instead, "sliding window" buffers allowing to access a single range of up
// to 1 GB (or up to 2 GB, depending on the low bits) at any moment are used.
// Parts of 4.5 GB address space can be accessed through 2 GB buffers as:
// +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5
// |___________________|___________________| or
// |___________________|______________|
// (2 GB is also the amount of scaled physical memory with 2x resolution
// scale, and older Intel GPUs, while support tiled resources, only support 31
// virtual address bits per resource).
// Index is first gigabyte. Only including buffers containing over 1 GB
// (because otherwise the data will be fully contained in another).
// Size is calculated the same as in GetScaledResolveBufferCount.
ScaledResolveVirtualBuffer*
scaled_resolve_2gb_buffers_[(uint64_t(SharedMemory::kBufferSize) *
(kMaxDrawResolutionScale *
kMaxDrawResolutionScale) -
1) >>
30] = {};
// Not very big heaps (16 MB) because they are needed pretty sparsely. One
// scaled 1280x720x32bpp texture is slighly bigger than 14 MB.
// 2x-scaled 1280x720x32bpp texture is slighly bigger than 14 MB.
static constexpr uint32_t kScaledResolveHeapSizeLog2 = 24;
static constexpr uint32_t kScaledResolveHeapSize =
1 << kScaledResolveHeapSizeLog2;
uint32_t(1) << kScaledResolveHeapSizeLog2;
static_assert(
(kScaledResolveHeapSize % D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES) == 0,
"Scaled resolve heap size must be a multiple of Direct3D tile size");
static_assert(
kScaledResolveHeapSizeLog2 <= SharedMemory::kBufferSizeLog2,
"Scaled resolve heaps are assumed to be wholly mappable irrespective of "
"resolution scale, never truncated, for example, if the scaled resolve "
"address space is 4.5 GB, but the heap size is 1 GB");
static_assert(
kScaledResolveHeapSizeLog2 <= 30,
"Scaled resolve heaps are assumed to only be wholly mappable to up to "
"two 2 GB buffers");
// Resident portions of the tiled buffer.
ID3D12Heap* scaled_resolve_heaps_[kScaledResolveBufferSize >>
kScaledResolveHeapSizeLog2] = {};
std::vector<ID3D12Heap*> scaled_resolve_heaps_;
// Number of currently resident portions of the tiled buffer, for profiling.
uint32_t scaled_resolve_heap_count_ = 0;
// Global watch for scaled resolve data invalidation.
SharedMemory::GlobalWatchHandle scaled_resolve_global_watch_handle_ = nullptr;
// Current scaled resolve state.
// For aliasing barrier placement, last owning buffer index for each of 1 GB.
size_t
scaled_resolve_1gb_buffer_indices_[(uint64_t(SharedMemory::kBufferSize) *
kMaxDrawResolutionScale *
kMaxDrawResolutionScale +
((uint32_t(1) << 30) - 1)) >>
30];
// Range used in the last successful MakeScaledResolveRangeCurrent call.
uint64_t scaled_resolve_current_range_start_scaled_;
uint64_t scaled_resolve_current_range_length_scaled_;
xe::global_critical_region global_critical_region_;
// Bit vector storing whether each 4 KB physical memory page contains scaled
// resolve data. uint32_t rather than uint64_t because parts of it are sent to
// shaders.
// resolve data. uint32_t rather than uint64_t because parts of it can be sent
// to shaders.
uint32_t* scaled_resolve_pages_ = nullptr;
// Second level of the bit vector for faster rejection of non-scaled textures.
uint64_t scaled_resolve_pages_l2_[(512 << 20) >> (12 + 5 + 6)];
// >> 12 for 4 KB pages, >> 5 for uint32_t level 1 bits, >> 6 for uint64_t
// level 2 bits.
uint64_t scaled_resolve_pages_l2_[SharedMemory::kBufferSize >> (12 + 5 + 6)];
};
} // namespace d3d12

File diff suppressed because it is too large Load Diff

View File

@ -16,6 +16,7 @@
#include "xenia/base/assert.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"
#include "xenia/memory.h"
@ -33,16 +34,144 @@ namespace draw_util {
// for use with the top-left rasterization rule later.
int32_t FloatToD3D11Fixed16p8(float f32);
// Polygonal primitive types (not including points and lines) are rasterized as
// triangles, have front and back faces, and also support face culling and fill
// modes (polymode_front_ptype, polymode_back_ptype). Other primitive types are
// always "front" (but don't support front face and back face culling, according
// to OpenGL and Vulkan specifications - even if glCullFace is
// GL_FRONT_AND_BACK, points and lines are still drawn), and may in some cases
// use the "para" registers instead of "front" or "back" (for "parallelogram" -
// like poly_offset_para_enable).
constexpr bool IsPrimitivePolygonal(bool vgt_output_path_is_tessellation_enable,
xenos::PrimitiveType type) {
if (vgt_output_path_is_tessellation_enable &&
(type == xenos::PrimitiveType::kTrianglePatch ||
type == xenos::PrimitiveType::kQuadPatch)) {
// For patch primitive types, the major mode is always explicit, so just
// checking if VGT_OUTPUT_PATH_CNTL::path_select is kTessellationEnable is
// enough.
return true;
}
switch (type) {
case xenos::PrimitiveType::kTriangleList:
case xenos::PrimitiveType::kTriangleFan:
case xenos::PrimitiveType::kTriangleStrip:
case xenos::PrimitiveType::kTriangleWithWFlags:
case xenos::PrimitiveType::kQuadList:
case xenos::PrimitiveType::kQuadStrip:
case xenos::PrimitiveType::kPolygon:
return true;
default:
break;
}
// TODO(Triang3l): Investigate how kRectangleList should be treated - possibly
// actually drawn as two polygons on the console, however, the current
// geometry shader doesn't care about the winding order - allowing backface
// culling for rectangles currently breaks Gears of War 2.
return false;
}
inline bool IsPrimitivePolygonal(const RegisterFile& regs) {
return IsPrimitivePolygonal(
regs.Get<reg::VGT_OUTPUT_PATH_CNTL>().path_select ==
xenos::VGTOutputPath::kTessellationEnable,
regs.Get<reg::VGT_DRAW_INITIATOR>().prim_type);
}
// Whether with the current state, any samples to rasterize (for any reason, not
// only to write something to a render target, but also to do sample counting or
// pixel shader memexport) can be generated. Finally dropping draw calls can
// only be done if the vertex shader doesn't memexport. Checks mostly special
// cases (for both the guest and usual host implementations), not everything
// like whether viewport / scissor are empty (until this truly matters in any
// game, of course).
bool IsRasterizationPotentiallyDone(const RegisterFile& regs,
bool primitive_polygonal);
// Direct3D 10.1+ standard sample positions, also used in Vulkan, for
// calculations related to host MSAA, in 1/16th of a pixel.
extern const int8_t kD3D10StandardSamplePositions2x[2][2];
extern const int8_t kD3D10StandardSamplePositions4x[4][2];
inline reg::RB_DEPTHCONTROL GetDepthControlForCurrentEdramMode(
const RegisterFile& regs) {
xenos::ModeControl edram_mode = regs.Get<reg::RB_MODECONTROL>().edram_mode;
if (edram_mode != xenos::ModeControl::kColorDepth &&
edram_mode != xenos::ModeControl::kDepth) {
// Both depth and stencil disabled (EDRAM depth and stencil ignored).
reg::RB_DEPTHCONTROL disabled;
disabled.value = 0;
return disabled;
}
return regs.Get<reg::RB_DEPTHCONTROL>();
}
constexpr float GetD3D10PolygonOffsetScale(
xenos::DepthRenderTargetFormat depth_format, bool float24_as_0_to_0_5) {
if (depth_format == xenos::DepthRenderTargetFormat::kD24S8) {
return float(1 << 24);
}
// 20 explicit + 1 implicit (1.) mantissa bits.
// 2^20 is not enough for Call of Duty 4 retail version's first mission F.N.G.
// shooting range floor (with the number 1) on Direct3D 12. Tested on Nvidia
// GeForce GTX 1070, the exact formula (taking into account the 0...1 to
// 0...0.5 remapping described below) used for testing is
// `int(ceil(offset * 2^20 * 0.5)) * sign(offset)`. With 2^20 * 0.5, there
// are various kinds of stripes dependending on the view angle in that
// location. With 2^21 * 0.5, the issue is not present.
constexpr float kFloat24Scale = float(1 << 21);
// 0...0.5 range may be used on the host to represent the 0...1 guest depth
// range to be able to copy all possible encodings, which are [0, 2), via a
// [0, 1] depth output variable, during EDRAM contents reinterpretation.
// This is done by scaling the viewport depth bounds by 0.5. However, the
// depth bias is applied after the viewport. This adjustment is only needed
// for the constant bias - for slope-scaled, the derivatives of Z are
// calculated after the viewport as well, and will already include the 0.5
// scaling from the viewport.
return float24_as_0_to_0_5 ? kFloat24Scale * 0.5f : kFloat24Scale;
}
inline bool DoesCoverageDependOnAlpha(reg::RB_COLORCONTROL rb_colorcontrol) {
return (rb_colorcontrol.alpha_test_enable &&
rb_colorcontrol.alpha_func != xenos::CompareFunction::kAlways) ||
rb_colorcontrol.alpha_to_mask_enable;
}
// Whether the pixel shader can be disabled on the host to speed up depth
// pre-passes and shadowmaps. The shader must have its ucode analyzed. If
// IsRasterizationPotentiallyDone, this shouldn't be called, and assumed false
// instead. Helps reject the pixel shader in some cases - memexport draws in
// Halo 3, and also most of some 1-point draws not covering anything done for
// some reason in different games with a leftover pixel shader from the previous
// draw, but with SQ_PROGRAM_CNTL destroyed, reducing the number of
// unpredictable unneeded translations of random shaders with different host
// modification bits, such as register count and depth format-related (though
// shaders with side effects on depth or memory export will still be preserved).
bool IsPixelShaderNeededWithRasterization(const Shader& shader,
const RegisterFile& regs);
struct ViewportInfo {
// The returned viewport will always be in the positive quarter-plane for
// simplicity of clamping to the maximum size supported by the host, negative
// offset will be applied via ndc_offset.
float left;
float top;
float width;
float height;
// Offset from render target UV = 0 to +UV.
// For simplicity of cropping to the maximum size on the host; to match the
// Direct3D 12 clipping / scissoring behavior with a fractional viewport, to
// floor(TopLeftXY) ... floor(TopLeftXY + WidthHeight), on the real AMD, Intel
// and Nvidia hardware (not WARP); as well as to hide the differences between
// 0 and 8+ viewportSubPixelBits on Vulkan, and to prevent any numerical error
// in bound checking in host APIs, viewport bounds are returned as integers.
// Also they're returned as non-negative, also to make it easier to crop (so
// Vulkan maxViewportDimensions and viewportBoundsRange don't have to be
// handled separately - maxViewportDimensions is greater than or equal to the
// largest framebuffer image size, so it's safe, and viewportBoundsRange is
// always bigger than maxViewportDimensions. All fractional offsetting,
// including the half-pixel offset, and cropping are handled via ndc_scale and
// ndc_offset.
uint32_t xy_offset[2];
// Extent can be zero for an empty viewport - host APIs not supporting empty
// viewports need to use an empty scissor rectangle.
uint32_t xy_extent[2];
float z_min;
float z_max;
// The scale is applied before the offset (like using multiply-add).
float ndc_scale[3];
float ndc_offset[3];
};
@ -50,19 +179,31 @@ struct ViewportInfo {
// a viewport, plus values to multiply-add the returned position by, usable on
// host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the
// Direct3D clip space with 0...W Z rather than -W...W.
void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x,
float pixel_size_y, bool origin_bottom_left,
float x_max, float y_max, bool allow_reverse_z,
bool convert_z_to_float24,
void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale,
bool origin_bottom_left, uint32_t x_max,
uint32_t y_max, bool allow_reverse_z,
bool convert_z_to_float24, bool full_float24_in_0_to_1,
bool pixel_shader_writes_depth,
ViewportInfo& viewport_info_out);
struct Scissor {
uint32_t left;
uint32_t top;
uint32_t width;
uint32_t height;
// Offset from render target UV = 0 to +UV.
uint32_t offset[2];
// Extent can be zero.
uint32_t extent[2];
};
void GetScissor(const RegisterFile& regs, Scissor& scissor_out);
void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
bool clamp_to_surface_pitch = true);
// Scales, and shift amounts of the upper 32 bits of the 32x32=64-bit
// multiplication result, for fast division and multiplication by
// EDRAM-tile-related amounts.
constexpr uint32_t kDivideScale3 = 0xAAAAAAABu;
constexpr uint32_t kDivideUpperShift3 = 1;
constexpr uint32_t kDivideScale5 = 0xCCCCCCCDu;
constexpr uint32_t kDivideUpperShift5 = 2;
constexpr uint32_t kDivideScale15 = 0x88888889u;
constexpr uint32_t kDivideUpperShift15 = 3;
// To avoid passing values that the shader won't understand (even though
// Direct3D 9 shouldn't pass them anyway).
@ -75,11 +216,11 @@ xenos::CopySampleSelect SanitizeCopySampleSelect(
union ResolveEdramPackedInfo {
struct {
// With offset to the 160x32 region that local_x/y_div_8 are relative to,
// and with 32bpp/64bpp taken into account.
// With 32bpp/64bpp taken into account.
uint32_t pitch_tiles : xenos::kEdramPitchTilesBits;
xenos::MsaaSamples msaa_samples : xenos::kMsaaSamplesBits;
uint32_t is_depth : 1;
// With offset to the 160x32 region that local_x/y_div_8 are relative to.
uint32_t base_tiles : xenos::kEdramBaseTilesBits;
uint32_t format : xenos::kRenderTargetFormatBits;
uint32_t format_is_64bpp : 1;
@ -122,26 +263,58 @@ union ResolveAddressPackedInfo {
static_assert(sizeof(ResolveAddressPackedInfo) <= sizeof(uint32_t),
"ResolveAddressPackedInfo must be packable in uint32_t");
// Returns tiles actually covered by a resolve area. Row length used is width of
// the area in tiles, but the pitch between rows is edram_info.pitch_tiles.
void GetResolveEdramTileSpan(ResolveEdramPackedInfo edram_info,
ResolveAddressPackedInfo address_info,
uint32_t& base_out, uint32_t& row_length_used_out,
uint32_t& rows_out);
union ResolveCopyDestPitchPackedInfo {
struct {
// 0...16384/32.
uint32_t pitch_aligned_div_32 : xenos::kTexture2DCubeMaxWidthHeightLog2 +
2 - xenos::kTextureTileWidthHeightLog2;
uint32_t height_aligned_div_32 : xenos::kTexture2DCubeMaxWidthHeightLog2 +
2 - xenos::kTextureTileWidthHeightLog2;
};
uint32_t packed;
};
static_assert(sizeof(ResolveCopyDestPitchPackedInfo) <= sizeof(uint32_t),
"ResolveAddressPackedInfo must be packable in uint32_t");
// For backends with Shader Model 5-like compute, host shaders to use to perform
// copying in resolve operations.
enum class ResolveCopyShaderIndex {
kFast32bpp1x2xMSAA,
kFast32bpp4xMSAA,
kFast32bpp2xRes,
kFast32bpp3xRes1x2xMSAA,
kFast32bpp3xRes4xMSAA,
kFast64bpp1x2xMSAA,
kFast64bpp4xMSAA,
kFast64bpp2xRes,
kFast64bpp3xRes,
kFull8bpp,
kFull8bpp2xRes,
kFull8bpp3xRes,
kFull16bpp,
kFull16bpp2xRes,
kFull16bppFrom32bpp3xRes,
kFull16bppFrom64bpp3xRes,
kFull32bpp,
kFull32bpp2xRes,
kFull32bppFrom32bpp3xRes,
kFull32bppFrom64bpp3xRes,
kFull64bpp,
kFull64bpp2xRes,
kFull64bppFrom32bpp3xRes,
kFull64bppFrom64bpp3xRes,
kFull128bpp,
kFull128bpp2xRes,
kFull128bppFrom32bpp3xRes,
kFull128bppFrom64bpp3xRes,
kCount,
kUnknown = kCount,
@ -182,7 +355,7 @@ struct ResolveCopyShaderConstants {
ResolveEdramPackedInfo edram_info;
ResolveAddressPackedInfo address_info;
reg::RB_COPY_DEST_INFO dest_info;
reg::RB_COPY_DEST_PITCH dest_pitch;
ResolveCopyDestPitchPackedInfo dest_pitch_aligned;
};
DestRelative dest_relative;
uint32_t dest_base;
@ -202,15 +375,23 @@ struct ResolveClearShaderConstants {
struct ResolveInfo {
reg::RB_COPY_CONTROL rb_copy_control;
// color_edram_info and depth_edram_info are set up if copying or clearing
// color and depth respectively, according to RB_COPY_CONTROL.
ResolveEdramPackedInfo color_edram_info;
// depth_edram_info / depth_original_base and color_edram_info /
// color_original_base are set up if copying or clearing color and depth
// respectively, according to RB_COPY_CONTROL.
ResolveEdramPackedInfo depth_edram_info;
ResolveEdramPackedInfo color_edram_info;
// Original bases, without adjustment to a 160x32 region for packed offsets,
// for locating host render targets to perform clears if host render targets
// are used for EDRAM emulation - the same as the base that the render target
// will likely used for drawing next, to prevent unneeded tile ownership
// transfers between clears and first usage if clearing a subregion.
uint32_t depth_original_base;
uint32_t color_original_base;
ResolveAddressPackedInfo address;
reg::RB_COPY_DEST_INFO rb_copy_dest_info;
reg::RB_COPY_DEST_PITCH rb_copy_dest_pitch;
reg::RB_COPY_DEST_INFO copy_dest_info;
ResolveCopyDestPitchPackedInfo copy_dest_pitch_aligned;
// Memory range that will potentially be modified by copying, with
// address.local_x/y_div_8 & 31 being the origin relative to it.
@ -228,6 +409,16 @@ struct ResolveInfo {
return rb_copy_control.copy_src_select >= xenos::kMaxColorRenderTargets;
}
// See GetResolveEdramTileSpan documentation for explanation.
void GetCopyEdramTileSpan(uint32_t& base_out, uint32_t& row_length_used_out,
uint32_t& rows_out, uint32_t& pitch_out) const {
ResolveEdramPackedInfo edram_info =
IsCopyingDepth() ? depth_edram_info : color_edram_info;
GetResolveEdramTileSpan(edram_info, address, base_out, row_length_used_out,
rows_out);
pitch_out = edram_info.pitch_tiles;
}
ResolveCopyShaderIndex GetCopyShader(
uint32_t resolution_scale, ResolveCopyShaderConstants& constants_out,
uint32_t& group_count_x_out, uint32_t& group_count_y_out) const;
@ -241,23 +432,10 @@ struct ResolveInfo {
}
void GetDepthClearShaderConstants(
bool has_float32_copy, ResolveClearShaderConstants& constants_out) const {
ResolveClearShaderConstants& constants_out) const {
assert_true(IsClearingDepth());
constants_out.rt_specific.clear_value[0] = rb_depth_clear;
if (has_float32_copy) {
float depth32;
uint32_t depth24 = rb_depth_clear >> 8;
if (xenos::DepthRenderTargetFormat(depth_edram_info.format) ==
xenos::DepthRenderTargetFormat::kD24S8) {
depth32 = depth24 * float(1.0f / 16777215.0f);
} else {
depth32 = xenos::Float20e4To32(depth24);
}
constants_out.rt_specific.clear_value[1] =
*reinterpret_cast<const uint32_t*>(&depth32);
} else {
constants_out.rt_specific.clear_value[1] = rb_depth_clear;
}
constants_out.rt_specific.clear_value[1] = rb_depth_clear;
constants_out.rt_specific.edram_info = depth_edram_info;
constants_out.address_info = address;
}
@ -266,9 +444,8 @@ struct ResolveInfo {
ResolveClearShaderConstants& constants_out) const {
assert_true(IsClearingColor());
// Not doing -32...32 to -1...1 clamping here as a hack for k_16_16 and
// k_16_16_16_16 blending emulation when using traditional host render
// targets as it would be inconsistent with the usual way of clearing with a
// quad.
// k_16_16_16_16 blending emulation when using host render targets as it
// would be inconsistent with the usual way of clearing with a depth quad.
// TODO(Triang3l): Check which 32-bit portion is in which register.
constants_out.rt_specific.clear_value[0] = rb_color_clear;
constants_out.rt_specific.clear_value[1] = rb_color_clear_lo;
@ -295,13 +472,14 @@ struct ResolveInfo {
};
// Returns false if there was an error obtaining the info making it totally
// invalid. edram_16_as_minus_1_to_1 is false if 16_16 and 16_16_16_16 color
// render target formats are properly emulated as -32...32, true if emulated as
// snorm, with range limited to -1...1, but with correct blending within that
// range.
// invalid. fixed_16_truncated_to_minus_1_to_1 is false if 16_16 and 16_16_16_16
// color render target formats are properly emulated as -32...32, true if
// emulated as snorm, with range limited to -1...1, but with correct blending
// within that range.
bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
TraceWriter& trace_writer, uint32_t resolution_scale,
bool edram_16_as_minus_1_to_1, ResolveInfo& info_out);
bool fixed_16_truncated_to_minus_1_to_1,
ResolveInfo& info_out);
// Taking user configuration - stretching or letterboxing, overscan region to
// crop to fill while maintaining the aspect ratio - into account, returns the

2355
src/xenia/gpu/dxbc.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -19,7 +19,7 @@ DxbcShader::DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash,
: Shader(shader_type, data_hash, dword_ptr, dword_count) {}
Shader::Translation* DxbcShader::CreateTranslationInstance(
uint32_t modification) {
uint64_t modification) {
return new DxbcTranslation(*this, modification);
}

View File

@ -10,6 +10,7 @@
#ifndef XENIA_GPU_DXBC_SHADER_H_
#define XENIA_GPU_DXBC_SHADER_H_
#include <atomic>
#include <vector>
#include "xenia/gpu/dxbc_shader_translator.h"
@ -23,13 +24,17 @@ class DxbcShader : public Shader {
public:
class DxbcTranslation : public Translation {
public:
DxbcTranslation(DxbcShader& shader, uint32_t modification)
DxbcTranslation(DxbcShader& shader, uint64_t modification)
: Translation(shader, modification) {}
};
DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash,
const uint32_t* dword_ptr, uint32_t dword_count);
// Resource bindings are gathered after the successful translation of any
// modification for simplicity of translation (and they don't depend on
// modification bits).
static constexpr uint32_t kMaxTextureBindingIndexBits =
DxbcShaderTranslator::kMaxTextureBindingIndexBits;
static constexpr uint32_t kMaxTextureBindings =
@ -43,11 +48,13 @@ class DxbcShader : public Shader {
bool is_signed;
};
// Safe to hash and compare with memcmp for layout hashing.
const TextureBinding* GetTextureBindings(uint32_t& count_out) const {
count_out = uint32_t(texture_bindings_.size());
return texture_bindings_.data();
const std::vector<TextureBinding>& GetTextureBindingsAfterTranslation()
const {
return texture_bindings_;
}
const uint32_t GetUsedTextureMaskAfterTranslation() const {
return used_texture_mask_;
}
const uint32_t GetUsedTextureMask() const { return used_texture_mask_; }
static constexpr uint32_t kMaxSamplerBindingIndexBits =
DxbcShaderTranslator::kMaxSamplerBindingIndexBits;
@ -61,17 +68,18 @@ class DxbcShader : public Shader {
xenos::TextureFilter mip_filter;
xenos::AnisoFilter aniso_filter;
};
const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const {
count_out = uint32_t(sampler_bindings_.size());
return sampler_bindings_.data();
const std::vector<SamplerBinding>& GetSamplerBindingsAfterTranslation()
const {
return sampler_bindings_;
}
protected:
Translation* CreateTranslationInstance(uint32_t modification) override;
Translation* CreateTranslationInstance(uint64_t modification) override;
private:
friend class DxbcShaderTranslator;
std::atomic_flag bindings_setup_entered_ = ATOMIC_FLAG_INIT;
std::vector<TextureBinding> texture_bindings_;
std::vector<SamplerBinding> sampler_bindings_;
uint32_t used_texture_mask_ = 0;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -7,6 +7,8 @@
******************************************************************************
*/
#include "xenia/base/assert.h"
#include "xenia/gpu/draw_util.h"
#include "xenia/gpu/dxbc_shader_translator.h"
namespace xe {
@ -15,7 +17,7 @@ using namespace ucode;
void DxbcShaderTranslator::ExportToMemory_PackFixed32(
const uint32_t* eM_temps, uint32_t eM_count, const uint32_t bits[4],
const DxbcSrc& is_integer, const DxbcSrc& is_signed) {
const dxbc::Src& is_integer, const dxbc::Src& is_signed) {
// Will insert with BFI - sign extension of red will be overwritten, not
// truncated.
assert_not_zero(bits[0]);
@ -26,64 +28,64 @@ void DxbcShaderTranslator::ExportToMemory_PackFixed32(
mask |= 1 << i;
}
}
DxbcOpIf(true, is_signed);
a_.OpIf(true, is_signed);
{
float range[4];
for (uint32_t i = 0; i < 4; ++i) {
range[i] = bits[i] ? float((uint32_t(1) << (bits[i] - 1)) - 1) : 0.0f;
}
DxbcSrc range_src(DxbcSrc::LP(range));
DxbcOpIf(false, is_integer);
dxbc::Src range_src(dxbc::Src::LP(range));
a_.OpIf(false, is_integer);
for (uint32_t i = 0; i < eM_count; ++i) {
uint32_t eM_temp = eM_temps[i];
DxbcOpMul(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp), range_src);
a_.OpMul(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp), range_src);
}
DxbcOpEndIf();
a_.OpEndIf();
for (uint32_t i = 0; i < eM_count; ++i) {
DxbcDest eM_dest(DxbcDest::R(eM_temps[i], mask));
DxbcSrc eM_src(DxbcSrc::R(eM_temps[i]));
dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[i], mask));
dxbc::Src eM_src(dxbc::Src::R(eM_temps[i]));
// TODO(Triang3l): NaN should become zero, not -range.
DxbcOpMax(eM_dest, eM_src, -range_src);
DxbcOpMin(eM_dest, eM_src, range_src);
a_.OpMax(eM_dest, eM_src, -range_src);
a_.OpMin(eM_dest, eM_src, range_src);
}
}
DxbcOpElse();
a_.OpElse();
{
float range[4];
for (uint32_t i = 0; i < 4; ++i) {
range[i] = float((uint32_t(1) << bits[i]) - 1);
}
DxbcSrc range_src(DxbcSrc::LP(range));
DxbcOpIf(false, is_integer);
dxbc::Src range_src(dxbc::Src::LP(range));
a_.OpIf(false, is_integer);
for (uint32_t i = 0; i < eM_count; ++i) {
uint32_t eM_temp = eM_temps[i];
DxbcOpMul(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp), range_src);
a_.OpMul(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp), range_src);
}
DxbcOpEndIf();
a_.OpEndIf();
for (uint32_t i = 0; i < eM_count; ++i) {
DxbcDest eM_dest(DxbcDest::R(eM_temps[i], mask));
DxbcSrc eM_src(DxbcSrc::R(eM_temps[i]));
DxbcOpMax(eM_dest, eM_src, DxbcSrc::LF(0.0f));
DxbcOpMin(eM_dest, eM_src, range_src);
dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[i], mask));
dxbc::Src eM_src(dxbc::Src::R(eM_temps[i]));
a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(0.0f));
a_.OpMin(eM_dest, eM_src, range_src);
}
}
DxbcOpEndIf();
a_.OpEndIf();
for (uint32_t i = 0; i < eM_count; ++i) {
uint32_t eM_temp = eM_temps[i];
// Round to the nearest integer, according to the rules of handling integer
// formats in Direct3D.
// TODO(Triang3l): Round by adding +-0.5, not with round_ne.
DxbcOpRoundNE(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp));
DxbcOpFToI(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp));
DxbcDest eM_packed_dest(DxbcDest::R(eM_temp, 0b0001));
DxbcSrc eM_packed_src(DxbcSrc::R(eM_temp, DxbcSrc::kXXXX));
a_.OpRoundNE(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp));
a_.OpFToI(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp));
dxbc::Dest eM_packed_dest(dxbc::Dest::R(eM_temp, 0b0001));
dxbc::Src eM_packed_src(dxbc::Src::R(eM_temp, dxbc::Src::kXXXX));
uint32_t offset = bits[0];
for (uint32_t j = 1; j < 4; ++j) {
if (!bits[j]) {
continue;
}
DxbcOpBFI(eM_packed_dest, DxbcSrc::LU(bits[j]), DxbcSrc::LU(offset),
DxbcSrc::R(eM_temp).Select(j), eM_packed_src);
a_.OpBFI(eM_packed_dest, dxbc::Src::LU(bits[j]), dxbc::Src::LU(offset),
dxbc::Src::R(eM_temp).Select(j), eM_packed_src);
offset += bits[j];
}
}
@ -99,44 +101,94 @@ void DxbcShaderTranslator::ExportToMemory() {
uint32_t control_temp = PushSystemTemp();
// Safety check if the shared memory is bound as UAV.
system_constants_used_ |= 1ull << kSysConst_Flags_Index;
DxbcOpAnd(DxbcDest::R(control_temp, 0b0001),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_Flags_Vec)
.Select(kSysConst_Flags_Comp),
DxbcSrc::LU(kSysFlag_SharedMemoryIsUAV));
a_.OpUBFE(dxbc::Dest::R(control_temp, 0b0001), dxbc::Src::LU(1),
dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift),
LoadFlagsSystemConstant());
// Open the `if` with the uniform condition for the shared memory buffer being
// bound as a UAV (more fine-grained checks are vector and likely divergent).
a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
// Check more fine-grained limitations.
// The flag in control_temp.x can be 0 or 1 for simplicity, not necessarily
// 0 or 0xFFFFFFFF.
bool inner_condition_provided = false;
if (is_pixel_shader()) {
// Disable memexport in pixel shaders with supersampling since VPOS is
// ambiguous.
if (edram_rov_used_) {
system_constants_used_ |= 1ull
<< kSysConst_EdramResolutionSquareScale_Index;
DxbcOpULT(DxbcDest::R(control_temp, 0b0010),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_EdramResolutionSquareScale_Vec)
.Select(kSysConst_EdramResolutionSquareScale_Comp),
DxbcSrc::LU(2));
DxbcOpAnd(DxbcDest::R(control_temp, 0b0001),
DxbcSrc::R(control_temp, DxbcSrc::kXXXX),
DxbcSrc::R(control_temp, DxbcSrc::kYYYY));
} else {
// Enough to check just Y because it's scaled for both 2x and 4x.
system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index;
DxbcOpMovC(DxbcDest::R(control_temp, 0b0001),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_SampleCountLog2_Vec)
.Select(kSysConst_SampleCountLog2_Comp + 1),
DxbcSrc::LU(0), DxbcSrc::R(control_temp, DxbcSrc::kXXXX));
if (draw_resolution_scale_ > 1) {
// Only do memexport for one host pixel in a guest pixel.
// For 2x - (1, 1) because it's covered with half-pixel offset that
// becomes full-pixel.
// For 3x - also (1, 1) because it's still covered with half-pixel offset,
// but close to the center.
in_position_used_ |= 0b0011;
a_.OpFToU(
dxbc::Dest::R(control_temp, 0b0110),
dxbc::Src::V(uint32_t(InOutRegister::kPSInPosition), 0b0100 << 2));
switch (draw_resolution_scale_) {
case 2:
a_.OpAnd(dxbc::Dest::R(control_temp, 0b0110),
dxbc::Src::R(control_temp), dxbc::Src::LU(1));
// No need to do IEq - already 1 for right / bottom, 0 for left / top.
break;
case 3:
// xy % 3 == 1.
for (uint32_t i = 1; i <= 2; ++i) {
a_.OpUMul(dxbc::Dest::R(control_temp, 0b1000), dxbc::Dest::Null(),
dxbc::Src::R(control_temp).Select(i),
dxbc::Src::LU(draw_util::kDivideScale3));
a_.OpUShR(dxbc::Dest::R(control_temp, 0b1000),
dxbc::Src::R(control_temp, dxbc::Src::kWWWW),
dxbc::Src::LU(draw_util::kDivideUpperShift3));
a_.OpIMAd(dxbc::Dest::R(control_temp, 1 << i),
dxbc::Src::R(control_temp, dxbc::Src::kWWWW),
dxbc::Src::LI(-3), dxbc::Src::R(control_temp).Select(i));
}
a_.OpIEq(dxbc::Dest::R(control_temp, 0b0110),
dxbc::Src::R(control_temp), dxbc::Src::LU(1));
break;
default:
assert_unhandled_case(draw_resolution_scale_);
}
a_.OpAnd(dxbc::Dest::R(control_temp,
inner_condition_provided ? 0b0010 : 0b0001),
dxbc::Src::R(control_temp, dxbc::Src::kYYYY),
dxbc::Src::R(control_temp, dxbc::Src::kZZZZ));
if (inner_condition_provided) {
// Merge with the previous condition in control_temp.x.
a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
}
inner_condition_provided = true;
}
// With sample-rate shading (with float24 conversion), only do memexport
// from one sample (as the shader is invoked multiple times for a pixel),
// if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage,
// firstbit_lo returns 0xFFFFFFFF.
if (IsSampleRate()) {
a_.OpFirstBitLo(dxbc::Dest::R(control_temp, 0b0010),
dxbc::Src::VCoverage());
a_.OpIEq(
dxbc::Dest::R(control_temp,
inner_condition_provided ? 0b0010 : 0b0001),
dxbc::Src::V(uint32_t(InOutRegister::kPSInFrontFaceAndSampleIndex),
dxbc::Src::kYYYY),
dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
if (inner_condition_provided) {
// Merge with the previous condition in control_temp.x.
a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
}
inner_condition_provided = true;
}
}
// Check if memexport can be done.
DxbcOpIf(true, DxbcSrc::R(control_temp, DxbcSrc::kXXXX));
// Open the inner (vector) conditional if needed.
if (inner_condition_provided) {
a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
}
// control_temp.x is now free.
for (uint32_t i = 0; i < kMaxMemExports; ++i) {
for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
uint32_t eA_temp = system_temps_memexport_address_[i];
if (eA_temp == UINT32_MAX) {
// Export not used.
@ -160,21 +212,21 @@ void DxbcShaderTranslator::ExportToMemory() {
}
// Swap red and blue if needed.
DxbcOpAnd(DxbcDest::R(control_temp, 0b0001),
DxbcSrc::R(eA_temp, DxbcSrc::kZZZZ),
DxbcSrc::LU(uint32_t(1) << 19));
a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ),
dxbc::Src::LU(uint32_t(1) << 19));
for (uint32_t j = 0; j < eM_count; ++j) {
uint32_t eM_temp = eM_temps[j];
DxbcOpMovC(DxbcDest::R(eM_temp, 0b0101),
DxbcSrc::R(control_temp, DxbcSrc::kXXXX),
DxbcSrc::R(eM_temp, 0b000010), DxbcSrc::R(eM_temp));
a_.OpMovC(dxbc::Dest::R(eM_temp, 0b0101),
dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
dxbc::Src::R(eM_temp, 0b000010), dxbc::Src::R(eM_temp));
}
// Initialize element size in control_temp.x to 4 bytes as this is the most
// common size.
DxbcDest element_size_dest(DxbcDest::R(control_temp, 0b0001));
DxbcSrc element_size_src(DxbcSrc::R(control_temp, DxbcSrc::kXXXX));
DxbcOpMov(element_size_dest, DxbcSrc::LU(4));
dxbc::Dest element_size_dest(dxbc::Dest::R(control_temp, 0b0001));
dxbc::Src element_size_src(dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
a_.OpMov(element_size_dest, dxbc::Src::LU(4));
// Each eM should get a packed value in the destination format now.
@ -182,285 +234,293 @@ void DxbcShaderTranslator::ExportToMemory() {
// Y - signedness if fixed-point.
// Z - fractional/integer if fixed-point.
// W - color format.
DxbcOpUBFE(DxbcDest::R(control_temp, 0b1110), DxbcSrc::LU(0, 1, 1, 6),
DxbcSrc::LU(0, 16, 17, 8), DxbcSrc::R(eA_temp, DxbcSrc::kZZZZ));
DxbcSrc is_signed(DxbcSrc::R(control_temp, DxbcSrc::kYYYY));
DxbcSrc is_integer(DxbcSrc::R(control_temp, DxbcSrc::kZZZZ));
a_.OpUBFE(dxbc::Dest::R(control_temp, 0b1110), dxbc::Src::LU(0, 1, 1, 6),
dxbc::Src::LU(0, 16, 17, 8),
dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ));
dxbc::Src is_signed(dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
dxbc::Src is_integer(dxbc::Src::R(control_temp, dxbc::Src::kZZZZ));
// Convert and pack the format.
DxbcOpSwitch(DxbcSrc::R(control_temp, DxbcSrc::kWWWW));
a_.OpSwitch(dxbc::Src::R(control_temp, dxbc::Src::kWWWW));
// control_temp.w is now free.
{
// k_8_8_8_8
// k_8_8_8_8_AS_16_16_16_16
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8)));
DxbcOpCase(
DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16)));
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8)));
a_.OpCase(dxbc::Src::LU(
uint32_t(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16)));
{
uint32_t bits[4] = {8, 8, 8, 8};
ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
is_signed);
}
DxbcOpBreak();
a_.OpBreak();
// k_2_10_10_10
// k_2_10_10_10_AS_16_16_16_16
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_2_10_10_10)));
DxbcOpCase(DxbcSrc::LU(
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_2_10_10_10)));
a_.OpCase(dxbc::Src::LU(
uint32_t(xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16)));
{
uint32_t bits[4] = {10, 10, 10, 2};
ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
is_signed);
}
DxbcOpBreak();
a_.OpBreak();
// k_10_11_11
// k_10_11_11_AS_16_16_16_16
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_10_11_11)));
DxbcOpCase(
DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16)));
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_10_11_11)));
a_.OpCase(dxbc::Src::LU(
uint32_t(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16)));
{
uint32_t bits[4] = {11, 11, 10};
ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
is_signed);
}
DxbcOpBreak();
a_.OpBreak();
// k_11_11_10
// k_11_11_10_AS_16_16_16_16
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_11_11_10)));
DxbcOpCase(
DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16)));
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_11_11_10)));
a_.OpCase(dxbc::Src::LU(
uint32_t(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16)));
{
uint32_t bits[4] = {10, 11, 11};
ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
is_signed);
}
DxbcOpBreak();
a_.OpBreak();
// k_16_16
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_16_16)));
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16)));
{
uint32_t bits[4] = {16, 16};
ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
is_signed);
}
DxbcOpBreak();
a_.OpBreak();
// k_16_16_16_16
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16)));
DxbcOpMov(element_size_dest, DxbcSrc::LU(8));
DxbcOpIf(true, is_signed);
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16)));
a_.OpMov(element_size_dest, dxbc::Src::LU(8));
a_.OpIf(true, is_signed);
{
DxbcOpIf(false, is_integer);
a_.OpIf(false, is_integer);
for (uint32_t j = 0; j < eM_count; ++j) {
uint32_t eM_temp = eM_temps[j];
DxbcOpMul(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp),
DxbcSrc::LF(32767.0f));
a_.OpMul(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp),
dxbc::Src::LF(32767.0f));
}
DxbcOpEndIf();
a_.OpEndIf();
for (uint32_t j = 0; j < eM_count; ++j) {
DxbcDest eM_dest(DxbcDest::R(eM_temps[j]));
DxbcSrc eM_src(DxbcSrc::R(eM_temps[j]));
dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
// TODO(Triang3l): NaN should become zero, not -range.
DxbcOpMax(eM_dest, eM_src, DxbcSrc::LF(-32767.0f));
DxbcOpMin(eM_dest, eM_src, DxbcSrc::LF(32767.0f));
a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(-32767.0f));
a_.OpMin(eM_dest, eM_src, dxbc::Src::LF(32767.0f));
}
}
DxbcOpElse();
a_.OpElse();
{
DxbcOpIf(false, is_integer);
a_.OpIf(false, is_integer);
for (uint32_t j = 0; j < eM_count; ++j) {
uint32_t eM_temp = eM_temps[j];
DxbcOpMul(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp),
DxbcSrc::LF(65535.0f));
a_.OpMul(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp),
dxbc::Src::LF(65535.0f));
}
DxbcOpEndIf();
a_.OpEndIf();
for (uint32_t j = 0; j < eM_count; ++j) {
DxbcDest eM_dest(DxbcDest::R(eM_temps[j]));
DxbcSrc eM_src(DxbcSrc::R(eM_temps[j]));
DxbcOpMax(eM_dest, eM_src, DxbcSrc::LF(0.0f));
DxbcOpMin(eM_dest, eM_src, DxbcSrc::LF(65535.0f));
dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(0.0f));
a_.OpMin(eM_dest, eM_src, dxbc::Src::LF(65535.0f));
}
}
DxbcOpEndIf();
a_.OpEndIf();
for (uint32_t j = 0; j < eM_count; ++j) {
uint32_t eM_temp = eM_temps[j];
// Round to the nearest integer, according to the rules of handling
// integer formats in Direct3D.
// TODO(Triang3l): Round by adding +-0.5, not with round_ne.
DxbcOpRoundNE(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp));
DxbcOpFToI(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp));
DxbcOpBFI(DxbcDest::R(eM_temp, 0b0011), DxbcSrc::LU(16),
DxbcSrc::LU(16), DxbcSrc::R(eM_temp, 0b1101),
DxbcSrc::R(eM_temp, 0b1000));
a_.OpRoundNE(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
a_.OpFToI(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::LU(16),
dxbc::Src::LU(16), dxbc::Src::R(eM_temp, 0b1101),
dxbc::Src::R(eM_temp, 0b1000));
}
DxbcOpBreak();
a_.OpBreak();
// k_16_16_FLOAT
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_16_16_FLOAT)));
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_FLOAT)));
for (uint32_t j = 0; j < eM_count; ++j) {
uint32_t eM_temp = eM_temps[j];
DxbcOpF32ToF16(DxbcDest::R(eM_temp, 0b0011), DxbcSrc::R(eM_temp));
DxbcOpBFI(DxbcDest::R(eM_temp, 0b0001), DxbcSrc::LU(16),
DxbcSrc::LU(16), DxbcSrc::R(eM_temp, DxbcSrc::kYYYY),
DxbcSrc::R(eM_temp, DxbcSrc::kXXXX));
a_.OpF32ToF16(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::R(eM_temp));
a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0001), dxbc::Src::LU(16),
dxbc::Src::LU(16), dxbc::Src::R(eM_temp, dxbc::Src::kYYYY),
dxbc::Src::R(eM_temp, dxbc::Src::kXXXX));
}
DxbcOpBreak();
a_.OpBreak();
// k_16_16_16_16_FLOAT
DxbcOpCase(
DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16_FLOAT)));
DxbcOpMov(element_size_dest, DxbcSrc::LU(8));
a_.OpCase(
dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16_FLOAT)));
a_.OpMov(element_size_dest, dxbc::Src::LU(8));
for (uint32_t j = 0; j < eM_count; ++j) {
uint32_t eM_temp = eM_temps[j];
DxbcOpF32ToF16(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp));
DxbcOpBFI(DxbcDest::R(eM_temp, 0b0011), DxbcSrc::LU(16),
DxbcSrc::LU(16), DxbcSrc::R(eM_temp, 0b1101),
DxbcSrc::R(eM_temp, 0b1000));
a_.OpF32ToF16(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::LU(16),
dxbc::Src::LU(16), dxbc::Src::R(eM_temp, 0b1101),
dxbc::Src::R(eM_temp, 0b1000));
}
DxbcOpBreak();
a_.OpBreak();
// k_32_FLOAT
// Already in the destination format, 4 bytes per element already
// selected.
// k_32_32_FLOAT
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_32_32_FLOAT)));
DxbcOpMov(element_size_dest, DxbcSrc::LU(8));
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_FLOAT)));
a_.OpMov(element_size_dest, dxbc::Src::LU(8));
// Already in the destination format.
DxbcOpBreak();
a_.OpBreak();
// k_32_32_32_32_FLOAT
DxbcOpCase(
DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_32_32_32_32_FLOAT)));
DxbcOpMov(element_size_dest, DxbcSrc::LU(16));
a_.OpCase(
dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_32_32_FLOAT)));
a_.OpMov(element_size_dest, dxbc::Src::LU(16));
// Already in the destination format.
DxbcOpBreak();
a_.OpBreak();
}
DxbcOpEndSwitch();
a_.OpEndSwitch();
// control_temp.yz are now free.
// Do endian swap.
{
DxbcDest endian_dest(DxbcDest::R(control_temp, 0b0010));
DxbcSrc endian_src(DxbcSrc::R(control_temp, DxbcSrc::kYYYY));
dxbc::Dest endian_dest(dxbc::Dest::R(control_temp, 0b0010));
dxbc::Src endian_src(dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
// Extract endianness into control_temp.y.
DxbcOpAnd(endian_dest, DxbcSrc::R(eA_temp, DxbcSrc::kZZZZ),
DxbcSrc::LU(0b111));
a_.OpAnd(endian_dest, dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ),
dxbc::Src::LU(0b111));
// Change 8-in-64 and 8-in-128 to 8-in-32.
for (uint32_t j = 0; j < 2; ++j) {
DxbcOpIEq(DxbcDest::R(control_temp, 0b0100), endian_src,
DxbcSrc::LU(uint32_t(j ? xenos::Endian128::k8in128
: xenos::Endian128::k8in64)));
a_.OpIEq(dxbc::Dest::R(control_temp, 0b0100), endian_src,
dxbc::Src::LU(uint32_t(j ? xenos::Endian128::k8in128
: xenos::Endian128::k8in64)));
for (uint32_t k = 0; k < eM_count; ++k) {
uint32_t eM_temp = eM_temps[k];
DxbcOpMovC(DxbcDest::R(eM_temp),
DxbcSrc::R(control_temp, DxbcSrc::kZZZZ),
DxbcSrc::R(eM_temp, j ? 0b00011011 : 0b10110001),
DxbcSrc::R(eM_temp));
a_.OpMovC(dxbc::Dest::R(eM_temp),
dxbc::Src::R(control_temp, dxbc::Src::kZZZZ),
dxbc::Src::R(eM_temp, j ? 0b00011011 : 0b10110001),
dxbc::Src::R(eM_temp));
}
DxbcOpMovC(endian_dest, DxbcSrc::R(control_temp, DxbcSrc::kZZZZ),
DxbcSrc::LU(uint32_t(xenos::Endian128::k8in32)), endian_src);
a_.OpMovC(endian_dest, dxbc::Src::R(control_temp, dxbc::Src::kZZZZ),
dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)),
endian_src);
}
uint32_t swap_temp = PushSystemTemp();
DxbcDest swap_temp_dest(DxbcDest::R(swap_temp));
DxbcSrc swap_temp_src(DxbcSrc::R(swap_temp));
dxbc::Dest swap_temp_dest(dxbc::Dest::R(swap_temp));
dxbc::Src swap_temp_src(dxbc::Src::R(swap_temp));
// 8-in-16 or one half of 8-in-32.
DxbcOpSwitch(endian_src);
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::Endian128::k8in16)));
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::Endian128::k8in32)));
a_.OpSwitch(endian_src);
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in16)));
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
for (uint32_t j = 0; j < eM_count; ++j) {
DxbcDest eM_dest(DxbcDest::R(eM_temps[j]));
DxbcSrc eM_src(DxbcSrc::R(eM_temps[j]));
dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
// Temp = X0Z0.
DxbcOpAnd(swap_temp_dest, eM_src, DxbcSrc::LU(0x00FF00FF));
a_.OpAnd(swap_temp_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
// eM = YZW0.
DxbcOpUShR(eM_dest, eM_src, DxbcSrc::LU(8));
a_.OpUShR(eM_dest, eM_src, dxbc::Src::LU(8));
// eM = Y0W0.
DxbcOpAnd(eM_dest, eM_src, DxbcSrc::LU(0x00FF00FF));
a_.OpAnd(eM_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
// eM = YXWZ.
DxbcOpUMAd(eM_dest, swap_temp_src, DxbcSrc::LU(256), eM_src);
a_.OpUMAd(eM_dest, swap_temp_src, dxbc::Src::LU(256), eM_src);
}
DxbcOpBreak();
DxbcOpEndSwitch();
a_.OpBreak();
a_.OpEndSwitch();
// 16-in-32 or another half of 8-in-32.
DxbcOpSwitch(endian_src);
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::Endian128::k8in32)));
DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::Endian128::k16in32)));
a_.OpSwitch(endian_src);
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k16in32)));
for (uint32_t j = 0; j < eM_count; ++j) {
DxbcDest eM_dest(DxbcDest::R(eM_temps[j]));
DxbcSrc eM_src(DxbcSrc::R(eM_temps[j]));
dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
// Temp = ZW00.
DxbcOpUShR(swap_temp_dest, eM_src, DxbcSrc::LU(16));
a_.OpUShR(swap_temp_dest, eM_src, dxbc::Src::LU(16));
// eM = ZWXY.
DxbcOpBFI(eM_dest, DxbcSrc::LU(16), DxbcSrc::LU(16), eM_src,
swap_temp_src);
a_.OpBFI(eM_dest, dxbc::Src::LU(16), dxbc::Src::LU(16), eM_src,
swap_temp_src);
}
DxbcOpBreak();
DxbcOpEndSwitch();
a_.OpBreak();
a_.OpEndSwitch();
// Release swap_temp.
PopSystemTemp();
}
// control_temp.yz are now free.
DxbcDest address_dest(DxbcDest::R(eA_temp, 0b0001));
DxbcSrc address_src(DxbcSrc::R(eA_temp, DxbcSrc::kXXXX));
dxbc::Dest address_dest(dxbc::Dest::R(eA_temp, 0b0001));
dxbc::Src address_src(dxbc::Src::R(eA_temp, dxbc::Src::kXXXX));
// Multiply the base address by dword size, also dropping the 0x40000000
// bit.
DxbcOpIShL(address_dest, address_src, DxbcSrc::LU(2));
a_.OpIShL(address_dest, address_src, dxbc::Src::LU(2));
// Drop the exponent in the element index.
DxbcOpAnd(DxbcDest::R(eA_temp, 0b0010), DxbcSrc::R(eA_temp, DxbcSrc::kYYYY),
DxbcSrc::LU((1 << 23) - 1));
a_.OpAnd(dxbc::Dest::R(eA_temp, 0b0010),
dxbc::Src::R(eA_temp, dxbc::Src::kYYYY),
dxbc::Src::LU((1 << 23) - 1));
// Add the offset of the first written element to the base address.
DxbcOpUMAd(address_dest, DxbcSrc::R(eA_temp, DxbcSrc::kYYYY),
element_size_src, address_src);
a_.OpUMAd(address_dest, dxbc::Src::R(eA_temp, dxbc::Src::kYYYY),
element_size_src, address_src);
// Do the writes.
DxbcSrc eM_written_src(
DxbcSrc::R(system_temp_memexport_written_).Select(i >> 2));
dxbc::Src eM_written_src(
dxbc::Src::R(system_temp_memexport_written_).Select(i >> 2));
uint32_t eM_written_base = 1u << ((i & 3) << 3);
for (uint32_t j = 0; j < eM_count; ++j) {
// Go to the next eM#.
uint32_t eM_relative_offset = eM_offsets[j] - (j ? eM_offsets[j - 1] : 0);
if (eM_relative_offset) {
if (eM_relative_offset == 1) {
DxbcOpIAdd(address_dest, element_size_src, address_src);
a_.OpIAdd(address_dest, element_size_src, address_src);
} else {
DxbcOpUMAd(address_dest, DxbcSrc::LU(eM_relative_offset),
element_size_src, address_src);
a_.OpUMAd(address_dest, dxbc::Src::LU(eM_relative_offset),
element_size_src, address_src);
}
}
// Check if the eM# was actually written to on the execution path.
DxbcOpAnd(DxbcDest::R(control_temp, 0b0010), eM_written_src,
DxbcSrc::LU(eM_written_base << eM_offsets[j]));
DxbcOpIf(true, DxbcSrc::R(control_temp, DxbcSrc::kYYYY));
a_.OpAnd(dxbc::Dest::R(control_temp, 0b0010), eM_written_src,
dxbc::Src::LU(eM_written_base << eM_offsets[j]));
a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
// Write the element of the needed size.
DxbcSrc eM_src(DxbcSrc::R(eM_temps[j]));
DxbcOpSwitch(element_size_src);
dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
a_.OpSwitch(element_size_src);
for (uint32_t k = 1; k <= 4; k <<= 1) {
DxbcOpCase(DxbcSrc::LU(k * 4));
a_.OpCase(dxbc::Src::LU(k * 4));
if (uav_index_shared_memory_ == kBindingIndexUnallocated) {
uav_index_shared_memory_ = uav_count_++;
}
DxbcOpStoreRaw(
DxbcDest::U(uav_index_shared_memory_,
uint32_t(UAVRegister::kSharedMemory), (1 << k) - 1),
a_.OpStoreRaw(
dxbc::Dest::U(uav_index_shared_memory_,
uint32_t(UAVRegister::kSharedMemory), (1 << k) - 1),
address_src, eM_src);
DxbcOpBreak();
a_.OpBreak();
}
DxbcOpEndSwitch();
DxbcOpEndIf();
a_.OpEndSwitch();
a_.OpEndIf();
}
// control_temp.y is now free.
}
// Close the memexport possibility check.
DxbcOpEndIf();
// Close the inner memexport possibility conditional.
if (inner_condition_provided) {
a_.OpEndIf();
}
// Close the outer memexport possibility conditional.
a_.OpEndIf();
// Release control_temp.
PopSystemTemp();

File diff suppressed because it is too large Load Diff

View File

@ -40,63 +40,9 @@ DEFINE_bool(
"be fully covered when MSAA is used with fullscreen passes.",
"GPU");
DEFINE_string(
depth_float24_conversion, "",
"Method for converting 32-bit Z values to 20e4 floating point when using "
"host depth buffers without native 20e4 support (when not using rasterizer-"
"ordered views / fragment shader interlocks to perform depth testing "
"manually).\n"
"Use: [any, on_copy, truncate, round]\n"
" on_copy:\n"
" Do depth testing at host precision, converting when copying between "
"host depth buffers and the EDRAM buffer to support reinterpretation, "
"maintaining two copies, in both host and 20e4 formats, for reloading data "
"to host depth buffers when it wasn't overwritten.\n"
" + Highest performance, allows early depth test and writing.\n"
" + Host MSAA is possible with pixel-rate shading where supported.\n"
" - EDRAM > RAM > EDRAM depth buffer round trip done in certain games "
"(such as GTA IV) destroys precision irreparably, causing artifacts if "
"another rendering pass is done after the EDRAM reupload.\n"
" truncate:\n"
" Convert to 20e4 directly in pixel shaders, always rounding down.\n"
" + Good performance, conservative early depth test is possible.\n"
" + No precision loss when anything changes in the storage of the depth "
"buffer, EDRAM > RAM > EDRAM copying preserves precision.\n"
" - Rounding mode is incorrect, sometimes giving results smaller than "
"they should be - may cause inaccuracy especially in edge cases when the "
"game wants to write an exact value.\n"
" - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
" round:\n"
" Convert to 20e4 directly in pixel shaders, correctly rounding to the "
"nearest even.\n"
" + Highest accuracy.\n"
" - Significantly limited performance, early depth test is not possible.\n"
" - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
" Any other value:\n"
" Choose what is considered the most optimal (currently \"on_copy\").",
"GPU");
DEFINE_int32(query_occlusion_fake_sample_count, 1000,
"If set to -1 no sample counts are written, games may hang. Else, "
"the sample count of every tile will be incremented on every "
"EVENT_WRITE_ZPD by this number. Setting this to 0 means "
"everything is reported as occluded.",
"GPU");
namespace xe {
namespace gpu {
namespace flags {
DepthFloat24Conversion GetDepthFloat24Conversion() {
if (cvars::depth_float24_conversion == "truncate") {
return DepthFloat24Conversion::kOnOutputTruncating;
}
if (cvars::depth_float24_conversion == "round") {
return DepthFloat24Conversion::kOnOutputRounding;
}
return DepthFloat24Conversion::kOnCopy;
}
} // namespace flags
} // namespace gpu
} // namespace xe

View File

@ -22,69 +22,6 @@ DECLARE_bool(gpu_allow_invalid_fetch_constants);
DECLARE_bool(half_pixel_offset);
DECLARE_string(depth_float24_conversion);
DECLARE_int32(query_occlusion_fake_sample_count);
namespace xe {
namespace gpu {
namespace flags {
enum class DepthFloat24Conversion {
// Doing depth test at the host precision, converting to 20e4 to support
// reinterpretation, but keeping a separate EDRAM view containing depth values
// in the host format. When copying from the EDRAM buffer to host depth
// buffers, writing the stored host pixel if stored_f24 == to_f24(stored_host)
// (otherwise it was overwritten by something else, like clearing, or a color
// buffer; this is inexact though, and will incorrectly load pixels that were
// overwritten by something else in the EDRAM, but turned out to have the same
// value on the guest as before - an outdated host-precision value will be
// loaded in these cases instead).
//
// EDRAM > RAM, then reusing the EDRAM region for something else > EDRAM round
// trip destroys precision beyond repair.
//
// Full host early Z and MSAA with pixel-rate shading are supported.
kOnCopy,
// Converting the depth to the closest host value representable exactly as a
// 20e4 float in pixel shaders, to support invariance in cases when the guest
// reuploads a previously resolved depth buffer to the EDRAM, rounding towards
// zero (which contradicts the rounding used by the Direct3D 9 reference
// rasterizer, but allows less-than-or-equal pixel shader depth output to be
// used to preserve most of early Z culling when the game is using reversed
// depth, which is the usual way of doing depth testing on the Xbox 360 and of
// utilizing the advantages of a floating-point encoding).
//
// With MSAA, pixel shaders must run at sample frequency - otherwise, if the
// depth is the same for the entire pixel, intersections of polygons cannot be
// antialiased.
//
// Important usage note: When using this mode, bounds of the fixed-function
// viewport must be converted to and back from float24 too (preferably using
// correct rounding to the nearest even, to reduce the error already caused by
// truncation rather than to amplify it). This ensures that clamping to the
// viewport bounds, which happens after the pixel shader even if it overwrites
// the resulting depth, is never done to a value not representable as float24
// (for example, if the minimum Z is a number too small to be represented as
// float24, but not zero, it won't be possible to write what should become
// 0x000000 to the depth buffer). Note that this may add some error to the
// depth values from the rasterizer; however, modifying Z in the vertex shader
// to make interpolated depth values would cause clipping to be done to
// different bounds, which may be more undesirable, especially in cases when Z
// is explicitly set to a value like 0 or W (in such cases, the adjusted
// polygon may go outside 0...W in clip space and disappear).
kOnOutputTruncating,
// Similar to kOnOutputTruncating, but rounding to the nearest even, more
// correctly, however, because the resulting depth can be bigger than the
// original host value, early depth testing can't be used at all. Same
// viewport usage rules apply.
kOnOutputRounding,
};
DepthFloat24Conversion GetDepthFloat24Conversion();
} // namespace flags
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_GPU_FLAGS_H_

View File

@ -221,13 +221,13 @@ void GraphicsSystem::WriteRegister(uint32_t addr, uint32_t value) {
register_file_.values[r].u32 = value;
}
void GraphicsSystem::InitializeRingBuffer(uint32_t ptr, uint32_t log2_size) {
command_processor_->InitializeRingBuffer(ptr, log2_size + 0x3);
void GraphicsSystem::InitializeRingBuffer(uint32_t ptr, uint32_t size_log2) {
command_processor_->InitializeRingBuffer(ptr, size_log2);
}
void GraphicsSystem::EnableReadPointerWriteBack(uint32_t ptr,
uint32_t block_size) {
command_processor_->EnableReadPointerWriteBack(ptr, block_size);
uint32_t block_size_log2) {
command_processor_->EnableReadPointerWriteBack(ptr, block_size_log2);
}
void GraphicsSystem::SetInterruptCallback(uint32_t callback,

View File

@ -55,8 +55,9 @@ class GraphicsSystem {
return command_processor_.get();
}
virtual void InitializeRingBuffer(uint32_t ptr, uint32_t log2_size);
virtual void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size);
virtual void InitializeRingBuffer(uint32_t ptr, uint32_t size_log2);
virtual void EnableReadPointerWriteBack(uint32_t ptr,
uint32_t block_size_log2);
virtual void SetInterruptCallback(uint32_t callback, uint32_t user_data);
void DispatchInterruptCallback(uint32_t source, uint32_t cpu);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,869 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2021 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_GPU_PRIMITIVE_PROCESSOR_H_
#define XENIA_GPU_PRIMITIVE_PROCESSOR_H_
#include <climits>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <deque>
#include <functional>
#include <mutex>
#include <unordered_map>
#include <utility>
#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/base/math.h"
#include "xenia/base/mutex.h"
#include "xenia/base/platform.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/shared_memory.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"
#include "xenia/memory.h"
#if XE_ARCH_AMD64
// 128-bit SSSE3-level (SSE2+ for integer comparison, SSSE3 for pshufb) or AVX
// (256-bit AVX only got integer operations such as comparison in AVX2, which is
// above the minimum requirements of Xenia).
#include <tmmintrin.h>
#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 16
#elif XE_ARCH_ARM64
#include <arm_neon.h>
#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 16
#else
#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 0
#endif // XE_ARCH
// The idea behind this config variable is to force both indirection without
// primitive reset and pre-masking / pre-swapping with primitive reset,
// therefore this is supposed to be checked only by the host if it supports
// indirection. It's pretty pointless to do only half of this on backends that
// support full 32-bit indices unconditionally.
DECLARE_bool(ignore_32bit_vertex_index_support);
namespace xe {
namespace gpu {
// Normalizes primitive data in various ways for use with Direct3D 12 and Vulkan
// (down to its minimum requirements plus the portability subset).
//
// This solves various issues:
// - Triangle fans not supported on Direct3D 10+ and the Vulkan portability
// subset.
// - Converts to triangle lists, both with and without primitive reset.
// - Line loops are not supported on Direct3D 12 or Vulkan.
// - Converts to line strips.
// - Quads not reproducible with line lists with adjacency without geometry
// shaders (some Vulkan implementations), as well as being hard to debug in
// PIX due to "catastrophic failures".
// - Converts to triangle lists.
// - Vulkan requiring 0xFFFF primitive restart index for 16-bit indices and
// 0xFFFFFFFF for 32-bit (Direct3D 12 slightly relaxes this, allowing 0xFFFF
// for 32-bit also, but it's of no use to Xenia since guest indices are
// big-endian usually. Also, only 24 lower bits of the vertex index being used
// on the guest (tested on an Adreno 200 phone with drawing, though not with
// primitive restart as OpenGL ES 2.0 doesn't expose it), so the upper 8 bits
// likely shouldn't have effect on primitive restart (guest reset index
// 0xFFFFFF likely working for 0xFFFFFF, 0xFFFFFFFF, and 254 more indices),
// while Vulkan and Direct3D 12 require exactly 0xFFFFFFFF.
// - For 16-bit indices with guest reset index other than 0xFFFF (passing
// 0xFFFF directly to the host is fine because it's the same irrespective of
// endianness), there are two possible solutions:
// - If the index buffer otherwise doesn't contain 0xFFFF otherwise (since
// it's a valid vertex index in this case), replacing the primitive reset
// index with 0xFFFF in the 16-bit buffer.
// - If the index buffer contains any usage of 0xFFFF as a real vertex
// index, converting the index buffer to 32-bit, and replacing the
// primitive reset index with 0xFFFFFFFF.
// - For 32-bit indices, there are two paths:
// - If the guest reset index is 0xFFFFFF, and the index buffer actually
// uses only 0xFFFFFFFF for reset, using it without changes.
// - If the guest uses something other than 0xFFFFFFFF for primitive reset,
// replacing elements with (index & 0xFFFFFF) == reset_index with
// 0xFFFFFFFF.
// - Some Vulkan implementations only support 24-bit indices. The guests usually
// pass big-endian vertices, so we need all 32 bits (as the least significant
// bits will be in 24...31) to perform the byte swapping. For this reason, we
// load 32-bit indices indirectly, doing non-indexed draws and fetching the
// indices from the shared memory. This, however, is not compatible with
// primitive restart.
// - Pre-swapping, masking to 24 bits, and converting the reset index to
// 0xFFFFFFFF, resulting in an index buffer that can be used directly.
class PrimitiveProcessor {
public:
enum ProcessedIndexBufferType {
// Auto-indexed on the host.
kNone,
// GPU DMA, from the shared memory.
// For 32-bit, indirection is needed if the host only supports 24-bit
// indices (even for non-endian-swapped, as the GPU should be ignoring the
// upper 8 bits completely, rather than exhibiting undefined behavior.
kGuest,
// Converted and stored in the primitive converter for the current draw
// command. For 32-bit indices, if the host doesn't support all 32 bits,
// this kind of an index buffer will always be pre-masked and pre-swapped.
kHostConverted,
// Auto-indexed on the guest, but with an adapter index buffer on the host.
kHostBuiltin,
};
struct ProcessingResult {
xenos::PrimitiveType guest_primitive_type;
xenos::PrimitiveType host_primitive_type;
// Includes whether tessellation is enabled (not kVertex) and the type of
// tessellation.
Shader::HostVertexShaderType host_vertex_shader_type;
// Only used for non-kVertex host_vertex_shader_type. For kAdaptive, the
// index buffer is always from the guest and fully 32-bit, and contains the
// floating-point tessellation factors.
xenos::TessellationMode tessellation_mode;
// TODO(Triang3l): If important, split into the index count and the actual
// index buffer size, using zeros for out-of-bounds indices.
uint32_t host_draw_vertex_count;
uint32_t line_loop_closing_index;
ProcessedIndexBufferType index_buffer_type;
uint32_t guest_index_base;
xenos::IndexFormat host_index_format;
xenos::Endian host_index_endian;
// The reset index, if enabled, is always 0xFFFF for host_index_format
// kInt16 and 0xFFFFFFFF for kInt32.
bool host_primitive_reset_enabled;
// Backend-specific handle for the index buffer valid for the current draw,
// only valid for index_buffer_type kHostConverted and kHostBuiltin.
size_t host_index_buffer_handle;
bool IsTessellated() const {
return host_vertex_shader_type != Shader::HostVertexShaderType::kVertex;
}
};
virtual ~PrimitiveProcessor();
bool AreFull32BitVertexIndicesUsed() const {
return full_32bit_vertex_indices_used_;
}
bool IsConvertingTriangleFansToLists() const {
return convert_triangle_fans_to_lists_;
}
bool IsConvertingLineLoopsToStrips() const {
return convert_line_loops_to_strips_;
}
// Quad lists may be emulated as line lists with adjacency and a geometry
// shader, but geometry shaders must be supported for this.
bool IsConvertingQuadListsToTriangleLists() const {
return convert_quad_lists_to_triangle_lists_;
}
// Submission must be open to call (may request the index buffer in the shared
// memory).
bool Process(ProcessingResult& result_out);
// Invalidates the cache within the range.
std::pair<uint32_t, uint32_t> MemoryInvalidationCallback(
uint32_t physical_address_start, uint32_t length, bool exact_range);
protected:
// For host-side index buffer creation, the biggest possibly needed contiguous
// allocation, in indices.
// - No conversion: up to 0xFFFF vertices (as the vertex count in
// VGT_DRAW_INITIATOR is 16-bit).
// - Triangle fans to lists: since the 3rd vertex, every guest vertex creates
// a triangle, thus the maximum is 3 * (UINT16_MAX - 2), or 0x2FFF7.
// Primitive reset can only slow down the amplification - the 3 vertices
// after a reset add 1 host vertex each, not 3 each.
// - Line loops to strips: adding 1 vertex if there are at least 2 vertices in
// the original primitive, either replacing the primitive reset index with
// this new closing vertex, or in case of the final primitive, just adding a
// vertex - thus the absolute limit is UINT16_MAX + 1, or 0x10000.
// - Quad lists to triangle lists: vertices are processed in groups of 4, each
// group converted to 6 vertices, so the limit is 1.5 * 0xFFFC, or 0x17FFA.
// Thus, the maximum vertex count is defined by triangle fan to list
// conversion.
// Also include padding for co-alignment of the source and the destination for
// SIMD.
static constexpr uint32_t kMinRequiredConvertedIndexBufferSize =
sizeof(uint32_t) * (UINT16_MAX - 2) * 3 *
+XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE;
PrimitiveProcessor(const RegisterFile& register_file, Memory& memory,
TraceWriter& trace_writer, SharedMemory& shared_memory)
: register_file_(register_file),
memory_(memory),
trace_writer_(trace_writer),
shared_memory_(shared_memory) {}
// Call from the backend-specific initialization function.
// - full_32bit_vertex_indices_supported:
// - If the backend supports 32-bit indices unconditionally, and doesn't
// generate indirection logic in vertex shaders, pass hard-coded `true`.
// - Otherwise:
// - If the host doesn't support full 32-bit indices (but supports at
// least 24-bit indices), pass `false`.
// - If the host supports 32-bit indices, but the backend can handle both
// cases, pass `cvars::ignore_32bit_vertex_index_support`, and
// afterwards, check `AreFull32BitVertexIndicesUsed()` externally to see
// if indirection may be needed.
// - When full 32-bit indices are not supported, the host must be using
// auto-indexed draws for 32-bit indices of ProcessedIndexBufferType
// kGuest, while fetching the index data manually from the shared memory
// buffer and endian-swapping it.
// - Indirection, however, precludes primitive reset usage - so if
// primitive reset is needed, the primitive processor will pre-swap and
// pre-mask the index buffer so there are only host-endian 0x00###### or
// 0xFFFFFFFF values in it. In this case, a kHostConverted index buffer
// is returned from Process, and indirection is not needed (and
// impossible since the index buffer is not in the shared memory buffer
// anymore), though byte swap is still needed as 16-bit indices may also
// be kHostConverted, while they are completely unaffected by this. The
// same applies to primitive type conversion - if it happens for 32-bit
// guest indices, and kHostConverted is returned, they will be
// pre-swapped and pre-masked.
// - triangle_fans_supported, line_loops_supported, quad_lists_supported:
// - Pass true or false depending on whether the host actually supports
// those guest primitive types directly or through geometry shader
// emulation. Debug overriding will be resolved in the common code if
// needed.
bool InitializeCommon(bool full_32bit_vertex_indices_supported,
bool triangle_fans_supported, bool line_loops_supported,
bool quad_lists_supported);
// If any primitive type conversion is needed for auto-indexed draws, called
// from InitializeCommon (thus only once in the primitive processor's
// lifetime) to set up the backend's index buffer containing indices for
// primitive type remapping. The backend must allocate a `sizeof(uint16_t) *
// index_count` buffer and call fill_callback for its mapping if creation is
// successful. 16-bit indices are enough even if the backend has primitive
// reset enabled all the time (Metal) as auto-indexed draws are limited to
// UINT16_MAX vertices, not UINT16_MAX + 1.
virtual bool InitializeBuiltin16BitIndexBuffer(
uint32_t index_count, std::function<void(uint16_t*)> fill_callback) = 0;
// Call last in implementation-specific shutdown, also callable from the
// destructor.
void ShutdownCommon();
// Call at boundaries of lifespans of converted data (between frames,
// preferably in the end of a frame so between the swap and the next draw,
// access violation handlers need to do less work).
void ClearPerFrameCache();
static constexpr size_t GetBuiltinIndexBufferOffsetBytes(size_t handle) {
// For simplicity, just using the handles as byte offsets.
return handle;
}
// The destination allocation must have XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
// excess bytes.
static ptrdiff_t GetSimdCoalignmentOffset(const void* host_index_ptr,
uint32_t guest_index_base) {
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
// Always moving the host pointer only forward into the allocation padding
// space of XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE bytes. Without relying on
// two's complement wrapping overflow behavior, the logic would look like:
// uintptr_t host_subalignment =
// reinterpret_cast<uintptr_t>(host_index_ptr) &
// (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1);
// uint32_t guest_subalignment = guest_index_base &
// (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1);
// uintptr_t host_index_address_aligned = host_index_address;
// if (guest_subalignment >= host_subalignment) {
// return guest_subalignment - host_subalignment;
// }
// return XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE -
// (host_subalignment - guest_subalignment);
return ptrdiff_t(
(guest_index_base - reinterpret_cast<uintptr_t>(host_index_ptr)) &
(XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1));
#else
return 0;
#endif
}
// Requests a buffer to write the new transformed indices to. The lifetime of
// the returned buffer must be that of the current frame. Returns the mapping
// of the buffer to write to, or nullptr in case of failure, in addition to,
// if successful, a handle that can be used by the backend's command processor
// to access the backend-specific data for binding the buffer.
virtual void* RequestHostConvertedIndexBufferForCurrentFrame(
xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,
uint32_t coalignment_original_address, size_t& backend_handle_out) = 0;
private:
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
#if XE_ARCH_AMD64
// SSSE3 or AVX.
using SimdVectorU16 = __m128i;
using SimdVectorU32 = __m128i;
static SimdVectorU16 ReplicateU16(uint16_t value) {
return _mm_set1_epi16(int16_t(value));
}
static SimdVectorU32 ReplicateU32(uint32_t value) {
return _mm_set1_epi32(int32_t(value));
}
static SimdVectorU16 LoadAlignedVectorU16(const uint16_t* source) {
return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
}
static SimdVectorU32 LoadAlignedVectorU32(const uint32_t* source) {
return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
}
static void StoreUnalignedVectorU16(uint16_t* dest, SimdVectorU16 source) {
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest), source);
}
static void StoreUnalignedVectorU32(uint32_t* dest, SimdVectorU32 source) {
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest), source);
}
#elif XE_ARCH_ARM64
// NEON.
using SimdVectorU16 = uint16x8_t;
using SimdVectorU32 = uint32x4_t;
static SimdVectorU16 ReplicateU16(uint16_t value) {
return vdupq_n_u16(value);
}
static SimdVectorU32 ReplicateU32(uint32_t value) {
return vdupq_n_u32(value);
}
static SimdVectorU16 LoadAlignedVectorU16(const uint16_t* source) {
#if XE_COMPILER_MSVC
return vld1q_u16_ex(source, sizeof(uint16x8_t) * CHAR_BIT);
#else
return vld1q_u16(reinterpret_cast<const uint16_t*>(
__builtin_assume_aligned(source, sizeof(uint16x8_t))));
#endif
}
static SimdVectorU32 LoadAlignedVectorU32(const uint32_t* source) {
#if XE_COMPILER_MSVC
return vld1q_u32_ex(source, sizeof(uint16x8_t) * CHAR_BIT);
#else
return vld1q_u32(reinterpret_cast<const uint32_t*>(
__builtin_assume_aligned(source, sizeof(uint32x4_t))));
#endif
}
static void StoreUnalignedVectorU16(uint16_t* dest, SimdVectorU16 source) {
vst1q_u16(dest, source);
}
static void StoreUnalignedVectorU32(uint32_t* dest, SimdVectorU32 source) {
vst1q_u32(dest, source);
}
#else
#error SIMD vector types and constant loads not specified.
#endif // XE_ARCH
static_assert(
sizeof(SimdVectorU16) == XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE,
"XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE must reflect the vector size "
"actually used");
static_assert(
sizeof(SimdVectorU32) == XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE,
"XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE must reflect the vector size "
"actually used");
static constexpr uint32_t kSimdVectorU16Elements =
sizeof(SimdVectorU16) / sizeof(uint16_t);
static constexpr uint32_t kSimdVectorU32Elements =
sizeof(SimdVectorU32) / sizeof(uint32_t);
#endif // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
static bool IsResetUsed(const uint16_t* source, uint32_t count,
uint16_t reset_index_guest_endian);
static void Get16BitResetIndexUsage(const uint16_t* source, uint32_t count,
uint16_t reset_index_guest_endian,
bool& is_reset_index_used_out,
bool& is_ffff_used_as_vertex_index_out);
static bool IsResetUsed(const uint32_t* source, uint32_t count,
uint32_t reset_index_guest_endian,
uint32_t low_bits_mask_guest_endian);
static void ReplaceResetIndex16To16(uint16_t* dest, const uint16_t* source,
uint32_t count,
uint16_t reset_index_guest_endian);
// For use when the reset index is not 0xFFFF, and 0xFFFF is also used as a
// valid index - keeps 0xFFFF as a real index and replaces the reset index
// with 0xFFFFFFFF instead.
static void ReplaceResetIndex16To24(uint32_t* dest, const uint16_t* source,
uint32_t count,
uint16_t reset_index_guest_endian);
// The reset index and the low 24 bits mask are taken explicitly because this
// function may be used two ways:
// - Passthrough - when the vertex shader swaps the indices (when 32-bit
// indices are supported on the host), in this case HostSwap is kNone, but
// the reset index and the guest low bits mask can be swapped according to
// the guest endian.
// - Swapping for the host - when only 24 bits of an index are supported on
// the host. In this case, masking and comparison are done before applying
// HostSwap, but according to HostSwap, if needed, the data is swapped from
// the PowerPC's big endianness to the host GPU little endianness that we
// assume, which matches the Xenos's little endianness.
template <xenos::Endian HostSwap>
static void ReplaceResetIndex32To24(uint32_t* dest, const uint32_t* source,
uint32_t count,
uint32_t reset_index_guest_endian,
uint32_t low_bits_mask_guest_endian) {
// The Xbox 360's GPU only uses the low 24 bits of the index - masking.
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
while (count && (reinterpret_cast<uintptr_t>(source) &
(XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1))) {
--count;
uint32_t index = *(source++) & low_bits_mask_guest_endian;
*(dest++) = index != reset_index_guest_endian
? xenos::GpuSwap(index, HostSwap)
: UINT32_MAX;
}
if (count >= kSimdVectorU32Elements) {
SimdVectorU32 reset_index_guest_endian_simd =
ReplicateU32(reset_index_guest_endian);
SimdVectorU32 low_bits_mask_guest_endian_simd =
ReplicateU32(low_bits_mask_guest_endian);
#if XE_ARCH_AMD64
__m128i host_swap_shuffle;
if constexpr (HostSwap != xenos::Endian::kNone) {
host_swap_shuffle = _mm_set_epi32(
int32_t(xenos::GpuSwap(uint32_t(0x0F0E0D0C), HostSwap)),
int32_t(xenos::GpuSwap(uint32_t(0x0B0A0908), HostSwap)),
int32_t(xenos::GpuSwap(uint32_t(0x07060504), HostSwap)),
int32_t(xenos::GpuSwap(uint32_t(0x03020100), HostSwap)));
}
#endif // XE_ARCH_AMD64
while (count >= kSimdVectorU32Elements) {
count -= kSimdVectorU32Elements;
// Comparison produces 0 or 0xFFFF on AVX and Neon - we need 0xFFFF as
// the result for the primitive reset indices, so the result is
// `index | (index == reset_index)`.
SimdVectorU32 source_simd = LoadAlignedVectorU32(source);
source += kSimdVectorU32Elements;
SimdVectorU32 result_simd;
#if XE_ARCH_AMD64
source_simd =
_mm_and_si128(source_simd, low_bits_mask_guest_endian_simd);
result_simd = _mm_or_si128(
source_simd,
_mm_cmpeq_epi32(source_simd, reset_index_guest_endian_simd));
if constexpr (HostSwap != xenos::Endian::kNone) {
result_simd = _mm_shuffle_epi8(result_simd, host_swap_shuffle);
}
#elif XE_ARCH_ARM64
source_simd = vandq_u32(source_simd, low_bits_mask_guest_endian_simd);
result_simd = vorrq_u32(
source_simd, vceqq_u32(source_simd, reset_index_guest_endian_simd));
if constexpr (HostSwap == xenos::Endian::k8in16) {
result_simd = vreinterpretq_u32_u8(
vrev16q_u8(vreinterpretq_u8_u32(result_simd)));
} else if constexpr (HostSwap == xenos::Endian::k8in32) {
result_simd = vreinterpretq_u32_u8(
vrev32q_u8(vreinterpretq_u8_u32(result_simd)));
} else if constexpr (HostSwap == xenos::Endian::k16in32) {
result_simd = vreinterpretq_u32_u16(
vrev32q_u16(vreinterpretq_u16_u32(result_simd)));
}
#else
#error SIMD ReplaceResetIndex32To24 not implemented.
#endif // XE_ARCH
StoreUnalignedVectorU32(dest, result_simd);
dest += kSimdVectorU32Elements;
}
}
#endif // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
while (count--) {
uint32_t index = *(source++) & low_bits_mask_guest_endian;
*(dest++) = index != reset_index_guest_endian
? xenos::GpuSwap(index, HostSwap)
: UINT32_MAX;
}
}
// TODO(Triang3l): 16-bit > 32-bit primitive type conversion for Metal, where
// primitive reset is always enabled, if UINT16_MAX is used as a real vertex
// index.
struct PassthroughIndexTransform {
uint16_t operator()(uint16_t index) const { return index; }
uint32_t operator()(uint32_t index) const { return index; }
};
struct To24NonSwappingIndexTransform {
uint32_t operator()(uint32_t index) const {
return index & xenos::kVertexIndexMask;
}
};
struct To24Swapping8In16IndexTransform {
uint32_t operator()(uint32_t index) const {
return xenos::GpuSwap(index, xenos::Endian::k8in16) &
xenos::kVertexIndexMask;
}
};
struct To24Swapping8In32IndexTransform {
uint32_t operator()(uint32_t index) const {
return xenos::GpuSwap(index, xenos::Endian::k8in32) &
xenos::kVertexIndexMask;
}
};
struct To24Swapping16In32IndexTransform {
uint32_t operator()(uint32_t index) const {
return xenos::GpuSwap(index, xenos::Endian::k16in32) &
xenos::kVertexIndexMask;
}
};
// Triangle fans as triangle lists.
// Ordered as (v1, v2, v0), (v2, v3, v0) in Direct3D.
// https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans
static constexpr uint32_t GetTriangleFanListIndexCount(
uint32_t fan_index_count) {
return fan_index_count > 2 ? (fan_index_count - 2) * 3 : 0;
}
template <typename Index, typename IndexTransform>
static void TriangleFanToList(Index* dest, const Index* source,
uint32_t source_index_count,
const IndexTransform& index_transform) {
if (source_index_count <= 2) {
// To match GetTriangleFanListIndexCount.
return;
}
Index index_first = index_transform(source[0]);
Index index_previous = index_transform(source[1]);
for (uint32_t i = 2; i < source_index_count; ++i) {
Index index_current = index_transform(source[i]);
*(dest++) = index_previous;
*(dest++) = index_current;
*(dest++) = index_first;
index_previous = index_current;
}
}
static constexpr uint32_t GetLineLoopStripIndexCount(
uint32_t loop_index_count) {
// Even if 2 vertices are supplied, two lines are still drawn between them.
// https://www.khronos.org/opengl/wiki/Primitive
// "You get n lines for n input vertices"
// "If the user only specifies 1 vertex, the drawing command is ignored"
return loop_index_count > 1 ? loop_index_count + 1 : 0;
}
template <typename Index, typename IndexTransform>
static void LineLoopToStrip(Index* dest, const Index* source,
uint32_t source_index_count,
const IndexTransform& index_transform) {
if (source_index_count <= 1) {
// To match GetLineLoopStripIndexCount.
return;
}
Index index_first = index_transform(source[0]);
dest[0] = index_first;
for (uint32_t i = 1; i < source_index_count; ++i) {
dest[i] = index_transform(source[i]);
}
dest[source_index_count] = index_first;
}
static void LineLoopToStrip(uint16_t* dest, const uint16_t* source,
uint32_t source_index_count,
const PassthroughIndexTransform& index_transform);
static void LineLoopToStrip(uint32_t* dest, const uint32_t* source,
uint32_t source_index_count,
const PassthroughIndexTransform& index_transform);
static constexpr uint32_t GetQuadListTriangleListIndexCount(
uint32_t quad_list_index_count) {
return (quad_list_index_count / 4) * 6;
}
template <typename Index, typename IndexTransform>
static void QuadListToTriangleList(Index* dest, const Index* source,
uint32_t source_index_count,
const IndexTransform& index_transform) {
uint32_t quad_count = source_index_count / 4;
for (uint32_t i = 0; i < quad_count; ++i) {
// TODO(Triang3l): Find the correct order.
// v0, v1, v2.
Index common_index_0 = index_transform(*(source++));
*(dest++) = common_index_0;
*(dest++) = index_transform(*(source++));
Index common_index_2 = index_transform(*(source++));
*(dest++) = common_index_2;
// v0, v2, v3.
*(dest++) = common_index_0;
*(dest++) = common_index_2;
*(dest++) = index_transform(*(source++));
}
}
// Pre-gathering the ranges allows for usage of the same functions for
// conversion with and without reset. In addition, this increases safety in
// weird cases - there won't be mismatch between the pre-calculation of the
// post-conversion index count and the actual conversion if the game for some
// reason modifies the index buffer between the two and adds or removes reset
// indices in it.
struct SinglePrimitiveRange {
SinglePrimitiveRange(uint32_t guest_offset, uint32_t guest_index_count,
uint32_t host_index_count)
: guest_offset(guest_offset),
guest_index_count(guest_index_count),
host_index_count(host_index_count) {}
uint32_t guest_offset;
uint32_t guest_index_count;
uint32_t host_index_count;
};
static uint32_t GetMultiPrimitiveHostIndexCountAndRanges(
std::function<uint32_t(uint32_t)> single_primitive_guest_to_host_count,
const uint16_t* source, uint32_t source_index_count,
uint16_t reset_index_guest_endian,
std::deque<SinglePrimitiveRange>& ranges_append_out);
static uint32_t GetMultiPrimitiveHostIndexCountAndRanges(
std::function<uint32_t(uint32_t)> single_primitive_guest_to_host_count,
const uint32_t* source, uint32_t source_index_count,
uint32_t reset_index_guest_endian, uint32_t low_bits_mask_guest_endian,
std::deque<SinglePrimitiveRange>& ranges_append_out);
template <typename Index, typename IndexTransform,
typename PrimitiveRangeIterator>
static void ConvertSinglePrimitiveRanges(
Index* dest, const Index* source,
xenos::PrimitiveType source_primitive_type,
const IndexTransform& index_transform,
PrimitiveRangeIterator ranges_beginning,
PrimitiveRangeIterator ranges_end) {
Index* dest_write_ptr = dest;
switch (source_primitive_type) {
case xenos::PrimitiveType::kTriangleFan:
for (PrimitiveRangeIterator range_it = ranges_beginning;
range_it != ranges_end; ++range_it) {
TriangleFanToList(dest_write_ptr, source + range_it->guest_offset,
range_it->guest_index_count, index_transform);
dest_write_ptr += range_it->host_index_count;
}
break;
case xenos::PrimitiveType::kLineLoop:
for (PrimitiveRangeIterator range_it = ranges_beginning;
range_it != ranges_end; ++range_it) {
LineLoopToStrip(dest_write_ptr, source + range_it->guest_offset,
range_it->guest_index_count, index_transform);
dest_write_ptr += range_it->host_index_count;
}
break;
case xenos::PrimitiveType::kQuadList:
for (PrimitiveRangeIterator range_it = ranges_beginning;
range_it != ranges_end; ++range_it) {
QuadListToTriangleList(dest_write_ptr,
source + range_it->guest_offset,
range_it->guest_index_count, index_transform);
dest_write_ptr += range_it->host_index_count;
}
break;
default:
assert_unhandled_case(source_primitive_type);
}
}
const RegisterFile& register_file_;
Memory& memory_;
TraceWriter& trace_writer_;
SharedMemory& shared_memory_;
bool full_32bit_vertex_indices_used_ = false;
bool convert_triangle_fans_to_lists_ = false;
bool convert_line_loops_to_strips_ = false;
bool convert_quad_lists_to_triangle_lists_ = false;
// Byte offsets used, for simplicity, directly as handles.
size_t builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX;
size_t builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX;
std::deque<SinglePrimitiveRange> single_primitive_ranges_;
// Caching for reuse of converted indices within a frame.
// 256 KB as the largest possible guest index buffer - 0xFFFF 32-bit indices -
// is slightly smaller than 256 KB, thus cache entries need store links within
// at most 2 buckets.
static constexpr uint32_t kCacheBucketSizeBytesLog2 = 18;
static constexpr uint32_t kCacheBucketSizeBytes =
uint32_t(1) << kCacheBucketSizeBytesLog2;
static constexpr uint32_t kCacheBucketCount =
xe::align(SharedMemory::kBufferSize, kCacheBucketSizeBytes) /
kCacheBucketSizeBytes;
union CacheKey {
struct {
uint32_t base; // 32 total
uint32_t count : 16; // 48
xenos::IndexFormat format : 1; // 49
xenos::Endian endian : 2; // 52
uint32_t is_reset_enabled : 1; // 53
// kNone if not changing the type (like only processing the reset index).
xenos::PrimitiveType conversion_guest_primitive_type : 6; // 59
};
uint64_t key = 0;
CacheKey() = default;
CacheKey(uint32_t base, uint32_t count, xenos::IndexFormat format,
xenos::Endian endian, bool is_reset_enabled,
xenos::PrimitiveType conversion_guest_primitive_type =
xenos::PrimitiveType::kNone)
: base(base),
count(count),
format(format),
endian(endian),
is_reset_enabled(is_reset_enabled),
conversion_guest_primitive_type(conversion_guest_primitive_type) {}
struct Hasher {
size_t operator()(const CacheKey& key) const {
return std::hash<uint64_t>{}(key.key);
}
};
bool operator==(const CacheKey& other_key) const {
return key == other_key.key;
}
uint32_t GetSizeBytes() const {
return count * (format == xenos::IndexFormat::kInt16 ? sizeof(uint16_t)
: sizeof(uint32_t));
}
};
// Subset of ConversionResult that can be reused for different primitive types
// if the same result is used irrespective of one (like when only processing
// the reset index).
struct CachedResult {
uint32_t host_draw_vertex_count;
ProcessedIndexBufferType index_buffer_type;
xenos::IndexFormat host_index_format;
xenos::Endian host_index_endian;
bool host_primitive_reset_enabled;
size_t host_index_buffer_handle;
};
struct CacheEntry {
static_assert(
UINT16_MAX * sizeof(uint32_t) <=
(size_t(1) << kCacheBucketSizeBytesLog2),
"Assuming that primitive processor cache entries need to store to the "
"previous and to the next entries only within up to 2 buckets, so the "
"size of the cache buckets must be not smaller than the maximum guest "
"index buffer size");
union {
size_t free_next;
size_t buckets_prev[2];
};
size_t buckets_next[2];
CacheKey key;
CachedResult result;
static uint32_t GetBucketCount(CacheKey key) {
uint32_t count =
((key.base + (key.GetSizeBytes() - 1)) >> kCacheBucketSizeBytesLog2) -
(key.base >> kCacheBucketSizeBytesLog2) + 1;
assert_true(count <= 2,
"Cache entries only store list links within two buckets");
return count;
}
uint32_t GetBucketCount() const { return GetBucketCount(key); }
};
// A cache transaction performs a few operations in a RAII-like way (so
// processing may return an error for any reason, and won't have to clean up
// cache_currently_processing_base_ / size_bytes_ explicitly):
// - Transaction initialization:
// - Lookup of previously processed indices in the cache.
// - If not found, beginning to add a new entry that is going to be
// processed:
// - Marking the range as currently being processed, for slightly safer
// race condition handling if one happens - if invalidation happens
// during the transaction (but outside a global critical region lock,
// since processing may take a long time), the new cache entry won't be
// stored as it will already be invalid at the time of the completion of
// the transaction.
// - Enabling an access callback for the range.
// - Setting the new result after processing (if not found in the cache
// previously).
// - Transaction completion:
// - If the range wasn't invalidated during the transaction, storing the new
// entry in the cache.
// If an entry was found in the cache (GetFoundResult results non-null), it
// MUST be used instead of processing - this class doesn't provide the
// possibility replace existing entries.
class CacheTransaction final {
public:
CacheTransaction(PrimitiveProcessor& processor, CacheKey key);
const CachedResult* GetFoundResult() const {
return result_type_ == ResultType::kExisting ? &result_ : nullptr;
}
void SetNewResult(const CachedResult& new_result) {
// Replacement of an existing entry is not allowed.
assert_true(result_type_ != ResultType::kExisting);
result_ = new_result;
result_type_ = ResultType::kNewSet;
}
~CacheTransaction();
private:
PrimitiveProcessor& processor_;
// If key_.count == 0, this transaction shouldn't do anything - for empty
// ranges it's pointless, and it's unsafe to get the end pointer without
// special logic, and count == 0 is also used as a special indicator for
// vertex count below the cache usage threshold.
CacheKey key_;
CachedResult result_;
enum class ResultType {
kNewUnset,
kNewSet,
kExisting,
};
ResultType result_type_ = ResultType::kNewUnset;
};
std::deque<CacheEntry> cache_entry_pool_;
void* memory_invalidation_callback_handle_ = nullptr;
xe::global_critical_region global_critical_region_;
// Modified by both the processor and the invalidation callback.
std::unordered_map<CacheKey, size_t, CacheKey::Hasher> cache_map_;
// The conversion is performed while the lock is released since it may take a
// long time.
// If during the conversion the region currently being converted is
// invalidated, the current entry will not be added to the cache.
// Modified by the processor, read by the invalidation callback.
uint32_t cache_currently_processing_base_ = 0;
// 0 if not in a cache transaction that hasn't found an existing entry
// currently.
uint32_t cache_currently_processing_size_bytes_ = 0;
// Modified by both the processor and the invalidation callback.
size_t cache_bucket_free_first_entry_ = SIZE_MAX;
// Modified by both the processor and the invalidation callback.
uint64_t cache_buckets_non_empty_l1_[(kCacheBucketCount + 63) / 64] = {};
// For even faster handling of memory invalidation - whether any bit is set in
// each cache_buckets_non_empty_l1_.
// Modified by both the processor and the invalidation callback.
uint64_t cache_buckets_non_empty_l2_[(kCacheBucketCount + (64 * 64 - 1)) /
(64 * 64)] = {};
// Must be called in a global critical region.
void UpdateCacheBucketsNonEmptyL2(
uint32_t bucket_index_div_64,
[[maybe_unused]] const std::unique_lock<std::recursive_mutex>&
global_lock) {
uint64_t& cache_buckets_non_empty_l2_ref =
cache_buckets_non_empty_l2_[bucket_index_div_64 >> 6];
uint64_t cache_buckets_non_empty_l2_bit = uint64_t(1)
<< (bucket_index_div_64 & 63);
if (cache_buckets_non_empty_l1_[bucket_index_div_64]) {
cache_buckets_non_empty_l2_ref |= cache_buckets_non_empty_l2_bit;
} else {
cache_buckets_non_empty_l2_ref &= ~cache_buckets_non_empty_l2_bit;
}
}
// cache_buckets_non_empty_l1_ (along with cache_buckets_non_empty_l2_, which
// must be kept in sync) used for indication whether each entry is non-empty,
// for faster clearing (there's no special index here for an empty entry).
// Huge, so it's the last in the class.
// Modified by both the processor and the invalidation callback.
size_t cache_bucket_first_entries_[kCacheBucketCount];
static std::pair<uint32_t, uint32_t> MemoryInvalidationCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length,
bool exact_range);
};
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_PRIMITIVE_PROCESSOR_H_

View File

@ -342,6 +342,8 @@ XE_GPU_REGISTER(0x2184, kDword, SQ_WRAPPING_1)
// These three registers are set by the command processor.
XE_GPU_REGISTER(0x21F9, kDword, VGT_EVENT_INITIATOR)
XE_GPU_REGISTER(0x21FA, kDword, VGT_DMA_BASE)
XE_GPU_REGISTER(0x21FB, kDword, VGT_DMA_SIZE)
XE_GPU_REGISTER(0x21FC, kDword, VGT_DRAW_INITIATOR)
XE_GPU_REGISTER(0x21FD, kDword, VGT_IMMED_DATA)
@ -419,6 +421,11 @@ XE_GPU_REGISTER(0x2323, kDword, RB_COPY_SURFACE_SLICE)
XE_GPU_REGISTER(0x2324, kDword, RB_SAMPLE_COUNT_CTL)
XE_GPU_REGISTER(0x2325, kDword, RB_SAMPLE_COUNT_ADDR)
// Polygon offset scales and offsets are 32-bit floating-point.
// "slope computed in subpixels (1/12 or 1/16)" - R5xx Acceleration.
// But the correct scale for conversion of the slope scale (FRONT_BACK/SCALE)
// from subpixels to pixels is likely 1/16 according to:
// https://github.com/mesa3d/mesa/blob/54ad9b444c8e73da498211870e785239ad3ff1aa/src/gallium/drivers/radeonsi/si_state.c#L946
XE_GPU_REGISTER(0x2380, kFloat, PA_SU_POLY_OFFSET_FRONT_SCALE)
XE_GPU_REGISTER(0x2381, kFloat, PA_SU_POLY_OFFSET_FRONT_OFFSET)
XE_GPU_REGISTER(0x2382, kFloat, PA_SU_POLY_OFFSET_BACK_SCALE)

View File

@ -13,12 +13,19 @@
#include <cstdint>
#include <cstdlib>
#include "xenia/base/assert.h"
#include "xenia/gpu/xenos.h"
// Most registers can be found from:
// https://github.com/UDOOboard/Kernel_Unico/blob/master/drivers/mxc/amd-gpu/include/reg/yamato/14/yamato_registers.h
// Some registers were added on Adreno specifically and are not referenced in
// game .pdb files and never set by games.
// Only 32-bit types (uint32_t, int32_t, float or enums with uint32_t / int32_t
// as the underlying type) are allowed in the bit fields here, as Visual C++
// restarts packing when a field requires different alignment than the previous
// one.
namespace xe {
namespace gpu {
@ -38,7 +45,7 @@ namespace reg {
*******************************************************************************/
union COHER_STATUS_HOST {
union alignas(uint32_t) COHER_STATUS_HOST {
struct {
uint32_t matching_contexts : 8; // +0
uint32_t rb_copy_dest_base_ena : 1; // +8
@ -60,8 +67,9 @@ union COHER_STATUS_HOST {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_COHER_STATUS_HOST;
};
static_assert_size(COHER_STATUS_HOST, sizeof(uint32_t));
union WAIT_UNTIL {
union alignas(uint32_t) WAIT_UNTIL {
struct {
uint32_t : 1; // +0
uint32_t wait_re_vsync : 1; // +1
@ -83,6 +91,7 @@ union WAIT_UNTIL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_WAIT_UNTIL;
};
static_assert_size(WAIT_UNTIL, sizeof(uint32_t));
/*******************************************************************************
___ ___ ___ _ _ ___ _ _ ___ ___ ___
@ -92,11 +101,12 @@ union WAIT_UNTIL {
*******************************************************************************/
union SQ_PROGRAM_CNTL {
union alignas(uint32_t) SQ_PROGRAM_CNTL {
struct {
// Note from a2xx.xml:
// Only 0x3F worth of valid register values for VS_NUM_REG and PS_NUM_REG,
// but high bit is set to indicate "0 registers used".
// (Register count = (num_reg & 0x80) ? 0 : (num_reg + 1))
uint32_t vs_num_reg : 8; // +0
uint32_t ps_num_reg : 8; // +8
uint32_t vs_resource : 1; // +16
@ -111,8 +121,9 @@ union SQ_PROGRAM_CNTL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_SQ_PROGRAM_CNTL;
};
static_assert_size(SQ_PROGRAM_CNTL, sizeof(uint32_t));
union SQ_CONTEXT_MISC {
union alignas(uint32_t) SQ_CONTEXT_MISC {
struct {
uint32_t inst_pred_optimize : 1; // +0
uint32_t sc_output_screen_xy : 1; // +1
@ -142,8 +153,9 @@ union SQ_CONTEXT_MISC {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_SQ_CONTEXT_MISC;
};
static_assert_size(SQ_CONTEXT_MISC, sizeof(uint32_t));
union SQ_INTERPOLATOR_CNTL {
union alignas(uint32_t) SQ_INTERPOLATOR_CNTL {
struct {
uint32_t param_shade : 16; // +0
// SampleLocation bits - 0 for centroid, 1 for center, if
@ -153,6 +165,7 @@ union SQ_INTERPOLATOR_CNTL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_SQ_INTERPOLATOR_CNTL;
};
static_assert_size(SQ_INTERPOLATOR_CNTL, sizeof(uint32_t));
/*******************************************************************************
__ _____ ___ _____ _____ __
@ -172,7 +185,17 @@ union SQ_INTERPOLATOR_CNTL {
*******************************************************************************/
union VGT_DRAW_INITIATOR {
union alignas(uint32_t) VGT_DMA_SIZE {
struct {
uint32_t num_words : 24; // +0
uint32_t : 6; // +24
xenos::Endian swap_mode : 2; // +30
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_VGT_DMA_SIZE;
};
union alignas(uint32_t) VGT_DRAW_INITIATOR {
// Different than on A2xx and R6xx/R7xx.
struct {
xenos::PrimitiveType prim_type : 6; // +0
@ -187,22 +210,88 @@ union VGT_DRAW_INITIATOR {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_VGT_DRAW_INITIATOR;
};
static_assert_size(VGT_DRAW_INITIATOR, sizeof(uint32_t));
union VGT_OUTPUT_PATH_CNTL {
// Unlike on R6xx (but closer to R5xx), and according to the Adreno 200 header,
// the registers related to the vertex index are 24-bit. Vertex indices are
// unsigned, and only the lower 24 bits of them are actually used by the GPU -
// this has been verified on an Adreno 200 phone (LG Optimus L7) on OpenGL ES
// using a GL_UNSIGNED_INT element array buffer with junk in the upper 8 bits
// that had no effect on drawing.
// The order of operations is primitive reset index checking -> offsetting ->
// clamping.
union alignas(uint32_t) VGT_MULTI_PRIM_IB_RESET_INDX {
struct {
// The upper 8 bits of the value from the index buffer are confirmed to be
// ignored. So, though this specifically is untested (because
// GL_PRIMITIVE_RESTART_FIXED_INDEX was added only in OpenGL ES 3.0, though
// it behaves conceptually close to our expectations anyway - uses the
// 0xFFFFFFFF restart index while GL_MAX_ELEMENT_INDEX may be 0xFFFFFF),
// the restart index check likely only involves the lower 24 bit of the
// vertex index - therefore, if reset_indx is 0xFFFFFF, likely 0xFFFFFF,
// 0x1FFFFFF, 0xFFFFFFFF all cause primitive reset.
uint32_t reset_indx : 24;
};
uint32_t value;
static constexpr Register register_index =
XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX;
};
static_assert_size(VGT_MULTI_PRIM_IB_RESET_INDX, sizeof(uint32_t));
union alignas(uint32_t) VGT_INDX_OFFSET {
struct {
// Unlike R5xx's VAP_INDEX_OFFSET, which is signed 25-bit, this is 24-bit -
// and signedness doesn't matter as index calculations are done in 24-bit
// integers, and ((0xFFFFFE + 3) & 0xFFFFFF) == 1 anyway, just like
// ((0xFFFFFFFE + 3) & 0xFFFFFF) == 1 if we treated it as signed by
// sign-extending on the host. Direct3D 9 just writes BaseVertexIndex as a
// signed int32 to the entire register, but the upper 8 bits are ignored
// anyway, and that has no effect on offsets that fit in 24 bits.
uint32_t indx_offset : 24;
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_VGT_INDX_OFFSET;
};
static_assert_size(VGT_INDX_OFFSET, sizeof(uint32_t));
union alignas(uint32_t) VGT_MIN_VTX_INDX {
struct {
uint32_t min_indx : 24;
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_VGT_MIN_VTX_INDX;
};
static_assert_size(VGT_MIN_VTX_INDX, sizeof(uint32_t));
union alignas(uint32_t) VGT_MAX_VTX_INDX {
struct {
// Usually 0xFFFF or 0xFFFFFF.
uint32_t max_indx : 24;
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_VGT_MAX_VTX_INDX;
};
static_assert_size(VGT_MAX_VTX_INDX, sizeof(uint32_t));
union alignas(uint32_t) VGT_OUTPUT_PATH_CNTL {
struct {
xenos::VGTOutputPath path_select : 2; // +0
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_VGT_OUTPUT_PATH_CNTL;
};
static_assert_size(VGT_OUTPUT_PATH_CNTL, sizeof(uint32_t));
union VGT_HOS_CNTL {
union alignas(uint32_t) VGT_HOS_CNTL {
struct {
xenos::TessellationMode tess_mode : 2; // +0
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_VGT_HOS_CNTL;
};
static_assert_size(VGT_HOS_CNTL, sizeof(uint32_t));
/*******************************************************************************
___ ___ ___ __ __ ___ _____ _____ _____
@ -217,7 +306,7 @@ union VGT_HOS_CNTL {
*******************************************************************************/
union PA_SU_POINT_MINMAX {
union alignas(uint32_t) PA_SU_POINT_MINMAX {
struct {
// Radius, 12.4 fixed point.
uint32_t min_size : 16; // +0
@ -226,8 +315,9 @@ union PA_SU_POINT_MINMAX {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SU_POINT_MINMAX;
};
static_assert_size(PA_SU_POINT_MINMAX, sizeof(uint32_t));
union PA_SU_POINT_SIZE {
union alignas(uint32_t) PA_SU_POINT_SIZE {
struct {
// 1/2 width or height, 12.4 fixed point.
uint32_t height : 16; // +0
@ -236,14 +326,19 @@ union PA_SU_POINT_SIZE {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SU_POINT_SIZE;
};
static_assert_size(PA_SU_POINT_SIZE, sizeof(uint32_t));
// Setup Unit / Scanline Converter mode cntl
union PA_SU_SC_MODE_CNTL {
union alignas(uint32_t) PA_SU_SC_MODE_CNTL {
struct {
uint32_t cull_front : 1; // +0
uint32_t cull_back : 1; // +1
// 0 - front is CCW, 1 - front is CW.
uint32_t face : 1; // +2
uint32_t face : 1; // +2
// The game Fuse uses poly_mode 2 for triangles, which is "reserved" on R6xx
// and not defined on Adreno 2xx, but polymode_front/back_ptype are 0
// (points) in this case in Fuse, which should not be respected for
// non-kDualMode as the game wants to draw filled triangles.
xenos::PolygonModeEnable poly_mode : 2; // +3
xenos::PolygonType polymode_front_ptype : 3; // +5
xenos::PolygonType polymode_back_ptype : 3; // +8
@ -267,9 +362,10 @@ union PA_SU_SC_MODE_CNTL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SU_SC_MODE_CNTL;
};
static_assert_size(PA_SU_SC_MODE_CNTL, sizeof(uint32_t));
// Setup Unit Vertex Control
union PA_SU_VTX_CNTL {
union alignas(uint32_t) PA_SU_VTX_CNTL {
struct {
uint32_t pix_center : 1; // +0 1 = half pixel offset (OpenGL).
uint32_t round_mode : 2; // +1
@ -278,8 +374,9 @@ union PA_SU_VTX_CNTL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SU_VTX_CNTL;
};
static_assert_size(PA_SU_VTX_CNTL, sizeof(uint32_t));
union PA_SC_MPASS_PS_CNTL {
union alignas(uint32_t) PA_SC_MPASS_PS_CNTL {
struct {
uint32_t mpass_pix_vec_per_pass : 20; // +0
uint32_t : 11; // +20
@ -288,9 +385,10 @@ union PA_SC_MPASS_PS_CNTL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SC_MPASS_PS_CNTL;
};
static_assert_size(PA_SC_MPASS_PS_CNTL, sizeof(uint32_t));
// Scanline converter viz query, used by D3D for gpu side conditional rendering
union PA_SC_VIZ_QUERY {
union alignas(uint32_t) PA_SC_VIZ_QUERY {
struct {
// the visibility of draws should be evaluated
uint32_t viz_query_ena : 1; // +0
@ -303,9 +401,10 @@ union PA_SC_VIZ_QUERY {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SC_VIZ_QUERY;
};
static_assert_size(PA_SC_VIZ_QUERY, sizeof(uint32_t));
// Clipper clip control
union PA_CL_CLIP_CNTL {
union alignas(uint32_t) PA_CL_CLIP_CNTL {
struct {
uint32_t ucp_ena_0 : 1; // +0
uint32_t ucp_ena_1 : 1; // +1
@ -328,9 +427,10 @@ union PA_CL_CLIP_CNTL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_CL_CLIP_CNTL;
};
static_assert_size(PA_CL_CLIP_CNTL, sizeof(uint32_t));
// Viewport transform engine control
union PA_CL_VTE_CNTL {
union alignas(uint32_t) PA_CL_VTE_CNTL {
struct {
uint32_t vport_x_scale_ena : 1; // +0
uint32_t vport_x_offset_ena : 1; // +1
@ -347,8 +447,31 @@ union PA_CL_VTE_CNTL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_CL_VTE_CNTL;
};
static_assert_size(PA_CL_VTE_CNTL, sizeof(uint32_t));
union PA_SC_WINDOW_OFFSET {
union alignas(uint32_t) PA_SC_SCREEN_SCISSOR_TL {
struct {
int32_t tl_x : 15; // +0
uint32_t : 1; // +15
int32_t tl_y : 15; // +16
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL;
};
static_assert_size(PA_SC_SCREEN_SCISSOR_TL, sizeof(uint32_t));
union alignas(uint32_t) PA_SC_SCREEN_SCISSOR_BR {
struct {
int32_t br_x : 15; // +0
uint32_t : 1; // +15
int32_t br_y : 15; // +16
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR;
};
static_assert_size(PA_SC_SCREEN_SCISSOR_BR, sizeof(uint32_t));
union alignas(uint32_t) PA_SC_WINDOW_OFFSET {
struct {
int32_t window_x_offset : 15; // +0
uint32_t : 1; // +15
@ -357,8 +480,9 @@ union PA_SC_WINDOW_OFFSET {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SC_WINDOW_OFFSET;
};
static_assert_size(PA_SC_WINDOW_OFFSET, sizeof(uint32_t));
union PA_SC_WINDOW_SCISSOR_TL {
union alignas(uint32_t) PA_SC_WINDOW_SCISSOR_TL {
struct {
uint32_t tl_x : 14; // +0
uint32_t : 2; // +14
@ -369,8 +493,9 @@ union PA_SC_WINDOW_SCISSOR_TL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL;
};
static_assert_size(PA_SC_WINDOW_SCISSOR_TL, sizeof(uint32_t));
union PA_SC_WINDOW_SCISSOR_BR {
union alignas(uint32_t) PA_SC_WINDOW_SCISSOR_BR {
struct {
uint32_t br_x : 14; // +0
uint32_t : 2; // +14
@ -379,6 +504,7 @@ union PA_SC_WINDOW_SCISSOR_BR {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR;
};
static_assert_size(PA_SC_WINDOW_SCISSOR_BR, sizeof(uint32_t));
/*******************************************************************************
___ ___
@ -388,15 +514,16 @@ union PA_SC_WINDOW_SCISSOR_BR {
*******************************************************************************/
union RB_MODECONTROL {
union alignas(uint32_t) RB_MODECONTROL {
struct {
xenos::ModeControl edram_mode : 3; // +0
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_RB_MODECONTROL;
};
static_assert_size(RB_MODECONTROL, sizeof(uint32_t));
union RB_SURFACE_INFO {
union alignas(uint32_t) RB_SURFACE_INFO {
struct {
uint32_t surface_pitch : 14; // +0 in pixels.
uint32_t : 2; // +14
@ -406,8 +533,9 @@ union RB_SURFACE_INFO {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_RB_SURFACE_INFO;
};
static_assert_size(RB_SURFACE_INFO, sizeof(uint32_t));
union RB_COLORCONTROL {
union alignas(uint32_t) RB_COLORCONTROL {
struct {
xenos::CompareFunction alpha_func : 3; // +0
uint32_t alpha_test_enable : 1; // +3
@ -455,8 +583,9 @@ union RB_COLORCONTROL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_RB_COLORCONTROL;
};
static_assert_size(RB_COLORCONTROL, sizeof(uint32_t));
union RB_COLOR_INFO {
union alignas(uint32_t) RB_COLOR_INFO {
struct {
uint32_t color_base : 12; // +0 in tiles.
uint32_t : 4; // +12
@ -468,8 +597,9 @@ union RB_COLOR_INFO {
// RB_COLOR[1-3]_INFO also use this format.
static const Register rt_register_indices[4];
};
static_assert_size(RB_COLOR_INFO, sizeof(uint32_t));
union RB_COLOR_MASK {
union alignas(uint32_t) RB_COLOR_MASK {
struct {
uint32_t write_red0 : 1; // +0
uint32_t write_green0 : 1; // +1
@ -491,8 +621,9 @@ union RB_COLOR_MASK {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_RB_COLOR_MASK;
};
static_assert_size(RB_COLOR_MASK, sizeof(uint32_t));
union RB_BLENDCONTROL {
union alignas(uint32_t) RB_BLENDCONTROL {
struct {
xenos::BlendFactor color_srcblend : 5; // +0
xenos::BlendOp color_comb_fcn : 3; // +5
@ -508,8 +639,9 @@ union RB_BLENDCONTROL {
static constexpr Register register_index = XE_GPU_REG_RB_BLENDCONTROL0;
static const Register rt_register_indices[4];
};
static_assert_size(RB_BLENDCONTROL, sizeof(uint32_t));
union RB_DEPTHCONTROL {
union alignas(uint32_t) RB_DEPTHCONTROL {
struct {
uint32_t stencil_enable : 1; // +0
uint32_t z_enable : 1; // +1
@ -530,8 +662,9 @@ union RB_DEPTHCONTROL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_RB_DEPTHCONTROL;
};
static_assert_size(RB_DEPTHCONTROL, sizeof(uint32_t));
union RB_STENCILREFMASK {
union alignas(uint32_t) RB_STENCILREFMASK {
struct {
uint32_t stencilref : 8; // +0
uint32_t stencilmask : 8; // +8
@ -541,8 +674,9 @@ union RB_STENCILREFMASK {
static constexpr Register register_index = XE_GPU_REG_RB_STENCILREFMASK;
// RB_STENCILREFMASK_BF also uses this format.
};
static_assert_size(RB_STENCILREFMASK, sizeof(uint32_t));
union RB_DEPTH_INFO {
union alignas(uint32_t) RB_DEPTH_INFO {
struct {
uint32_t depth_base : 12; // +0 in tiles.
uint32_t : 4; // +12
@ -551,10 +685,11 @@ union RB_DEPTH_INFO {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_RB_DEPTH_INFO;
};
static_assert_size(RB_DEPTH_INFO, sizeof(uint32_t));
// Copy registers are very different than on Adreno.
union RB_COPY_CONTROL {
union alignas(uint32_t) RB_COPY_CONTROL {
struct {
uint32_t copy_src_select : 3; // +0 Depth is 4.
uint32_t : 1; // +3
@ -568,8 +703,9 @@ union RB_COPY_CONTROL {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_RB_COPY_CONTROL;
};
static_assert_size(RB_COPY_CONTROL, sizeof(uint32_t));
union RB_COPY_DEST_INFO {
union alignas(uint32_t) RB_COPY_DEST_INFO {
struct {
xenos::Endian128 copy_dest_endian : 3; // +0
uint32_t copy_dest_array : 1; // +3
@ -583,8 +719,9 @@ union RB_COPY_DEST_INFO {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_RB_COPY_DEST_INFO;
};
static_assert_size(RB_COPY_DEST_INFO, sizeof(uint32_t));
union RB_COPY_DEST_PITCH {
union alignas(uint32_t) RB_COPY_DEST_PITCH {
struct {
uint32_t copy_dest_pitch : 14; // +0
uint32_t : 2; // +14
@ -593,6 +730,7 @@ union RB_COPY_DEST_PITCH {
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_RB_COPY_DEST_PITCH;
};
static_assert_size(RB_COPY_DEST_PITCH, sizeof(uint32_t));
} // namespace reg

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More